Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 30 Mar 2012 19:44:29 +0000 (12:44 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 30 Mar 2012 19:44:29 +0000 (12:44 -0700)
Pull btrfs fixes and features from Chris Mason:
 "We've merged in the error handling patches from SuSE.  These are
  already shipping in the sles kernel, and they give btrfs the ability
  to abort transactions and go readonly on errors.  It involves a lot of
  churn as they clarify BUG_ONs, and remove the ones we now properly
  deal with.

  Josef reworked the way our metadata interacts with the page cache.
  page->private now points to the btrfs extent_buffer object, which
  makes everything faster.  He changed it so we write an whole extent
  buffer at a time instead of allowing individual pages to go down,,
  which will be important for the raid5/6 code (for the 3.5 merge
  window ;)

  Josef also made us more aggressive about dropping pages for metadata
  blocks that were freed due to COW.  Overall, our metadata caching is
  much faster now.

  We've integrated my patch for metadata bigger than the page size.
  This allows metadata blocks up to 64KB in size.  In practice 16K and
  32K seem to work best.  For workloads with lots of metadata, this cuts
  down the size of the extent allocation tree dramatically and fragments
  much less.

  Scrub was updated to support the larger block sizes, which ended up
  being a fairly large change (thanks Stefan Behrens).

  We also have an assortment of fixes and updates, especially to the
  balancing code (Ilya Dryomov), the back ref walker (Jan Schmidt) and
  the defragging code (Liu Bo)."

Fixed up trivial conflicts in fs/btrfs/scrub.c that were just due to
removal of the second argument to k[un]map_atomic() in commit
7ac687d9e047.

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (75 commits)
  Btrfs: update the checks for mixed block groups with big metadata blocks
  Btrfs: update to the right index of defragment
  Btrfs: do not bother to defrag an extent if it is a big real extent
  Btrfs: add a check to decide if we should defrag the range
  Btrfs: fix recursive defragment with autodefrag option
  Btrfs: fix the mismatch of page->mapping
  Btrfs: fix race between direct io and autodefrag
  Btrfs: fix deadlock during allocating chunks
  Btrfs: show useful info in space reservation tracepoint
  Btrfs: don't use crc items bigger than 4KB
  Btrfs: flush out and clean up any block device pages during mount
  btrfs: disallow unequal data/metadata blocksize for mixed block groups
  Btrfs: enhance superblock sanity checks
  Btrfs: change scrub to support big blocks
  Btrfs: minor cleanup in scrub
  Btrfs: introduce common define for max number of mirrors
  Btrfs: fix infinite loop in btrfs_shrink_device()
  Btrfs: fix memory leak in resolver code
  Btrfs: allow dup for data chunks in mixed mode
  Btrfs: validate target profiles only if we are going to use them
  ...

1  2 
fs/btrfs/compression.c
fs/btrfs/extent_io.c
fs/btrfs/file-item.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode.c
fs/btrfs/scrub.c
fs/btrfs/super.c

diff --combined fs/btrfs/compression.c
@@@ -120,10 -120,10 +120,10 @@@ static int check_compressed_csum(struc
                page = cb->compressed_pages[i];
                csum = ~(u32)0;
  
 -              kaddr = kmap_atomic(page, KM_USER0);
 +              kaddr = kmap_atomic(page);
                csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
                btrfs_csum_final(csum, (char *)&csum);
 -              kunmap_atomic(kaddr, KM_USER0);
 +              kunmap_atomic(kaddr);
  
                if (csum != *cb_sum) {
                        printk(KERN_INFO "btrfs csum failed ino %llu "
@@@ -226,8 -226,8 +226,8 @@@ out
   * Clear the writeback bits on all of the file
   * pages for a compressed write
   */
- static noinline int end_compressed_writeback(struct inode *inode, u64 start,
-                                            unsigned long ram_size)
+ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
+                                             unsigned long ram_size)
  {
        unsigned long index = start >> PAGE_CACHE_SHIFT;
        unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
                index += ret;
        }
        /* the inode may be gone now */
-       return 0;
  }
  
  /*
@@@ -392,16 -391,16 +391,16 @@@ int btrfs_submit_compressed_write(struc
                         */
                        atomic_inc(&cb->pending_bios);
                        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM */
  
                        if (!skip_sum) {
                                ret = btrfs_csum_one_bio(root, inode, bio,
                                                         start, 1);
-                               BUG_ON(ret);
+                               BUG_ON(ret); /* -ENOMEM */
                        }
  
                        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM */
  
                        bio_put(bio);
  
        bio_get(bio);
  
        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-       BUG_ON(ret);
+       BUG_ON(ret); /* -ENOMEM */
  
        if (!skip_sum) {
                ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
        }
  
        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
-       BUG_ON(ret);
+       BUG_ON(ret); /* -ENOMEM */
  
        bio_put(bio);
        return 0;
@@@ -497,7 -496,7 +496,7 @@@ static noinline int add_ra_bio_pages(st
                 * sure they map to this compressed extent on disk.
                 */
                set_page_extent_mapped(page);
-               lock_extent(tree, last_offset, end, GFP_NOFS);
+               lock_extent(tree, last_offset, end);
                read_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, last_offset,
                                           PAGE_CACHE_SIZE);
                    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
                    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
                        free_extent_map(em);
-                       unlock_extent(tree, last_offset, end, GFP_NOFS);
+                       unlock_extent(tree, last_offset, end);
                        unlock_page(page);
                        page_cache_release(page);
                        break;
                        if (zero_offset) {
                                int zeros;
                                zeros = PAGE_CACHE_SIZE - zero_offset;
 -                              userpage = kmap_atomic(page, KM_USER0);
 +                              userpage = kmap_atomic(page);
                                memset(userpage + zero_offset, 0, zeros);
                                flush_dcache_page(page);
 -                              kunmap_atomic(userpage, KM_USER0);
 +                              kunmap_atomic(userpage);
                        }
                }
  
                        nr_pages++;
                        page_cache_release(page);
                } else {
-                       unlock_extent(tree, last_offset, end, GFP_NOFS);
+                       unlock_extent(tree, last_offset, end);
                        unlock_page(page);
                        page_cache_release(page);
                        break;
@@@ -662,7 -661,7 +661,7 @@@ int btrfs_submit_compressed_read(struc
                        bio_get(comp_bio);
  
                        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM */
  
                        /*
                         * inc the count before we submit the bio so
                        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                                ret = btrfs_lookup_bio_sums(root, inode,
                                                        comp_bio, sums);
-                               BUG_ON(ret);
+                               BUG_ON(ret); /* -ENOMEM */
                        }
                        sums += (comp_bio->bi_size + root->sectorsize - 1) /
                                root->sectorsize;
  
                        ret = btrfs_map_bio(root, READ, comp_bio,
                                            mirror_num, 0);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM */
  
                        bio_put(comp_bio);
  
        bio_get(comp_bio);
  
        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
-       BUG_ON(ret);
+       BUG_ON(ret); /* -ENOMEM */
  
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
        }
  
        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
-       BUG_ON(ret);
+       BUG_ON(ret); /* -ENOMEM */
  
        bio_put(comp_bio);
        return 0;
@@@ -734,7 -733,7 +733,7 @@@ struct btrfs_compress_op *btrfs_compres
        &btrfs_lzo_compress,
  };
  
int __init btrfs_init_compress(void)
void __init btrfs_init_compress(void)
  {
        int i;
  
                atomic_set(&comp_alloc_workspace[i], 0);
                init_waitqueue_head(&comp_workspace_wait[i]);
        }
-       return 0;
  }
  
  /*
@@@ -993,9 -991,9 +991,9 @@@ int btrfs_decompress_buf2page(char *buf
                bytes = min(PAGE_CACHE_SIZE - *pg_offset,
                            PAGE_CACHE_SIZE - buf_offset);
                bytes = min(bytes, working_bytes);
 -              kaddr = kmap_atomic(page_out, KM_USER0);
 +              kaddr = kmap_atomic(page_out);
                memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
 -              kunmap_atomic(kaddr, KM_USER0);
 +              kunmap_atomic(kaddr);
                flush_dcache_page(page_out);
  
                *pg_offset += bytes;
diff --combined fs/btrfs/extent_io.c
@@@ -19,6 -19,7 +19,7 @@@
  #include "btrfs_inode.h"
  #include "volumes.h"
  #include "check-integrity.h"
+ #include "locking.h"
  
  static struct kmem_cache *extent_state_cache;
  static struct kmem_cache *extent_buffer_cache;
@@@ -53,6 -54,13 +54,13 @@@ struct extent_page_data 
        unsigned int sync_io:1;
  };
  
+ static noinline void flush_write_bio(void *data);
+ static inline struct btrfs_fs_info *
+ tree_fs_info(struct extent_io_tree *tree)
+ {
+       return btrfs_sb(tree->mapping->host->i_sb);
+ }
  int __init extent_io_init(void)
  {
        extent_state_cache = kmem_cache_create("extent_state",
@@@ -136,6 -144,7 +144,7 @@@ static struct extent_state *alloc_exten
  #endif
        atomic_set(&state->refs, 1);
        init_waitqueue_head(&state->wq);
+       trace_alloc_extent_state(state, mask, _RET_IP_);
        return state;
  }
  
@@@ -153,6 -162,7 +162,7 @@@ void free_extent_state(struct extent_st
                list_del(&state->leak_list);
                spin_unlock_irqrestore(&leak_lock, flags);
  #endif
+               trace_free_extent_state(state, _RET_IP_);
                kmem_cache_free(extent_state_cache, state);
        }
  }
@@@ -439,6 -449,13 +449,13 @@@ alloc_extent_state_atomic(struct extent
        return prealloc;
  }
  
+ void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+ {
+       btrfs_panic(tree_fs_info(tree), err, "Locking error: "
+                   "Extent tree was modified by another "
+                   "thread while locked.");
+ }
  /*
   * clear some bits on a range in the tree.  This may require splitting
   * or inserting elements in the tree, so the gfp mask is used to
   *
   * the range [start, end] is inclusive.
   *
-  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
-  * bits were already set, or zero if none of the bits were already set.
+  * This takes the tree lock, and returns 0 on success and < 0 on error.
   */
  int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                     int bits, int wake, int delete,
        struct rb_node *node;
        u64 last_end;
        int err;
-       int set = 0;
        int clear = 0;
  
        if (delete)
@@@ -542,12 -557,14 +557,14 @@@ hit_next
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, start);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
                prealloc = NULL;
                if (err)
                        goto out;
                if (state->end <= end) {
-                       set |= clear_state_bit(tree, state, &bits, wake);
+                       clear_state_bit(tree, state, &bits, wake);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, end + 1);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
                if (wake)
                        wake_up(&state->wq);
  
-               set |= clear_state_bit(tree, prealloc, &bits, wake);
+               clear_state_bit(tree, prealloc, &bits, wake);
  
                prealloc = NULL;
                goto out;
        }
  
-       set |= clear_state_bit(tree, state, &bits, wake);
+       clear_state_bit(tree, state, &bits, wake);
  next:
        if (last_end == (u64)-1)
                goto out;
@@@ -591,7 -610,7 +610,7 @@@ out
        if (prealloc)
                free_extent_state(prealloc);
  
-       return set;
+       return 0;
  
  search_again:
        if (start > end)
        goto again;
  }
  
- static int wait_on_state(struct extent_io_tree *tree,
-                        struct extent_state *state)
+ static void wait_on_state(struct extent_io_tree *tree,
+                         struct extent_state *state)
                __releases(tree->lock)
                __acquires(tree->lock)
  {
        schedule();
        spin_lock(&tree->lock);
        finish_wait(&state->wq, &wait);
-       return 0;
  }
  
  /*
   * The range [start, end] is inclusive.
   * The tree lock is taken by this function
   */
int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
  {
        struct extent_state *state;
        struct rb_node *node;
@@@ -658,7 -676,6 +676,6 @@@ again
        }
  out:
        spin_unlock(&tree->lock);
-       return 0;
  }
  
  static void set_state_bits(struct extent_io_tree *tree,
@@@ -706,9 -723,10 +723,10 @@@ static void uncache_state(struct extent
   * [start, end] is inclusive This takes the tree lock.
   */
  
- int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  int bits, int exclusive_bits, u64 *failed_start,
-                  struct extent_state **cached_state, gfp_t mask)
+ static int __must_check
+ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                int bits, int exclusive_bits, u64 *failed_start,
+                struct extent_state **cached_state, gfp_t mask)
  {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@@ -742,8 -760,10 +760,10 @@@ again
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = insert_state(tree, prealloc, start, end, &bits);
+               if (err)
+                       extent_io_tree_panic(tree, err);
                prealloc = NULL;
-               BUG_ON(err == -EEXIST);
                goto out;
        }
        state = rb_entry(node, struct extent_state, rb_node);
@@@ -809,7 -829,9 +829,9 @@@ hit_next
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, start);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
                prealloc = NULL;
                if (err)
                        goto out;
                 */
                err = insert_state(tree, prealloc, start, this_end,
                                   &bits);
-               BUG_ON(err == -EEXIST);
-               if (err) {
-                       free_extent_state(prealloc);
-                       prealloc = NULL;
-                       goto out;
-               }
+               if (err)
+                       extent_io_tree_panic(tree, err);
                cache_state(prealloc, cached_state);
                prealloc = NULL;
                start = this_end + 1;
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, end + 1);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
  
                set_state_bits(tree, prealloc, &bits);
                cache_state(prealloc, cached_state);
@@@ -900,6 -920,15 +920,15 @@@ search_again
        goto again;
  }
  
+ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+                  u64 *failed_start, struct extent_state **cached_state,
+                  gfp_t mask)
+ {
+       return __set_extent_bit(tree, start, end, bits, 0, failed_start,
+                               cached_state, mask);
+ }
  /**
   * convert_extent - convert all bits in a given range from one bit to another
   * @tree:     the io tree to search
@@@ -946,7 -975,8 +975,8 @@@ again
                }
                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
                goto out;
        }
        state = rb_entry(node, struct extent_state, rb_node);
@@@ -1002,7 -1032,8 +1032,8 @@@ hit_next
                        goto out;
                }
                err = split_state(tree, state, prealloc, start);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
                prealloc = NULL;
                if (err)
                        goto out;
                 */
                err = insert_state(tree, prealloc, start, this_end,
                                   &bits);
-               BUG_ON(err == -EEXIST);
-               if (err) {
-                       free_extent_state(prealloc);
-                       prealloc = NULL;
-                       goto out;
-               }
+               if (err)
+                       extent_io_tree_panic(tree, err);
                prealloc = NULL;
                start = this_end + 1;
                goto search_again;
                }
  
                err = split_state(tree, state, prealloc, end + 1);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
  
                set_state_bits(tree, prealloc, &bits);
                clear_state_bit(tree, prealloc, &clear_bits, 0);
@@@ -1095,14 -1123,14 +1123,14 @@@ search_again
  int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
  {
-       return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+       return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
                              NULL, mask);
  }
  
  int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask)
  {
-       return set_extent_bit(tree, start, end, bits, 0, NULL,
+       return set_extent_bit(tree, start, end, bits, NULL,
                              NULL, mask);
  }
  
@@@ -1117,7 -1145,7 +1145,7 @@@ int set_extent_delalloc(struct extent_i
  {
        return set_extent_bit(tree, start, end,
                              EXTENT_DELALLOC | EXTENT_UPTODATE,
-                             0, NULL, cached_state, mask);
+                             NULL, cached_state, mask);
  }
  
  int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
  int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
  {
-       return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+       return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
                              NULL, mask);
  }
  
@@@ -1139,7 -1167,7 +1167,7 @@@ int set_extent_uptodate(struct extent_i
                        struct extent_state **cached_state, gfp_t mask)
  {
        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
-                             NULL, cached_state, mask);
+                             cached_state, mask);
  }
  
  static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
   * us if waiting is desired.
   */
  int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    int bits, struct extent_state **cached_state, gfp_t mask)
+                    int bits, struct extent_state **cached_state)
  {
        int err;
        u64 failed_start;
        while (1) {
-               err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
-                                    EXTENT_LOCKED, &failed_start,
-                                    cached_state, mask);
-               if (err == -EEXIST && (mask & __GFP_WAIT)) {
+               err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+                                      EXTENT_LOCKED, &failed_start,
+                                      cached_state, GFP_NOFS);
+               if (err == -EEXIST) {
                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
                        start = failed_start;
-               } else {
+               } else
                        break;
-               }
                WARN_ON(start > end);
        }
        return err;
  }
  
- int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
  {
-       return lock_extent_bits(tree, start, end, 0, NULL, mask);
+       return lock_extent_bits(tree, start, end, 0, NULL);
  }
  
- int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
-                   gfp_t mask)
+ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
  {
        int err;
        u64 failed_start;
  
-       err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-                            &failed_start, NULL, mask);
+       err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+                              &failed_start, NULL, GFP_NOFS);
        if (err == -EEXIST) {
                if (failed_start > start)
                        clear_extent_bit(tree, start, failed_start - 1,
-                                        EXTENT_LOCKED, 1, 0, NULL, mask);
+                                        EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
                return 0;
        }
        return 1;
@@@ -1203,10 -1229,10 +1229,10 @@@ int unlock_extent_cached(struct extent_
                                mask);
  }
  
- int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
  {
        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
-                               mask);
+                               GFP_NOFS);
  }
  
  /*
@@@ -1220,7 -1246,7 +1246,7 @@@ static int set_range_writeback(struct e
  
        while (index <= end_index) {
                page = find_get_page(tree->mapping, index);
-               BUG_ON(!page);
+               BUG_ON(!page); /* Pages should be in the extent_io_tree */
                set_page_writeback(page);
                page_cache_release(page);
                index++;
@@@ -1343,9 -1369,9 +1369,9 @@@ out
        return found;
  }
  
- static noinline int __unlock_for_delalloc(struct inode *inode,
-                                         struct page *locked_page,
-                                         u64 start, u64 end)
+ static noinline void __unlock_for_delalloc(struct inode *inode,
+                                          struct page *locked_page,
+                                          u64 start, u64 end)
  {
        int ret;
        struct page *pages[16];
        int i;
  
        if (index == locked_page->index && end_index == index)
-               return 0;
+               return;
  
        while (nr_pages > 0) {
                ret = find_get_pages_contig(inode->i_mapping, index,
                index += ret;
                cond_resched();
        }
-       return 0;
  }
  
  static noinline int lock_delalloc_pages(struct inode *inode,
@@@ -1500,11 -1525,10 +1525,10 @@@ again
                        goto out_failed;
                }
        }
-       BUG_ON(ret);
+       BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
  
        /* step three, lock the state bits for the whole range */
-       lock_extent_bits(tree, delalloc_start, delalloc_end,
-                        0, &cached_state, GFP_NOFS);
+       lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
  
        /* then test to make sure it is all still delalloc */
        ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@@ -1761,39 -1785,34 +1785,34 @@@ int test_range_bit(struct extent_io_tre
   * helper function to set a given page up to date if all the
   * extents in the tree for that page are up to date
   */
- static int check_page_uptodate(struct extent_io_tree *tree,
-                              struct page *page)
+ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
  {
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
                SetPageUptodate(page);
-       return 0;
  }
  
  /*
   * helper function to unlock a page if all the extents in the tree
   * for that page are unlocked
   */
- static int check_page_locked(struct extent_io_tree *tree,
-                            struct page *page)
+ static void check_page_locked(struct extent_io_tree *tree, struct page *page)
  {
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
                unlock_page(page);
-       return 0;
  }
  
  /*
   * helper function to end page writeback if all the extents
   * in the tree for that page are done with writeback
   */
- static int check_page_writeback(struct extent_io_tree *tree,
-                            struct page *page)
+ static void check_page_writeback(struct extent_io_tree *tree,
+                                struct page *page)
  {
        end_page_writeback(page);
-       return 0;
  }
  
  /*
@@@ -1912,6 -1931,26 +1931,26 @@@ int repair_io_failure(struct btrfs_mapp
        return 0;
  }
  
+ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+                        int mirror_num)
+ {
+       struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+       u64 start = eb->start;
+       unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+       int ret;
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+               ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+                                       start, p, mirror_num);
+               if (ret)
+                       break;
+               start += PAGE_CACHE_SIZE;
+       }
+       return ret;
+ }
  /*
   * each time an IO finishes, we do a fast check in the IO failure tree
   * to see if we need to process or clean up an io_failure_record
@@@ -2258,6 -2297,7 +2297,7 @@@ static void end_bio_extent_readpage(str
        u64 start;
        u64 end;
        int whole_page;
+       int failed_mirror;
        int ret;
  
        if (err)
                        else
                                clean_io_failure(start, page);
                }
-               if (!uptodate) {
-                       int failed_mirror;
+               if (!uptodate)
                        failed_mirror = (int)(unsigned long)bio->bi_bdev;
+               if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
+                       ret = tree->ops->readpage_io_failed_hook(page, failed_mirror);
+                       if (!ret && !err &&
+                           test_bit(BIO_UPTODATE, &bio->bi_flags))
+                               uptodate = 1;
+               } else if (!uptodate) {
                        /*
                         * The generic bio_readpage_error handles errors the
                         * following way: If possible, new read requests are
                        ret = bio_readpage_error(bio, page, start, end,
                                                        failed_mirror, NULL);
                        if (ret == 0) {
- error_handled:
                                uptodate =
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
                                uncache_state(&cached);
                                continue;
                        }
-                       if (tree->ops && tree->ops->readpage_io_failed_hook) {
-                               ret = tree->ops->readpage_io_failed_hook(
-                                                       bio, page, start, end,
-                                                       failed_mirror, state);
-                               if (ret == 0)
-                                       goto error_handled;
-                       }
                }
  
-               if (uptodate) {
+               if (uptodate && tree->track_uptodate) {
                        set_extent_uptodate(tree, start, end, &cached,
                                            GFP_ATOMIC);
                }
@@@ -2386,8 -2425,12 +2425,12 @@@ btrfs_bio_alloc(struct block_device *bd
        return bio;
  }
  
- static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
-                         unsigned long bio_flags)
+ /*
+  * Since writes are async, they will only return -ENOMEM.
+  * Reads can return the full range of I/O error conditions.
+  */
+ static int __must_check submit_one_bio(int rw, struct bio *bio,
+                                      int mirror_num, unsigned long bio_flags)
  {
        int ret = 0;
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        return ret;
  }
  
+ static int merge_bio(struct extent_io_tree *tree, struct page *page,
+                    unsigned long offset, size_t size, struct bio *bio,
+                    unsigned long bio_flags)
+ {
+       int ret = 0;
+       if (tree->ops && tree->ops->merge_bio_hook)
+               ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+                                               bio_flags);
+       BUG_ON(ret < 0);
+       return ret;
+ }
  static int submit_extent_page(int rw, struct extent_io_tree *tree,
                              struct page *page, sector_t sector,
                              size_t size, unsigned long offset,
                                sector;
  
                if (prev_bio_flags != bio_flags || !contig ||
-                   (tree->ops && tree->ops->merge_bio_hook &&
-                    tree->ops->merge_bio_hook(page, offset, page_size, bio,
-                                              bio_flags)) ||
+                   merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
                    bio_add_page(bio, page, page_size, offset) < page_size) {
                        ret = submit_one_bio(rw, bio, mirror_num,
                                             prev_bio_flags);
+                       if (ret < 0)
+                               return ret;
                        bio = NULL;
                } else {
                        return 0;
        return ret;
  }
  
- void set_page_extent_mapped(struct page *page)
+ void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
  {
        if (!PagePrivate(page)) {
                SetPagePrivate(page);
                page_cache_get(page);
-               set_page_private(page, EXTENT_PAGE_PRIVATE);
+               set_page_private(page, (unsigned long)eb);
+       } else {
+               WARN_ON(page->private != (unsigned long)eb);
        }
  }
  
static void set_page_extent_head(struct page *page, unsigned long len)
void set_page_extent_mapped(struct page *page)
  {
-       WARN_ON(!PagePrivate(page));
-       set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+       if (!PagePrivate(page)) {
+               SetPagePrivate(page);
+               page_cache_get(page);
+               set_page_private(page, EXTENT_PAGE_PRIVATE);
+       }
  }
  
  /*
   * basic readpage implementation.  Locked extent state structs are inserted
   * into the tree that are removed when the IO is done (by the end_io
   * handlers)
+  * XXX JDM: This needs looking at to ensure proper page locking
   */
  static int __extent_read_full_page(struct extent_io_tree *tree,
                                   struct page *page,
  
        end = page_end;
        while (1) {
-               lock_extent(tree, start, end, GFP_NOFS);
+               lock_extent(tree, start, end);
                ordered = btrfs_lookup_ordered_extent(inode, start);
                if (!ordered)
                        break;
-               unlock_extent(tree, start, end, GFP_NOFS);
+               unlock_extent(tree, start, end);
                btrfs_start_ordered_extent(inode, ordered, 1);
                btrfs_put_ordered_extent(ordered);
        }
  
                if (zero_offset) {
                        iosize = PAGE_CACHE_SIZE - zero_offset;
 -                      userpage = kmap_atomic(page, KM_USER0);
 +                      userpage = kmap_atomic(page);
                        memset(userpage + zero_offset, 0, iosize);
                        flush_dcache_page(page);
 -                      kunmap_atomic(userpage, KM_USER0);
 +                      kunmap_atomic(userpage);
                }
        }
        while (cur <= end) {
                        struct extent_state *cached = NULL;
  
                        iosize = PAGE_CACHE_SIZE - pg_offset;
 -                      userpage = kmap_atomic(page, KM_USER0);
 +                      userpage = kmap_atomic(page);
                        memset(userpage + pg_offset, 0, iosize);
                        flush_dcache_page(page);
 -                      kunmap_atomic(userpage, KM_USER0);
 +                      kunmap_atomic(userpage);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
                        unlock_extent_cached(tree, cur, cur + iosize - 1,
                                end - cur + 1, 0);
                if (IS_ERR_OR_NULL(em)) {
                        SetPageError(page);
-                       unlock_extent(tree, cur, end, GFP_NOFS);
+                       unlock_extent(tree, cur, end);
                        break;
                }
                extent_offset = cur - em->start;
                        char *userpage;
                        struct extent_state *cached = NULL;
  
 -                      userpage = kmap_atomic(page, KM_USER0);
 +                      userpage = kmap_atomic(page);
                        memset(userpage + pg_offset, 0, iosize);
                        flush_dcache_page(page);
 -                      kunmap_atomic(userpage, KM_USER0);
 +                      kunmap_atomic(userpage);
  
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
                if (test_range_bit(tree, cur, cur_end,
                                   EXTENT_UPTODATE, 1, NULL)) {
                        check_page_uptodate(tree, page);
-                       unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                       unlock_extent(tree, cur, cur + iosize - 1);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
                 */
                if (block_start == EXTENT_MAP_INLINE) {
                        SetPageError(page);
-                       unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                       unlock_extent(tree, cur, cur + iosize - 1);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
                                         this_bio_flag);
+                       BUG_ON(ret == -ENOMEM);
                        nr++;
                        *bio_flags = this_bio_flag;
                }
@@@ -2756,10 -2819,10 +2819,10 @@@ static int __extent_writepage(struct pa
        if (page->index == end_index) {
                char *userpage;
  
 -              userpage = kmap_atomic(page, KM_USER0);
 +              userpage = kmap_atomic(page);
                memset(userpage + pg_offset, 0,
                       PAGE_CACHE_SIZE - pg_offset);
 -              kunmap_atomic(userpage, KM_USER0);
 +              kunmap_atomic(userpage);
                flush_dcache_page(page);
        }
        pg_offset = 0;
                                                       delalloc_end,
                                                       &page_started,
                                                       &nr_written);
-                       BUG_ON(ret);
+                       /* File system has been set read-only */
+                       if (ret) {
+                               SetPageError(page);
+                               goto done;
+                       }
                        /*
                         * delalloc_end is already one less than the total
                         * length, so we don't subtract one from
@@@ -2968,6 -3035,275 +3035,275 @@@ done_unlocked
        return 0;
  }
  
+ static int eb_wait(void *word)
+ {
+       io_schedule();
+       return 0;
+ }
+ static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+ {
+       wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
+                   TASK_UNINTERRUPTIBLE);
+ }
+ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
+                                    struct btrfs_fs_info *fs_info,
+                                    struct extent_page_data *epd)
+ {
+       unsigned long i, num_pages;
+       int flush = 0;
+       int ret = 0;
+       if (!btrfs_try_tree_write_lock(eb)) {
+               flush = 1;
+               flush_write_bio(epd);
+               btrfs_tree_lock(eb);
+       }
+       if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+               btrfs_tree_unlock(eb);
+               if (!epd->sync_io)
+                       return 0;
+               if (!flush) {
+                       flush_write_bio(epd);
+                       flush = 1;
+               }
+               while (1) {
+                       wait_on_extent_buffer_writeback(eb);
+                       btrfs_tree_lock(eb);
+                       if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+                               break;
+                       btrfs_tree_unlock(eb);
+               }
+       }
+       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+               set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+               btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+               spin_lock(&fs_info->delalloc_lock);
+               if (fs_info->dirty_metadata_bytes >= eb->len)
+                       fs_info->dirty_metadata_bytes -= eb->len;
+               else
+                       WARN_ON(1);
+               spin_unlock(&fs_info->delalloc_lock);
+               ret = 1;
+       }
+       btrfs_tree_unlock(eb);
+       if (!ret)
+               return ret;
+       num_pages = num_extent_pages(eb->start, eb->len);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+               if (!trylock_page(p)) {
+                       if (!flush) {
+                               flush_write_bio(epd);
+                               flush = 1;
+                       }
+                       lock_page(p);
+               }
+       }
+       return ret;
+ }
+ static void end_extent_buffer_writeback(struct extent_buffer *eb)
+ {
+       clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+       smp_mb__after_clear_bit();
+       wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+ }
+ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+ {
+       int uptodate = err == 0;
+       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+       struct extent_buffer *eb;
+       int done;
+       do {
+               struct page *page = bvec->bv_page;
+               bvec--;
+               eb = (struct extent_buffer *)page->private;
+               BUG_ON(!eb);
+               done = atomic_dec_and_test(&eb->io_pages);
+               if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+                       set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+                       ClearPageUptodate(page);
+                       SetPageError(page);
+               }
+               end_page_writeback(page);
+               if (!done)
+                       continue;
+               end_extent_buffer_writeback(eb);
+       } while (bvec >= bio->bi_io_vec);
+       bio_put(bio);
+ }
+ static int write_one_eb(struct extent_buffer *eb,
+                       struct btrfs_fs_info *fs_info,
+                       struct writeback_control *wbc,
+                       struct extent_page_data *epd)
+ {
+       struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+       u64 offset = eb->start;
+       unsigned long i, num_pages;
+       int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
+       int ret;
+       clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+       num_pages = num_extent_pages(eb->start, eb->len);
+       atomic_set(&eb->io_pages, num_pages);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+               clear_page_dirty_for_io(p);
+               set_page_writeback(p);
+               ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+                                        PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+                                        -1, end_bio_extent_buffer_writepage,
+                                        0, 0, 0);
+               if (ret) {
+                       set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+                       SetPageError(p);
+                       if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+                               end_extent_buffer_writeback(eb);
+                       ret = -EIO;
+                       break;
+               }
+               offset += PAGE_CACHE_SIZE;
+               update_nr_written(p, wbc, 1);
+               unlock_page(p);
+       }
+       if (unlikely(ret)) {
+               for (; i < num_pages; i++) {
+                       struct page *p = extent_buffer_page(eb, i);
+                       unlock_page(p);
+               }
+       }
+       return ret;
+ }
+ int btree_write_cache_pages(struct address_space *mapping,
+                                  struct writeback_control *wbc)
+ {
+       struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+       struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+       struct extent_buffer *eb, *prev_eb = NULL;
+       struct extent_page_data epd = {
+               .bio = NULL,
+               .tree = tree,
+               .extent_locked = 0,
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+       };
+       int ret = 0;
+       int done = 0;
+       int nr_to_write_done = 0;
+       struct pagevec pvec;
+       int nr_pages;
+       pgoff_t index;
+       pgoff_t end;            /* Inclusive */
+       int scanned = 0;
+       int tag;
+       pagevec_init(&pvec, 0);
+       if (wbc->range_cyclic) {
+               index = mapping->writeback_index; /* Start from prev offset */
+               end = -1;
+       } else {
+               index = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+               scanned = 1;
+       }
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag = PAGECACHE_TAG_TOWRITE;
+       else
+               tag = PAGECACHE_TAG_DIRTY;
+ retry:
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag_pages_for_writeback(mapping, index, end);
+       while (!done && !nr_to_write_done && (index <= end) &&
+              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+               unsigned i;
+               scanned = 1;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+                       if (!PagePrivate(page))
+                               continue;
+                       if (!wbc->range_cyclic && page->index > end) {
+                               done = 1;
+                               break;
+                       }
+                       eb = (struct extent_buffer *)page->private;
+                       if (!eb) {
+                               WARN_ON(1);
+                               continue;
+                       }
+                       if (eb == prev_eb)
+                               continue;
+                       if (!atomic_inc_not_zero(&eb->refs)) {
+                               WARN_ON(1);
+                               continue;
+                       }
+                       prev_eb = eb;
+                       ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+                       if (!ret) {
+                               free_extent_buffer(eb);
+                               continue;
+                       }
+                       ret = write_one_eb(eb, fs_info, wbc, &epd);
+                       if (ret) {
+                               done = 1;
+                               free_extent_buffer(eb);
+                               break;
+                       }
+                       free_extent_buffer(eb);
+                       /*
+                        * the filesystem may choose to bump up nr_to_write.
+                        * We have to make sure to honor the new nr_to_write
+                        * at any time
+                        */
+                       nr_to_write_done = wbc->nr_to_write <= 0;
+               }
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+       if (!scanned && !done) {
+               /*
+                * We hit the last page and there is more work to be done: wrap
+                * back to the start of the file
+                */
+               scanned = 1;
+               index = 0;
+               goto retry;
+       }
+       flush_write_bio(&epd);
+       return ret;
+ }
  /**
   * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
   * @mapping: address space structure to write
@@@ -3099,10 -3435,14 +3435,14 @@@ retry
  static void flush_epd_write_bio(struct extent_page_data *epd)
  {
        if (epd->bio) {
+               int rw = WRITE;
+               int ret;
                if (epd->sync_io)
-                       submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
-               else
-                       submit_one_bio(WRITE, epd->bio, 0, 0);
+                       rw = WRITE_SYNC;
+               ret = submit_one_bio(rw, epd->bio, 0, 0);
+               BUG_ON(ret < 0); /* -ENOMEM */
                epd->bio = NULL;
        }
  }
@@@ -3219,7 -3559,7 +3559,7 @@@ int extent_readpages(struct extent_io_t
        }
        BUG_ON(!list_empty(pages));
        if (bio)
-               submit_one_bio(READ, bio, 0, bio_flags);
+               return submit_one_bio(READ, bio, 0, bio_flags);
        return 0;
  }
  
@@@ -3240,7 -3580,7 +3580,7 @@@ int extent_invalidatepage(struct extent
        if (start > end)
                return 0;
  
-       lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
+       lock_extent_bits(tree, start, end, 0, &cached_state);
        wait_on_page_writeback(page);
        clear_extent_bit(tree, start, end,
                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@@ -3454,7 -3794,7 +3794,7 @@@ int extent_fiemap(struct inode *inode, 
        }
  
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
-                        &cached_state, GFP_NOFS);
+                        &cached_state);
  
        em = get_extent_skip_holes(inode, start, last_for_get_extent,
                                   get_extent);
@@@ -3548,26 -3888,7 +3888,7 @@@ out
  inline struct page *extent_buffer_page(struct extent_buffer *eb,
                                              unsigned long i)
  {
-       struct page *p;
-       struct address_space *mapping;
-       if (i == 0)
-               return eb->first_page;
-       i += eb->start >> PAGE_CACHE_SHIFT;
-       mapping = eb->first_page->mapping;
-       if (!mapping)
-               return NULL;
-       /*
-        * extent_buffer_page is only called after pinning the page
-        * by increasing the reference count.  So we know the page must
-        * be in the radix tree.
-        */
-       rcu_read_lock();
-       p = radix_tree_lookup(&mapping->page_tree, i);
-       rcu_read_unlock();
-       return p;
+       return eb->pages[i];
  }
  
  inline unsigned long num_extent_pages(u64 start, u64 len)
                (start >> PAGE_CACHE_SHIFT);
  }
  
+ static void __free_extent_buffer(struct extent_buffer *eb)
+ {
+ #if LEAK_DEBUG
+       unsigned long flags;
+       spin_lock_irqsave(&leak_lock, flags);
+       list_del(&eb->leak_list);
+       spin_unlock_irqrestore(&leak_lock, flags);
+ #endif
+       if (eb->pages && eb->pages != eb->inline_pages)
+               kfree(eb->pages);
+       kmem_cache_free(extent_buffer_cache, eb);
+ }
  static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                                                   u64 start,
                                                   unsigned long len,
                return NULL;
        eb->start = start;
        eb->len = len;
+       eb->tree = tree;
        rwlock_init(&eb->lock);
        atomic_set(&eb->write_locks, 0);
        atomic_set(&eb->read_locks, 0);
        list_add(&eb->leak_list, &buffers);
        spin_unlock_irqrestore(&leak_lock, flags);
  #endif
+       spin_lock_init(&eb->refs_lock);
        atomic_set(&eb->refs, 1);
+       atomic_set(&eb->io_pages, 0);
+       if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
+               struct page **pages;
+               int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
+                       PAGE_CACHE_SHIFT;
+               pages = kzalloc(num_pages, mask);
+               if (!pages) {
+                       __free_extent_buffer(eb);
+                       return NULL;
+               }
+               eb->pages = pages;
+       } else {
+               eb->pages = eb->inline_pages;
+       }
  
        return eb;
  }
  
- static void __free_extent_buffer(struct extent_buffer *eb)
+ static int extent_buffer_under_io(struct extent_buffer *eb)
  {
- #if LEAK_DEBUG
-       unsigned long flags;
-       spin_lock_irqsave(&leak_lock, flags);
-       list_del(&eb->leak_list);
-       spin_unlock_irqrestore(&leak_lock, flags);
- #endif
-       kmem_cache_free(extent_buffer_cache, eb);
+       return (atomic_read(&eb->io_pages) ||
+               test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+               test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
  }
  
  /*
@@@ -3632,8 -3979,7 +3979,7 @@@ static void btrfs_release_extent_buffer
        unsigned long index;
        struct page *page;
  
-       if (!eb->first_page)
-               return;
+       BUG_ON(extent_buffer_under_io(eb));
  
        index = num_extent_pages(eb->start, eb->len);
        if (start_idx >= index)
        do {
                index--;
                page = extent_buffer_page(eb, index);
-               if (page)
+               if (page) {
+                       spin_lock(&page->mapping->private_lock);
+                       /*
+                        * We do this since we'll remove the pages after we've
+                        * removed the eb from the radix tree, so we could race
+                        * and have this page now attached to the new eb.  So
+                        * only clear page_private if it's still connected to
+                        * this eb.
+                        */
+                       if (PagePrivate(page) &&
+                           page->private == (unsigned long)eb) {
+                               BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+                               BUG_ON(PageDirty(page));
+                               BUG_ON(PageWriteback(page));
+                               /*
+                                * We need to make sure we haven't be attached
+                                * to a new eb.
+                                */
+                               ClearPagePrivate(page);
+                               set_page_private(page, 0);
+                               /* One for the page private */
+                               page_cache_release(page);
+                       }
+                       spin_unlock(&page->mapping->private_lock);
+                       /* One for when we alloced the page */
                        page_cache_release(page);
+               }
        } while (index != start_idx);
  }
  
@@@ -3656,9 -4028,50 +4028,50 @@@ static inline void btrfs_release_extent
        __free_extent_buffer(eb);
  }
  
+ static void check_buffer_tree_ref(struct extent_buffer *eb)
+ {
+       /* the ref bit is tricky.  We have to make sure it is set
+        * if we have the buffer dirty.   Otherwise the
+        * code to free a buffer can end up dropping a dirty
+        * page
+        *
+        * Once the ref bit is set, it won't go away while the
+        * buffer is dirty or in writeback, and it also won't
+        * go away while we have the reference count on the
+        * eb bumped.
+        *
+        * We can't just set the ref bit without bumping the
+        * ref on the eb because free_extent_buffer might
+        * see the ref bit and try to clear it.  If this happens
+        * free_extent_buffer might end up dropping our original
+        * ref by mistake and freeing the page before we are able
+        * to add one more ref.
+        *
+        * So bump the ref count first, then set the bit.  If someone
+        * beat us to it, drop the ref we added.
+        */
+       if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+               atomic_inc(&eb->refs);
+               if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+                       atomic_dec(&eb->refs);
+       }
+ }
+ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+ {
+       unsigned long num_pages, i;
+       check_buffer_tree_ref(eb);
+       num_pages = num_extent_pages(eb->start, eb->len);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+               mark_page_accessed(p);
+       }
+ }
  struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
-                                         u64 start, unsigned long len,
-                                         struct page *page0)
+                                         u64 start, unsigned long len)
  {
        unsigned long num_pages = num_extent_pages(start, len);
        unsigned long i;
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
-               mark_page_accessed(eb->first_page);
+               mark_extent_buffer_accessed(eb);
                return eb;
        }
        rcu_read_unlock();
        if (!eb)
                return NULL;
  
-       if (page0) {
-               eb->first_page = page0;
-               i = 1;
-               index++;
-               page_cache_get(page0);
-               mark_page_accessed(page0);
-               set_page_extent_mapped(page0);
-               set_page_extent_head(page0, len);
-               uptodate = PageUptodate(page0);
-       } else {
-               i = 0;
-       }
-       for (; i < num_pages; i++, index++) {
+       for (i = 0; i < num_pages; i++, index++) {
                p = find_or_create_page(mapping, index, GFP_NOFS);
                if (!p) {
                        WARN_ON(1);
                        goto free_eb;
                }
-               set_page_extent_mapped(p);
-               mark_page_accessed(p);
-               if (i == 0) {
-                       eb->first_page = p;
-                       set_page_extent_head(p, len);
-               } else {
-                       set_page_private(p, EXTENT_PAGE_PRIVATE);
+               spin_lock(&mapping->private_lock);
+               if (PagePrivate(p)) {
+                       /*
+                        * We could have already allocated an eb for this page
+                        * and attached one so lets see if we can get a ref on
+                        * the existing eb, and if we can we know it's good and
+                        * we can just return that one, else we know we can just
+                        * overwrite page->private.
+                        */
+                       exists = (struct extent_buffer *)p->private;
+                       if (atomic_inc_not_zero(&exists->refs)) {
+                               spin_unlock(&mapping->private_lock);
+                               unlock_page(p);
+                               mark_extent_buffer_accessed(exists);
+                               goto free_eb;
+                       }
+                       /*
+                        * Do this so attach doesn't complain and we need to
+                        * drop the ref the old guy had.
+                        */
+                       ClearPagePrivate(p);
+                       WARN_ON(PageDirty(p));
+                       page_cache_release(p);
                }
+               attach_extent_buffer_page(eb, p);
+               spin_unlock(&mapping->private_lock);
+               WARN_ON(PageDirty(p));
+               mark_page_accessed(p);
+               eb->pages[i] = p;
                if (!PageUptodate(p))
                        uptodate = 0;
  
                 * see below about how we avoid a nasty race with release page
                 * and why we unlock later
                 */
-               if (i != 0)
-                       unlock_page(p);
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ again:
        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
        if (ret)
                goto free_eb;
        if (ret == -EEXIST) {
                exists = radix_tree_lookup(&tree->buffer,
                                                start >> PAGE_CACHE_SHIFT);
-               /* add one reference for the caller */
-               atomic_inc(&exists->refs);
+               if (!atomic_inc_not_zero(&exists->refs)) {
+                       spin_unlock(&tree->buffer_lock);
+                       radix_tree_preload_end();
+                       exists = NULL;
+                       goto again;
+               }
                spin_unlock(&tree->buffer_lock);
                radix_tree_preload_end();
+               mark_extent_buffer_accessed(exists);
                goto free_eb;
        }
        /* add one reference for the tree */
-       atomic_inc(&eb->refs);
+       spin_lock(&eb->refs_lock);
+       check_buffer_tree_ref(eb);
+       spin_unlock(&eb->refs_lock);
        spin_unlock(&tree->buffer_lock);
        radix_tree_preload_end();
  
         * after the extent buffer is in the radix tree so
         * it doesn't get lost
         */
-       set_page_extent_mapped(eb->first_page);
-       set_page_extent_head(eb->first_page, eb->len);
-       if (!page0)
-               unlock_page(eb->first_page);
+       SetPageChecked(eb->pages[0]);
+       for (i = 1; i < num_pages; i++) {
+               p = extent_buffer_page(eb, i);
+               ClearPageChecked(p);
+               unlock_page(p);
+       }
+       unlock_page(eb->pages[0]);
        return eb;
  
  free_eb:
-       if (eb->first_page && !page0)
-               unlock_page(eb->first_page);
+       for (i = 0; i < num_pages; i++) {
+               if (eb->pages[i])
+                       unlock_page(eb->pages[i]);
+       }
  
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
@@@ -3776,7 -4210,7 +4210,7 @@@ struct extent_buffer *find_extent_buffe
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
-               mark_page_accessed(eb->first_page);
+               mark_extent_buffer_accessed(eb);
                return eb;
        }
        rcu_read_unlock();
        return NULL;
  }
  
+ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+ {
+       struct extent_buffer *eb =
+                       container_of(head, struct extent_buffer, rcu_head);
+       __free_extent_buffer(eb);
+ }
+ /* Expects to have eb->eb_lock already held */
+ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+ {
+       WARN_ON(atomic_read(&eb->refs) == 0);
+       if (atomic_dec_and_test(&eb->refs)) {
+               struct extent_io_tree *tree = eb->tree;
+               spin_unlock(&eb->refs_lock);
+               spin_lock(&tree->buffer_lock);
+               radix_tree_delete(&tree->buffer,
+                                 eb->start >> PAGE_CACHE_SHIFT);
+               spin_unlock(&tree->buffer_lock);
+               /* Should be safe to release our pages at this point */
+               btrfs_release_extent_buffer_page(eb, 0);
+               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+               return;
+       }
+       spin_unlock(&eb->refs_lock);
+ }
  void free_extent_buffer(struct extent_buffer *eb)
  {
        if (!eb)
                return;
  
-       if (!atomic_dec_and_test(&eb->refs))
+       spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) == 2 &&
+           test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+           !extent_buffer_under_io(eb) &&
+           test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+               atomic_dec(&eb->refs);
+       /*
+        * I know this is terrible, but it's temporary until we stop tracking
+        * the uptodate bits and such for the extent buffers.
+        */
+       release_extent_buffer(eb, GFP_ATOMIC);
+ }
+ void free_extent_buffer_stale(struct extent_buffer *eb)
+ {
+       if (!eb)
                return;
  
-       WARN_ON(1);
+       spin_lock(&eb->refs_lock);
+       set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+       if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+           test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+               atomic_dec(&eb->refs);
+       release_extent_buffer(eb, GFP_NOFS);
  }
  
- int clear_extent_buffer_dirty(struct extent_io_tree *tree,
-                             struct extent_buffer *eb)
+ void clear_extent_buffer_dirty(struct extent_buffer *eb)
  {
        unsigned long i;
        unsigned long num_pages;
                lock_page(page);
                WARN_ON(!PagePrivate(page));
  
-               set_page_extent_mapped(page);
-               if (i == 0)
-                       set_page_extent_head(page, eb->len);
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
                if (!PageDirty(page)) {
                ClearPageError(page);
                unlock_page(page);
        }
-       return 0;
+       WARN_ON(atomic_read(&eb->refs) == 0);
  }
  
- int set_extent_buffer_dirty(struct extent_io_tree *tree,
-                            struct extent_buffer *eb)
+ int set_extent_buffer_dirty(struct extent_buffer *eb)
  {
        unsigned long i;
        unsigned long num_pages;
        int was_dirty = 0;
  
+       check_buffer_tree_ref(eb);
        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
+       WARN_ON(atomic_read(&eb->refs) == 0);
+       WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
        for (i = 0; i < num_pages; i++)
-               __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+               set_page_dirty(extent_buffer_page(eb, i));
        return was_dirty;
  }
  
- static int __eb_straddles_pages(u64 start, u64 len)
+ static int range_straddles_pages(u64 start, u64 len)
  {
        if (len < PAGE_CACHE_SIZE)
                return 1;
        return 0;
  }
  
- static int eb_straddles_pages(struct extent_buffer *eb)
- {
-       return __eb_straddles_pages(eb->start, eb->len);
- }
- int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-                               struct extent_buffer *eb,
-                               struct extent_state **cached_state)
+ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
  {
        unsigned long i;
        struct page *page;
        unsigned long num_pages;
  
-       num_pages = num_extent_pages(eb->start, eb->len);
        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-       clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                             cached_state, GFP_NOFS);
+       num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if (page)
        return 0;
  }
  
- int set_extent_buffer_uptodate(struct extent_io_tree *tree,
-                               struct extent_buffer *eb)
+ int set_extent_buffer_uptodate(struct extent_buffer *eb)
  {
        unsigned long i;
        struct page *page;
        unsigned long num_pages;
  
+       set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
-       if (eb_straddles_pages(eb)) {
-               set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                                   NULL, GFP_NOFS);
-       }
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-               if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-                   ((i == num_pages - 1) &&
-                    ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-                       check_page_uptodate(tree, page);
-                       continue;
-               }
                SetPageUptodate(page);
        }
        return 0;
@@@ -3917,7 -4382,7 +4382,7 @@@ int extent_range_uptodate(struct extent
        int uptodate;
        unsigned long index;
  
-       if (__eb_straddles_pages(start, end - start + 1)) {
+       if (range_straddles_pages(start, end - start + 1)) {
                ret = test_range_bit(tree, start, end,
                                     EXTENT_UPTODATE, 1, NULL);
                if (ret)
        return pg_uptodate;
  }
  
- int extent_buffer_uptodate(struct extent_io_tree *tree,
-                          struct extent_buffer *eb,
-                          struct extent_state *cached_state)
+ int extent_buffer_uptodate(struct extent_buffer *eb)
  {
-       int ret = 0;
-       unsigned long num_pages;
-       unsigned long i;
-       struct page *page;
-       int pg_uptodate = 1;
-       if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
-               return 1;
-       if (eb_straddles_pages(eb)) {
-               ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                                  EXTENT_UPTODATE, 1, cached_state);
-               if (ret)
-                       return ret;
-       }
-       num_pages = num_extent_pages(eb->start, eb->len);
-       for (i = 0; i < num_pages; i++) {
-               page = extent_buffer_page(eb, i);
-               if (!PageUptodate(page)) {
-                       pg_uptodate = 0;
-                       break;
-               }
-       }
-       return pg_uptodate;
+       return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
  }
  
  int read_extent_buffer_pages(struct extent_io_tree *tree,
        int ret = 0;
        int locked_pages = 0;
        int all_uptodate = 1;
-       int inc_all_pages = 0;
        unsigned long num_pages;
+       unsigned long num_reads = 0;
        struct bio *bio = NULL;
        unsigned long bio_flags = 0;
  
        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 0;
  
-       if (eb_straddles_pages(eb)) {
-               if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                                  EXTENT_UPTODATE, 1, NULL)) {
-                       return 0;
-               }
-       }
        if (start) {
                WARN_ON(start < eb->start);
                start_i = (start >> PAGE_CACHE_SHIFT) -
                        lock_page(page);
                }
                locked_pages++;
-               if (!PageUptodate(page))
+               if (!PageUptodate(page)) {
+                       num_reads++;
                        all_uptodate = 0;
+               }
        }
        if (all_uptodate) {
                if (start_i == 0)
                goto unlock_exit;
        }
  
+       clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+       eb->failed_mirror = 0;
+       atomic_set(&eb->io_pages, num_reads);
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-               WARN_ON(!PagePrivate(page));
-               set_page_extent_mapped(page);
-               if (i == 0)
-                       set_page_extent_head(page, eb->len);
-               if (inc_all_pages)
-                       page_cache_get(page);
                if (!PageUptodate(page)) {
-                       if (start_i == 0)
-                               inc_all_pages = 1;
                        ClearPageError(page);
                        err = __extent_read_full_page(tree, page,
                                                      get_extent, &bio,
                }
        }
  
-       if (bio)
-               submit_one_bio(READ, bio, mirror_num, bio_flags);
+       if (bio) {
+               err = submit_one_bio(READ, bio, mirror_num, bio_flags);
+               if (err)
+                       return err;
+       }
  
        if (ret || wait != WAIT_COMPLETE)
                return ret;
                        ret = -EIO;
        }
  
-       if (!ret)
-               set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        return ret;
  
  unlock_exit:
@@@ -4304,15 -4731,20 +4731,20 @@@ static void copy_pages(struct page *dst
  {
        char *dst_kaddr = page_address(dst_page);
        char *src_kaddr;
+       int must_memmove = 0;
  
        if (dst_page != src_page) {
                src_kaddr = page_address(src_page);
        } else {
                src_kaddr = dst_kaddr;
-               BUG_ON(areas_overlap(src_off, dst_off, len));
+               if (areas_overlap(src_off, dst_off, len))
+                       must_memmove = 1;
        }
  
-       memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+       if (must_memmove)
+               memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+       else
+               memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
  }
  
  void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@@ -4382,7 -4814,7 +4814,7 @@@ void memmove_extent_buffer(struct exten
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
-       if (!areas_overlap(src_offset, dst_offset, len)) {
+       if (dst_offset < src_offset) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }
        }
  }
  
static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
int try_release_extent_buffer(struct page *page, gfp_t mask)
  {
-       struct extent_buffer *eb =
-                       container_of(head, struct extent_buffer, rcu_head);
-       btrfs_release_extent_buffer(eb);
- }
- int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
- {
-       u64 start = page_offset(page);
        struct extent_buffer *eb;
-       int ret = 1;
  
-       spin_lock(&tree->buffer_lock);
-       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-       if (!eb) {
-               spin_unlock(&tree->buffer_lock);
-               return ret;
+       /*
+        * We need to make sure noboody is attaching this page to an eb right
+        * now.
+        */
+       spin_lock(&page->mapping->private_lock);
+       if (!PagePrivate(page)) {
+               spin_unlock(&page->mapping->private_lock);
+               return 1;
        }
  
-       if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-               ret = 0;
-               goto out;
-       }
+       eb = (struct extent_buffer *)page->private;
+       BUG_ON(!eb);
  
        /*
-        * set @eb->refs to 0 if it is already 1, and then release the @eb.
-        * Or go back.
+        * This is a little awful but should be ok, we need to make sure that
+        * the eb doesn't disappear out from under us while we're looking at
+        * this page.
         */
-       if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
-               ret = 0;
-               goto out;
+       spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+               spin_unlock(&eb->refs_lock);
+               spin_unlock(&page->mapping->private_lock);
+               return 0;
        }
+       spin_unlock(&page->mapping->private_lock);
  
-       radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
- out:
-       spin_unlock(&tree->buffer_lock);
+       if ((mask & GFP_NOFS) == GFP_NOFS)
+               mask = GFP_NOFS;
  
-       /* at this point we can safely release the extent buffer */
-       if (atomic_read(&eb->refs) == 0)
-               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
-       return ret;
+       /*
+        * If tree ref isn't set then we know the ref on this eb is a real ref,
+        * so just return, this page will likely be freed soon anyway.
+        */
+       if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+               spin_unlock(&eb->refs_lock);
+               return 0;
+       }
+       release_extent_buffer(eb, mask);
+       return 1;
  }
diff --combined fs/btrfs/file-item.c
  #include "transaction.h"
  #include "print-tree.h"
  
- #define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+ #define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
                                   sizeof(struct btrfs_item) * 2) / \
                                  size) - 1))
  
+ #define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE))
  #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
                                   sizeof(struct btrfs_ordered_sum)) / \
                                   sizeof(struct btrfs_sector_sum) * \
@@@ -59,7 -61,7 +61,7 @@@ int btrfs_insert_file_extent(struct btr
                                      sizeof(*item));
        if (ret < 0)
                goto out;
-       BUG_ON(ret);
+       BUG_ON(ret); /* Can't happen */
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0],
                              struct btrfs_file_extent_item);
@@@ -284,6 -286,7 +286,7 @@@ int btrfs_lookup_csums_range(struct btr
        struct btrfs_ordered_sum *sums;
        struct btrfs_sector_sum *sector_sum;
        struct btrfs_csum_item *item;
+       LIST_HEAD(tmplist);
        unsigned long offset;
        int ret;
        size_t size;
                                        MAX_ORDERED_SUM_BYTES(root));
                        sums = kzalloc(btrfs_ordered_sum_size(root, size),
                                        GFP_NOFS);
-                       BUG_ON(!sums);
+                       if (!sums) {
+                               ret = -ENOMEM;
+                               goto fail;
+                       }
  
                        sector_sum = sums->sums;
                        sums->bytenr = start;
                                offset += csum_size;
                                sector_sum++;
                        }
-                       list_add_tail(&sums->list, list);
+                       list_add_tail(&sums->list, &tmplist);
                }
                path->slots[0]++;
        }
        ret = 0;
  fail:
+       while (ret < 0 && !list_empty(&tmplist)) {
+               sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+               list_del(&sums->list);
+               kfree(sums);
+       }
+       list_splice_tail(&tmplist, list);
        btrfs_free_path(path);
        return ret;
  }
@@@ -420,7 -433,7 +433,7 @@@ int btrfs_csum_one_bio(struct btrfs_roo
                offset = page_offset(bvec->bv_page) + bvec->bv_offset;
  
        ordered = btrfs_lookup_ordered_extent(inode, offset);
-       BUG_ON(!ordered);
+       BUG_ON(!ordered); /* Logic error */
        sums->bytenr = ordered->start;
  
        while (bio_index < bio->bi_vcnt) {
  
                        sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
                                       GFP_NOFS);
-                       BUG_ON(!sums);
+                       BUG_ON(!sums); /* -ENOMEM */
                        sector_sum = sums->sums;
                        sums->len = bytes_left;
                        ordered = btrfs_lookup_ordered_extent(inode, offset);
-                       BUG_ON(!ordered);
+                       BUG_ON(!ordered); /* Logic error */
                        sums->bytenr = ordered->start;
                }
  
 -              data = kmap_atomic(bvec->bv_page, KM_USER0);
 +              data = kmap_atomic(bvec->bv_page);
                sector_sum->sum = ~(u32)0;
                sector_sum->sum = btrfs_csum_data(root,
                                                  data + bvec->bv_offset,
                                                  sector_sum->sum,
                                                  bvec->bv_len);
 -              kunmap_atomic(data, KM_USER0);
 +              kunmap_atomic(data);
                btrfs_csum_final(sector_sum->sum,
                                 (char *)&sector_sum->sum);
                sector_sum->bytenr = disk_bytenr;
   * This calls btrfs_truncate_item with the correct args based on the
   * overlap, and fixes up the key as required.
   */
- static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root,
-                                     struct btrfs_path *path,
-                                     struct btrfs_key *key,
-                                     u64 bytenr, u64 len)
+ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct btrfs_key *key,
+                                      u64 bytenr, u64 len)
  {
        struct extent_buffer *leaf;
        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        u64 csum_end;
        u64 end_byte = bytenr + len;
        u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
-       int ret;
  
        leaf = path->nodes[0];
        csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
                 */
                u32 new_size = (bytenr - key->offset) >> blocksize_bits;
                new_size *= csum_size;
-               ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+               btrfs_truncate_item(trans, root, path, new_size, 1);
        } else if (key->offset >= bytenr && csum_end > end_byte &&
                   end_byte > key->offset) {
                /*
                u32 new_size = (csum_end - end_byte) >> blocksize_bits;
                new_size *= csum_size;
  
-               ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+               btrfs_truncate_item(trans, root, path, new_size, 0);
  
                key->offset = end_byte;
-               ret = btrfs_set_item_key_safe(trans, root, path, key);
-               BUG_ON(ret);
+               btrfs_set_item_key_safe(trans, root, path, key);
        } else {
                BUG();
        }
-       return 0;
  }
  
  /*
@@@ -635,13 -645,14 +645,14 @@@ int btrfs_del_csums(struct btrfs_trans_
                         * item changed size or key
                         */
                        ret = btrfs_split_item(trans, root, path, &key, offset);
-                       BUG_ON(ret && ret != -EAGAIN);
+                       if (ret && ret != -EAGAIN) {
+                               btrfs_abort_transaction(trans, root, ret);
+                               goto out;
+                       }
  
                        key.offset = end_byte - 1;
                } else {
-                       ret = truncate_one_csum(trans, root, path,
-                                               &key, bytenr, len);
-                       BUG_ON(ret);
+                       truncate_one_csum(trans, root, path, &key, bytenr, len);
                        if (key.offset < bytenr)
                                break;
                }
@@@ -772,7 -783,7 +783,7 @@@ again
                if (diff != csum_size)
                        goto insert;
  
-               ret = btrfs_extend_item(trans, root, path, diff);
+               btrfs_extend_item(trans, root, path, diff);
                goto csum;
        }
  
@@@ -230,11 -230,13 +230,13 @@@ int btrfs_truncate_free_space_cache(str
  
        if (ret) {
                trans->block_rsv = rsv;
-               WARN_ON(1);
+               btrfs_abort_transaction(trans, root, ret);
                return ret;
        }
  
        ret = btrfs_update_inode(trans, root, inode);
+       if (ret)
+               btrfs_abort_transaction(trans, root, ret);
        trans->block_rsv = rsv;
  
        return ret;
@@@ -869,7 -871,7 +871,7 @@@ int __btrfs_write_out_cache(struct btrf
        io_ctl_prepare_pages(&io_ctl, inode, 0);
  
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
-                        0, &cached_state, GFP_NOFS);
+                        0, &cached_state);
  
        node = rb_first(&ctl->free_space_offset);
        if (!node && cluster) {
@@@ -1068,7 -1070,7 +1070,7 @@@ int btrfs_write_out_cache(struct btrfs_
                spin_unlock(&block_group->lock);
                ret = 0;
  #ifdef DEBUG
 -              printk(KERN_ERR "btrfs: failed to write free space cace "
 +              printk(KERN_ERR "btrfs: failed to write free space cache "
                       "for block group %llu\n", block_group->key.objectid);
  #endif
        }
@@@ -1948,14 -1950,14 +1950,14 @@@ again
                 */
                ret = btrfs_add_free_space(block_group, old_start,
                                           offset - old_start);
-               WARN_ON(ret);
+               WARN_ON(ret); /* -ENOMEM */
                goto out;
        }
  
        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
        if (ret == -EAGAIN)
                goto again;
-       BUG_ON(ret);
+       BUG_ON(ret); /* logic error */
  out_lock:
        spin_unlock(&ctl->tree_lock);
  out:
@@@ -2346,7 -2348,7 +2348,7 @@@ again
        rb_erase(&entry->offset_index, &ctl->free_space_offset);
        ret = tree_insert_offset(&cluster->root, entry->offset,
                                 &entry->offset_index, 1);
-       BUG_ON(ret);
+       BUG_ON(ret); /* -EEXIST; Logic error */
  
        trace_btrfs_setup_cluster(block_group, cluster,
                                  total_found * block_group->sectorsize, 1);
@@@ -2439,7 -2441,7 +2441,7 @@@ setup_cluster_no_bitmap(struct btrfs_bl
                ret = tree_insert_offset(&cluster->root, entry->offset,
                                         &entry->offset_index, 0);
                total_size += entry->bytes;
-               BUG_ON(ret);
+               BUG_ON(ret); /* -EEXIST; Logic error */
        } while (node && entry != last);
  
        cluster->max_size = max_extent;
@@@ -2830,6 -2832,7 +2832,7 @@@ u64 btrfs_find_ino_for_alloc(struct btr
                int ret;
  
                ret = search_bitmap(ctl, entry, &offset, &count);
+               /* Logic error; Should be empty if it can't find anything */
                BUG_ON(ret);
  
                ino = offset;
diff --combined fs/btrfs/inode.c
@@@ -150,7 -150,6 +150,6 @@@ static noinline int insert_inline_exten
        inode_add_bytes(inode, size);
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      datasize);
-       BUG_ON(ret);
        if (ret) {
                err = ret;
                goto fail;
                        cur_size = min_t(unsigned long, compressed_size,
                                       PAGE_CACHE_SIZE);
  
 -                      kaddr = kmap_atomic(cpage, KM_USER0);
 +                      kaddr = kmap_atomic(cpage);
                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
 -                      kunmap_atomic(kaddr, KM_USER0);
 +                      kunmap_atomic(kaddr);
  
                        i++;
                        ptr += cur_size;
                page = find_get_page(inode->i_mapping,
                                     start >> PAGE_CACHE_SHIFT);
                btrfs_set_file_extent_compression(leaf, ei, 0);
 -              kaddr = kmap_atomic(page, KM_USER0);
 +              kaddr = kmap_atomic(page);
                offset = start & (PAGE_CACHE_SIZE - 1);
                write_extent_buffer(leaf, kaddr + offset, ptr, size);
 -              kunmap_atomic(kaddr, KM_USER0);
 +              kunmap_atomic(kaddr);
                page_cache_release(page);
        }
        btrfs_mark_buffer_dirty(leaf);
         * could end up racing with unlink.
         */
        BTRFS_I(inode)->disk_i_size = inode->i_size;
-       btrfs_update_inode(trans, root, inode);
+       ret = btrfs_update_inode(trans, root, inode);
  
-       return 0;
+       return ret;
  fail:
        btrfs_free_path(path);
        return err;
@@@ -250,14 -249,18 +249,18 @@@ static noinline int cow_file_range_inli
  
        ret = btrfs_drop_extents(trans, inode, start, aligned_end,
                                 &hint_byte, 1);
-       BUG_ON(ret);
+       if (ret)
+               return ret;
  
        if (isize > actual_end)
                inline_len = min_t(u64, isize, actual_end);
        ret = insert_inline_extent(trans, root, inode, start,
                                   inline_len, compressed_size,
                                   compress_type, compressed_pages);
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               return ret;
+       }
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
@@@ -293,7 -296,7 +296,7 @@@ static noinline int add_async_extent(st
        struct async_extent *async_extent;
  
        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
-       BUG_ON(!async_extent);
+       BUG_ON(!async_extent); /* -ENOMEM */
        async_extent->start = start;
        async_extent->ram_size = ram_size;
        async_extent->compressed_size = compressed_size;
@@@ -344,8 -347,9 +347,9 @@@ static noinline int compress_file_range
        int will_compress;
        int compress_type = root->fs_info->compress_type;
  
-       /* if this is a small write inside eof, kick off a defragbot */
-       if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
+       /* if this is a small write inside eof, kick off a defrag */
+       if ((end - start + 1) < 16 * 1024 &&
+           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                btrfs_add_inode_defrag(NULL, inode);
  
        actual_end = min_t(u64, isize, end + 1);
@@@ -422,10 -426,10 +426,10 @@@ again
                         * sending it down to disk
                         */
                        if (offset) {
 -                              kaddr = kmap_atomic(page, KM_USER0);
 +                              kaddr = kmap_atomic(page);
                                memset(kaddr + offset, 0,
                                       PAGE_CACHE_SIZE - offset);
 -                              kunmap_atomic(kaddr, KM_USER0);
 +                              kunmap_atomic(kaddr);
                        }
                        will_compress = 1;
                }
  cont:
        if (start == 0) {
                trans = btrfs_join_transaction(root);
-               BUG_ON(IS_ERR(trans));
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       trans = NULL;
+                       goto cleanup_and_out;
+               }
                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
                /* lets try to make an inline extent */
                                                    total_compressed,
                                                    compress_type, pages);
                }
-               if (ret == 0) {
+               if (ret <= 0) {
                        /*
-                        * inline extent creation worked, we don't need
-                        * to create any more async work items.  Unlock
-                        * and free up our temp pages.
+                        * inline extent creation worked or returned error,
+                        * we don't need to create any more async work items.
+                        * Unlock and free up our temp pages.
                         */
                        extent_clear_unlock_delalloc(inode,
                             &BTRFS_I(inode)->io_tree,
@@@ -547,7 -555,7 +555,7 @@@ cleanup_and_bail_uncompressed
        }
  
  out:
-       return 0;
+       return ret;
  
  free_pages_out:
        for (i = 0; i < nr_pages_ret; i++) {
        kfree(pages);
  
        goto out;
+ cleanup_and_out:
+       extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                    start, end, NULL,
+                                    EXTENT_CLEAR_UNLOCK_PAGE |
+                                    EXTENT_CLEAR_DIRTY |
+                                    EXTENT_CLEAR_DELALLOC |
+                                    EXTENT_SET_WRITEBACK |
+                                    EXTENT_END_WRITEBACK);
+       if (!trans || IS_ERR(trans))
+               btrfs_error(root->fs_info, ret, "Failed to join transaction");
+       else
+               btrfs_abort_transaction(trans, root, ret);
+       goto free_pages_out;
  }
  
  /*
@@@ -597,7 -619,7 +619,7 @@@ retry
  
                        lock_extent(io_tree, async_extent->start,
                                         async_extent->start +
-                                        async_extent->ram_size - 1, GFP_NOFS);
+                                        async_extent->ram_size - 1);
  
                        /* allocate blocks */
                        ret = cow_file_range(inode, async_cow->locked_page,
                                             async_extent->ram_size - 1,
                                             &page_started, &nr_written, 0);
  
+                       /* JDM XXX */
                        /*
                         * if page_started, cow_file_range inserted an
                         * inline extent and took care of all the unlocking
                }
  
                lock_extent(io_tree, async_extent->start,
-                           async_extent->start + async_extent->ram_size - 1,
-                           GFP_NOFS);
+                           async_extent->start + async_extent->ram_size - 1);
  
                trans = btrfs_join_transaction(root);
-               BUG_ON(IS_ERR(trans));
-               trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-               ret = btrfs_reserve_extent(trans, root,
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+               } else {
+                       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+                       ret = btrfs_reserve_extent(trans, root,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
-                                          0, alloc_hint,
-                                          (u64)-1, &ins, 1);
-               btrfs_end_transaction(trans, root);
+                                          0, alloc_hint, &ins, 1);
+                       if (ret)
+                               btrfs_abort_transaction(trans, root, ret);
+                       btrfs_end_transaction(trans, root);
+               }
  
                if (ret) {
                        int i;
                        async_extent->pages = NULL;
                        unlock_extent(io_tree, async_extent->start,
                                      async_extent->start +
-                                     async_extent->ram_size - 1, GFP_NOFS);
-                       goto retry;
+                                     async_extent->ram_size - 1);
+                       if (ret == -ENOSPC)
+                               goto retry;
+                       goto out_free; /* JDM: Requeue? */
                }
  
                /*
                                        async_extent->ram_size - 1, 0);
  
                em = alloc_extent_map();
-               BUG_ON(!em);
+               BUG_ON(!em); /* -ENOMEM */
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
                em->orig_start = em->start;
                                                ins.offset,
                                                BTRFS_ORDERED_COMPRESSED,
                                                async_extent->compress_type);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
  
                /*
                 * clear dirty, set writeback and unlock the pages.
                                    ins.offset, async_extent->pages,
                                    async_extent->nr_pages);
  
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
                alloc_hint = ins.objectid + ins.offset;
                kfree(async_extent);
                cond_resched();
        }
-       return 0;
+       ret = 0;
+ out:
+       return ret;
+ out_free:
+       kfree(async_extent);
+       goto out;
  }
  
  static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@@ -791,7 -824,18 +824,18 @@@ static noinline int cow_file_range(stru
  
        BUG_ON(btrfs_is_free_space_inode(root, inode));
        trans = btrfs_join_transaction(root);
-       BUG_ON(IS_ERR(trans));
+       if (IS_ERR(trans)) {
+               extent_clear_unlock_delalloc(inode,
+                            &BTRFS_I(inode)->io_tree,
+                            start, end, NULL,
+                            EXTENT_CLEAR_UNLOCK_PAGE |
+                            EXTENT_CLEAR_UNLOCK |
+                            EXTENT_CLEAR_DELALLOC |
+                            EXTENT_CLEAR_DIRTY |
+                            EXTENT_SET_WRITEBACK |
+                            EXTENT_END_WRITEBACK);
+               return PTR_ERR(trans);
+       }
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        ret = 0;
  
        /* if this is a small write inside eof, kick off defrag */
-       if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
+       if (num_bytes < 64 * 1024 &&
+           (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                btrfs_add_inode_defrag(trans, inode);
  
        if (start == 0) {
                        *nr_written = *nr_written +
                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
                        *page_started = 1;
-                       ret = 0;
                        goto out;
+               } else if (ret < 0) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_unlock;
                }
        }
  
                cur_alloc_size = disk_num_bytes;
                ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
                                           root->sectorsize, 0, alloc_hint,
-                                          (u64)-1, &ins, 1);
-               BUG_ON(ret);
+                                          &ins, 1);
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_unlock;
+               }
  
                em = alloc_extent_map();
-               BUG_ON(!em);
+               BUG_ON(!em); /* -ENOMEM */
                em->start = start;
                em->orig_start = em->start;
                ram_size = ins.offset;
                cur_alloc_size = ins.offset;
                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
                                               ram_size, cur_alloc_size, 0);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
  
                if (root->root_key.objectid ==
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                        ret = btrfs_reloc_clone_csums(inode, start,
                                                      cur_alloc_size);
-                       BUG_ON(ret);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, root, ret);
+                               goto out_unlock;
+                       }
                }
  
                if (disk_num_bytes < cur_alloc_size)
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
        }
- out:
        ret = 0;
+ out:
        btrfs_end_transaction(trans, root);
  
        return ret;
+ out_unlock:
+       extent_clear_unlock_delalloc(inode,
+                    &BTRFS_I(inode)->io_tree,
+                    start, end, NULL,
+                    EXTENT_CLEAR_UNLOCK_PAGE |
+                    EXTENT_CLEAR_UNLOCK |
+                    EXTENT_CLEAR_DELALLOC |
+                    EXTENT_CLEAR_DIRTY |
+                    EXTENT_SET_WRITEBACK |
+                    EXTENT_END_WRITEBACK);
+       goto out;
  }
  
  /*
@@@ -969,7 -1034,7 +1034,7 @@@ static int cow_file_range_async(struct 
                         1, 0, NULL, GFP_NOFS);
        while (start < end) {
                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
-               BUG_ON(!async_cow);
+               BUG_ON(!async_cow); /* -ENOMEM */
                async_cow->inode = inode;
                async_cow->root = root;
                async_cow->locked_page = locked_page;
@@@ -1060,7 -1125,7 +1125,7 @@@ static noinline int run_delalloc_nocow(
        u64 disk_bytenr;
        u64 num_bytes;
        int extent_type;
-       int ret;
+       int ret, err;
        int type;
        int nocow;
        int check_prev = 1;
        else
                trans = btrfs_join_transaction(root);
  
-       BUG_ON(IS_ERR(trans));
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
        cow_start = (u64)-1;
        while (1) {
                ret = btrfs_lookup_file_extent(trans, root, path, ino,
                                               cur_offset, 0);
-               BUG_ON(ret < 0);
+               if (ret < 0) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto error;
+               }
                if (ret > 0 && path->slots[0] > 0 && check_prev) {
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &found_key,
@@@ -1100,8 -1172,10 +1172,10 @@@ next_slot
                leaf = path->nodes[0];
                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
                        ret = btrfs_next_leaf(root, path);
-                       if (ret < 0)
-                               BUG_ON(1);
+                       if (ret < 0) {
+                               btrfs_abort_transaction(trans, root, ret);
+                               goto error;
+                       }
                        if (ret > 0)
                                break;
                        leaf = path->nodes[0];
@@@ -1189,7 -1263,10 +1263,10 @@@ out_check
                        ret = cow_file_range(inode, locked_page, cow_start,
                                        found_key.offset - 1, page_started,
                                        nr_written, 1);
-                       BUG_ON(ret);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, root, ret);
+                               goto error;
+                       }
                        cow_start = (u64)-1;
                }
  
                        struct extent_map_tree *em_tree;
                        em_tree = &BTRFS_I(inode)->extent_tree;
                        em = alloc_extent_map();
-                       BUG_ON(!em);
+                       BUG_ON(!em); /* -ENOMEM */
                        em->start = cur_offset;
                        em->orig_start = em->start;
                        em->len = num_bytes;
  
                ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
                                               num_bytes, num_bytes, type);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM */
  
                if (root->root_key.objectid ==
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
                                                      num_bytes);
-                       BUG_ON(ret);
+                       if (ret) {
+                               btrfs_abort_transaction(trans, root, ret);
+                               goto error;
+                       }
                }
  
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
        if (cow_start != (u64)-1) {
                ret = cow_file_range(inode, locked_page, cow_start, end,
                                     page_started, nr_written, 1);
-               BUG_ON(ret);
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto error;
+               }
        }
  
+ error:
        if (nolock) {
-               ret = btrfs_end_transaction_nolock(trans, root);
-               BUG_ON(ret);
+               err = btrfs_end_transaction_nolock(trans, root);
        } else {
-               ret = btrfs_end_transaction(trans, root);
-               BUG_ON(ret);
+               err = btrfs_end_transaction(trans, root);
        }
+       if (!ret)
+               ret = err;
        btrfs_free_path(path);
-       return 0;
+       return ret;
  }
  
  /*
@@@ -1425,10 -1510,11 +1510,11 @@@ int btrfs_merge_bio_hook(struct page *p
        map_length = length;
        ret = btrfs_map_block(map_tree, READ, logical,
                              &map_length, NULL, 0);
+       /* Will always return 0 or 1 with map_multi == NULL */
+       BUG_ON(ret < 0);
        if (map_length < length + size)
                return 1;
-       return ret;
+       return 0;
  }
  
  /*
@@@ -1448,7 -1534,7 +1534,7 @@@ static int __btrfs_submit_bio_start(str
        int ret = 0;
  
        ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
-       BUG_ON(ret);
+       BUG_ON(ret); /* -ENOMEM */
        return 0;
  }
  
@@@ -1479,14 -1565,16 +1565,16 @@@ static int btrfs_submit_bio_hook(struc
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
        int skip_sum;
+       int metadata = 0;
  
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
  
        if (btrfs_is_free_space_inode(root, inode))
-               ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
-       else
-               ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-       BUG_ON(ret);
+               metadata = 2;
+       ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+       if (ret)
+               return ret;
  
        if (!(rw & REQ_WRITE)) {
                if (bio_flags & EXTENT_BIO_COMPRESSED) {
@@@ -1571,7 -1659,7 +1659,7 @@@ again
        page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
  
        lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
-                        &cached_state, GFP_NOFS);
+                        &cached_state);
  
        /* already ordered? We're done */
        if (PagePrivate2(page))
@@@ -1675,13 -1763,15 +1763,15 @@@ static int insert_reserved_file_extent(
         */
        ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
                                 &hint, 0);
-       BUG_ON(ret);
+       if (ret)
+               goto out;
  
        ins.objectid = btrfs_ino(inode);
        ins.offset = file_pos;
        ins.type = BTRFS_EXTENT_DATA_KEY;
        ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
-       BUG_ON(ret);
+       if (ret)
+               goto out;
        leaf = path->nodes[0];
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
        ret = btrfs_alloc_reserved_file_extent(trans, root,
                                        root->root_key.objectid,
                                        btrfs_ino(inode), file_pos, &ins);
-       BUG_ON(ret);
+ out:
        btrfs_free_path(path);
  
-       return 0;
+       return ret;
  }
  
  /*
@@@ -1740,35 -1830,41 +1830,41 @@@ static int btrfs_finish_ordered_io(stru
                                             end - start + 1);
        if (!ret)
                return 0;
-       BUG_ON(!ordered_extent);
+       BUG_ON(!ordered_extent); /* Logic error */
  
        nolock = btrfs_is_free_space_inode(root, inode);
  
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
-               BUG_ON(!list_empty(&ordered_extent->list));
+               BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
                        if (nolock)
                                trans = btrfs_join_transaction_nolock(root);
                        else
                                trans = btrfs_join_transaction(root);
-                       BUG_ON(IS_ERR(trans));
+                       if (IS_ERR(trans))
+                               return PTR_ERR(trans);
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode_fallback(trans, root, inode);
-                       BUG_ON(ret);
+                       if (ret) /* -ENOMEM or corruption */
+                               btrfs_abort_transaction(trans, root, ret);
                }
                goto out;
        }
  
        lock_extent_bits(io_tree, ordered_extent->file_offset,
                         ordered_extent->file_offset + ordered_extent->len - 1,
-                        0, &cached_state, GFP_NOFS);
+                        0, &cached_state);
  
        if (nolock)
                trans = btrfs_join_transaction_nolock(root);
        else
                trans = btrfs_join_transaction(root);
-       BUG_ON(IS_ERR(trans));
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               trans = NULL;
+               goto out_unlock;
+       }
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
                                                ordered_extent->len);
-               BUG_ON(ret);
        } else {
                BUG_ON(root == root->fs_info->tree_root);
                ret = insert_reserved_file_extent(trans, inode,
                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
                                   ordered_extent->file_offset,
                                   ordered_extent->len);
-               BUG_ON(ret);
        }
        unlock_extent_cached(io_tree, ordered_extent->file_offset,
                             ordered_extent->file_offset +
                             ordered_extent->len - 1, &cached_state, GFP_NOFS);
+       if (ret < 0) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out;
+       }
  
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
        if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
                ret = btrfs_update_inode_fallback(trans, root, inode);
-               BUG_ON(ret);
+               if (ret) { /* -ENOMEM or corruption */
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out;
+               }
        }
        ret = 0;
  out:
        btrfs_put_ordered_extent(ordered_extent);
  
        return 0;
+ out_unlock:
+       unlock_extent_cached(io_tree, ordered_extent->file_offset,
+                            ordered_extent->file_offset +
+                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
+       goto out;
  }
  
  static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
@@@ -1873,7 -1979,7 +1979,7 @@@ static int btrfs_readpage_end_io_hook(s
        } else {
                ret = get_state_private(io_tree, start, &private);
        }
 -      kaddr = kmap_atomic(page, KM_USER0);
 +      kaddr = kmap_atomic(page);
        if (ret)
                goto zeroit;
  
        if (csum != private)
                goto zeroit;
  
 -      kunmap_atomic(kaddr, KM_USER0);
 +      kunmap_atomic(kaddr);
  good:
        return 0;
  
@@@ -1894,7 -2000,7 +2000,7 @@@ zeroit
                       (unsigned long long)private);
        memset(kaddr + offset, 1, end - start + 1);
        flush_dcache_page(page);
 -      kunmap_atomic(kaddr, KM_USER0);
 +      kunmap_atomic(kaddr);
        if (private == 0)
                return 0;
        return -EIO;
@@@ -1905,6 -2011,8 +2011,8 @@@ struct delayed_iput 
        struct inode *inode;
  };
  
+ /* JDM: If this is fs-wide, why can't we add a pointer to
+  * btrfs_inode instead and avoid the allocation? */
  void btrfs_add_delayed_iput(struct inode *inode)
  {
        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@@ -2051,20 -2159,27 +2159,27 @@@ int btrfs_orphan_add(struct btrfs_trans
        /* grab metadata reservation from transaction handle */
        if (reserve) {
                ret = btrfs_orphan_reserve_metadata(trans, inode);
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
        }
  
        /* insert an orphan item to track this unlinked/truncated file */
        if (insert >= 1) {
                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
-               BUG_ON(ret && ret != -EEXIST);
+               if (ret && ret != -EEXIST) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       return ret;
+               }
+               ret = 0;
        }
  
        /* insert an orphan item to track subvolume contains orphan files */
        if (insert >= 2) {
                ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
                                               root->root_key.objectid);
-               BUG_ON(ret);
+               if (ret && ret != -EEXIST) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       return ret;
+               }
        }
        return 0;
  }
@@@ -2094,7 -2209,7 +2209,7 @@@ int btrfs_orphan_del(struct btrfs_trans
  
        if (trans && delete_item) {
                ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-               BUG_ON(ret);
+               BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
        }
  
        if (release_rsv)
@@@ -2228,7 -2343,7 +2343,7 @@@ int btrfs_orphan_cleanup(struct btrfs_r
                        }
                        ret = btrfs_del_orphan_item(trans, root,
                                                    found_key.objectid);
-                       BUG_ON(ret);
+                       BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
                        btrfs_end_transaction(trans, root);
                        continue;
                }
@@@ -2610,16 -2725,22 +2725,22 @@@ static int __btrfs_unlink_inode(struct 
                printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
                       "inode %llu parent %llu\n", name_len, name,
                       (unsigned long long)ino, (unsigned long long)dir_ino);
+               btrfs_abort_transaction(trans, root, ret);
                goto err;
        }
  
        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
-       if (ret)
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
                goto err;
+       }
  
        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
                                         inode, dir_ino);
-       BUG_ON(ret != 0 && ret != -ENOENT);
+       if (ret != 0 && ret != -ENOENT) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto err;
+       }
  
        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
                                           dir, index);
@@@ -2777,7 -2898,7 +2898,7 @@@ static struct btrfs_trans_handle *__unl
                        err = ret;
                        goto out;
                }
-               BUG_ON(ret == 0);
+               BUG_ON(ret == 0); /* Corruption */
                if (check_path_shared(root, path))
                        goto out;
                btrfs_release_path(path);
                err = PTR_ERR(ref);
                goto out;
        }
-       BUG_ON(!ref);
+       BUG_ON(!ref); /* Logic error */
        if (check_path_shared(root, path))
                goto out;
        index = btrfs_inode_ref_index(path->nodes[0], ref);
@@@ -2917,23 -3038,42 +3038,42 @@@ int btrfs_unlink_subvol(struct btrfs_tr
  
        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
                                   name, name_len, -1);
-       BUG_ON(IS_ERR_OR_NULL(di));
+       if (IS_ERR_OR_NULL(di)) {
+               if (!di)
+                       ret = -ENOENT;
+               else
+                       ret = PTR_ERR(di);
+               goto out;
+       }
  
        leaf = path->nodes[0];
        btrfs_dir_item_key_to_cpu(leaf, di, &key);
        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
        ret = btrfs_delete_one_dir_name(trans, root, path, di);
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out;
+       }
        btrfs_release_path(path);
  
        ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
                                 objectid, root->root_key.objectid,
                                 dir_ino, &index, name, name_len);
        if (ret < 0) {
-               BUG_ON(ret != -ENOENT);
+               if (ret != -ENOENT) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out;
+               }
                di = btrfs_search_dir_index_item(root, path, dir_ino,
                                                 name, name_len);
-               BUG_ON(IS_ERR_OR_NULL(di));
+               if (IS_ERR_OR_NULL(di)) {
+                       if (!di)
+                               ret = -ENOENT;
+                       else
+                               ret = PTR_ERR(di);
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out;
+               }
  
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
        btrfs_release_path(path);
  
        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out;
+       }
  
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, dir);
-       BUG_ON(ret);
+       if (ret)
+               btrfs_abort_transaction(trans, root, ret);
+ out:
        btrfs_free_path(path);
-       return 0;
+       return ret;
  }
  
  static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
@@@ -3161,8 -3305,8 +3305,8 @@@ search_again
                                }
                                size =
                                    btrfs_file_extent_calc_inline_size(size);
-                               ret = btrfs_truncate_item(trans, root, path,
-                                                         size, 1);
+                               btrfs_truncate_item(trans, root, path,
+                                                   size, 1);
                        } else if (root->ref_cows) {
                                inode_sub_bytes(inode, item_end + 1 -
                                                found_key.offset);
@@@ -3210,7 -3354,11 +3354,11 @@@ delete
                                ret = btrfs_del_items(trans, root, path,
                                                pending_del_slot,
                                                pending_del_nr);
-                               BUG_ON(ret);
+                               if (ret) {
+                                       btrfs_abort_transaction(trans,
+                                                               root, ret);
+                                       goto error;
+                               }
                                pending_del_nr = 0;
                        }
                        btrfs_release_path(path);
        if (pending_del_nr) {
                ret = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
-               BUG_ON(ret);
+               if (ret)
+                       btrfs_abort_transaction(trans, root, ret);
        }
+ error:
        btrfs_free_path(path);
        return err;
  }
@@@ -3282,8 -3432,7 +3432,7 @@@ again
        }
        wait_on_page_writeback(page);
  
-       lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
-                        GFP_NOFS);
+       lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
        set_page_extent_mapped(page);
  
        ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@@ -3359,7 -3508,7 +3508,7 @@@ int btrfs_cont_expand(struct inode *ino
                btrfs_wait_ordered_range(inode, hole_start,
                                         block_end - hole_start);
                lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
-                                &cached_state, GFP_NOFS);
+                                &cached_state);
                ordered = btrfs_lookup_ordered_extent(inode, hole_start);
                if (!ordered)
                        break;
        while (1) {
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                block_end - cur_offset, 0);
-               BUG_ON(IS_ERR_OR_NULL(em));
+               if (IS_ERR(em)) {
+                       err = PTR_ERR(em);
+                       break;
+               }
                last_byte = min(extent_map_end(em), block_end);
                last_byte = (last_byte + mask) & ~mask;
                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
                                                 cur_offset + hole_size,
                                                 &hint_byte, 1);
                        if (err) {
-                               btrfs_update_inode(trans, root, inode);
+                               btrfs_abort_transaction(trans, root, err);
                                btrfs_end_transaction(trans, root);
                                break;
                        }
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
                        if (err) {
-                               btrfs_update_inode(trans, root, inode);
+                               btrfs_abort_transaction(trans, root, err);
                                btrfs_end_transaction(trans, root);
                                break;
                        }
@@@ -3779,7 -3931,7 +3931,7 @@@ static void inode_tree_del(struct inod
        }
  }
  
int btrfs_invalidate_inodes(struct btrfs_root *root)
void btrfs_invalidate_inodes(struct btrfs_root *root)
  {
        struct rb_node *node;
        struct rb_node *prev;
@@@ -3839,7 -3991,6 +3991,6 @@@ again
                node = rb_next(node);
        }
        spin_unlock(&root->inode_lock);
-       return 0;
  }
  
  static int btrfs_init_locked_inode(struct inode *inode, void *p)
@@@ -4581,18 -4732,26 +4732,26 @@@ int btrfs_add_link(struct btrfs_trans_h
                                             parent_ino, index);
        }
  
-       if (ret == 0) {
-               ret = btrfs_insert_dir_item(trans, root, name, name_len,
-                                           parent_inode, &key,
-                                           btrfs_inode_type(inode), index);
-               if (ret)
-                       goto fail_dir_item;
+       /* Nothing to clean up yet */
+       if (ret)
+               return ret;
  
-               btrfs_i_size_write(parent_inode, parent_inode->i_size +
-                                  name_len * 2);
-               parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-               ret = btrfs_update_inode(trans, root, parent_inode);
+       ret = btrfs_insert_dir_item(trans, root, name, name_len,
+                                   parent_inode, &key,
+                                   btrfs_inode_type(inode), index);
+       if (ret == -EEXIST)
+               goto fail_dir_item;
+       else if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               return ret;
        }
+       btrfs_i_size_write(parent_inode, parent_inode->i_size +
+                          name_len * 2);
+       parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+       ret = btrfs_update_inode(trans, root, parent_inode);
+       if (ret)
+               btrfs_abort_transaction(trans, root, ret);
        return ret;
  
  fail_dir_item:
@@@ -4806,7 -4965,8 +4965,8 @@@ static int btrfs_link(struct dentry *ol
        } else {
                struct dentry *parent = dentry->d_parent;
                err = btrfs_update_inode(trans, root, inode);
-               BUG_ON(err);
+               if (err)
+                       goto fail;
                d_instantiate(dentry, inode);
                btrfs_log_new_name(trans, inode, NULL, parent);
        }
@@@ -4937,12 -5097,12 +5097,12 @@@ static noinline int uncompress_inline(s
        ret = btrfs_decompress(compress_type, tmp, page,
                               extent_offset, inline_size, max_size);
        if (ret) {
 -              char *kaddr = kmap_atomic(page, KM_USER0);
 +              char *kaddr = kmap_atomic(page);
                unsigned long copy_size = min_t(u64,
                                  PAGE_CACHE_SIZE - pg_offset,
                                  max_size - extent_offset);
                memset(kaddr + pg_offset, 0, copy_size);
 -              kunmap_atomic(kaddr, KM_USER0);
 +              kunmap_atomic(kaddr);
        }
        kfree(tmp);
        return 0;
@@@ -5137,7 -5297,7 +5297,7 @@@ again
                                ret = uncompress_inline(path, inode, page,
                                                        pg_offset,
                                                        extent_offset, item);
-                               BUG_ON(ret);
+                               BUG_ON(ret); /* -ENOMEM */
                        } else {
                                map = kmap(page);
                                read_extent_buffer(leaf, map + pg_offset, ptr,
@@@ -5252,6 -5412,7 +5412,7 @@@ out
                free_extent_map(em);
                return ERR_PTR(err);
        }
+       BUG_ON(!em); /* Error is always set */
        return em;
  }
  
@@@ -5414,7 -5575,7 +5575,7 @@@ static struct extent_map *btrfs_new_ext
  
        alloc_hint = get_extent_allocation_hint(inode, start, len);
        ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
-                                  alloc_hint, (u64)-1, &ins, 1);
+                                  alloc_hint, &ins, 1);
        if (ret) {
                em = ERR_PTR(ret);
                goto out;
@@@ -5602,7 -5763,7 +5763,7 @@@ static int btrfs_get_blocks_direct(stru
                free_extent_map(em);
                /* DIO will do one hole at a time, so just unlock a sector */
                unlock_extent(&BTRFS_I(inode)->io_tree, start,
-                             start + root->sectorsize - 1, GFP_NOFS);
+                             start + root->sectorsize - 1);
                return 0;
        }
  
@@@ -5719,11 -5880,11 +5880,11 @@@ static void btrfs_endio_direct_read(str
                        unsigned long flags;
  
                        local_irq_save(flags);
 -                      kaddr = kmap_atomic(page, KM_IRQ0);
 +                      kaddr = kmap_atomic(page);
                        csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
                                               csum, bvec->bv_len);
                        btrfs_csum_final(csum, (char *)&csum);
 -                      kunmap_atomic(kaddr, KM_IRQ0);
 +                      kunmap_atomic(kaddr);
                        local_irq_restore(flags);
  
                        flush_dcache_page(bvec->bv_page);
        } while (bvec <= bvec_end);
  
        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
-                     dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+                     dip->logical_offset + dip->bytes - 1);
        bio->bi_private = dip->private;
  
        kfree(dip->csums);
@@@ -5794,7 -5955,7 +5955,7 @@@ again
  
        lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
                         ordered->file_offset + ordered->len - 1, 0,
-                        &cached_state, GFP_NOFS);
+                        &cached_state);
  
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
                ret = btrfs_mark_extent_written(trans, inode,
@@@ -5868,7 -6029,7 +6029,7 @@@ static int __btrfs_submit_bio_start_dir
        int ret;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
-       BUG_ON(ret);
+       BUG_ON(ret); /* -ENOMEM */
        return 0;
  }
  
@@@ -6209,7 -6370,7 +6370,7 @@@ static ssize_t btrfs_direct_IO(int rw, 
  
        while (1) {
                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                0, &cached_state, GFP_NOFS);
+                                0, &cached_state);
                /*
                 * We're concerned with the entire range that we're going to be
                 * doing DIO to, so we need to make sure theres no ordered
        if (writing) {
                write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                    EXTENT_DELALLOC, 0, NULL, &cached_state,
+                                    EXTENT_DELALLOC, NULL, &cached_state,
                                     GFP_NOFS);
                if (ret) {
                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
@@@ -6363,8 -6524,7 +6524,7 @@@ static void btrfs_invalidatepage(struc
                btrfs_releasepage(page, GFP_NOFS);
                return;
        }
-       lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
-                        GFP_NOFS);
+       lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
        ordered = btrfs_lookup_ordered_extent(page->mapping->host,
                                           page_offset(page));
        if (ordered) {
                }
                btrfs_put_ordered_extent(ordered);
                cached_state = NULL;
-               lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
-                                GFP_NOFS);
+               lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
        }
        clear_extent_bit(tree, page_start, page_end,
                 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@@ -6462,8 -6621,7 +6621,7 @@@ again
        }
        wait_on_page_writeback(page);
  
-       lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
-                        GFP_NOFS);
+       lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
        set_page_extent_mapped(page);
  
        /*
@@@ -6737,10 -6895,9 +6895,9 @@@ int btrfs_create_subvol_root(struct btr
        btrfs_i_size_write(inode, 0);
  
        err = btrfs_update_inode(trans, new_root, inode);
-       BUG_ON(err);
  
        iput(inode);
-       return 0;
+       return err;
  }
  
  struct inode *btrfs_alloc_inode(struct super_block *sb)
        extent_map_tree_init(&ei->extent_tree);
        extent_io_tree_init(&ei->io_tree, &inode->i_data);
        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+       ei->io_tree.track_uptodate = 1;
+       ei->io_failure_tree.track_uptodate = 1;
        mutex_init(&ei->log_mutex);
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@@ -7072,7 -7231,10 +7231,10 @@@ static int btrfs_rename(struct inode *o
                if (!ret)
                        ret = btrfs_update_inode(trans, root, old_inode);
        }
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
  
        if (new_inode) {
                new_inode->i_ctime = CURRENT_TIME;
                                                 new_dentry->d_name.name,
                                                 new_dentry->d_name.len);
                }
-               BUG_ON(ret);
-               if (new_inode->i_nlink == 0) {
+               if (!ret && new_inode->i_nlink == 0) {
                        ret = btrfs_orphan_add(trans, new_dentry->d_inode);
                        BUG_ON(ret);
                }
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       goto out_fail;
+               }
        }
  
        fixup_inode_flags(new_dir, old_inode);
        ret = btrfs_add_link(trans, new_dir, old_inode,
                             new_dentry->d_name.name,
                             new_dentry->d_name.len, 0, index);
-       BUG_ON(ret);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_fail;
+       }
  
        if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
                struct dentry *parent = new_dentry->d_parent;
@@@ -7315,7 -7483,7 +7483,7 @@@ static int __btrfs_prealloc_file_range(
                }
  
                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
-                                          0, *alloc_hint, (u64)-1, &ins, 1);
+                                          0, *alloc_hint, &ins, 1);
                if (ret) {
                        if (own_trans)
                                btrfs_end_transaction(trans, root);
                                                  ins.offset, ins.offset,
                                                  ins.offset, 0, 0, 0,
                                                  BTRFS_FILE_EXTENT_PREALLOC);
-               BUG_ON(ret);
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       if (own_trans)
+                               btrfs_end_transaction(trans, root);
+                       break;
+               }
                btrfs_drop_extent_cache(inode, cur_offset,
                                        cur_offset + ins.offset -1, 0);
  
                }
  
                ret = btrfs_update_inode(trans, root, inode);
-               BUG_ON(ret);
+               if (ret) {
+                       btrfs_abort_transaction(trans, root, ret);
+                       if (own_trans)
+                               btrfs_end_transaction(trans, root);
+                       break;
+               }
  
                if (own_trans)
                        btrfs_end_transaction(trans, root);
diff --combined fs/btrfs/scrub.c
   * Future enhancements:
   *  - In case an unrepairable extent is encountered, track which files are
   *    affected and report them
-  *  - In case of a read error on files with nodatasum, map the file and read
-  *    the extent to trigger a writeback of the good copy
   *  - track and record media errors, throw out bad devices
   *  - add a mode to also read unallocated space
   */
  
- struct scrub_bio;
- struct scrub_page;
+ struct scrub_block;
  struct scrub_dev;
- static void scrub_bio_end_io(struct bio *bio, int err);
- static void scrub_checksum(struct btrfs_work *work);
- static int scrub_checksum_data(struct scrub_dev *sdev,
-                              struct scrub_page *spag, void *buffer);
- static int scrub_checksum_tree_block(struct scrub_dev *sdev,
-                                    struct scrub_page *spag, u64 logical,
-                                    void *buffer);
- static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
- static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
- static void scrub_fixup_end_io(struct bio *bio, int err);
- static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
-                         struct page *page);
- static void scrub_fixup(struct scrub_bio *sbio, int ix);
  
  #define SCRUB_PAGES_PER_BIO   16      /* 64k per bio */
  #define SCRUB_BIOS_PER_DEV    16      /* 1 MB per device in flight */
+ #define SCRUB_MAX_PAGES_PER_BLOCK     16      /* 64k per node/leaf/sector */
  
  struct scrub_page {
+       struct scrub_block      *sblock;
+       struct page             *page;
+       struct block_device     *bdev;
        u64                     flags;  /* extent flags */
        u64                     generation;
-       int                     mirror_num;
-       int                     have_csum;
+       u64                     logical;
+       u64                     physical;
+       struct {
+               unsigned int    mirror_num:8;
+               unsigned int    have_csum:1;
+               unsigned int    io_error:1;
+       };
        u8                      csum[BTRFS_CSUM_SIZE];
  };
  
@@@ -77,12 -70,25 +70,25 @@@ struct scrub_bio 
        int                     err;
        u64                     logical;
        u64                     physical;
-       struct scrub_page       spag[SCRUB_PAGES_PER_BIO];
-       u64                     count;
+       struct scrub_page       *pagev[SCRUB_PAGES_PER_BIO];
+       int                     page_count;
        int                     next_free;
        struct btrfs_work       work;
  };
  
+ struct scrub_block {
+       struct scrub_page       pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+       int                     page_count;
+       atomic_t                outstanding_pages;
+       atomic_t                ref_count; /* free mem on transition to zero */
+       struct scrub_dev        *sdev;
+       struct {
+               unsigned int    header_error:1;
+               unsigned int    checksum_error:1;
+               unsigned int    no_io_error_seen:1;
+       };
+ };
  struct scrub_dev {
        struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
        struct btrfs_device     *dev;
        struct list_head        csum_list;
        atomic_t                cancel_req;
        int                     readonly;
+       int                     pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+       u32                     sectorsize;
+       u32                     nodesize;
+       u32                     leafsize;
        /*
         * statistics
         */
@@@ -124,6 -134,43 +134,43 @@@ struct scrub_warning 
        int                     scratch_bufsize;
  };
  
+ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+                                    struct btrfs_mapping_tree *map_tree,
+                                    u64 length, u64 logical,
+                                    struct scrub_block *sblock);
+ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+                              struct scrub_block *sblock, int is_metadata,
+                              int have_csum, u8 *csum, u64 generation,
+                              u16 csum_size);
+ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+                                        struct scrub_block *sblock,
+                                        int is_metadata, int have_csum,
+                                        const u8 *csum, u64 generation,
+                                        u16 csum_size);
+ static void scrub_complete_bio_end_io(struct bio *bio, int err);
+ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+                                            struct scrub_block *sblock_good,
+                                            int force_write);
+ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+                                           struct scrub_block *sblock_good,
+                                           int page_num, int force_write);
+ static int scrub_checksum_data(struct scrub_block *sblock);
+ static int scrub_checksum_tree_block(struct scrub_block *sblock);
+ static int scrub_checksum_super(struct scrub_block *sblock);
+ static void scrub_block_get(struct scrub_block *sblock);
+ static void scrub_block_put(struct scrub_block *sblock);
+ static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+                                struct scrub_page *spage);
+ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+                      u64 physical, u64 flags, u64 gen, int mirror_num,
+                      u8 *csum, int force);
+ static void scrub_bio_end_io(struct bio *bio, int err);
+ static void scrub_bio_end_io_worker(struct btrfs_work *work);
+ static void scrub_block_complete(struct scrub_block *sblock);
  static void scrub_free_csums(struct scrub_dev *sdev)
  {
        while (!list_empty(&sdev->csum_list)) {
        }
  }
  
- static void scrub_free_bio(struct bio *bio)
- {
-       int i;
-       struct page *last_page = NULL;
-       if (!bio)
-               return;
-       for (i = 0; i < bio->bi_vcnt; ++i) {
-               if (bio->bi_io_vec[i].bv_page == last_page)
-                       continue;
-               last_page = bio->bi_io_vec[i].bv_page;
-               __free_page(last_page);
-       }
-       bio_put(bio);
- }
  static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
  {
        int i;
        if (!sdev)
                return;
  
+       /* this can happen when scrub is cancelled */
+       if (sdev->curr != -1) {
+               struct scrub_bio *sbio = sdev->bios[sdev->curr];
+               for (i = 0; i < sbio->page_count; i++) {
+                       BUG_ON(!sbio->pagev[i]);
+                       BUG_ON(!sbio->pagev[i]->page);
+                       scrub_block_put(sbio->pagev[i]->sblock);
+               }
+               bio_put(sbio->bio);
+       }
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                struct scrub_bio *sbio = sdev->bios[i];
  
                if (!sbio)
                        break;
-               scrub_free_bio(sbio->bio);
                kfree(sbio);
        }
  
@@@ -179,11 -219,16 +219,16 @@@ struct scrub_dev *scrub_setup_dev(struc
        struct scrub_dev *sdev;
        int             i;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+       int pages_per_bio;
  
+       pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
+                             bio_get_nr_vecs(dev->bdev));
        sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
        if (!sdev)
                goto nomem;
        sdev->dev = dev;
+       sdev->pages_per_bio = pages_per_bio;
+       sdev->curr = -1;
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                struct scrub_bio *sbio;
  
  
                sbio->index = i;
                sbio->sdev = sdev;
-               sbio->count = 0;
-               sbio->work.func = scrub_checksum;
+               sbio->page_count = 0;
+               sbio->work.func = scrub_bio_end_io_worker;
  
                if (i != SCRUB_BIOS_PER_DEV-1)
                        sdev->bios[i]->next_free = i + 1;
                        sdev->bios[i]->next_free = -1;
        }
        sdev->first_free = 0;
-       sdev->curr = -1;
+       sdev->nodesize = dev->dev_root->nodesize;
+       sdev->leafsize = dev->dev_root->leafsize;
+       sdev->sectorsize = dev->dev_root->sectorsize;
        atomic_set(&sdev->in_flight, 0);
        atomic_set(&sdev->fixup_cnt, 0);
        atomic_set(&sdev->cancel_req, 0);
@@@ -294,10 -341,9 +341,9 @@@ err
        return 0;
  }
  
- static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
-                               int ix)
+ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
  {
-       struct btrfs_device *dev = sbio->sdev->dev;
+       struct btrfs_device *dev = sblock->sdev->dev;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
        struct btrfs_path *path;
        struct btrfs_key found_key;
  
        swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
        swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
-       swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
-       swarn.logical = sbio->logical + ix * PAGE_SIZE;
+       BUG_ON(sblock->page_count < 1);
+       swarn.sector = (sblock->pagev[0].physical) >> 9;
+       swarn.logical = sblock->pagev[0].logical;
        swarn.errstr = errstr;
        swarn.dev = dev;
        swarn.msg_bufsize = bufsize;
                do {
                        ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
                                                        &ref_root, &ref_level);
-                       printk(KERN_WARNING "%s at logical %llu on dev %s, "
+                       printk(KERN_WARNING
+                               "btrfs: %s at logical %llu on dev %s, "
                                "sector %llu: metadata %s (level %d) in tree "
                                "%llu\n", errstr, swarn.logical, dev->name,
                                (unsigned long long)swarn.sector,
                } while (ret != 1);
        } else {
                swarn.path = path;
-               iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       extent_item_pos,
+               iterate_extent_inodes(fs_info, found_key.objectid,
+                                       extent_item_pos, 1,
                                        scrub_print_warning_inode, &swarn);
        }
  
@@@ -531,9 -579,9 +579,9 @@@ out
                spin_lock(&sdev->stat_lock);
                ++sdev->stat.uncorrectable_errors;
                spin_unlock(&sdev->stat_lock);
-               printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
-                                       "(nodatasum) error at logical %llu\n",
-                                       fixup->logical);
+               printk_ratelimited(KERN_ERR
+                       "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+                       (unsigned long long)fixup->logical, sdev->dev->name);
        }
  
        btrfs_free_path(path);
  }
  
  /*
-  * scrub_recheck_error gets called when either verification of the page
-  * failed or the bio failed to read, e.g. with EIO. In the latter case,
-  * recheck_error gets called for every page in the bio, even though only
-  * one may be bad
+  * scrub_handle_errored_block gets called when either verification of the
+  * pages failed or the bio failed to read, e.g. with EIO. In the latter
+  * case, this function handles all pages in the bio, even though only one
+  * may be bad.
+  * The goal of this function is to repair the errored block by using the
+  * contents of one of the mirrors.
   */
- static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
+ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
  {
-       struct scrub_dev *sdev = sbio->sdev;
-       u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+       struct scrub_dev *sdev = sblock_to_check->sdev;
+       struct btrfs_fs_info *fs_info;
+       u64 length;
+       u64 logical;
+       u64 generation;
+       unsigned int failed_mirror_index;
+       unsigned int is_metadata;
+       unsigned int have_csum;
+       u8 *csum;
+       struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+       struct scrub_block *sblock_bad;
+       int ret;
+       int mirror_index;
+       int page_num;
+       int success;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-                                       DEFAULT_RATELIMIT_BURST);
+                                     DEFAULT_RATELIMIT_BURST);
+       BUG_ON(sblock_to_check->page_count < 1);
+       fs_info = sdev->dev->dev_root->fs_info;
+       length = sblock_to_check->page_count * PAGE_SIZE;
+       logical = sblock_to_check->pagev[0].logical;
+       generation = sblock_to_check->pagev[0].generation;
+       BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
+       failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
+       is_metadata = !(sblock_to_check->pagev[0].flags &
+                       BTRFS_EXTENT_FLAG_DATA);
+       have_csum = sblock_to_check->pagev[0].have_csum;
+       csum = sblock_to_check->pagev[0].csum;
  
-       if (sbio->err) {
-               if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
-                                  sbio->bio->bi_io_vec[ix].bv_page) == 0) {
-                       if (scrub_fixup_check(sbio, ix) == 0)
-                               return 0;
-               }
-               if (__ratelimit(&_rs))
-                       scrub_print_warning("i/o error", sbio, ix);
-       } else {
-               if (__ratelimit(&_rs))
-                       scrub_print_warning("checksum error", sbio, ix);
+       /*
+        * read all mirrors one after the other. This includes to
+        * re-read the extent or metadata block that failed (that was
+        * the cause that this fixup code is called) another time,
+        * page by page this time in order to know which pages
+        * caused I/O errors and which ones are good (for all mirrors).
+        * It is the goal to handle the situation when more than one
+        * mirror contains I/O errors, but the errors do not
+        * overlap, i.e. the data can be repaired by selecting the
+        * pages from those mirrors without I/O error on the
+        * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
+        * would be that mirror #1 has an I/O error on the first page,
+        * the second page is good, and mirror #2 has an I/O error on
+        * the second page, but the first page is good.
+        * Then the first page of the first mirror can be repaired by
+        * taking the first page of the second mirror, and the
+        * second page of the second mirror can be repaired by
+        * copying the contents of the 2nd page of the 1st mirror.
+        * One more note: if the pages of one mirror contain I/O
+        * errors, the checksum cannot be verified. In order to get
+        * the best data for repairing, the first attempt is to find
+        * a mirror without I/O errors and with a validated checksum.
+        * Only if this is not possible, the pages are picked from
+        * mirrors with I/O errors without considering the checksum.
+        * If the latter is the case, at the end, the checksum of the
+        * repaired area is verified in order to correctly maintain
+        * the statistics.
+        */
+       sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
+                                    sizeof(*sblocks_for_recheck),
+                                    GFP_NOFS);
+       if (!sblocks_for_recheck) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.malloc_errors++;
+               sdev->stat.read_errors++;
+               sdev->stat.uncorrectable_errors++;
+               spin_unlock(&sdev->stat_lock);
+               goto out;
        }
  
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.read_errors;
-       spin_unlock(&sdev->stat_lock);
+       /* setup the context, map the logical blocks and alloc the pages */
+       ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+                                       logical, sblocks_for_recheck);
+       if (ret) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.read_errors++;
+               sdev->stat.uncorrectable_errors++;
+               spin_unlock(&sdev->stat_lock);
+               goto out;
+       }
+       BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+       sblock_bad = sblocks_for_recheck + failed_mirror_index;
  
-       scrub_fixup(sbio, ix);
-       return 1;
- }
+       /* build and submit the bios for the failed mirror, check checksums */
+       ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+                                 csum, generation, sdev->csum_size);
+       if (ret) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.read_errors++;
+               sdev->stat.uncorrectable_errors++;
+               spin_unlock(&sdev->stat_lock);
+               goto out;
+       }
  
- static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
- {
-       int ret = 1;
-       struct page *page;
-       void *buffer;
-       u64 flags = sbio->spag[ix].flags;
+       if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+           sblock_bad->no_io_error_seen) {
+               /*
+                * the error disappeared after reading page by page, or
+                * the area was part of a huge bio and other parts of the
+                * bio caused I/O errors, or the block layer merged several
+                * read requests into one and the error is caused by a
+                * different bio (usually one of the two latter cases is
+                * the cause)
+                */
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.unverified_errors++;
+               spin_unlock(&sdev->stat_lock);
  
-       page = sbio->bio->bi_io_vec[ix].bv_page;
-       buffer = kmap_atomic(page);
-       if (flags & BTRFS_EXTENT_FLAG_DATA) {
-               ret = scrub_checksum_data(sbio->sdev,
-                                         sbio->spag + ix, buffer);
-       } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-               ret = scrub_checksum_tree_block(sbio->sdev,
-                                               sbio->spag + ix,
-                                               sbio->logical + ix * PAGE_SIZE,
-                                               buffer);
-       } else {
-               WARN_ON(1);
+               goto out;
        }
-       kunmap_atomic(buffer);
  
-       return ret;
- }
+       if (!sblock_bad->no_io_error_seen) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.read_errors++;
+               spin_unlock(&sdev->stat_lock);
+               if (__ratelimit(&_rs))
+                       scrub_print_warning("i/o error", sblock_to_check);
+       } else if (sblock_bad->checksum_error) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.csum_errors++;
+               spin_unlock(&sdev->stat_lock);
+               if (__ratelimit(&_rs))
+                       scrub_print_warning("checksum error", sblock_to_check);
+       } else if (sblock_bad->header_error) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.verify_errors++;
+               spin_unlock(&sdev->stat_lock);
+               if (__ratelimit(&_rs))
+                       scrub_print_warning("checksum/header error",
+                                           sblock_to_check);
+       }
  
- static void scrub_fixup_end_io(struct bio *bio, int err)
- {
-       complete((struct completion *)bio->bi_private);
- }
+       if (sdev->readonly)
+               goto did_not_correct_error;
  
- static void scrub_fixup(struct scrub_bio *sbio, int ix)
- {
-       struct scrub_dev *sdev = sbio->sdev;
-       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
-       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-       struct btrfs_bio *bbio = NULL;
-       struct scrub_fixup_nodatasum *fixup;
-       u64 logical = sbio->logical + ix * PAGE_SIZE;
-       u64 length;
-       int i;
-       int ret;
-       DECLARE_COMPLETION_ONSTACK(complete);
-       if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
-           (sbio->spag[ix].have_csum == 0)) {
-               fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
-               if (!fixup)
-                       goto uncorrectable;
-               fixup->sdev = sdev;
-               fixup->logical = logical;
-               fixup->root = fs_info->extent_root;
-               fixup->mirror_num = sbio->spag[ix].mirror_num;
+       if (!is_metadata && !have_csum) {
+               struct scrub_fixup_nodatasum *fixup_nodatasum;
+               /*
+                * !is_metadata and !have_csum, this means that the data
+                * might not be COW'ed, that it might be modified
+                * concurrently. The general strategy to work on the
+                * commit root does not help in the case when COW is not
+                * used.
+                */
+               fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
+               if (!fixup_nodatasum)
+                       goto did_not_correct_error;
+               fixup_nodatasum->sdev = sdev;
+               fixup_nodatasum->logical = logical;
+               fixup_nodatasum->root = fs_info->extent_root;
+               fixup_nodatasum->mirror_num = failed_mirror_index + 1;
                /*
                 * increment scrubs_running to prevent cancel requests from
                 * completing as long as a fixup worker is running. we must also
                atomic_inc(&fs_info->scrubs_paused);
                mutex_unlock(&fs_info->scrub_lock);
                atomic_inc(&sdev->fixup_cnt);
-               fixup->work.func = scrub_fixup_nodatasum;
-               btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
-               return;
+               fixup_nodatasum->work.func = scrub_fixup_nodatasum;
+               btrfs_queue_worker(&fs_info->scrub_workers,
+                                  &fixup_nodatasum->work);
+               goto out;
        }
  
-       length = PAGE_SIZE;
-       ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
-                             &bbio, 0);
-       if (ret || !bbio || length < PAGE_SIZE) {
-               printk(KERN_ERR
-                      "scrub_fixup: btrfs_map_block failed us for %llu\n",
-                      (unsigned long long)logical);
-               WARN_ON(1);
-               kfree(bbio);
-               return;
+       /*
+        * now build and submit the bios for the other mirrors, check
+        * checksums
+        */
+       for (mirror_index = 0;
+            mirror_index < BTRFS_MAX_MIRRORS &&
+            sblocks_for_recheck[mirror_index].page_count > 0;
+            mirror_index++) {
+               if (mirror_index == failed_mirror_index)
+                       continue;
+               /* build and submit the bios, check checksums */
+               ret = scrub_recheck_block(fs_info,
+                                         sblocks_for_recheck + mirror_index,
+                                         is_metadata, have_csum, csum,
+                                         generation, sdev->csum_size);
+               if (ret)
+                       goto did_not_correct_error;
        }
  
-       if (bbio->num_stripes == 1)
-               /* there aren't any replicas */
-               goto uncorrectable;
+       /*
+        * first try to pick the mirror which is completely without I/O
+        * errors and also does not have a checksum error.
+        * If one is found, and if a checksum is present, the full block
+        * that is known to contain an error is rewritten. Afterwards
+        * the block is known to be corrected.
+        * If a mirror is found which is completely correct, and no
+        * checksum is present, only those pages are rewritten that had
+        * an I/O error in the block to be repaired, since it cannot be
+        * determined, which copy of the other pages is better (and it
+        * could happen otherwise that a correct page would be
+        * overwritten by a bad one).
+        */
+       for (mirror_index = 0;
+            mirror_index < BTRFS_MAX_MIRRORS &&
+            sblocks_for_recheck[mirror_index].page_count > 0;
+            mirror_index++) {
+               struct scrub_block *sblock_other = sblocks_for_recheck +
+                                                  mirror_index;
+               if (!sblock_other->header_error &&
+                   !sblock_other->checksum_error &&
+                   sblock_other->no_io_error_seen) {
+                       int force_write = is_metadata || have_csum;
+                       ret = scrub_repair_block_from_good_copy(sblock_bad,
+                                                               sblock_other,
+                                                               force_write);
+                       if (0 == ret)
+                               goto corrected_error;
+               }
+       }
  
        /*
-        * first find a good copy
+        * in case of I/O errors in the area that is supposed to be
+        * repaired, continue by picking good copies of those pages.
+        * Select the good pages from mirrors to rewrite bad pages from
+        * the area to fix. Afterwards verify the checksum of the block
+        * that is supposed to be repaired. This verification step is
+        * only done for the purpose of statistic counting and for the
+        * final scrub report, whether errors remain.
+        * A perfect algorithm could make use of the checksum and try
+        * all possible combinations of pages from the different mirrors
+        * until the checksum verification succeeds. For example, when
+        * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+        * of mirror #2 is readable but the final checksum test fails,
+        * then the 2nd page of mirror #3 could be tried, whether now
+        * the final checksum succeedes. But this would be a rare
+        * exception and is therefore not implemented. At least it is
+        * avoided that the good copy is overwritten.
+        * A more useful improvement would be to pick the sectors
+        * without I/O error based on sector sizes (512 bytes on legacy
+        * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+        * mirror could be repaired by taking 512 byte of a different
+        * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+        * area are unreadable.
         */
-       for (i = 0; i < bbio->num_stripes; ++i) {
-               if (i + 1 == sbio->spag[ix].mirror_num)
-                       continue;
  
-               if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
-                                  bbio->stripes[i].physical >> 9,
-                                  sbio->bio->bi_io_vec[ix].bv_page)) {
-                       /* I/O-error, this is not a good copy */
+       /* can only fix I/O errors from here on */
+       if (sblock_bad->no_io_error_seen)
+               goto did_not_correct_error;
+       success = 1;
+       for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+               struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+               if (!page_bad->io_error)
                        continue;
+               for (mirror_index = 0;
+                    mirror_index < BTRFS_MAX_MIRRORS &&
+                    sblocks_for_recheck[mirror_index].page_count > 0;
+                    mirror_index++) {
+                       struct scrub_block *sblock_other = sblocks_for_recheck +
+                                                          mirror_index;
+                       struct scrub_page *page_other = sblock_other->pagev +
+                                                       page_num;
+                       if (!page_other->io_error) {
+                               ret = scrub_repair_page_from_good_copy(
+                                       sblock_bad, sblock_other, page_num, 0);
+                               if (0 == ret) {
+                                       page_bad->io_error = 0;
+                                       break; /* succeeded for this page */
+                               }
+                       }
                }
  
-               if (scrub_fixup_check(sbio, ix) == 0)
-                       break;
+               if (page_bad->io_error) {
+                       /* did not find a mirror to copy the page from */
+                       success = 0;
+               }
        }
-       if (i == bbio->num_stripes)
-               goto uncorrectable;
  
-       if (!sdev->readonly) {
-               /*
-                * bi_io_vec[ix].bv_page now contains good data, write it back
-                */
-               if (scrub_fixup_io(WRITE, sdev->dev->bdev,
-                                  (sbio->physical + ix * PAGE_SIZE) >> 9,
-                                  sbio->bio->bi_io_vec[ix].bv_page)) {
-                       /* I/O-error, writeback failed, give up */
-                       goto uncorrectable;
+       if (success) {
+               if (is_metadata || have_csum) {
+                       /*
+                        * need to verify the checksum now that all
+                        * sectors on disk are repaired (the write
+                        * request for data to be repaired is on its way).
+                        * Just be lazy and use scrub_recheck_block()
+                        * which re-reads the data before the checksum
+                        * is verified, but most likely the data comes out
+                        * of the page cache.
+                        */
+                       ret = scrub_recheck_block(fs_info, sblock_bad,
+                                                 is_metadata, have_csum, csum,
+                                                 generation, sdev->csum_size);
+                       if (!ret && !sblock_bad->header_error &&
+                           !sblock_bad->checksum_error &&
+                           sblock_bad->no_io_error_seen)
+                               goto corrected_error;
+                       else
+                               goto did_not_correct_error;
+               } else {
+ corrected_error:
+                       spin_lock(&sdev->stat_lock);
+                       sdev->stat.corrected_errors++;
+                       spin_unlock(&sdev->stat_lock);
+                       printk_ratelimited(KERN_ERR
+                               "btrfs: fixed up error at logical %llu on dev %s\n",
+                               (unsigned long long)logical, sdev->dev->name);
                }
+       } else {
+ did_not_correct_error:
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.uncorrectable_errors++;
+               spin_unlock(&sdev->stat_lock);
+               printk_ratelimited(KERN_ERR
+                       "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
+                       (unsigned long long)logical, sdev->dev->name);
        }
  
-       kfree(bbio);
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.corrected_errors;
-       spin_unlock(&sdev->stat_lock);
+ out:
+       if (sblocks_for_recheck) {
+               for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
+                    mirror_index++) {
+                       struct scrub_block *sblock = sblocks_for_recheck +
+                                                    mirror_index;
+                       int page_index;
+                       for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
+                            page_index++)
+                               if (sblock->pagev[page_index].page)
+                                       __free_page(
+                                               sblock->pagev[page_index].page);
+               }
+               kfree(sblocks_for_recheck);
+       }
  
-       printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
-                              (unsigned long long)logical);
-       return;
+       return 0;
+ }
  
- uncorrectable:
-       kfree(bbio);
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.uncorrectable_errors;
-       spin_unlock(&sdev->stat_lock);
+ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+                                    struct btrfs_mapping_tree *map_tree,
+                                    u64 length, u64 logical,
+                                    struct scrub_block *sblocks_for_recheck)
+ {
+       int page_index;
+       int mirror_index;
+       int ret;
+       /*
+        * note: the three members sdev, ref_count and outstanding_pages
+        * are not used (and not set) in the blocks that are used for
+        * the recheck procedure
+        */
+       page_index = 0;
+       while (length > 0) {
+               u64 sublen = min_t(u64, length, PAGE_SIZE);
+               u64 mapped_length = sublen;
+               struct btrfs_bio *bbio = NULL;
+               /*
+                * with a length of PAGE_SIZE, each returned stripe
+                * represents one mirror
+                */
+               ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
+                                     &bbio, 0);
+               if (ret || !bbio || mapped_length < sublen) {
+                       kfree(bbio);
+                       return -EIO;
+               }
  
-       printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
-                               "logical %llu\n", (unsigned long long)logical);
+               BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+               for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+                    mirror_index++) {
+                       struct scrub_block *sblock;
+                       struct scrub_page *page;
+                       if (mirror_index >= BTRFS_MAX_MIRRORS)
+                               continue;
+                       sblock = sblocks_for_recheck + mirror_index;
+                       page = sblock->pagev + page_index;
+                       page->logical = logical;
+                       page->physical = bbio->stripes[mirror_index].physical;
+                       page->bdev = bbio->stripes[mirror_index].dev->bdev;
+                       page->mirror_num = mirror_index + 1;
+                       page->page = alloc_page(GFP_NOFS);
+                       if (!page->page) {
+                               spin_lock(&sdev->stat_lock);
+                               sdev->stat.malloc_errors++;
+                               spin_unlock(&sdev->stat_lock);
+                               return -ENOMEM;
+                       }
+                       sblock->page_count++;
+               }
+               kfree(bbio);
+               length -= sublen;
+               logical += sublen;
+               page_index++;
+       }
+       return 0;
  }
  
- static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
-                        struct page *page)
+ /*
+  * this function will check the on disk data for checksum errors, header
+  * errors and read I/O errors. If any I/O errors happen, the exact pages
+  * which are errored are marked as being bad. The goal is to enable scrub
+  * to take those pages that are not errored from all the mirrors so that
+  * the pages that are errored in the just handled mirror can be repaired.
+  */
+ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+                              struct scrub_block *sblock, int is_metadata,
+                              int have_csum, u8 *csum, u64 generation,
+                              u16 csum_size)
  {
-       struct bio *bio = NULL;
-       int ret;
-       DECLARE_COMPLETION_ONSTACK(complete);
+       int page_num;
+       sblock->no_io_error_seen = 1;
+       sblock->header_error = 0;
+       sblock->checksum_error = 0;
+       for (page_num = 0; page_num < sblock->page_count; page_num++) {
+               struct bio *bio;
+               int ret;
+               struct scrub_page *page = sblock->pagev + page_num;
+               DECLARE_COMPLETION_ONSTACK(complete);
+               BUG_ON(!page->page);
+               bio = bio_alloc(GFP_NOFS, 1);
+               bio->bi_bdev = page->bdev;
+               bio->bi_sector = page->physical >> 9;
+               bio->bi_end_io = scrub_complete_bio_end_io;
+               bio->bi_private = &complete;
+               ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
+               if (PAGE_SIZE != ret) {
+                       bio_put(bio);
+                       return -EIO;
+               }
+               btrfsic_submit_bio(READ, bio);
  
-       bio = bio_alloc(GFP_NOFS, 1);
-       bio->bi_bdev = bdev;
-       bio->bi_sector = sector;
-       bio_add_page(bio, page, PAGE_SIZE, 0);
-       bio->bi_end_io = scrub_fixup_end_io;
-       bio->bi_private = &complete;
-       btrfsic_submit_bio(rw, bio);
+               /* this will also unplug the queue */
+               wait_for_completion(&complete);
  
-       /* this will also unplug the queue */
-       wait_for_completion(&complete);
+               page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+               if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+                       sblock->no_io_error_seen = 0;
+               bio_put(bio);
+       }
  
-       ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
-       bio_put(bio);
-       return ret;
+       if (sblock->no_io_error_seen)
+               scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+                                            have_csum, csum, generation,
+                                            csum_size);
+       return 0;
  }
  
- static void scrub_bio_end_io(struct bio *bio, int err)
+ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+                                        struct scrub_block *sblock,
+                                        int is_metadata, int have_csum,
+                                        const u8 *csum, u64 generation,
+                                        u16 csum_size)
  {
-       struct scrub_bio *sbio = bio->bi_private;
-       struct scrub_dev *sdev = sbio->sdev;
-       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+       int page_num;
+       u8 calculated_csum[BTRFS_CSUM_SIZE];
+       u32 crc = ~(u32)0;
+       struct btrfs_root *root = fs_info->extent_root;
+       void *mapped_buffer;
+       BUG_ON(!sblock->pagev[0].page);
+       if (is_metadata) {
+               struct btrfs_header *h;
 -              mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
++              mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+               h = (struct btrfs_header *)mapped_buffer;
+               if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+                   generation != le64_to_cpu(h->generation) ||
+                   memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+                   memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+                          BTRFS_UUID_SIZE))
+                       sblock->header_error = 1;
+               csum = h->csum;
+       } else {
+               if (!have_csum)
+                       return;
  
-       sbio->err = err;
-       sbio->bio = bio;
 -              mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
++              mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+       }
  
-       btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+       for (page_num = 0;;) {
+               if (page_num == 0 && is_metadata)
+                       crc = btrfs_csum_data(root,
+                               ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
+                               crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
+               else
+                       crc = btrfs_csum_data(root, mapped_buffer, crc,
+                                             PAGE_SIZE);
 -              kunmap_atomic(mapped_buffer, KM_USER0);
++              kunmap_atomic(mapped_buffer);
+               page_num++;
+               if (page_num >= sblock->page_count)
+                       break;
+               BUG_ON(!sblock->pagev[page_num].page);
 -              mapped_buffer = kmap_atomic(sblock->pagev[page_num].page,
 -                                          KM_USER0);
++              mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
+       }
+       btrfs_csum_final(crc, calculated_csum);
+       if (memcmp(calculated_csum, csum, csum_size))
+               sblock->checksum_error = 1;
  }
  
- static void scrub_checksum(struct btrfs_work *work)
+ static void scrub_complete_bio_end_io(struct bio *bio, int err)
  {
-       struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
-       struct scrub_dev *sdev = sbio->sdev;
-       struct page *page;
-       void *buffer;
-       int i;
-       u64 flags;
-       u64 logical;
-       int ret;
+       complete((struct completion *)bio->bi_private);
+ }
  
-       if (sbio->err) {
-               ret = 0;
-               for (i = 0; i < sbio->count; ++i)
-                       ret |= scrub_recheck_error(sbio, i);
-               if (!ret) {
-                       spin_lock(&sdev->stat_lock);
-                       ++sdev->stat.unverified_errors;
-                       spin_unlock(&sdev->stat_lock);
-               }
+ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+                                            struct scrub_block *sblock_good,
+                                            int force_write)
+ {
+       int page_num;
+       int ret = 0;
  
-               sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-               sbio->bio->bi_phys_segments = 0;
-               sbio->bio->bi_idx = 0;
+       for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+               int ret_sub;
  
-               for (i = 0; i < sbio->count; i++) {
-                       struct bio_vec *bi;
-                       bi = &sbio->bio->bi_io_vec[i];
-                       bi->bv_offset = 0;
-                       bi->bv_len = PAGE_SIZE;
-               }
-               goto out;
+               ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
+                                                          sblock_good,
+                                                          page_num,
+                                                          force_write);
+               if (ret_sub)
+                       ret = ret_sub;
        }
-       for (i = 0; i < sbio->count; ++i) {
-               page = sbio->bio->bi_io_vec[i].bv_page;
-               buffer = kmap_atomic(page);
-               flags = sbio->spag[i].flags;
-               logical = sbio->logical + i * PAGE_SIZE;
-               ret = 0;
-               if (flags & BTRFS_EXTENT_FLAG_DATA) {
-                       ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
-               } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-                       ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
-                                                       logical, buffer);
-               } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
-                       BUG_ON(i);
-                       (void)scrub_checksum_super(sbio, buffer);
-               } else {
-                       WARN_ON(1);
-               }
-               kunmap_atomic(buffer);
-               if (ret) {
-                       ret = scrub_recheck_error(sbio, i);
-                       if (!ret) {
-                               spin_lock(&sdev->stat_lock);
-                               ++sdev->stat.unverified_errors;
-                               spin_unlock(&sdev->stat_lock);
-                       }
+       return ret;
+ }
+ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+                                           struct scrub_block *sblock_good,
+                                           int page_num, int force_write)
+ {
+       struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+       struct scrub_page *page_good = sblock_good->pagev + page_num;
+       BUG_ON(sblock_bad->pagev[page_num].page == NULL);
+       BUG_ON(sblock_good->pagev[page_num].page == NULL);
+       if (force_write || sblock_bad->header_error ||
+           sblock_bad->checksum_error || page_bad->io_error) {
+               struct bio *bio;
+               int ret;
+               DECLARE_COMPLETION_ONSTACK(complete);
+               bio = bio_alloc(GFP_NOFS, 1);
+               bio->bi_bdev = page_bad->bdev;
+               bio->bi_sector = page_bad->physical >> 9;
+               bio->bi_end_io = scrub_complete_bio_end_io;
+               bio->bi_private = &complete;
+               ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+               if (PAGE_SIZE != ret) {
+                       bio_put(bio);
+                       return -EIO;
                }
+               btrfsic_submit_bio(WRITE, bio);
+               /* this will also unplug the queue */
+               wait_for_completion(&complete);
+               bio_put(bio);
        }
  
- out:
-       scrub_free_bio(sbio->bio);
-       sbio->bio = NULL;
-       spin_lock(&sdev->list_lock);
-       sbio->next_free = sdev->first_free;
-       sdev->first_free = sbio->index;
-       spin_unlock(&sdev->list_lock);
-       atomic_dec(&sdev->in_flight);
-       wake_up(&sdev->list_wait);
+       return 0;
+ }
+ static void scrub_checksum(struct scrub_block *sblock)
+ {
+       u64 flags;
+       int ret;
+       BUG_ON(sblock->page_count < 1);
+       flags = sblock->pagev[0].flags;
+       ret = 0;
+       if (flags & BTRFS_EXTENT_FLAG_DATA)
+               ret = scrub_checksum_data(sblock);
+       else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+               ret = scrub_checksum_tree_block(sblock);
+       else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+               (void)scrub_checksum_super(sblock);
+       else
+               WARN_ON(1);
+       if (ret)
+               scrub_handle_errored_block(sblock);
  }
  
- static int scrub_checksum_data(struct scrub_dev *sdev,
-                              struct scrub_page *spag, void *buffer)
+ static int scrub_checksum_data(struct scrub_block *sblock)
  {
+       struct scrub_dev *sdev = sblock->sdev;
        u8 csum[BTRFS_CSUM_SIZE];
+       u8 *on_disk_csum;
+       struct page *page;
+       void *buffer;
        u32 crc = ~(u32)0;
        int fail = 0;
        struct btrfs_root *root = sdev->dev->dev_root;
+       u64 len;
+       int index;
  
-       if (!spag->have_csum)
+       BUG_ON(sblock->page_count < 1);
+       if (!sblock->pagev[0].have_csum)
                return 0;
  
-       crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+       on_disk_csum = sblock->pagev[0].csum;
+       page = sblock->pagev[0].page;
 -      buffer = kmap_atomic(page, KM_USER0);
++      buffer = kmap_atomic(page);
+       len = sdev->sectorsize;
+       index = 0;
+       for (;;) {
+               u64 l = min_t(u64, len, PAGE_SIZE);
+               crc = btrfs_csum_data(root, buffer, crc, l);
 -              kunmap_atomic(buffer, KM_USER0);
++              kunmap_atomic(buffer);
+               len -= l;
+               if (len == 0)
+                       break;
+               index++;
+               BUG_ON(index >= sblock->page_count);
+               BUG_ON(!sblock->pagev[index].page);
+               page = sblock->pagev[index].page;
 -              buffer = kmap_atomic(page, KM_USER0);
++              buffer = kmap_atomic(page);
+       }
        btrfs_csum_final(crc, csum);
-       if (memcmp(csum, spag->csum, sdev->csum_size))
+       if (memcmp(csum, on_disk_csum, sdev->csum_size))
                fail = 1;
  
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.data_extents_scrubbed;
-       sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
-       if (fail)
+       if (fail) {
+               spin_lock(&sdev->stat_lock);
                ++sdev->stat.csum_errors;
-       spin_unlock(&sdev->stat_lock);
+               spin_unlock(&sdev->stat_lock);
+       }
  
        return fail;
  }
  
- static int scrub_checksum_tree_block(struct scrub_dev *sdev,
-                                    struct scrub_page *spag, u64 logical,
-                                    void *buffer)
+ static int scrub_checksum_tree_block(struct scrub_block *sblock)
  {
+       struct scrub_dev *sdev = sblock->sdev;
        struct btrfs_header *h;
        struct btrfs_root *root = sdev->dev->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
-       u8 csum[BTRFS_CSUM_SIZE];
+       u8 calculated_csum[BTRFS_CSUM_SIZE];
+       u8 on_disk_csum[BTRFS_CSUM_SIZE];
+       struct page *page;
+       void *mapped_buffer;
+       u64 mapped_size;
+       void *p;
        u32 crc = ~(u32)0;
        int fail = 0;
        int crc_fail = 0;
 -      mapped_buffer = kmap_atomic(page, KM_USER0);
+       u64 len;
+       int index;
+       BUG_ON(sblock->page_count < 1);
+       page = sblock->pagev[0].page;
++      mapped_buffer = kmap_atomic(page);
+       h = (struct btrfs_header *)mapped_buffer;
+       memcpy(on_disk_csum, h->csum, sdev->csum_size);
  
        /*
         * we don't use the getter functions here, as we
         * a) don't have an extent buffer and
         * b) the page is already kmapped
         */
-       h = (struct btrfs_header *)buffer;
  
-       if (logical != le64_to_cpu(h->bytenr))
+       if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
                ++fail;
  
-       if (spag->generation != le64_to_cpu(h->generation))
+       if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
                ++fail;
  
        if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
                   BTRFS_UUID_SIZE))
                ++fail;
  
-       crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
-                             PAGE_SIZE - BTRFS_CSUM_SIZE);
-       btrfs_csum_final(crc, csum);
-       if (memcmp(csum, h->csum, sdev->csum_size))
+       BUG_ON(sdev->nodesize != sdev->leafsize);
+       len = sdev->nodesize - BTRFS_CSUM_SIZE;
+       mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+       p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+       index = 0;
+       for (;;) {
+               u64 l = min_t(u64, len, mapped_size);
+               crc = btrfs_csum_data(root, p, crc, l);
 -              kunmap_atomic(mapped_buffer, KM_USER0);
++              kunmap_atomic(mapped_buffer);
+               len -= l;
+               if (len == 0)
+                       break;
+               index++;
+               BUG_ON(index >= sblock->page_count);
+               BUG_ON(!sblock->pagev[index].page);
+               page = sblock->pagev[index].page;
 -              mapped_buffer = kmap_atomic(page, KM_USER0);
++              mapped_buffer = kmap_atomic(page);
+               mapped_size = PAGE_SIZE;
+               p = mapped_buffer;
+       }
+       btrfs_csum_final(crc, calculated_csum);
+       if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
                ++crc_fail;
  
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.tree_extents_scrubbed;
-       sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
-       if (crc_fail)
-               ++sdev->stat.csum_errors;
-       if (fail)
-               ++sdev->stat.verify_errors;
-       spin_unlock(&sdev->stat_lock);
+       if (crc_fail || fail) {
+               spin_lock(&sdev->stat_lock);
+               if (crc_fail)
+                       ++sdev->stat.csum_errors;
+               if (fail)
+                       ++sdev->stat.verify_errors;
+               spin_unlock(&sdev->stat_lock);
+       }
  
        return fail || crc_fail;
  }
  
- static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+ static int scrub_checksum_super(struct scrub_block *sblock)
  {
        struct btrfs_super_block *s;
-       u64 logical;
-       struct scrub_dev *sdev = sbio->sdev;
+       struct scrub_dev *sdev = sblock->sdev;
        struct btrfs_root *root = sdev->dev->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
-       u8 csum[BTRFS_CSUM_SIZE];
+       u8 calculated_csum[BTRFS_CSUM_SIZE];
+       u8 on_disk_csum[BTRFS_CSUM_SIZE];
+       struct page *page;
+       void *mapped_buffer;
+       u64 mapped_size;
+       void *p;
        u32 crc = ~(u32)0;
        int fail = 0;
+       u64 len;
+       int index;
  
-       s = (struct btrfs_super_block *)buffer;
-       logical = sbio->logical;
+       BUG_ON(sblock->page_count < 1);
+       page = sblock->pagev[0].page;
 -      mapped_buffer = kmap_atomic(page, KM_USER0);
++      mapped_buffer = kmap_atomic(page);
+       s = (struct btrfs_super_block *)mapped_buffer;
+       memcpy(on_disk_csum, s->csum, sdev->csum_size);
  
-       if (logical != le64_to_cpu(s->bytenr))
+       if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
                ++fail;
  
-       if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+       if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
                ++fail;
  
        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
                ++fail;
  
-       crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
-                             PAGE_SIZE - BTRFS_CSUM_SIZE);
-       btrfs_csum_final(crc, csum);
-       if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+       len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
+       mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+       p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+       index = 0;
+       for (;;) {
+               u64 l = min_t(u64, len, mapped_size);
+               crc = btrfs_csum_data(root, p, crc, l);
 -              kunmap_atomic(mapped_buffer, KM_USER0);
++              kunmap_atomic(mapped_buffer);
+               len -= l;
+               if (len == 0)
+                       break;
+               index++;
+               BUG_ON(index >= sblock->page_count);
+               BUG_ON(!sblock->pagev[index].page);
+               page = sblock->pagev[index].page;
 -              mapped_buffer = kmap_atomic(page, KM_USER0);
++              mapped_buffer = kmap_atomic(page);
+               mapped_size = PAGE_SIZE;
+               p = mapped_buffer;
+       }
+       btrfs_csum_final(crc, calculated_csum);
+       if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
                ++fail;
  
        if (fail) {
        return fail;
  }
  
- static int scrub_submit(struct scrub_dev *sdev)
+ static void scrub_block_get(struct scrub_block *sblock)
+ {
+       atomic_inc(&sblock->ref_count);
+ }
+ static void scrub_block_put(struct scrub_block *sblock)
+ {
+       if (atomic_dec_and_test(&sblock->ref_count)) {
+               int i;
+               for (i = 0; i < sblock->page_count; i++)
+                       if (sblock->pagev[i].page)
+                               __free_page(sblock->pagev[i].page);
+               kfree(sblock);
+       }
+ }
+ static void scrub_submit(struct scrub_dev *sdev)
  {
        struct scrub_bio *sbio;
  
        if (sdev->curr == -1)
-               return 0;
+               return;
  
        sbio = sdev->bios[sdev->curr];
-       sbio->err = 0;
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
  
        btrfsic_submit_bio(READ, sbio->bio);
-       return 0;
  }
  
- static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
-                     u64 physical, u64 flags, u64 gen, int mirror_num,
-                     u8 *csum, int force)
+ static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+                                struct scrub_page *spage)
  {
+       struct scrub_block *sblock = spage->sblock;
        struct scrub_bio *sbio;
-       struct page *page;
        int ret;
  
  again:
                if (sdev->curr != -1) {
                        sdev->first_free = sdev->bios[sdev->curr]->next_free;
                        sdev->bios[sdev->curr]->next_free = -1;
-                       sdev->bios[sdev->curr]->count = 0;
+                       sdev->bios[sdev->curr]->page_count = 0;
                        spin_unlock(&sdev->list_lock);
                } else {
                        spin_unlock(&sdev->list_lock);
                }
        }
        sbio = sdev->bios[sdev->curr];
-       if (sbio->count == 0) {
+       if (sbio->page_count == 0) {
                struct bio *bio;
  
-               sbio->physical = physical;
-               sbio->logical = logical;
-               bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
-               if (!bio)
-                       return -ENOMEM;
+               sbio->physical = spage->physical;
+               sbio->logical = spage->logical;
+               bio = sbio->bio;
+               if (!bio) {
+                       bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+                       if (!bio)
+                               return -ENOMEM;
+                       sbio->bio = bio;
+               }
  
                bio->bi_private = sbio;
                bio->bi_end_io = scrub_bio_end_io;
                bio->bi_bdev = sdev->dev->bdev;
-               bio->bi_sector = sbio->physical >> 9;
+               bio->bi_sector = spage->physical >> 9;
                sbio->err = 0;
-               sbio->bio = bio;
-       } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
-                  sbio->logical + sbio->count * PAGE_SIZE != logical) {
-               ret = scrub_submit(sdev);
-               if (ret)
-                       return ret;
+       } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+                  spage->physical ||
+                  sbio->logical + sbio->page_count * PAGE_SIZE !=
+                  spage->logical) {
+               scrub_submit(sdev);
                goto again;
        }
-       sbio->spag[sbio->count].flags = flags;
-       sbio->spag[sbio->count].generation = gen;
-       sbio->spag[sbio->count].have_csum = 0;
-       sbio->spag[sbio->count].mirror_num = mirror_num;
-       page = alloc_page(GFP_NOFS);
-       if (!page)
-               return -ENOMEM;
  
-       ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
-       if (!ret) {
-               __free_page(page);
-               ret = scrub_submit(sdev);
-               if (ret)
-                       return ret;
+       sbio->pagev[sbio->page_count] = spage;
+       ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+       if (ret != PAGE_SIZE) {
+               if (sbio->page_count < 1) {
+                       bio_put(sbio->bio);
+                       sbio->bio = NULL;
+                       return -EIO;
+               }
+               scrub_submit(sdev);
                goto again;
        }
  
-       if (csum) {
-               sbio->spag[sbio->count].have_csum = 1;
-               memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+       scrub_block_get(sblock); /* one for the added page */
+       atomic_inc(&sblock->outstanding_pages);
+       sbio->page_count++;
+       if (sbio->page_count == sdev->pages_per_bio)
+               scrub_submit(sdev);
+       return 0;
+ }
+ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+                      u64 physical, u64 flags, u64 gen, int mirror_num,
+                      u8 *csum, int force)
+ {
+       struct scrub_block *sblock;
+       int index;
+       sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+       if (!sblock) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.malloc_errors++;
+               spin_unlock(&sdev->stat_lock);
+               return -ENOMEM;
        }
-       ++sbio->count;
-       if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
+       /* one ref inside this function, plus one for each page later on */
+       atomic_set(&sblock->ref_count, 1);
+       sblock->sdev = sdev;
+       sblock->no_io_error_seen = 1;
+       for (index = 0; len > 0; index++) {
+               struct scrub_page *spage = sblock->pagev + index;
+               u64 l = min_t(u64, len, PAGE_SIZE);
+               BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+               spage->page = alloc_page(GFP_NOFS);
+               if (!spage->page) {
+                       spin_lock(&sdev->stat_lock);
+                       sdev->stat.malloc_errors++;
+                       spin_unlock(&sdev->stat_lock);
+                       while (index > 0) {
+                               index--;
+                               __free_page(sblock->pagev[index].page);
+                       }
+                       kfree(sblock);
+                       return -ENOMEM;
+               }
+               spage->sblock = sblock;
+               spage->bdev = sdev->dev->bdev;
+               spage->flags = flags;
+               spage->generation = gen;
+               spage->logical = logical;
+               spage->physical = physical;
+               spage->mirror_num = mirror_num;
+               if (csum) {
+                       spage->have_csum = 1;
+                       memcpy(spage->csum, csum, sdev->csum_size);
+               } else {
+                       spage->have_csum = 0;
+               }
+               sblock->page_count++;
+               len -= l;
+               logical += l;
+               physical += l;
+       }
+       BUG_ON(sblock->page_count == 0);
+       for (index = 0; index < sblock->page_count; index++) {
+               struct scrub_page *spage = sblock->pagev + index;
                int ret;
  
-               ret = scrub_submit(sdev);
-               if (ret)
+               ret = scrub_add_page_to_bio(sdev, spage);
+               if (ret) {
+                       scrub_block_put(sblock);
                        return ret;
+               }
        }
  
+       if (force)
+               scrub_submit(sdev);
+       /* last one frees, either here or in bio completion for last page */
+       scrub_block_put(sblock);
        return 0;
  }
  
+ static void scrub_bio_end_io(struct bio *bio, int err)
+ {
+       struct scrub_bio *sbio = bio->bi_private;
+       struct scrub_dev *sdev = sbio->sdev;
+       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+       sbio->err = err;
+       sbio->bio = bio;
+       btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+ }
+ static void scrub_bio_end_io_worker(struct btrfs_work *work)
+ {
+       struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+       struct scrub_dev *sdev = sbio->sdev;
+       int i;
+       BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+       if (sbio->err) {
+               for (i = 0; i < sbio->page_count; i++) {
+                       struct scrub_page *spage = sbio->pagev[i];
+                       spage->io_error = 1;
+                       spage->sblock->no_io_error_seen = 0;
+               }
+       }
+       /* now complete the scrub_block items that have all pages completed */
+       for (i = 0; i < sbio->page_count; i++) {
+               struct scrub_page *spage = sbio->pagev[i];
+               struct scrub_block *sblock = spage->sblock;
+               if (atomic_dec_and_test(&sblock->outstanding_pages))
+                       scrub_block_complete(sblock);
+               scrub_block_put(sblock);
+       }
+       if (sbio->err) {
+               /* what is this good for??? */
+               sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+               sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+               sbio->bio->bi_phys_segments = 0;
+               sbio->bio->bi_idx = 0;
+               for (i = 0; i < sbio->page_count; i++) {
+                       struct bio_vec *bi;
+                       bi = &sbio->bio->bi_io_vec[i];
+                       bi->bv_offset = 0;
+                       bi->bv_len = PAGE_SIZE;
+               }
+       }
+       bio_put(sbio->bio);
+       sbio->bio = NULL;
+       spin_lock(&sdev->list_lock);
+       sbio->next_free = sdev->first_free;
+       sdev->first_free = sbio->index;
+       spin_unlock(&sdev->list_lock);
+       atomic_dec(&sdev->in_flight);
+       wake_up(&sdev->list_wait);
+ }
+ static void scrub_block_complete(struct scrub_block *sblock)
+ {
+       if (!sblock->no_io_error_seen)
+               scrub_handle_errored_block(sblock);
+       else
+               scrub_checksum(sblock);
+ }
  static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
                           u8 *csum)
  {
        int ret = 0;
        unsigned long i;
        unsigned long num_sectors;
-       u32 sectorsize = sdev->dev->dev_root->sectorsize;
  
        while (!list_empty(&sdev->csum_list)) {
                sum = list_first_entry(&sdev->csum_list,
        if (!sum)
                return 0;
  
-       num_sectors = sum->len / sectorsize;
+       num_sectors = sum->len / sdev->sectorsize;
        for (i = 0; i < num_sectors; ++i) {
                if (sum->sums[i].bytenr == logical) {
                        memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
@@@ -1093,9 -1710,28 +1709,28 @@@ static int scrub_extent(struct scrub_de
  {
        int ret;
        u8 csum[BTRFS_CSUM_SIZE];
+       u32 blocksize;
+       if (flags & BTRFS_EXTENT_FLAG_DATA) {
+               blocksize = sdev->sectorsize;
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.data_extents_scrubbed++;
+               sdev->stat.data_bytes_scrubbed += len;
+               spin_unlock(&sdev->stat_lock);
+       } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+               BUG_ON(sdev->nodesize != sdev->leafsize);
+               blocksize = sdev->nodesize;
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.tree_extents_scrubbed++;
+               sdev->stat.tree_bytes_scrubbed += len;
+               spin_unlock(&sdev->stat_lock);
+       } else {
+               blocksize = sdev->sectorsize;
+               BUG_ON(1);
+       }
  
        while (len) {
-               u64 l = min_t(u64, len, PAGE_SIZE);
+               u64 l = min_t(u64, len, blocksize);
                int have_csum = 0;
  
                if (flags & BTRFS_EXTENT_FLAG_DATA) {
                        if (have_csum == 0)
                                ++sdev->stat.no_csum;
                }
-               ret = scrub_page(sdev, logical, l, physical, flags, gen,
-                                mirror_num, have_csum ? csum : NULL, 0);
+               ret = scrub_pages(sdev, logical, l, physical, flags, gen,
+                                 mirror_num, have_csum ? csum : NULL, 0);
                if (ret)
                        return ret;
                len -= l;
@@@ -1170,6 -1806,11 +1805,11 @@@ static noinline_for_stack int scrub_str
        if (!path)
                return -ENOMEM;
  
+       /*
+        * work on commit root. The related disk blocks are static as
+        * long as COW is applied. This means, it is save to rewrite
+        * them to repair disk errors without any race conditions
+        */
        path->search_commit_root = 1;
        path->skip_locking = 1;
  
@@@ -1516,15 -2157,18 +2156,18 @@@ static noinline_for_stack int scrub_sup
        struct btrfs_device *device = sdev->dev;
        struct btrfs_root *root = device->dev_root;
  
+       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+               return -EIO;
        gen = root->fs_info->last_trans_committed;
  
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
-               if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+               if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
                        break;
  
-               ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
-                                BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+               ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+                                    BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
                if (ret)
                        return ret;
        }
@@@ -1583,10 -2227,30 +2226,30 @@@ int btrfs_scrub_dev(struct btrfs_root *
        /*
         * check some assumptions
         */
-       if (root->sectorsize != PAGE_SIZE ||
-           root->sectorsize != root->leafsize ||
-           root->sectorsize != root->nodesize) {
-               printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+       if (root->nodesize != root->leafsize) {
+               printk(KERN_ERR
+                      "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
+                      root->nodesize, root->leafsize);
+               return -EINVAL;
+       }
+       if (root->nodesize > BTRFS_STRIPE_LEN) {
+               /*
+                * in this case scrub is unable to calculate the checksum
+                * the way scrub is implemented. Do not handle this
+                * situation at all because it won't ever happen.
+                */
+               printk(KERN_ERR
+                      "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
+                      root->nodesize, BTRFS_STRIPE_LEN);
+               return -EINVAL;
+       }
+       if (root->sectorsize != PAGE_SIZE) {
+               /* not supported for data w/o checksums */
+               printk(KERN_ERR
+                      "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
+                      root->sectorsize, (unsigned long long)PAGE_SIZE);
                return -EINVAL;
        }
  
        return ret;
  }
  
int btrfs_scrub_pause(struct btrfs_root *root)
void btrfs_scrub_pause(struct btrfs_root *root)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
  
                mutex_lock(&fs_info->scrub_lock);
        }
        mutex_unlock(&fs_info->scrub_lock);
-       return 0;
  }
  
int btrfs_scrub_continue(struct btrfs_root *root)
void btrfs_scrub_continue(struct btrfs_root *root)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
  
        atomic_dec(&fs_info->scrub_pause_req);
        wake_up(&fs_info->scrub_pause_wait);
-       return 0;
  }
  
int btrfs_scrub_pause_super(struct btrfs_root *root)
void btrfs_scrub_pause_super(struct btrfs_root *root)
  {
        down_write(&root->fs_info->scrub_super_lock);
-       return 0;
  }
  
int btrfs_scrub_continue_super(struct btrfs_root *root)
void btrfs_scrub_continue_super(struct btrfs_root *root)
  {
        up_write(&root->fs_info->scrub_super_lock);
-       return 0;
  }
  
- int btrfs_scrub_cancel(struct btrfs_root *root)
+ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
  {
-       struct btrfs_fs_info *fs_info = root->fs_info;
  
        mutex_lock(&fs_info->scrub_lock);
        if (!atomic_read(&fs_info->scrubs_running)) {
        return 0;
  }
  
+ int btrfs_scrub_cancel(struct btrfs_root *root)
+ {
+       return __btrfs_scrub_cancel(root->fs_info);
+ }
  int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
  
        return 0;
  }
  int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
diff --combined fs/btrfs/super.c
@@@ -76,6 -76,9 +76,9 @@@ static const char *btrfs_decode_error(s
        case -EROFS:
                errstr = "Readonly filesystem";
                break;
+       case -EEXIST:
+               errstr = "Object already exists";
+               break;
        default:
                if (nbuf) {
                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
@@@ -116,6 -119,8 +119,8 @@@ static void btrfs_handle_error(struct b
        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
                sb->s_flags |= MS_RDONLY;
                printk(KERN_INFO "btrfs is forced readonly\n");
+               __btrfs_scrub_cancel(fs_info);
+ //            WARN_ON(1);
        }
  }
  
   * invokes the approciate error response.
   */
  void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
-                    unsigned int line, int errno)
+                      unsigned int line, int errno, const char *fmt, ...)
  {
        struct super_block *sb = fs_info->sb;
        char nbuf[16];
        const char *errstr;
+       va_list args;
+       va_start(args, fmt);
  
        /*
         * Special case: if the error is EROFS, and we're already
         * under MS_RDONLY, then it is safe here.
         */
        if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+               return;
+       errstr = btrfs_decode_error(fs_info, errno, nbuf);
+       if (fmt) {
+               struct va_format vaf = {
+                       .fmt = fmt,
+                       .va = &args,
+               };
+               printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
+                       sb->s_id, function, line, errstr, &vaf);
+       } else {
+               printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+                       sb->s_id, function, line, errstr);
+       }
+       /* Don't go through full error handling during mount */
+       if (sb->s_flags & MS_BORN) {
+               save_error_info(fs_info);
+               btrfs_handle_error(fs_info);
+       }
+       va_end(args);
+ }
+ const char *logtypes[] = {
+       "emergency",
+       "alert",
+       "critical",
+       "error",
+       "warning",
+       "notice",
+       "info",
+       "debug",
+ };
+ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
+ {
+       struct super_block *sb = fs_info->sb;
+       char lvl[4];
+       struct va_format vaf;
+       va_list args;
+       const char *type = logtypes[4];
+       va_start(args, fmt);
+       if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
+               strncpy(lvl, fmt, 3);
+               fmt += 3;
+               type = logtypes[fmt[1] - '0'];
+       } else
+               *lvl = '\0';
+       vaf.fmt = fmt;
+       vaf.va = &args;
+       printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf);
+ }
+ /*
+  * We only mark the transaction aborted and then set the file system read-only.
+  * This will prevent new transactions from starting or trying to join this
+  * one.
+  *
+  * This means that error recovery at the call site is limited to freeing
+  * any local memory allocations and passing the error code up without
+  * further cleanup. The transaction should complete as it normally would
+  * in the call path but will return -EIO.
+  *
+  * We'll complete the cleanup in btrfs_end_transaction and
+  * btrfs_commit_transaction.
+  */
+ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root, const char *function,
+                              unsigned int line, int errno)
+ {
+       WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
+       trans->aborted = errno;
+       /* Nothing used. The other threads that have joined this
+        * transaction may be able to continue. */
+       if (!trans->blocks_used) {
+               btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
                return;
+       }
+       trans->transaction->aborted = errno;
+       __btrfs_std_error(root->fs_info, function, line, errno, NULL);
+ }
+ /*
+  * __btrfs_panic decodes unexpected, fatal errors from the caller,
+  * issues an alert, and either panics or BUGs, depending on mount options.
+  */
+ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+                  unsigned int line, int errno, const char *fmt, ...)
+ {
+       char nbuf[16];
+       char *s_id = "<unknown>";
+       const char *errstr;
+       struct va_format vaf = { .fmt = fmt };
+       va_list args;
  
-       errstr = btrfs_decode_error(fs_info, errno, nbuf);
-       printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
-               sb->s_id, function, line, errstr);
-       save_error_info(fs_info);
+       if (fs_info)
+               s_id = fs_info->sb->s_id;
  
-       btrfs_handle_error(fs_info);
+       va_start(args, fmt);
+       vaf.va = &args;
+       errstr = btrfs_decode_error(fs_info, errno, nbuf);
+       if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
+               panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
+                       s_id, function, line, &vaf, errstr);
+       printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
+              s_id, function, line, &vaf, errstr);
+       va_end(args);
+       /* Caller calls BUG() */
  }
  
  static void btrfs_put_super(struct super_block *sb)
@@@ -166,7 -278,7 +278,7 @@@ enum 
        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
        Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
        Opt_check_integrity, Opt_check_integrity_including_extent_data,
-       Opt_check_integrity_print_mask,
+       Opt_check_integrity_print_mask, Opt_fatal_errors,
        Opt_err,
  };
  
@@@ -206,12 -318,14 +318,14 @@@ static match_table_t tokens = 
        {Opt_check_integrity, "check_int"},
        {Opt_check_integrity_including_extent_data, "check_int_data"},
        {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+       {Opt_fatal_errors, "fatal_errors=%s"},
        {Opt_err, NULL},
  };
  
  /*
   * Regular mount options parser.  Everything that is needed only when
   * reading in a new superblock is parsed here.
+  * XXX JDM: This needs to be cleaned up for remount.
   */
  int btrfs_parse_options(struct btrfs_root *root, char *options)
  {
                        ret = -EINVAL;
                        goto out;
  #endif
+               case Opt_fatal_errors:
+                       if (strcmp(args[0].from, "panic") == 0)
+                               btrfs_set_opt(info->mount_opt,
+                                             PANIC_ON_FATAL_ERROR);
+                       else if (strcmp(args[0].from, "bug") == 0)
+                               btrfs_clear_opt(info->mount_opt,
+                                             PANIC_ON_FATAL_ERROR);
+                       else {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@@ -629,6 -755,7 +755,6 @@@ static int btrfs_fill_super(struct supe
                            void *data, int silent)
  {
        struct inode *inode;
 -      struct dentry *root_dentry;
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_key key;
        int err;
                goto fail_close;
        }
  
 -      root_dentry = d_alloc_root(inode);
 -      if (!root_dentry) {
 -              iput(inode);
 +      sb->s_root = d_make_root(inode);
 +      if (!sb->s_root) {
                err = -ENOMEM;
                goto fail_close;
        }
  
 -      sb->s_root = root_dentry;
 -
        save_mount_options(sb, data);
        cleancache_init_fs(sb);
        sb->s_flags |= MS_ACTIVE;
@@@ -762,6 -892,8 +888,8 @@@ static int btrfs_show_options(struct se
                seq_puts(seq, ",inode_cache");
        if (btrfs_test_opt(root, SKIP_BALANCE))
                seq_puts(seq, ",skip_balance");
+       if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+               seq_puts(seq, ",fatal_errors=panic");
        return 0;
  }
  
@@@ -995,11 -1127,20 +1123,20 @@@ static int btrfs_remount(struct super_b
  {
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_root *root = fs_info->tree_root;
+       unsigned old_flags = sb->s_flags;
+       unsigned long old_opts = fs_info->mount_opt;
+       unsigned long old_compress_type = fs_info->compress_type;
+       u64 old_max_inline = fs_info->max_inline;
+       u64 old_alloc_start = fs_info->alloc_start;
+       int old_thread_pool_size = fs_info->thread_pool_size;
+       unsigned int old_metadata_ratio = fs_info->metadata_ratio;
        int ret;
  
        ret = btrfs_parse_options(root, data);
-       if (ret)
-               return -EINVAL;
+       if (ret) {
+               ret = -EINVAL;
+               goto restore;
+       }
  
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
        if (*flags & MS_RDONLY) {
                sb->s_flags |= MS_RDONLY;
  
-               ret =  btrfs_commit_super(root);
-               WARN_ON(ret);
+               ret = btrfs_commit_super(root);
+               if (ret)
+                       goto restore;
        } else {
                if (fs_info->fs_devices->rw_devices == 0)
-                       return -EACCES;
+                       ret = -EACCES;
+                       goto restore;
  
                if (btrfs_super_log_root(fs_info->super_copy) != 0)
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto restore;
  
                ret = btrfs_cleanup_fs_roots(fs_info);
-               WARN_ON(ret);
+               if (ret)
+                       goto restore;
  
                /* recover relocation */
                ret = btrfs_recover_relocation(root);
-               WARN_ON(ret);
+               if (ret)
+                       goto restore;
  
                sb->s_flags &= ~MS_RDONLY;
        }
  
        return 0;
+ restore:
+       /* We've hit an error - don't reset MS_RDONLY */
+       if (sb->s_flags & MS_RDONLY)
+               old_flags |= MS_RDONLY;
+       sb->s_flags = old_flags;
+       fs_info->mount_opt = old_opts;
+       fs_info->compress_type = old_compress_type;
+       fs_info->max_inline = old_max_inline;
+       fs_info->alloc_start = old_alloc_start;
+       fs_info->thread_pool_size = old_thread_pool_size;
+       fs_info->metadata_ratio = old_metadata_ratio;
+       return ret;
  }
  
  /* Used to sort the devices by max_avail(descending sort) */
@@@ -1356,9 -1515,7 +1511,7 @@@ static int __init init_btrfs_fs(void
        if (err)
                return err;
  
-       err = btrfs_init_compress();
-       if (err)
-               goto free_sysfs;
+       btrfs_init_compress();
  
        err = btrfs_init_cachep();
        if (err)
        if (err)
                goto unregister_ioctl;
  
+       btrfs_init_lockdep();
        printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
        return 0;
  
@@@ -1399,7 -1558,6 +1554,6 @@@ free_cachep
        btrfs_destroy_cachep();
  free_compress:
        btrfs_exit_compress();
- free_sysfs:
        btrfs_exit_sysfs();
        return err;
  }