Merge branches 'for-4.8/alps', 'for-4.8/apple', 'for-4.8/i2c-hid', 'for-4.8/uhid...
[cascardo/linux.git] / fs / btrfs / extent_io.c
index d247fc0..75533ad 100644 (file)
@@ -726,14 +726,6 @@ next:
        start = last_end + 1;
        if (start <= end && state && !need_resched())
                goto hit_next;
-       goto search_again;
-
-out:
-       spin_unlock(&tree->lock);
-       if (prealloc)
-               free_extent_state(prealloc);
-
-       return 0;
 
 search_again:
        if (start > end)
@@ -742,6 +734,14 @@ search_again:
        if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;
+
+out:
+       spin_unlock(&tree->lock);
+       if (prealloc)
+               free_extent_state(prealloc);
+
+       return 0;
+
 }
 
 static void wait_on_state(struct extent_io_tree *tree,
@@ -873,8 +873,14 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        bits |= EXTENT_FIRST_DELALLOC;
 again:
        if (!prealloc && gfpflags_allow_blocking(mask)) {
+               /*
+                * Don't care for allocation failure here because we might end
+                * up not needing the pre-allocated extent state at all, which
+                * is the case if we only have in the tree extent states that
+                * cover our input range and don't cover too any other range.
+                * If we end up needing a new extent state we allocate it later.
+                */
                prealloc = alloc_extent_state(mask);
-               BUG_ON(!prealloc);
        }
 
        spin_lock(&tree->lock);
@@ -1037,7 +1043,13 @@ hit_next:
                goto out;
        }
 
-       goto search_again;
+search_again:
+       if (start > end)
+               goto out;
+       spin_unlock(&tree->lock);
+       if (gfpflags_allow_blocking(mask))
+               cond_resched();
+       goto again;
 
 out:
        spin_unlock(&tree->lock);
@@ -1046,13 +1058,6 @@ out:
 
        return err;
 
-search_again:
-       if (start > end)
-               goto out;
-       spin_unlock(&tree->lock);
-       if (gfpflags_allow_blocking(mask))
-               cond_resched();
-       goto again;
 }
 
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1073,17 +1078,18 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
  * @bits:      the bits to set in this range
  * @clear_bits:        the bits to clear in this range
  * @cached_state:      state that we're going to cache
- * @mask:      the allocation mask
  *
  * This will go through and set bits for the given range.  If any states exist
  * already in this range they are set with the given bit and cleared of the
  * clear_bits.  This is only meant to be used by things that are mergeable, ie
  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
  * boundary bits like LOCK.
+ *
+ * All allocations are done with GFP_NOFS.
  */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                       unsigned bits, unsigned clear_bits,
-                      struct extent_state **cached_state, gfp_t mask)
+                      struct extent_state **cached_state)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -1098,7 +1104,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        btrfs_debug_check_extent_io_range(tree, start, end);
 
 again:
-       if (!prealloc && gfpflags_allow_blocking(mask)) {
+       if (!prealloc) {
                /*
                 * Best effort, don't worry if extent state allocation fails
                 * here for the first iteration. We might have a cached state
@@ -1106,7 +1112,7 @@ again:
                 * extent state allocations are needed. We'll only know this
                 * after locking the tree.
                 */
-               prealloc = alloc_extent_state(mask);
+               prealloc = alloc_extent_state(GFP_NOFS);
                if (!prealloc && !first_iteration)
                        return -ENOMEM;
        }
@@ -1263,7 +1269,13 @@ hit_next:
                goto out;
        }
 
-       goto search_again;
+search_again:
+       if (start > end)
+               goto out;
+       spin_unlock(&tree->lock);
+       cond_resched();
+       first_iteration = false;
+       goto again;
 
 out:
        spin_unlock(&tree->lock);
@@ -1271,21 +1283,11 @@ out:
                free_extent_state(prealloc);
 
        return err;
-
-search_again:
-       if (start > end)
-               goto out;
-       spin_unlock(&tree->lock);
-       if (gfpflags_allow_blocking(mask))
-               cond_resched();
-       first_iteration = false;
-       goto again;
 }
 
 /* wrappers around set/clear extent bit */
 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                          unsigned bits, gfp_t mask,
-                          struct extent_changeset *changeset)
+                          unsigned bits, struct extent_changeset *changeset)
 {
        /*
         * We don't support EXTENT_LOCKED yet, as current changeset will
@@ -1295,7 +1297,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
         */
        BUG_ON(bits & EXTENT_LOCKED);
 
-       return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+       return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
                                changeset);
 }
 
@@ -1308,8 +1310,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 }
 
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                            unsigned bits, gfp_t mask,
-                            struct extent_changeset *changeset)
+               unsigned bits, struct extent_changeset *changeset)
 {
        /*
         * Don't support EXTENT_LOCKED case, same reason as
@@ -1317,7 +1318,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
         */
        BUG_ON(bits & EXTENT_LOCKED);
 
-       return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+       return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
                                  changeset);
 }
 
@@ -1975,13 +1976,13 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
        set_state_failrec(failure_tree, rec->start, NULL);
        ret = clear_extent_bits(failure_tree, rec->start,
                                rec->start + rec->len - 1,
-                               EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+                               EXTENT_LOCKED | EXTENT_DIRTY);
        if (ret)
                err = ret;
 
        ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
                                rec->start + rec->len - 1,
-                               EXTENT_DAMAGED, GFP_NOFS);
+                               EXTENT_DAMAGED);
        if (ret && !err)
                err = ret;
 
@@ -2024,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
        bio->bi_iter.bi_size = 0;
        map_length = length;
 
+       /*
+        * Avoid races with device replace and make sure our bbio has devices
+        * associated to its stripes that don't go away while we are doing the
+        * read repair operation.
+        */
+       btrfs_bio_counter_inc_blocked(fs_info);
        ret = btrfs_map_block(fs_info, WRITE, logical,
                              &map_length, &bbio, mirror_num);
        if (ret) {
+               btrfs_bio_counter_dec(fs_info);
                bio_put(bio);
                return -EIO;
        }
@@ -2036,6 +2044,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
        dev = bbio->stripes[mirror_num-1].dev;
        btrfs_put_bbio(bbio);
        if (!dev || !dev->bdev || !dev->writeable) {
+               btrfs_bio_counter_dec(fs_info);
                bio_put(bio);
                return -EIO;
        }
@@ -2044,6 +2053,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 
        if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
                /* try to remap that extent elsewhere? */
+               btrfs_bio_counter_dec(fs_info);
                bio_put(bio);
                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
                return -EIO;
@@ -2053,6 +2063,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
                "read error corrected: ino %llu off %llu (dev %s sector %llu)",
                                  btrfs_ino(inode), start,
                                  rcu_str_deref(dev->name), sector);
+       btrfs_bio_counter_dec(fs_info);
        bio_put(bio);
        return 0;
 }
@@ -2232,13 +2243,12 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
 
                /* set the bits in the private failure tree */
                ret = set_extent_bits(failure_tree, start, end,
-                                       EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+                                       EXTENT_LOCKED | EXTENT_DIRTY);
                if (ret >= 0)
                        ret = set_state_failrec(failure_tree, start, failrec);
                /* set the bits in the inode's tree */
                if (ret >= 0)
-                       ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
-                                               GFP_NOFS);
+                       ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
                if (ret < 0) {
                        kfree(failrec);
                        return ret;
@@ -3200,14 +3210,10 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        return ret;
 }
 
-static noinline void update_nr_written(struct page *page,
-                                     struct writeback_control *wbc,
-                                     unsigned long nr_written)
+static void update_nr_written(struct page *page, struct writeback_control *wbc,
+                             unsigned long nr_written)
 {
        wbc->nr_to_write -= nr_written;
-       if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
-           wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
-               page->mapping->writeback_index = page->index + nr_written;
 }
 
 /*
@@ -3368,6 +3374,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 
        while (cur <= end) {
                u64 em_end;
+               unsigned long max_nr;
+
                if (cur >= i_size) {
                        if (tree->ops && tree->ops->writepage_end_io_hook)
                                tree->ops->writepage_end_io_hook(page, cur,
@@ -3423,32 +3431,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
                        continue;
                }
 
-               if (tree->ops && tree->ops->writepage_io_hook) {
-                       ret = tree->ops->writepage_io_hook(page, cur,
-                                               cur + iosize - 1);
-               } else {
-                       ret = 0;
+               max_nr = (i_size >> PAGE_SHIFT) + 1;
+
+               set_range_writeback(tree, cur, cur + iosize - 1);
+               if (!PageWriteback(page)) {
+                       btrfs_err(BTRFS_I(inode)->root->fs_info,
+                                  "page %lu not writeback, cur %llu end %llu",
+                              page->index, cur, end);
                }
-               if (ret) {
-                       SetPageError(page);
-               } else {
-                       unsigned long max_nr = (i_size >> PAGE_SHIFT) + 1;
 
-                       set_range_writeback(tree, cur, cur + iosize - 1);
-                       if (!PageWriteback(page)) {
-                               btrfs_err(BTRFS_I(inode)->root->fs_info,
-                                          "page %lu not writeback, cur %llu end %llu",
-                                      page->index, cur, end);
-                       }
+               ret = submit_extent_page(write_flags, tree, wbc, page,
+                                        sector, iosize, pg_offset,
+                                        bdev, &epd->bio, max_nr,
+                                        end_bio_extent_writepage,
+                                        0, 0, 0, false);
+               if (ret)
+                       SetPageError(page);
 
-                       ret = submit_extent_page(write_flags, tree, wbc, page,
-                                                sector, iosize, pg_offset,
-                                                bdev, &epd->bio, max_nr,
-                                                end_bio_extent_writepage,
-                                                0, 0, 0, false);
-                       if (ret)
-                               SetPageError(page);
-               }
                cur = cur + iosize;
                pg_offset += iosize;
                nr++;
@@ -3920,12 +3919,13 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        struct inode *inode = mapping->host;
        int ret = 0;
        int done = 0;
-       int err = 0;
        int nr_to_write_done = 0;
        struct pagevec pvec;
        int nr_pages;
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
+       pgoff_t done_index;
+       int range_whole = 0;
        int scanned = 0;
        int tag;
 
@@ -3948,6 +3948,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        } else {
                index = wbc->range_start >> PAGE_SHIFT;
                end = wbc->range_end >> PAGE_SHIFT;
+               if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                       range_whole = 1;
                scanned = 1;
        }
        if (wbc->sync_mode == WB_SYNC_ALL)
@@ -3957,6 +3959,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 retry:
        if (wbc->sync_mode == WB_SYNC_ALL)
                tag_pages_for_writeback(mapping, index, end);
+       done_index = index;
        while (!done && !nr_to_write_done && (index <= end) &&
               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -3966,6 +3969,7 @@ retry:
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
 
+                       done_index = page->index;
                        /*
                         * At this point we hold neither mapping->tree_lock nor
                         * lock on the page itself: the page may be truncated or
@@ -4007,8 +4011,20 @@ retry:
                                unlock_page(page);
                                ret = 0;
                        }
-                       if (!err && ret < 0)
-                               err = ret;
+                       if (ret < 0) {
+                               /*
+                                * done_index is set past this page,
+                                * so media errors will not choke
+                                * background writeout for the entire
+                                * file. This has consequences for
+                                * range_cyclic semantics (ie. it may
+                                * not be suitable for data integrity
+                                * writeout).
+                                */
+                               done_index = page->index + 1;
+                               done = 1;
+                               break;
+                       }
 
                        /*
                         * the filesystem may choose to bump up nr_to_write.
@@ -4020,7 +4036,7 @@ retry:
                pagevec_release(&pvec);
                cond_resched();
        }
-       if (!scanned && !done && !err) {
+       if (!scanned && !done) {
                /*
                 * We hit the last page and there is more work to be done: wrap
                 * back to the start of the file
@@ -4029,8 +4045,12 @@ retry:
                index = 0;
                goto retry;
        }
+
+       if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
+               mapping->writeback_index = done_index;
+
        btrfs_add_delayed_iput(inode);
-       return err;
+       return ret;
 }
 
 static void flush_epd_write_bio(struct extent_page_data *epd)
@@ -4379,8 +4399,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        if (ret < 0) {
                btrfs_free_path(path);
                return ret;
+       } else {
+               WARN_ON(!ret);
+               if (ret == 1)
+                       ret = 0;
        }
-       WARN_ON(!ret);
+
        path->slots[0]--;
        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
        found_type = found_key.type;
@@ -4591,7 +4615,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
                if (mapped)
                        spin_unlock(&page->mapping->private_lock);
 
-               /* One for when we alloced the page */
+               /* One for when we allocated the page */
                put_page(page);
        } while (index != 0);
 }
@@ -4704,16 +4728,16 @@ err:
 }
 
 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-                                               u64 start)
+                                               u64 start, u32 nodesize)
 {
        unsigned long len;
 
        if (!fs_info) {
                /*
                 * Called only from tests that don't always have a fs_info
-                * available, but we know that nodesize is 4096
+                * available
                 */
-               len = 4096;
+               len = nodesize;
        } else {
                len = fs_info->tree_root->nodesize;
        }
@@ -4809,7 +4833,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
-                                              u64 start)
+                                       u64 start, u32 nodesize)
 {
        struct extent_buffer *eb, *exists = NULL;
        int ret;
@@ -4817,12 +4841,12 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
        eb = find_extent_buffer(fs_info, start);
        if (eb)
                return eb;
-       eb = alloc_dummy_extent_buffer(fs_info, start);
+       eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
        if (!eb)
                return NULL;
        eb->fs_info = fs_info;
 again:
-       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+       ret = radix_tree_preload(GFP_NOFS);
        if (ret)
                goto free_eb;
        spin_lock(&fs_info->buffer_lock);
@@ -4868,18 +4892,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
        int uptodate = 1;
        int ret;
 
+       if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
+               btrfs_err(fs_info, "bad tree block start %llu", start);
+               return ERR_PTR(-EINVAL);
+       }
+
        eb = find_extent_buffer(fs_info, start);
        if (eb)
                return eb;
 
        eb = __alloc_extent_buffer(fs_info, start, len);
        if (!eb)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        for (i = 0; i < num_pages; i++, index++) {
                p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
-               if (!p)
+               if (!p) {
+                       exists = ERR_PTR(-ENOMEM);
                        goto free_eb;
+               }
 
                spin_lock(&mapping->private_lock);
                if (PagePrivate(p)) {
@@ -4923,9 +4954,11 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 again:
-       ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
-       if (ret)
+       ret = radix_tree_preload(GFP_NOFS);
+       if (ret) {
+               exists = ERR_PTR(ret);
                goto free_eb;
+       }
 
        spin_lock(&fs_info->buffer_lock);
        ret = radix_tree_insert(&fs_info->buffer_radix,
@@ -5309,6 +5342,11 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
        return ret;
 }
 
+/*
+ * return 0 if the item is found within a page.
+ * return 1 if the item spans two pages.
+ * return -EINVAL otherwise.
+ */
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
                               unsigned long min_len, char **map,
                               unsigned long *map_start,
@@ -5323,7 +5361,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
                PAGE_SHIFT;
 
        if (i != end_i)
-               return -EINVAL;
+               return 1;
 
        if (i == 0) {
                offset = start_offset;
@@ -5751,7 +5789,7 @@ int try_release_extent_buffer(struct page *page)
        struct extent_buffer *eb;
 
        /*
-        * We need to make sure noboody is attaching this page to an eb right
+        * We need to make sure nobody is attaching this page to an eb right
         * now.
         */
        spin_lock(&page->mapping->private_lock);