Merge branch 'upstream' of git://git.infradead.org/users/pcmoore/audit
[cascardo/linux.git] / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31                       *root, struct btrfs_key *ins_key,
32                       struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34                           struct btrfs_root *root, struct extent_buffer *dst,
35                           struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41                     int level, int slot);
42 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43                                  struct extent_buffer *eb);
44
45 struct btrfs_path *btrfs_alloc_path(void)
46 {
47         struct btrfs_path *path;
48         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
49         return path;
50 }
51
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58         int i;
59         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60                 if (!p->nodes[i] || !p->locks[i])
61                         continue;
62                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63                 if (p->locks[i] == BTRFS_READ_LOCK)
64                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
66                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67         }
68 }
69
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79                                         struct extent_buffer *held, int held_rw)
80 {
81         int i;
82
83         if (held) {
84                 btrfs_set_lock_blocking_rw(held, held_rw);
85                 if (held_rw == BTRFS_WRITE_LOCK)
86                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
87                 else if (held_rw == BTRFS_READ_LOCK)
88                         held_rw = BTRFS_READ_LOCK_BLOCKING;
89         }
90         btrfs_set_path_blocking(p);
91
92         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
93                 if (p->nodes[i] && p->locks[i]) {
94                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
95                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
96                                 p->locks[i] = BTRFS_WRITE_LOCK;
97                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
98                                 p->locks[i] = BTRFS_READ_LOCK;
99                 }
100         }
101
102         if (held)
103                 btrfs_clear_lock_blocking_rw(held, held_rw);
104 }
105
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path *p)
108 {
109         if (!p)
110                 return;
111         btrfs_release_path(p);
112         kmem_cache_free(btrfs_path_cachep, p);
113 }
114
115 /*
116  * path release drops references on the extent buffers in the path
117  * and it drops any locks held by this path
118  *
119  * It is safe to call this on paths that no locks or extent buffers held.
120  */
121 noinline void btrfs_release_path(struct btrfs_path *p)
122 {
123         int i;
124
125         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
126                 p->slots[i] = 0;
127                 if (!p->nodes[i])
128                         continue;
129                 if (p->locks[i]) {
130                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
131                         p->locks[i] = 0;
132                 }
133                 free_extent_buffer(p->nodes[i]);
134                 p->nodes[i] = NULL;
135         }
136 }
137
138 /*
139  * safely gets a reference on the root node of a tree.  A lock
140  * is not taken, so a concurrent writer may put a different node
141  * at the root of the tree.  See btrfs_lock_root_node for the
142  * looping required.
143  *
144  * The extent buffer returned by this has a reference taken, so
145  * it won't disappear.  It may stop being the root of the tree
146  * at any time because there are no locks held.
147  */
148 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
149 {
150         struct extent_buffer *eb;
151
152         while (1) {
153                 rcu_read_lock();
154                 eb = rcu_dereference(root->node);
155
156                 /*
157                  * RCU really hurts here, we could free up the root node because
158                  * it was cow'ed but we may not get the new root node yet so do
159                  * the inc_not_zero dance and if it doesn't work then
160                  * synchronize_rcu and try again.
161                  */
162                 if (atomic_inc_not_zero(&eb->refs)) {
163                         rcu_read_unlock();
164                         break;
165                 }
166                 rcu_read_unlock();
167                 synchronize_rcu();
168         }
169         return eb;
170 }
171
172 /* loop around taking references on and locking the root node of the
173  * tree until you end up with a lock on the root.  A locked buffer
174  * is returned, with a reference held.
175  */
176 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
177 {
178         struct extent_buffer *eb;
179
180         while (1) {
181                 eb = btrfs_root_node(root);
182                 btrfs_tree_lock(eb);
183                 if (eb == root->node)
184                         break;
185                 btrfs_tree_unlock(eb);
186                 free_extent_buffer(eb);
187         }
188         return eb;
189 }
190
191 /* loop around taking references on and locking the root node of the
192  * tree until you end up with a lock on the root.  A locked buffer
193  * is returned, with a reference held.
194  */
195 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196 {
197         struct extent_buffer *eb;
198
199         while (1) {
200                 eb = btrfs_root_node(root);
201                 btrfs_tree_read_lock(eb);
202                 if (eb == root->node)
203                         break;
204                 btrfs_tree_read_unlock(eb);
205                 free_extent_buffer(eb);
206         }
207         return eb;
208 }
209
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211  * put onto a simple dirty list.  transaction.c walks this to make sure they
212  * get properly updated on disk.
213  */
214 static void add_root_to_dirty_list(struct btrfs_root *root)
215 {
216         spin_lock(&root->fs_info->trans_lock);
217         if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
218             list_empty(&root->dirty_list)) {
219                 list_add(&root->dirty_list,
220                          &root->fs_info->dirty_cowonly_roots);
221         }
222         spin_unlock(&root->fs_info->trans_lock);
223 }
224
225 /*
226  * used by snapshot creation to make a copy of a root for a tree with
227  * a given objectid.  The buffer with the new root node is returned in
228  * cow_ret, and this func returns zero on success or a negative error code.
229  */
230 int btrfs_copy_root(struct btrfs_trans_handle *trans,
231                       struct btrfs_root *root,
232                       struct extent_buffer *buf,
233                       struct extent_buffer **cow_ret, u64 new_root_objectid)
234 {
235         struct extent_buffer *cow;
236         int ret = 0;
237         int level;
238         struct btrfs_disk_key disk_key;
239
240         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
241                 trans->transid != root->fs_info->running_transaction->transid);
242         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
243                 trans->transid != root->last_trans);
244
245         level = btrfs_header_level(buf);
246         if (level == 0)
247                 btrfs_item_key(buf, &disk_key, 0);
248         else
249                 btrfs_node_key(buf, &disk_key, 0);
250
251         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
252                         &disk_key, level, buf->start, 0);
253         if (IS_ERR(cow))
254                 return PTR_ERR(cow);
255
256         copy_extent_buffer(cow, buf, 0, 0, cow->len);
257         btrfs_set_header_bytenr(cow, cow->start);
258         btrfs_set_header_generation(cow, trans->transid);
259         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
260         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
261                                      BTRFS_HEADER_FLAG_RELOC);
262         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
263                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
264         else
265                 btrfs_set_header_owner(cow, new_root_objectid);
266
267         write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
268                             BTRFS_FSID_SIZE);
269
270         WARN_ON(btrfs_header_generation(buf) > trans->transid);
271         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
272                 ret = btrfs_inc_ref(trans, root, cow, 1);
273         else
274                 ret = btrfs_inc_ref(trans, root, cow, 0);
275
276         if (ret)
277                 return ret;
278
279         btrfs_mark_buffer_dirty(cow);
280         *cow_ret = cow;
281         return 0;
282 }
283
284 enum mod_log_op {
285         MOD_LOG_KEY_REPLACE,
286         MOD_LOG_KEY_ADD,
287         MOD_LOG_KEY_REMOVE,
288         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
289         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
290         MOD_LOG_MOVE_KEYS,
291         MOD_LOG_ROOT_REPLACE,
292 };
293
294 struct tree_mod_move {
295         int dst_slot;
296         int nr_items;
297 };
298
299 struct tree_mod_root {
300         u64 logical;
301         u8 level;
302 };
303
304 struct tree_mod_elem {
305         struct rb_node node;
306         u64 index;              /* shifted logical */
307         u64 seq;
308         enum mod_log_op op;
309
310         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
311         int slot;
312
313         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
314         u64 generation;
315
316         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
317         struct btrfs_disk_key key;
318         u64 blockptr;
319
320         /* this is used for op == MOD_LOG_MOVE_KEYS */
321         struct tree_mod_move move;
322
323         /* this is used for op == MOD_LOG_ROOT_REPLACE */
324         struct tree_mod_root old_root;
325 };
326
327 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
328 {
329         read_lock(&fs_info->tree_mod_log_lock);
330 }
331
332 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
333 {
334         read_unlock(&fs_info->tree_mod_log_lock);
335 }
336
337 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
338 {
339         write_lock(&fs_info->tree_mod_log_lock);
340 }
341
342 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
343 {
344         write_unlock(&fs_info->tree_mod_log_lock);
345 }
346
347 /*
348  * Pull a new tree mod seq number for our operation.
349  */
350 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
351 {
352         return atomic64_inc_return(&fs_info->tree_mod_seq);
353 }
354
355 /*
356  * This adds a new blocker to the tree mod log's blocker list if the @elem
357  * passed does not already have a sequence number set. So when a caller expects
358  * to record tree modifications, it should ensure to set elem->seq to zero
359  * before calling btrfs_get_tree_mod_seq.
360  * Returns a fresh, unused tree log modification sequence number, even if no new
361  * blocker was added.
362  */
363 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
364                            struct seq_list *elem)
365 {
366         tree_mod_log_write_lock(fs_info);
367         spin_lock(&fs_info->tree_mod_seq_lock);
368         if (!elem->seq) {
369                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
370                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
371         }
372         spin_unlock(&fs_info->tree_mod_seq_lock);
373         tree_mod_log_write_unlock(fs_info);
374
375         return elem->seq;
376 }
377
378 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
379                             struct seq_list *elem)
380 {
381         struct rb_root *tm_root;
382         struct rb_node *node;
383         struct rb_node *next;
384         struct seq_list *cur_elem;
385         struct tree_mod_elem *tm;
386         u64 min_seq = (u64)-1;
387         u64 seq_putting = elem->seq;
388
389         if (!seq_putting)
390                 return;
391
392         spin_lock(&fs_info->tree_mod_seq_lock);
393         list_del(&elem->list);
394         elem->seq = 0;
395
396         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
397                 if (cur_elem->seq < min_seq) {
398                         if (seq_putting > cur_elem->seq) {
399                                 /*
400                                  * blocker with lower sequence number exists, we
401                                  * cannot remove anything from the log
402                                  */
403                                 spin_unlock(&fs_info->tree_mod_seq_lock);
404                                 return;
405                         }
406                         min_seq = cur_elem->seq;
407                 }
408         }
409         spin_unlock(&fs_info->tree_mod_seq_lock);
410
411         /*
412          * anything that's lower than the lowest existing (read: blocked)
413          * sequence number can be removed from the tree.
414          */
415         tree_mod_log_write_lock(fs_info);
416         tm_root = &fs_info->tree_mod_log;
417         for (node = rb_first(tm_root); node; node = next) {
418                 next = rb_next(node);
419                 tm = container_of(node, struct tree_mod_elem, node);
420                 if (tm->seq > min_seq)
421                         continue;
422                 rb_erase(node, tm_root);
423                 kfree(tm);
424         }
425         tree_mod_log_write_unlock(fs_info);
426 }
427
428 /*
429  * key order of the log:
430  *       index -> sequence
431  *
432  * the index is the shifted logical of the *new* root node for root replace
433  * operations, or the shifted logical of the affected block for all other
434  * operations.
435  *
436  * Note: must be called with write lock (tree_mod_log_write_lock).
437  */
438 static noinline int
439 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
440 {
441         struct rb_root *tm_root;
442         struct rb_node **new;
443         struct rb_node *parent = NULL;
444         struct tree_mod_elem *cur;
445
446         BUG_ON(!tm);
447
448         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
449
450         tm_root = &fs_info->tree_mod_log;
451         new = &tm_root->rb_node;
452         while (*new) {
453                 cur = container_of(*new, struct tree_mod_elem, node);
454                 parent = *new;
455                 if (cur->index < tm->index)
456                         new = &((*new)->rb_left);
457                 else if (cur->index > tm->index)
458                         new = &((*new)->rb_right);
459                 else if (cur->seq < tm->seq)
460                         new = &((*new)->rb_left);
461                 else if (cur->seq > tm->seq)
462                         new = &((*new)->rb_right);
463                 else
464                         return -EEXIST;
465         }
466
467         rb_link_node(&tm->node, parent, new);
468         rb_insert_color(&tm->node, tm_root);
469         return 0;
470 }
471
472 /*
473  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
474  * returns zero with the tree_mod_log_lock acquired. The caller must hold
475  * this until all tree mod log insertions are recorded in the rb tree and then
476  * call tree_mod_log_write_unlock() to release.
477  */
478 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
479                                     struct extent_buffer *eb) {
480         smp_mb();
481         if (list_empty(&(fs_info)->tree_mod_seq_list))
482                 return 1;
483         if (eb && btrfs_header_level(eb) == 0)
484                 return 1;
485
486         tree_mod_log_write_lock(fs_info);
487         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
488                 tree_mod_log_write_unlock(fs_info);
489                 return 1;
490         }
491
492         return 0;
493 }
494
495 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
496 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
497                                     struct extent_buffer *eb)
498 {
499         smp_mb();
500         if (list_empty(&(fs_info)->tree_mod_seq_list))
501                 return 0;
502         if (eb && btrfs_header_level(eb) == 0)
503                 return 0;
504
505         return 1;
506 }
507
508 static struct tree_mod_elem *
509 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
510                     enum mod_log_op op, gfp_t flags)
511 {
512         struct tree_mod_elem *tm;
513
514         tm = kzalloc(sizeof(*tm), flags);
515         if (!tm)
516                 return NULL;
517
518         tm->index = eb->start >> PAGE_CACHE_SHIFT;
519         if (op != MOD_LOG_KEY_ADD) {
520                 btrfs_node_key(eb, &tm->key, slot);
521                 tm->blockptr = btrfs_node_blockptr(eb, slot);
522         }
523         tm->op = op;
524         tm->slot = slot;
525         tm->generation = btrfs_node_ptr_generation(eb, slot);
526         RB_CLEAR_NODE(&tm->node);
527
528         return tm;
529 }
530
531 static noinline int
532 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
533                         struct extent_buffer *eb, int slot,
534                         enum mod_log_op op, gfp_t flags)
535 {
536         struct tree_mod_elem *tm;
537         int ret;
538
539         if (!tree_mod_need_log(fs_info, eb))
540                 return 0;
541
542         tm = alloc_tree_mod_elem(eb, slot, op, flags);
543         if (!tm)
544                 return -ENOMEM;
545
546         if (tree_mod_dont_log(fs_info, eb)) {
547                 kfree(tm);
548                 return 0;
549         }
550
551         ret = __tree_mod_log_insert(fs_info, tm);
552         tree_mod_log_write_unlock(fs_info);
553         if (ret)
554                 kfree(tm);
555
556         return ret;
557 }
558
559 static noinline int
560 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
561                          struct extent_buffer *eb, int dst_slot, int src_slot,
562                          int nr_items, gfp_t flags)
563 {
564         struct tree_mod_elem *tm = NULL;
565         struct tree_mod_elem **tm_list = NULL;
566         int ret = 0;
567         int i;
568         int locked = 0;
569
570         if (!tree_mod_need_log(fs_info, eb))
571                 return 0;
572
573         tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
574         if (!tm_list)
575                 return -ENOMEM;
576
577         tm = kzalloc(sizeof(*tm), flags);
578         if (!tm) {
579                 ret = -ENOMEM;
580                 goto free_tms;
581         }
582
583         tm->index = eb->start >> PAGE_CACHE_SHIFT;
584         tm->slot = src_slot;
585         tm->move.dst_slot = dst_slot;
586         tm->move.nr_items = nr_items;
587         tm->op = MOD_LOG_MOVE_KEYS;
588
589         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
590                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
591                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
592                 if (!tm_list[i]) {
593                         ret = -ENOMEM;
594                         goto free_tms;
595                 }
596         }
597
598         if (tree_mod_dont_log(fs_info, eb))
599                 goto free_tms;
600         locked = 1;
601
602         /*
603          * When we override something during the move, we log these removals.
604          * This can only happen when we move towards the beginning of the
605          * buffer, i.e. dst_slot < src_slot.
606          */
607         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
608                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
609                 if (ret)
610                         goto free_tms;
611         }
612
613         ret = __tree_mod_log_insert(fs_info, tm);
614         if (ret)
615                 goto free_tms;
616         tree_mod_log_write_unlock(fs_info);
617         kfree(tm_list);
618
619         return 0;
620 free_tms:
621         for (i = 0; i < nr_items; i++) {
622                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
623                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
624                 kfree(tm_list[i]);
625         }
626         if (locked)
627                 tree_mod_log_write_unlock(fs_info);
628         kfree(tm_list);
629         kfree(tm);
630
631         return ret;
632 }
633
634 static inline int
635 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
636                        struct tree_mod_elem **tm_list,
637                        int nritems)
638 {
639         int i, j;
640         int ret;
641
642         for (i = nritems - 1; i >= 0; i--) {
643                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
644                 if (ret) {
645                         for (j = nritems - 1; j > i; j--)
646                                 rb_erase(&tm_list[j]->node,
647                                          &fs_info->tree_mod_log);
648                         return ret;
649                 }
650         }
651
652         return 0;
653 }
654
655 static noinline int
656 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
657                          struct extent_buffer *old_root,
658                          struct extent_buffer *new_root, gfp_t flags,
659                          int log_removal)
660 {
661         struct tree_mod_elem *tm = NULL;
662         struct tree_mod_elem **tm_list = NULL;
663         int nritems = 0;
664         int ret = 0;
665         int i;
666
667         if (!tree_mod_need_log(fs_info, NULL))
668                 return 0;
669
670         if (log_removal && btrfs_header_level(old_root) > 0) {
671                 nritems = btrfs_header_nritems(old_root);
672                 tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
673                                   flags);
674                 if (!tm_list) {
675                         ret = -ENOMEM;
676                         goto free_tms;
677                 }
678                 for (i = 0; i < nritems; i++) {
679                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
680                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
681                         if (!tm_list[i]) {
682                                 ret = -ENOMEM;
683                                 goto free_tms;
684                         }
685                 }
686         }
687
688         tm = kzalloc(sizeof(*tm), flags);
689         if (!tm) {
690                 ret = -ENOMEM;
691                 goto free_tms;
692         }
693
694         tm->index = new_root->start >> PAGE_CACHE_SHIFT;
695         tm->old_root.logical = old_root->start;
696         tm->old_root.level = btrfs_header_level(old_root);
697         tm->generation = btrfs_header_generation(old_root);
698         tm->op = MOD_LOG_ROOT_REPLACE;
699
700         if (tree_mod_dont_log(fs_info, NULL))
701                 goto free_tms;
702
703         if (tm_list)
704                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
705         if (!ret)
706                 ret = __tree_mod_log_insert(fs_info, tm);
707
708         tree_mod_log_write_unlock(fs_info);
709         if (ret)
710                 goto free_tms;
711         kfree(tm_list);
712
713         return ret;
714
715 free_tms:
716         if (tm_list) {
717                 for (i = 0; i < nritems; i++)
718                         kfree(tm_list[i]);
719                 kfree(tm_list);
720         }
721         kfree(tm);
722
723         return ret;
724 }
725
726 static struct tree_mod_elem *
727 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
728                       int smallest)
729 {
730         struct rb_root *tm_root;
731         struct rb_node *node;
732         struct tree_mod_elem *cur = NULL;
733         struct tree_mod_elem *found = NULL;
734         u64 index = start >> PAGE_CACHE_SHIFT;
735
736         tree_mod_log_read_lock(fs_info);
737         tm_root = &fs_info->tree_mod_log;
738         node = tm_root->rb_node;
739         while (node) {
740                 cur = container_of(node, struct tree_mod_elem, node);
741                 if (cur->index < index) {
742                         node = node->rb_left;
743                 } else if (cur->index > index) {
744                         node = node->rb_right;
745                 } else if (cur->seq < min_seq) {
746                         node = node->rb_left;
747                 } else if (!smallest) {
748                         /* we want the node with the highest seq */
749                         if (found)
750                                 BUG_ON(found->seq > cur->seq);
751                         found = cur;
752                         node = node->rb_left;
753                 } else if (cur->seq > min_seq) {
754                         /* we want the node with the smallest seq */
755                         if (found)
756                                 BUG_ON(found->seq < cur->seq);
757                         found = cur;
758                         node = node->rb_right;
759                 } else {
760                         found = cur;
761                         break;
762                 }
763         }
764         tree_mod_log_read_unlock(fs_info);
765
766         return found;
767 }
768
769 /*
770  * this returns the element from the log with the smallest time sequence
771  * value that's in the log (the oldest log item). any element with a time
772  * sequence lower than min_seq will be ignored.
773  */
774 static struct tree_mod_elem *
775 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
776                            u64 min_seq)
777 {
778         return __tree_mod_log_search(fs_info, start, min_seq, 1);
779 }
780
781 /*
782  * this returns the element from the log with the largest time sequence
783  * value that's in the log (the most recent log item). any element with
784  * a time sequence lower than min_seq will be ignored.
785  */
786 static struct tree_mod_elem *
787 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
788 {
789         return __tree_mod_log_search(fs_info, start, min_seq, 0);
790 }
791
792 static noinline int
793 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
794                      struct extent_buffer *src, unsigned long dst_offset,
795                      unsigned long src_offset, int nr_items)
796 {
797         int ret = 0;
798         struct tree_mod_elem **tm_list = NULL;
799         struct tree_mod_elem **tm_list_add, **tm_list_rem;
800         int i;
801         int locked = 0;
802
803         if (!tree_mod_need_log(fs_info, NULL))
804                 return 0;
805
806         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
807                 return 0;
808
809         tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
810                           GFP_NOFS);
811         if (!tm_list)
812                 return -ENOMEM;
813
814         tm_list_add = tm_list;
815         tm_list_rem = tm_list + nr_items;
816         for (i = 0; i < nr_items; i++) {
817                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
818                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
819                 if (!tm_list_rem[i]) {
820                         ret = -ENOMEM;
821                         goto free_tms;
822                 }
823
824                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
825                     MOD_LOG_KEY_ADD, GFP_NOFS);
826                 if (!tm_list_add[i]) {
827                         ret = -ENOMEM;
828                         goto free_tms;
829                 }
830         }
831
832         if (tree_mod_dont_log(fs_info, NULL))
833                 goto free_tms;
834         locked = 1;
835
836         for (i = 0; i < nr_items; i++) {
837                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
838                 if (ret)
839                         goto free_tms;
840                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
841                 if (ret)
842                         goto free_tms;
843         }
844
845         tree_mod_log_write_unlock(fs_info);
846         kfree(tm_list);
847
848         return 0;
849
850 free_tms:
851         for (i = 0; i < nr_items * 2; i++) {
852                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
853                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
854                 kfree(tm_list[i]);
855         }
856         if (locked)
857                 tree_mod_log_write_unlock(fs_info);
858         kfree(tm_list);
859
860         return ret;
861 }
862
863 static inline void
864 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
865                      int dst_offset, int src_offset, int nr_items)
866 {
867         int ret;
868         ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
869                                        nr_items, GFP_NOFS);
870         BUG_ON(ret < 0);
871 }
872
873 static noinline void
874 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
875                           struct extent_buffer *eb, int slot, int atomic)
876 {
877         int ret;
878
879         ret = tree_mod_log_insert_key(fs_info, eb, slot,
880                                         MOD_LOG_KEY_REPLACE,
881                                         atomic ? GFP_ATOMIC : GFP_NOFS);
882         BUG_ON(ret < 0);
883 }
884
885 static noinline int
886 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
887 {
888         struct tree_mod_elem **tm_list = NULL;
889         int nritems = 0;
890         int i;
891         int ret = 0;
892
893         if (btrfs_header_level(eb) == 0)
894                 return 0;
895
896         if (!tree_mod_need_log(fs_info, NULL))
897                 return 0;
898
899         nritems = btrfs_header_nritems(eb);
900         tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
901                           GFP_NOFS);
902         if (!tm_list)
903                 return -ENOMEM;
904
905         for (i = 0; i < nritems; i++) {
906                 tm_list[i] = alloc_tree_mod_elem(eb, i,
907                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
908                 if (!tm_list[i]) {
909                         ret = -ENOMEM;
910                         goto free_tms;
911                 }
912         }
913
914         if (tree_mod_dont_log(fs_info, eb))
915                 goto free_tms;
916
917         ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
918         tree_mod_log_write_unlock(fs_info);
919         if (ret)
920                 goto free_tms;
921         kfree(tm_list);
922
923         return 0;
924
925 free_tms:
926         for (i = 0; i < nritems; i++)
927                 kfree(tm_list[i]);
928         kfree(tm_list);
929
930         return ret;
931 }
932
933 static noinline void
934 tree_mod_log_set_root_pointer(struct btrfs_root *root,
935                               struct extent_buffer *new_root_node,
936                               int log_removal)
937 {
938         int ret;
939         ret = tree_mod_log_insert_root(root->fs_info, root->node,
940                                        new_root_node, GFP_NOFS, log_removal);
941         BUG_ON(ret < 0);
942 }
943
944 /*
945  * check if the tree block can be shared by multiple trees
946  */
947 int btrfs_block_can_be_shared(struct btrfs_root *root,
948                               struct extent_buffer *buf)
949 {
950         /*
951          * Tree blocks not in refernece counted trees and tree roots
952          * are never shared. If a block was allocated after the last
953          * snapshot and the block was not allocated by tree relocation,
954          * we know the block is not shared.
955          */
956         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
957             buf != root->node && buf != root->commit_root &&
958             (btrfs_header_generation(buf) <=
959              btrfs_root_last_snapshot(&root->root_item) ||
960              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
961                 return 1;
962 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
963         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
964             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
965                 return 1;
966 #endif
967         return 0;
968 }
969
970 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
971                                        struct btrfs_root *root,
972                                        struct extent_buffer *buf,
973                                        struct extent_buffer *cow,
974                                        int *last_ref)
975 {
976         u64 refs;
977         u64 owner;
978         u64 flags;
979         u64 new_flags = 0;
980         int ret;
981
982         /*
983          * Backrefs update rules:
984          *
985          * Always use full backrefs for extent pointers in tree block
986          * allocated by tree relocation.
987          *
988          * If a shared tree block is no longer referenced by its owner
989          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
990          * use full backrefs for extent pointers in tree block.
991          *
992          * If a tree block is been relocating
993          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
994          * use full backrefs for extent pointers in tree block.
995          * The reason for this is some operations (such as drop tree)
996          * are only allowed for blocks use full backrefs.
997          */
998
999         if (btrfs_block_can_be_shared(root, buf)) {
1000                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
1001                                                btrfs_header_level(buf), 1,
1002                                                &refs, &flags);
1003                 if (ret)
1004                         return ret;
1005                 if (refs == 0) {
1006                         ret = -EROFS;
1007                         btrfs_std_error(root->fs_info, ret);
1008                         return ret;
1009                 }
1010         } else {
1011                 refs = 1;
1012                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1013                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1014                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1015                 else
1016                         flags = 0;
1017         }
1018
1019         owner = btrfs_header_owner(buf);
1020         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1021                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1022
1023         if (refs > 1) {
1024                 if ((owner == root->root_key.objectid ||
1025                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1026                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1027                         ret = btrfs_inc_ref(trans, root, buf, 1);
1028                         BUG_ON(ret); /* -ENOMEM */
1029
1030                         if (root->root_key.objectid ==
1031                             BTRFS_TREE_RELOC_OBJECTID) {
1032                                 ret = btrfs_dec_ref(trans, root, buf, 0);
1033                                 BUG_ON(ret); /* -ENOMEM */
1034                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1035                                 BUG_ON(ret); /* -ENOMEM */
1036                         }
1037                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1038                 } else {
1039
1040                         if (root->root_key.objectid ==
1041                             BTRFS_TREE_RELOC_OBJECTID)
1042                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1043                         else
1044                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1045                         BUG_ON(ret); /* -ENOMEM */
1046                 }
1047                 if (new_flags != 0) {
1048                         int level = btrfs_header_level(buf);
1049
1050                         ret = btrfs_set_disk_extent_flags(trans, root,
1051                                                           buf->start,
1052                                                           buf->len,
1053                                                           new_flags, level, 0);
1054                         if (ret)
1055                                 return ret;
1056                 }
1057         } else {
1058                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1059                         if (root->root_key.objectid ==
1060                             BTRFS_TREE_RELOC_OBJECTID)
1061                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1062                         else
1063                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1064                         BUG_ON(ret); /* -ENOMEM */
1065                         ret = btrfs_dec_ref(trans, root, buf, 1);
1066                         BUG_ON(ret); /* -ENOMEM */
1067                 }
1068                 clean_tree_block(trans, root, buf);
1069                 *last_ref = 1;
1070         }
1071         return 0;
1072 }
1073
1074 /*
1075  * does the dirty work in cow of a single block.  The parent block (if
1076  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1077  * dirty and returned locked.  If you modify the block it needs to be marked
1078  * dirty again.
1079  *
1080  * search_start -- an allocation hint for the new block
1081  *
1082  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1083  * bytes the allocator should try to find free next to the block it returns.
1084  * This is just a hint and may be ignored by the allocator.
1085  */
1086 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1087                              struct btrfs_root *root,
1088                              struct extent_buffer *buf,
1089                              struct extent_buffer *parent, int parent_slot,
1090                              struct extent_buffer **cow_ret,
1091                              u64 search_start, u64 empty_size)
1092 {
1093         struct btrfs_disk_key disk_key;
1094         struct extent_buffer *cow;
1095         int level, ret;
1096         int last_ref = 0;
1097         int unlock_orig = 0;
1098         u64 parent_start;
1099
1100         if (*cow_ret == buf)
1101                 unlock_orig = 1;
1102
1103         btrfs_assert_tree_locked(buf);
1104
1105         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1106                 trans->transid != root->fs_info->running_transaction->transid);
1107         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1108                 trans->transid != root->last_trans);
1109
1110         level = btrfs_header_level(buf);
1111
1112         if (level == 0)
1113                 btrfs_item_key(buf, &disk_key, 0);
1114         else
1115                 btrfs_node_key(buf, &disk_key, 0);
1116
1117         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1118                 if (parent)
1119                         parent_start = parent->start;
1120                 else
1121                         parent_start = 0;
1122         } else
1123                 parent_start = 0;
1124
1125         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1126                         root->root_key.objectid, &disk_key, level,
1127                         search_start, empty_size);
1128         if (IS_ERR(cow))
1129                 return PTR_ERR(cow);
1130
1131         /* cow is set to blocking by btrfs_init_new_buffer */
1132
1133         copy_extent_buffer(cow, buf, 0, 0, cow->len);
1134         btrfs_set_header_bytenr(cow, cow->start);
1135         btrfs_set_header_generation(cow, trans->transid);
1136         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1137         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1138                                      BTRFS_HEADER_FLAG_RELOC);
1139         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1140                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1141         else
1142                 btrfs_set_header_owner(cow, root->root_key.objectid);
1143
1144         write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1145                             BTRFS_FSID_SIZE);
1146
1147         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1148         if (ret) {
1149                 btrfs_abort_transaction(trans, root, ret);
1150                 return ret;
1151         }
1152
1153         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1154                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1155                 if (ret)
1156                         return ret;
1157         }
1158
1159         if (buf == root->node) {
1160                 WARN_ON(parent && parent != buf);
1161                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1162                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1163                         parent_start = buf->start;
1164                 else
1165                         parent_start = 0;
1166
1167                 extent_buffer_get(cow);
1168                 tree_mod_log_set_root_pointer(root, cow, 1);
1169                 rcu_assign_pointer(root->node, cow);
1170
1171                 btrfs_free_tree_block(trans, root, buf, parent_start,
1172                                       last_ref);
1173                 free_extent_buffer(buf);
1174                 add_root_to_dirty_list(root);
1175         } else {
1176                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1177                         parent_start = parent->start;
1178                 else
1179                         parent_start = 0;
1180
1181                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1182                 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1183                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1184                 btrfs_set_node_blockptr(parent, parent_slot,
1185                                         cow->start);
1186                 btrfs_set_node_ptr_generation(parent, parent_slot,
1187                                               trans->transid);
1188                 btrfs_mark_buffer_dirty(parent);
1189                 if (last_ref) {
1190                         ret = tree_mod_log_free_eb(root->fs_info, buf);
1191                         if (ret) {
1192                                 btrfs_abort_transaction(trans, root, ret);
1193                                 return ret;
1194                         }
1195                 }
1196                 btrfs_free_tree_block(trans, root, buf, parent_start,
1197                                       last_ref);
1198         }
1199         if (unlock_orig)
1200                 btrfs_tree_unlock(buf);
1201         free_extent_buffer_stale(buf);
1202         btrfs_mark_buffer_dirty(cow);
1203         *cow_ret = cow;
1204         return 0;
1205 }
1206
1207 /*
1208  * returns the logical address of the oldest predecessor of the given root.
1209  * entries older than time_seq are ignored.
1210  */
1211 static struct tree_mod_elem *
1212 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1213                            struct extent_buffer *eb_root, u64 time_seq)
1214 {
1215         struct tree_mod_elem *tm;
1216         struct tree_mod_elem *found = NULL;
1217         u64 root_logical = eb_root->start;
1218         int looped = 0;
1219
1220         if (!time_seq)
1221                 return NULL;
1222
1223         /*
1224          * the very last operation that's logged for a root is the replacement
1225          * operation (if it is replaced at all). this has the index of the *new*
1226          * root, making it the very first operation that's logged for this root.
1227          */
1228         while (1) {
1229                 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1230                                                 time_seq);
1231                 if (!looped && !tm)
1232                         return NULL;
1233                 /*
1234                  * if there are no tree operation for the oldest root, we simply
1235                  * return it. this should only happen if that (old) root is at
1236                  * level 0.
1237                  */
1238                 if (!tm)
1239                         break;
1240
1241                 /*
1242                  * if there's an operation that's not a root replacement, we
1243                  * found the oldest version of our root. normally, we'll find a
1244                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1245                  */
1246                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1247                         break;
1248
1249                 found = tm;
1250                 root_logical = tm->old_root.logical;
1251                 looped = 1;
1252         }
1253
1254         /* if there's no old root to return, return what we found instead */
1255         if (!found)
1256                 found = tm;
1257
1258         return found;
1259 }
1260
1261 /*
1262  * tm is a pointer to the first operation to rewind within eb. then, all
1263  * previous operations will be rewinded (until we reach something older than
1264  * time_seq).
1265  */
1266 static void
1267 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1268                       u64 time_seq, struct tree_mod_elem *first_tm)
1269 {
1270         u32 n;
1271         struct rb_node *next;
1272         struct tree_mod_elem *tm = first_tm;
1273         unsigned long o_dst;
1274         unsigned long o_src;
1275         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1276
1277         n = btrfs_header_nritems(eb);
1278         tree_mod_log_read_lock(fs_info);
1279         while (tm && tm->seq >= time_seq) {
1280                 /*
1281                  * all the operations are recorded with the operator used for
1282                  * the modification. as we're going backwards, we do the
1283                  * opposite of each operation here.
1284                  */
1285                 switch (tm->op) {
1286                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1287                         BUG_ON(tm->slot < n);
1288                         /* Fallthrough */
1289                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1290                 case MOD_LOG_KEY_REMOVE:
1291                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1292                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1293                         btrfs_set_node_ptr_generation(eb, tm->slot,
1294                                                       tm->generation);
1295                         n++;
1296                         break;
1297                 case MOD_LOG_KEY_REPLACE:
1298                         BUG_ON(tm->slot >= n);
1299                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1300                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1301                         btrfs_set_node_ptr_generation(eb, tm->slot,
1302                                                       tm->generation);
1303                         break;
1304                 case MOD_LOG_KEY_ADD:
1305                         /* if a move operation is needed it's in the log */
1306                         n--;
1307                         break;
1308                 case MOD_LOG_MOVE_KEYS:
1309                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1310                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1311                         memmove_extent_buffer(eb, o_dst, o_src,
1312                                               tm->move.nr_items * p_size);
1313                         break;
1314                 case MOD_LOG_ROOT_REPLACE:
1315                         /*
1316                          * this operation is special. for roots, this must be
1317                          * handled explicitly before rewinding.
1318                          * for non-roots, this operation may exist if the node
1319                          * was a root: root A -> child B; then A gets empty and
1320                          * B is promoted to the new root. in the mod log, we'll
1321                          * have a root-replace operation for B, a tree block
1322                          * that is no root. we simply ignore that operation.
1323                          */
1324                         break;
1325                 }
1326                 next = rb_next(&tm->node);
1327                 if (!next)
1328                         break;
1329                 tm = container_of(next, struct tree_mod_elem, node);
1330                 if (tm->index != first_tm->index)
1331                         break;
1332         }
1333         tree_mod_log_read_unlock(fs_info);
1334         btrfs_set_header_nritems(eb, n);
1335 }
1336
1337 /*
1338  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1339  * is returned. If rewind operations happen, a fresh buffer is returned. The
1340  * returned buffer is always read-locked. If the returned buffer is not the
1341  * input buffer, the lock on the input buffer is released and the input buffer
1342  * is freed (its refcount is decremented).
1343  */
1344 static struct extent_buffer *
1345 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1346                     struct extent_buffer *eb, u64 time_seq)
1347 {
1348         struct extent_buffer *eb_rewin;
1349         struct tree_mod_elem *tm;
1350
1351         if (!time_seq)
1352                 return eb;
1353
1354         if (btrfs_header_level(eb) == 0)
1355                 return eb;
1356
1357         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1358         if (!tm)
1359                 return eb;
1360
1361         btrfs_set_path_blocking(path);
1362         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1363
1364         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1365                 BUG_ON(tm->slot != 0);
1366                 eb_rewin = alloc_dummy_extent_buffer(eb->start,
1367                                                 fs_info->tree_root->nodesize);
1368                 if (!eb_rewin) {
1369                         btrfs_tree_read_unlock_blocking(eb);
1370                         free_extent_buffer(eb);
1371                         return NULL;
1372                 }
1373                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1374                 btrfs_set_header_backref_rev(eb_rewin,
1375                                              btrfs_header_backref_rev(eb));
1376                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1377                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1378         } else {
1379                 eb_rewin = btrfs_clone_extent_buffer(eb);
1380                 if (!eb_rewin) {
1381                         btrfs_tree_read_unlock_blocking(eb);
1382                         free_extent_buffer(eb);
1383                         return NULL;
1384                 }
1385         }
1386
1387         btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1388         btrfs_tree_read_unlock_blocking(eb);
1389         free_extent_buffer(eb);
1390
1391         extent_buffer_get(eb_rewin);
1392         btrfs_tree_read_lock(eb_rewin);
1393         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1394         WARN_ON(btrfs_header_nritems(eb_rewin) >
1395                 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1396
1397         return eb_rewin;
1398 }
1399
1400 /*
1401  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1402  * value. If there are no changes, the current root->root_node is returned. If
1403  * anything changed in between, there's a fresh buffer allocated on which the
1404  * rewind operations are done. In any case, the returned buffer is read locked.
1405  * Returns NULL on error (with no locks held).
1406  */
1407 static inline struct extent_buffer *
1408 get_old_root(struct btrfs_root *root, u64 time_seq)
1409 {
1410         struct tree_mod_elem *tm;
1411         struct extent_buffer *eb = NULL;
1412         struct extent_buffer *eb_root;
1413         struct extent_buffer *old;
1414         struct tree_mod_root *old_root = NULL;
1415         u64 old_generation = 0;
1416         u64 logical;
1417
1418         eb_root = btrfs_read_lock_root_node(root);
1419         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1420         if (!tm)
1421                 return eb_root;
1422
1423         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1424                 old_root = &tm->old_root;
1425                 old_generation = tm->generation;
1426                 logical = old_root->logical;
1427         } else {
1428                 logical = eb_root->start;
1429         }
1430
1431         tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1432         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1433                 btrfs_tree_read_unlock(eb_root);
1434                 free_extent_buffer(eb_root);
1435                 old = read_tree_block(root, logical, 0);
1436                 if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1437                         free_extent_buffer(old);
1438                         btrfs_warn(root->fs_info,
1439                                 "failed to read tree block %llu from get_old_root", logical);
1440                 } else {
1441                         eb = btrfs_clone_extent_buffer(old);
1442                         free_extent_buffer(old);
1443                 }
1444         } else if (old_root) {
1445                 btrfs_tree_read_unlock(eb_root);
1446                 free_extent_buffer(eb_root);
1447                 eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1448         } else {
1449                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1450                 eb = btrfs_clone_extent_buffer(eb_root);
1451                 btrfs_tree_read_unlock_blocking(eb_root);
1452                 free_extent_buffer(eb_root);
1453         }
1454
1455         if (!eb)
1456                 return NULL;
1457         extent_buffer_get(eb);
1458         btrfs_tree_read_lock(eb);
1459         if (old_root) {
1460                 btrfs_set_header_bytenr(eb, eb->start);
1461                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1462                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1463                 btrfs_set_header_level(eb, old_root->level);
1464                 btrfs_set_header_generation(eb, old_generation);
1465         }
1466         if (tm)
1467                 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1468         else
1469                 WARN_ON(btrfs_header_level(eb) != 0);
1470         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1471
1472         return eb;
1473 }
1474
1475 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1476 {
1477         struct tree_mod_elem *tm;
1478         int level;
1479         struct extent_buffer *eb_root = btrfs_root_node(root);
1480
1481         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1482         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1483                 level = tm->old_root.level;
1484         } else {
1485                 level = btrfs_header_level(eb_root);
1486         }
1487         free_extent_buffer(eb_root);
1488
1489         return level;
1490 }
1491
1492 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1493                                    struct btrfs_root *root,
1494                                    struct extent_buffer *buf)
1495 {
1496         if (btrfs_test_is_dummy_root(root))
1497                 return 0;
1498
1499         /* ensure we can see the force_cow */
1500         smp_rmb();
1501
1502         /*
1503          * We do not need to cow a block if
1504          * 1) this block is not created or changed in this transaction;
1505          * 2) this block does not belong to TREE_RELOC tree;
1506          * 3) the root is not forced COW.
1507          *
1508          * What is forced COW:
1509          *    when we create snapshot during commiting the transaction,
1510          *    after we've finished coping src root, we must COW the shared
1511          *    block to ensure the metadata consistency.
1512          */
1513         if (btrfs_header_generation(buf) == trans->transid &&
1514             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1515             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1516               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1517             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1518                 return 0;
1519         return 1;
1520 }
1521
1522 /*
1523  * cows a single block, see __btrfs_cow_block for the real work.
1524  * This version of it has extra checks so that a block isn't cow'd more than
1525  * once per transaction, as long as it hasn't been written yet
1526  */
1527 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1528                     struct btrfs_root *root, struct extent_buffer *buf,
1529                     struct extent_buffer *parent, int parent_slot,
1530                     struct extent_buffer **cow_ret)
1531 {
1532         u64 search_start;
1533         int ret;
1534
1535         if (trans->transaction != root->fs_info->running_transaction)
1536                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1537                        trans->transid,
1538                        root->fs_info->running_transaction->transid);
1539
1540         if (trans->transid != root->fs_info->generation)
1541                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1542                        trans->transid, root->fs_info->generation);
1543
1544         if (!should_cow_block(trans, root, buf)) {
1545                 *cow_ret = buf;
1546                 return 0;
1547         }
1548
1549         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1550
1551         if (parent)
1552                 btrfs_set_lock_blocking(parent);
1553         btrfs_set_lock_blocking(buf);
1554
1555         ret = __btrfs_cow_block(trans, root, buf, parent,
1556                                  parent_slot, cow_ret, search_start, 0);
1557
1558         trace_btrfs_cow_block(root, buf, *cow_ret);
1559
1560         return ret;
1561 }
1562
1563 /*
1564  * helper function for defrag to decide if two blocks pointed to by a
1565  * node are actually close by
1566  */
1567 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1568 {
1569         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1570                 return 1;
1571         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1572                 return 1;
1573         return 0;
1574 }
1575
1576 /*
1577  * compare two keys in a memcmp fashion
1578  */
1579 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1580 {
1581         struct btrfs_key k1;
1582
1583         btrfs_disk_key_to_cpu(&k1, disk);
1584
1585         return btrfs_comp_cpu_keys(&k1, k2);
1586 }
1587
1588 /*
1589  * same as comp_keys only with two btrfs_key's
1590  */
1591 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1592 {
1593         if (k1->objectid > k2->objectid)
1594                 return 1;
1595         if (k1->objectid < k2->objectid)
1596                 return -1;
1597         if (k1->type > k2->type)
1598                 return 1;
1599         if (k1->type < k2->type)
1600                 return -1;
1601         if (k1->offset > k2->offset)
1602                 return 1;
1603         if (k1->offset < k2->offset)
1604                 return -1;
1605         return 0;
1606 }
1607
1608 /*
1609  * this is used by the defrag code to go through all the
1610  * leaves pointed to by a node and reallocate them so that
1611  * disk order is close to key order
1612  */
1613 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1614                        struct btrfs_root *root, struct extent_buffer *parent,
1615                        int start_slot, u64 *last_ret,
1616                        struct btrfs_key *progress)
1617 {
1618         struct extent_buffer *cur;
1619         u64 blocknr;
1620         u64 gen;
1621         u64 search_start = *last_ret;
1622         u64 last_block = 0;
1623         u64 other;
1624         u32 parent_nritems;
1625         int end_slot;
1626         int i;
1627         int err = 0;
1628         int parent_level;
1629         int uptodate;
1630         u32 blocksize;
1631         int progress_passed = 0;
1632         struct btrfs_disk_key disk_key;
1633
1634         parent_level = btrfs_header_level(parent);
1635
1636         WARN_ON(trans->transaction != root->fs_info->running_transaction);
1637         WARN_ON(trans->transid != root->fs_info->generation);
1638
1639         parent_nritems = btrfs_header_nritems(parent);
1640         blocksize = root->nodesize;
1641         end_slot = parent_nritems;
1642
1643         if (parent_nritems == 1)
1644                 return 0;
1645
1646         btrfs_set_lock_blocking(parent);
1647
1648         for (i = start_slot; i < end_slot; i++) {
1649                 int close = 1;
1650
1651                 btrfs_node_key(parent, &disk_key, i);
1652                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1653                         continue;
1654
1655                 progress_passed = 1;
1656                 blocknr = btrfs_node_blockptr(parent, i);
1657                 gen = btrfs_node_ptr_generation(parent, i);
1658                 if (last_block == 0)
1659                         last_block = blocknr;
1660
1661                 if (i > 0) {
1662                         other = btrfs_node_blockptr(parent, i - 1);
1663                         close = close_blocks(blocknr, other, blocksize);
1664                 }
1665                 if (!close && i < end_slot - 2) {
1666                         other = btrfs_node_blockptr(parent, i + 1);
1667                         close = close_blocks(blocknr, other, blocksize);
1668                 }
1669                 if (close) {
1670                         last_block = blocknr;
1671                         continue;
1672                 }
1673
1674                 cur = btrfs_find_tree_block(root, blocknr);
1675                 if (cur)
1676                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1677                 else
1678                         uptodate = 0;
1679                 if (!cur || !uptodate) {
1680                         if (!cur) {
1681                                 cur = read_tree_block(root, blocknr, gen);
1682                                 if (!cur || !extent_buffer_uptodate(cur)) {
1683                                         free_extent_buffer(cur);
1684                                         return -EIO;
1685                                 }
1686                         } else if (!uptodate) {
1687                                 err = btrfs_read_buffer(cur, gen);
1688                                 if (err) {
1689                                         free_extent_buffer(cur);
1690                                         return err;
1691                                 }
1692                         }
1693                 }
1694                 if (search_start == 0)
1695                         search_start = last_block;
1696
1697                 btrfs_tree_lock(cur);
1698                 btrfs_set_lock_blocking(cur);
1699                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1700                                         &cur, search_start,
1701                                         min(16 * blocksize,
1702                                             (end_slot - i) * blocksize));
1703                 if (err) {
1704                         btrfs_tree_unlock(cur);
1705                         free_extent_buffer(cur);
1706                         break;
1707                 }
1708                 search_start = cur->start;
1709                 last_block = cur->start;
1710                 *last_ret = search_start;
1711                 btrfs_tree_unlock(cur);
1712                 free_extent_buffer(cur);
1713         }
1714         return err;
1715 }
1716
1717 /*
1718  * The leaf data grows from end-to-front in the node.
1719  * this returns the address of the start of the last item,
1720  * which is the stop of the leaf data stack
1721  */
1722 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1723                                          struct extent_buffer *leaf)
1724 {
1725         u32 nr = btrfs_header_nritems(leaf);
1726         if (nr == 0)
1727                 return BTRFS_LEAF_DATA_SIZE(root);
1728         return btrfs_item_offset_nr(leaf, nr - 1);
1729 }
1730
1731
1732 /*
1733  * search for key in the extent_buffer.  The items start at offset p,
1734  * and they are item_size apart.  There are 'max' items in p.
1735  *
1736  * the slot in the array is returned via slot, and it points to
1737  * the place where you would insert key if it is not found in
1738  * the array.
1739  *
1740  * slot may point to max if the key is bigger than all of the keys
1741  */
1742 static noinline int generic_bin_search(struct extent_buffer *eb,
1743                                        unsigned long p,
1744                                        int item_size, struct btrfs_key *key,
1745                                        int max, int *slot)
1746 {
1747         int low = 0;
1748         int high = max;
1749         int mid;
1750         int ret;
1751         struct btrfs_disk_key *tmp = NULL;
1752         struct btrfs_disk_key unaligned;
1753         unsigned long offset;
1754         char *kaddr = NULL;
1755         unsigned long map_start = 0;
1756         unsigned long map_len = 0;
1757         int err;
1758
1759         while (low < high) {
1760                 mid = (low + high) / 2;
1761                 offset = p + mid * item_size;
1762
1763                 if (!kaddr || offset < map_start ||
1764                     (offset + sizeof(struct btrfs_disk_key)) >
1765                     map_start + map_len) {
1766
1767                         err = map_private_extent_buffer(eb, offset,
1768                                                 sizeof(struct btrfs_disk_key),
1769                                                 &kaddr, &map_start, &map_len);
1770
1771                         if (!err) {
1772                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1773                                                         map_start);
1774                         } else {
1775                                 read_extent_buffer(eb, &unaligned,
1776                                                    offset, sizeof(unaligned));
1777                                 tmp = &unaligned;
1778                         }
1779
1780                 } else {
1781                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1782                                                         map_start);
1783                 }
1784                 ret = comp_keys(tmp, key);
1785
1786                 if (ret < 0)
1787                         low = mid + 1;
1788                 else if (ret > 0)
1789                         high = mid;
1790                 else {
1791                         *slot = mid;
1792                         return 0;
1793                 }
1794         }
1795         *slot = low;
1796         return 1;
1797 }
1798
1799 /*
1800  * simple bin_search frontend that does the right thing for
1801  * leaves vs nodes
1802  */
1803 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1804                       int level, int *slot)
1805 {
1806         if (level == 0)
1807                 return generic_bin_search(eb,
1808                                           offsetof(struct btrfs_leaf, items),
1809                                           sizeof(struct btrfs_item),
1810                                           key, btrfs_header_nritems(eb),
1811                                           slot);
1812         else
1813                 return generic_bin_search(eb,
1814                                           offsetof(struct btrfs_node, ptrs),
1815                                           sizeof(struct btrfs_key_ptr),
1816                                           key, btrfs_header_nritems(eb),
1817                                           slot);
1818 }
1819
1820 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1821                      int level, int *slot)
1822 {
1823         return bin_search(eb, key, level, slot);
1824 }
1825
1826 static void root_add_used(struct btrfs_root *root, u32 size)
1827 {
1828         spin_lock(&root->accounting_lock);
1829         btrfs_set_root_used(&root->root_item,
1830                             btrfs_root_used(&root->root_item) + size);
1831         spin_unlock(&root->accounting_lock);
1832 }
1833
1834 static void root_sub_used(struct btrfs_root *root, u32 size)
1835 {
1836         spin_lock(&root->accounting_lock);
1837         btrfs_set_root_used(&root->root_item,
1838                             btrfs_root_used(&root->root_item) - size);
1839         spin_unlock(&root->accounting_lock);
1840 }
1841
1842 /* given a node and slot number, this reads the blocks it points to.  The
1843  * extent buffer is returned with a reference taken (but unlocked).
1844  * NULL is returned on error.
1845  */
1846 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1847                                    struct extent_buffer *parent, int slot)
1848 {
1849         int level = btrfs_header_level(parent);
1850         struct extent_buffer *eb;
1851
1852         if (slot < 0)
1853                 return NULL;
1854         if (slot >= btrfs_header_nritems(parent))
1855                 return NULL;
1856
1857         BUG_ON(level == 0);
1858
1859         eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1860                              btrfs_node_ptr_generation(parent, slot));
1861         if (eb && !extent_buffer_uptodate(eb)) {
1862                 free_extent_buffer(eb);
1863                 eb = NULL;
1864         }
1865
1866         return eb;
1867 }
1868
1869 /*
1870  * node level balancing, used to make sure nodes are in proper order for
1871  * item deletion.  We balance from the top down, so we have to make sure
1872  * that a deletion won't leave an node completely empty later on.
1873  */
1874 static noinline int balance_level(struct btrfs_trans_handle *trans,
1875                          struct btrfs_root *root,
1876                          struct btrfs_path *path, int level)
1877 {
1878         struct extent_buffer *right = NULL;
1879         struct extent_buffer *mid;
1880         struct extent_buffer *left = NULL;
1881         struct extent_buffer *parent = NULL;
1882         int ret = 0;
1883         int wret;
1884         int pslot;
1885         int orig_slot = path->slots[level];
1886         u64 orig_ptr;
1887
1888         if (level == 0)
1889                 return 0;
1890
1891         mid = path->nodes[level];
1892
1893         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1894                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1895         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1896
1897         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1898
1899         if (level < BTRFS_MAX_LEVEL - 1) {
1900                 parent = path->nodes[level + 1];
1901                 pslot = path->slots[level + 1];
1902         }
1903
1904         /*
1905          * deal with the case where there is only one pointer in the root
1906          * by promoting the node below to a root
1907          */
1908         if (!parent) {
1909                 struct extent_buffer *child;
1910
1911                 if (btrfs_header_nritems(mid) != 1)
1912                         return 0;
1913
1914                 /* promote the child to a root */
1915                 child = read_node_slot(root, mid, 0);
1916                 if (!child) {
1917                         ret = -EROFS;
1918                         btrfs_std_error(root->fs_info, ret);
1919                         goto enospc;
1920                 }
1921
1922                 btrfs_tree_lock(child);
1923                 btrfs_set_lock_blocking(child);
1924                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1925                 if (ret) {
1926                         btrfs_tree_unlock(child);
1927                         free_extent_buffer(child);
1928                         goto enospc;
1929                 }
1930
1931                 tree_mod_log_set_root_pointer(root, child, 1);
1932                 rcu_assign_pointer(root->node, child);
1933
1934                 add_root_to_dirty_list(root);
1935                 btrfs_tree_unlock(child);
1936
1937                 path->locks[level] = 0;
1938                 path->nodes[level] = NULL;
1939                 clean_tree_block(trans, root, mid);
1940                 btrfs_tree_unlock(mid);
1941                 /* once for the path */
1942                 free_extent_buffer(mid);
1943
1944                 root_sub_used(root, mid->len);
1945                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1946                 /* once for the root ptr */
1947                 free_extent_buffer_stale(mid);
1948                 return 0;
1949         }
1950         if (btrfs_header_nritems(mid) >
1951             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1952                 return 0;
1953
1954         left = read_node_slot(root, parent, pslot - 1);
1955         if (left) {
1956                 btrfs_tree_lock(left);
1957                 btrfs_set_lock_blocking(left);
1958                 wret = btrfs_cow_block(trans, root, left,
1959                                        parent, pslot - 1, &left);
1960                 if (wret) {
1961                         ret = wret;
1962                         goto enospc;
1963                 }
1964         }
1965         right = read_node_slot(root, parent, pslot + 1);
1966         if (right) {
1967                 btrfs_tree_lock(right);
1968                 btrfs_set_lock_blocking(right);
1969                 wret = btrfs_cow_block(trans, root, right,
1970                                        parent, pslot + 1, &right);
1971                 if (wret) {
1972                         ret = wret;
1973                         goto enospc;
1974                 }
1975         }
1976
1977         /* first, try to make some room in the middle buffer */
1978         if (left) {
1979                 orig_slot += btrfs_header_nritems(left);
1980                 wret = push_node_left(trans, root, left, mid, 1);
1981                 if (wret < 0)
1982                         ret = wret;
1983         }
1984
1985         /*
1986          * then try to empty the right most buffer into the middle
1987          */
1988         if (right) {
1989                 wret = push_node_left(trans, root, mid, right, 1);
1990                 if (wret < 0 && wret != -ENOSPC)
1991                         ret = wret;
1992                 if (btrfs_header_nritems(right) == 0) {
1993                         clean_tree_block(trans, root, right);
1994                         btrfs_tree_unlock(right);
1995                         del_ptr(root, path, level + 1, pslot + 1);
1996                         root_sub_used(root, right->len);
1997                         btrfs_free_tree_block(trans, root, right, 0, 1);
1998                         free_extent_buffer_stale(right);
1999                         right = NULL;
2000                 } else {
2001                         struct btrfs_disk_key right_key;
2002                         btrfs_node_key(right, &right_key, 0);
2003                         tree_mod_log_set_node_key(root->fs_info, parent,
2004                                                   pslot + 1, 0);
2005                         btrfs_set_node_key(parent, &right_key, pslot + 1);
2006                         btrfs_mark_buffer_dirty(parent);
2007                 }
2008         }
2009         if (btrfs_header_nritems(mid) == 1) {
2010                 /*
2011                  * we're not allowed to leave a node with one item in the
2012                  * tree during a delete.  A deletion from lower in the tree
2013                  * could try to delete the only pointer in this node.
2014                  * So, pull some keys from the left.
2015                  * There has to be a left pointer at this point because
2016                  * otherwise we would have pulled some pointers from the
2017                  * right
2018                  */
2019                 if (!left) {
2020                         ret = -EROFS;
2021                         btrfs_std_error(root->fs_info, ret);
2022                         goto enospc;
2023                 }
2024                 wret = balance_node_right(trans, root, mid, left);
2025                 if (wret < 0) {
2026                         ret = wret;
2027                         goto enospc;
2028                 }
2029                 if (wret == 1) {
2030                         wret = push_node_left(trans, root, left, mid, 1);
2031                         if (wret < 0)
2032                                 ret = wret;
2033                 }
2034                 BUG_ON(wret == 1);
2035         }
2036         if (btrfs_header_nritems(mid) == 0) {
2037                 clean_tree_block(trans, root, mid);
2038                 btrfs_tree_unlock(mid);
2039                 del_ptr(root, path, level + 1, pslot);
2040                 root_sub_used(root, mid->len);
2041                 btrfs_free_tree_block(trans, root, mid, 0, 1);
2042                 free_extent_buffer_stale(mid);
2043                 mid = NULL;
2044         } else {
2045                 /* update the parent key to reflect our changes */
2046                 struct btrfs_disk_key mid_key;
2047                 btrfs_node_key(mid, &mid_key, 0);
2048                 tree_mod_log_set_node_key(root->fs_info, parent,
2049                                           pslot, 0);
2050                 btrfs_set_node_key(parent, &mid_key, pslot);
2051                 btrfs_mark_buffer_dirty(parent);
2052         }
2053
2054         /* update the path */
2055         if (left) {
2056                 if (btrfs_header_nritems(left) > orig_slot) {
2057                         extent_buffer_get(left);
2058                         /* left was locked after cow */
2059                         path->nodes[level] = left;
2060                         path->slots[level + 1] -= 1;
2061                         path->slots[level] = orig_slot;
2062                         if (mid) {
2063                                 btrfs_tree_unlock(mid);
2064                                 free_extent_buffer(mid);
2065                         }
2066                 } else {
2067                         orig_slot -= btrfs_header_nritems(left);
2068                         path->slots[level] = orig_slot;
2069                 }
2070         }
2071         /* double check we haven't messed things up */
2072         if (orig_ptr !=
2073             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2074                 BUG();
2075 enospc:
2076         if (right) {
2077                 btrfs_tree_unlock(right);
2078                 free_extent_buffer(right);
2079         }
2080         if (left) {
2081                 if (path->nodes[level] != left)
2082                         btrfs_tree_unlock(left);
2083                 free_extent_buffer(left);
2084         }
2085         return ret;
2086 }
2087
2088 /* Node balancing for insertion.  Here we only split or push nodes around
2089  * when they are completely full.  This is also done top down, so we
2090  * have to be pessimistic.
2091  */
2092 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2093                                           struct btrfs_root *root,
2094                                           struct btrfs_path *path, int level)
2095 {
2096         struct extent_buffer *right = NULL;
2097         struct extent_buffer *mid;
2098         struct extent_buffer *left = NULL;
2099         struct extent_buffer *parent = NULL;
2100         int ret = 0;
2101         int wret;
2102         int pslot;
2103         int orig_slot = path->slots[level];
2104
2105         if (level == 0)
2106                 return 1;
2107
2108         mid = path->nodes[level];
2109         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2110
2111         if (level < BTRFS_MAX_LEVEL - 1) {
2112                 parent = path->nodes[level + 1];
2113                 pslot = path->slots[level + 1];
2114         }
2115
2116         if (!parent)
2117                 return 1;
2118
2119         left = read_node_slot(root, parent, pslot - 1);
2120
2121         /* first, try to make some room in the middle buffer */
2122         if (left) {
2123                 u32 left_nr;
2124
2125                 btrfs_tree_lock(left);
2126                 btrfs_set_lock_blocking(left);
2127
2128                 left_nr = btrfs_header_nritems(left);
2129                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2130                         wret = 1;
2131                 } else {
2132                         ret = btrfs_cow_block(trans, root, left, parent,
2133                                               pslot - 1, &left);
2134                         if (ret)
2135                                 wret = 1;
2136                         else {
2137                                 wret = push_node_left(trans, root,
2138                                                       left, mid, 0);
2139                         }
2140                 }
2141                 if (wret < 0)
2142                         ret = wret;
2143                 if (wret == 0) {
2144                         struct btrfs_disk_key disk_key;
2145                         orig_slot += left_nr;
2146                         btrfs_node_key(mid, &disk_key, 0);
2147                         tree_mod_log_set_node_key(root->fs_info, parent,
2148                                                   pslot, 0);
2149                         btrfs_set_node_key(parent, &disk_key, pslot);
2150                         btrfs_mark_buffer_dirty(parent);
2151                         if (btrfs_header_nritems(left) > orig_slot) {
2152                                 path->nodes[level] = left;
2153                                 path->slots[level + 1] -= 1;
2154                                 path->slots[level] = orig_slot;
2155                                 btrfs_tree_unlock(mid);
2156                                 free_extent_buffer(mid);
2157                         } else {
2158                                 orig_slot -=
2159                                         btrfs_header_nritems(left);
2160                                 path->slots[level] = orig_slot;
2161                                 btrfs_tree_unlock(left);
2162                                 free_extent_buffer(left);
2163                         }
2164                         return 0;
2165                 }
2166                 btrfs_tree_unlock(left);
2167                 free_extent_buffer(left);
2168         }
2169         right = read_node_slot(root, parent, pslot + 1);
2170
2171         /*
2172          * then try to empty the right most buffer into the middle
2173          */
2174         if (right) {
2175                 u32 right_nr;
2176
2177                 btrfs_tree_lock(right);
2178                 btrfs_set_lock_blocking(right);
2179
2180                 right_nr = btrfs_header_nritems(right);
2181                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2182                         wret = 1;
2183                 } else {
2184                         ret = btrfs_cow_block(trans, root, right,
2185                                               parent, pslot + 1,
2186                                               &right);
2187                         if (ret)
2188                                 wret = 1;
2189                         else {
2190                                 wret = balance_node_right(trans, root,
2191                                                           right, mid);
2192                         }
2193                 }
2194                 if (wret < 0)
2195                         ret = wret;
2196                 if (wret == 0) {
2197                         struct btrfs_disk_key disk_key;
2198
2199                         btrfs_node_key(right, &disk_key, 0);
2200                         tree_mod_log_set_node_key(root->fs_info, parent,
2201                                                   pslot + 1, 0);
2202                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2203                         btrfs_mark_buffer_dirty(parent);
2204
2205                         if (btrfs_header_nritems(mid) <= orig_slot) {
2206                                 path->nodes[level] = right;
2207                                 path->slots[level + 1] += 1;
2208                                 path->slots[level] = orig_slot -
2209                                         btrfs_header_nritems(mid);
2210                                 btrfs_tree_unlock(mid);
2211                                 free_extent_buffer(mid);
2212                         } else {
2213                                 btrfs_tree_unlock(right);
2214                                 free_extent_buffer(right);
2215                         }
2216                         return 0;
2217                 }
2218                 btrfs_tree_unlock(right);
2219                 free_extent_buffer(right);
2220         }
2221         return 1;
2222 }
2223
2224 /*
2225  * readahead one full node of leaves, finding things that are close
2226  * to the block in 'slot', and triggering ra on them.
2227  */
2228 static void reada_for_search(struct btrfs_root *root,
2229                              struct btrfs_path *path,
2230                              int level, int slot, u64 objectid)
2231 {
2232         struct extent_buffer *node;
2233         struct btrfs_disk_key disk_key;
2234         u32 nritems;
2235         u64 search;
2236         u64 target;
2237         u64 nread = 0;
2238         u64 gen;
2239         int direction = path->reada;
2240         struct extent_buffer *eb;
2241         u32 nr;
2242         u32 blocksize;
2243         u32 nscan = 0;
2244
2245         if (level != 1)
2246                 return;
2247
2248         if (!path->nodes[level])
2249                 return;
2250
2251         node = path->nodes[level];
2252
2253         search = btrfs_node_blockptr(node, slot);
2254         blocksize = root->nodesize;
2255         eb = btrfs_find_tree_block(root, search);
2256         if (eb) {
2257                 free_extent_buffer(eb);
2258                 return;
2259         }
2260
2261         target = search;
2262
2263         nritems = btrfs_header_nritems(node);
2264         nr = slot;
2265
2266         while (1) {
2267                 if (direction < 0) {
2268                         if (nr == 0)
2269                                 break;
2270                         nr--;
2271                 } else if (direction > 0) {
2272                         nr++;
2273                         if (nr >= nritems)
2274                                 break;
2275                 }
2276                 if (path->reada < 0 && objectid) {
2277                         btrfs_node_key(node, &disk_key, nr);
2278                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2279                                 break;
2280                 }
2281                 search = btrfs_node_blockptr(node, nr);
2282                 if ((search <= target && target - search <= 65536) ||
2283                     (search > target && search - target <= 65536)) {
2284                         gen = btrfs_node_ptr_generation(node, nr);
2285                         readahead_tree_block(root, search, blocksize);
2286                         nread += blocksize;
2287                 }
2288                 nscan++;
2289                 if ((nread > 65536 || nscan > 32))
2290                         break;
2291         }
2292 }
2293
2294 static noinline void reada_for_balance(struct btrfs_root *root,
2295                                        struct btrfs_path *path, int level)
2296 {
2297         int slot;
2298         int nritems;
2299         struct extent_buffer *parent;
2300         struct extent_buffer *eb;
2301         u64 gen;
2302         u64 block1 = 0;
2303         u64 block2 = 0;
2304         int blocksize;
2305
2306         parent = path->nodes[level + 1];
2307         if (!parent)
2308                 return;
2309
2310         nritems = btrfs_header_nritems(parent);
2311         slot = path->slots[level + 1];
2312         blocksize = root->nodesize;
2313
2314         if (slot > 0) {
2315                 block1 = btrfs_node_blockptr(parent, slot - 1);
2316                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2317                 eb = btrfs_find_tree_block(root, block1);
2318                 /*
2319                  * if we get -eagain from btrfs_buffer_uptodate, we
2320                  * don't want to return eagain here.  That will loop
2321                  * forever
2322                  */
2323                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2324                         block1 = 0;
2325                 free_extent_buffer(eb);
2326         }
2327         if (slot + 1 < nritems) {
2328                 block2 = btrfs_node_blockptr(parent, slot + 1);
2329                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2330                 eb = btrfs_find_tree_block(root, block2);
2331                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2332                         block2 = 0;
2333                 free_extent_buffer(eb);
2334         }
2335
2336         if (block1)
2337                 readahead_tree_block(root, block1, blocksize);
2338         if (block2)
2339                 readahead_tree_block(root, block2, blocksize);
2340 }
2341
2342
2343 /*
2344  * when we walk down the tree, it is usually safe to unlock the higher layers
2345  * in the tree.  The exceptions are when our path goes through slot 0, because
2346  * operations on the tree might require changing key pointers higher up in the
2347  * tree.
2348  *
2349  * callers might also have set path->keep_locks, which tells this code to keep
2350  * the lock if the path points to the last slot in the block.  This is part of
2351  * walking through the tree, and selecting the next slot in the higher block.
2352  *
2353  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2354  * if lowest_unlock is 1, level 0 won't be unlocked
2355  */
2356 static noinline void unlock_up(struct btrfs_path *path, int level,
2357                                int lowest_unlock, int min_write_lock_level,
2358                                int *write_lock_level)
2359 {
2360         int i;
2361         int skip_level = level;
2362         int no_skips = 0;
2363         struct extent_buffer *t;
2364
2365         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2366                 if (!path->nodes[i])
2367                         break;
2368                 if (!path->locks[i])
2369                         break;
2370                 if (!no_skips && path->slots[i] == 0) {
2371                         skip_level = i + 1;
2372                         continue;
2373                 }
2374                 if (!no_skips && path->keep_locks) {
2375                         u32 nritems;
2376                         t = path->nodes[i];
2377                         nritems = btrfs_header_nritems(t);
2378                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2379                                 skip_level = i + 1;
2380                                 continue;
2381                         }
2382                 }
2383                 if (skip_level < i && i >= lowest_unlock)
2384                         no_skips = 1;
2385
2386                 t = path->nodes[i];
2387                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2388                         btrfs_tree_unlock_rw(t, path->locks[i]);
2389                         path->locks[i] = 0;
2390                         if (write_lock_level &&
2391                             i > min_write_lock_level &&
2392                             i <= *write_lock_level) {
2393                                 *write_lock_level = i - 1;
2394                         }
2395                 }
2396         }
2397 }
2398
2399 /*
2400  * This releases any locks held in the path starting at level and
2401  * going all the way up to the root.
2402  *
2403  * btrfs_search_slot will keep the lock held on higher nodes in a few
2404  * corner cases, such as COW of the block at slot zero in the node.  This
2405  * ignores those rules, and it should only be called when there are no
2406  * more updates to be done higher up in the tree.
2407  */
2408 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2409 {
2410         int i;
2411
2412         if (path->keep_locks)
2413                 return;
2414
2415         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2416                 if (!path->nodes[i])
2417                         continue;
2418                 if (!path->locks[i])
2419                         continue;
2420                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2421                 path->locks[i] = 0;
2422         }
2423 }
2424
2425 /*
2426  * helper function for btrfs_search_slot.  The goal is to find a block
2427  * in cache without setting the path to blocking.  If we find the block
2428  * we return zero and the path is unchanged.
2429  *
2430  * If we can't find the block, we set the path blocking and do some
2431  * reada.  -EAGAIN is returned and the search must be repeated.
2432  */
2433 static int
2434 read_block_for_search(struct btrfs_trans_handle *trans,
2435                        struct btrfs_root *root, struct btrfs_path *p,
2436                        struct extent_buffer **eb_ret, int level, int slot,
2437                        struct btrfs_key *key, u64 time_seq)
2438 {
2439         u64 blocknr;
2440         u64 gen;
2441         struct extent_buffer *b = *eb_ret;
2442         struct extent_buffer *tmp;
2443         int ret;
2444
2445         blocknr = btrfs_node_blockptr(b, slot);
2446         gen = btrfs_node_ptr_generation(b, slot);
2447
2448         tmp = btrfs_find_tree_block(root, blocknr);
2449         if (tmp) {
2450                 /* first we do an atomic uptodate check */
2451                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2452                         *eb_ret = tmp;
2453                         return 0;
2454                 }
2455
2456                 /* the pages were up to date, but we failed
2457                  * the generation number check.  Do a full
2458                  * read for the generation number that is correct.
2459                  * We must do this without dropping locks so
2460                  * we can trust our generation number
2461                  */
2462                 btrfs_set_path_blocking(p);
2463
2464                 /* now we're allowed to do a blocking uptodate check */
2465                 ret = btrfs_read_buffer(tmp, gen);
2466                 if (!ret) {
2467                         *eb_ret = tmp;
2468                         return 0;
2469                 }
2470                 free_extent_buffer(tmp);
2471                 btrfs_release_path(p);
2472                 return -EIO;
2473         }
2474
2475         /*
2476          * reduce lock contention at high levels
2477          * of the btree by dropping locks before
2478          * we read.  Don't release the lock on the current
2479          * level because we need to walk this node to figure
2480          * out which blocks to read.
2481          */
2482         btrfs_unlock_up_safe(p, level + 1);
2483         btrfs_set_path_blocking(p);
2484
2485         free_extent_buffer(tmp);
2486         if (p->reada)
2487                 reada_for_search(root, p, level, slot, key->objectid);
2488
2489         btrfs_release_path(p);
2490
2491         ret = -EAGAIN;
2492         tmp = read_tree_block(root, blocknr, 0);
2493         if (tmp) {
2494                 /*
2495                  * If the read above didn't mark this buffer up to date,
2496                  * it will never end up being up to date.  Set ret to EIO now
2497                  * and give up so that our caller doesn't loop forever
2498                  * on our EAGAINs.
2499                  */
2500                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2501                         ret = -EIO;
2502                 free_extent_buffer(tmp);
2503         }
2504         return ret;
2505 }
2506
2507 /*
2508  * helper function for btrfs_search_slot.  This does all of the checks
2509  * for node-level blocks and does any balancing required based on
2510  * the ins_len.
2511  *
2512  * If no extra work was required, zero is returned.  If we had to
2513  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2514  * start over
2515  */
2516 static int
2517 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2518                        struct btrfs_root *root, struct btrfs_path *p,
2519                        struct extent_buffer *b, int level, int ins_len,
2520                        int *write_lock_level)
2521 {
2522         int ret;
2523         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2524             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2525                 int sret;
2526
2527                 if (*write_lock_level < level + 1) {
2528                         *write_lock_level = level + 1;
2529                         btrfs_release_path(p);
2530                         goto again;
2531                 }
2532
2533                 btrfs_set_path_blocking(p);
2534                 reada_for_balance(root, p, level);
2535                 sret = split_node(trans, root, p, level);
2536                 btrfs_clear_path_blocking(p, NULL, 0);
2537
2538                 BUG_ON(sret > 0);
2539                 if (sret) {
2540                         ret = sret;
2541                         goto done;
2542                 }
2543                 b = p->nodes[level];
2544         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2545                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2546                 int sret;
2547
2548                 if (*write_lock_level < level + 1) {
2549                         *write_lock_level = level + 1;
2550                         btrfs_release_path(p);
2551                         goto again;
2552                 }
2553
2554                 btrfs_set_path_blocking(p);
2555                 reada_for_balance(root, p, level);
2556                 sret = balance_level(trans, root, p, level);
2557                 btrfs_clear_path_blocking(p, NULL, 0);
2558
2559                 if (sret) {
2560                         ret = sret;
2561                         goto done;
2562                 }
2563                 b = p->nodes[level];
2564                 if (!b) {
2565                         btrfs_release_path(p);
2566                         goto again;
2567                 }
2568                 BUG_ON(btrfs_header_nritems(b) == 1);
2569         }
2570         return 0;
2571
2572 again:
2573         ret = -EAGAIN;
2574 done:
2575         return ret;
2576 }
2577
2578 static void key_search_validate(struct extent_buffer *b,
2579                                 struct btrfs_key *key,
2580                                 int level)
2581 {
2582 #ifdef CONFIG_BTRFS_ASSERT
2583         struct btrfs_disk_key disk_key;
2584
2585         btrfs_cpu_key_to_disk(&disk_key, key);
2586
2587         if (level == 0)
2588                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2589                     offsetof(struct btrfs_leaf, items[0].key),
2590                     sizeof(disk_key)));
2591         else
2592                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2593                     offsetof(struct btrfs_node, ptrs[0].key),
2594                     sizeof(disk_key)));
2595 #endif
2596 }
2597
2598 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2599                       int level, int *prev_cmp, int *slot)
2600 {
2601         if (*prev_cmp != 0) {
2602                 *prev_cmp = bin_search(b, key, level, slot);
2603                 return *prev_cmp;
2604         }
2605
2606         key_search_validate(b, key, level);
2607         *slot = 0;
2608
2609         return 0;
2610 }
2611
2612 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
2613                 u64 iobjectid, u64 ioff, u8 key_type,
2614                 struct btrfs_key *found_key)
2615 {
2616         int ret;
2617         struct btrfs_key key;
2618         struct extent_buffer *eb;
2619         struct btrfs_path *path;
2620
2621         key.type = key_type;
2622         key.objectid = iobjectid;
2623         key.offset = ioff;
2624
2625         if (found_path == NULL) {
2626                 path = btrfs_alloc_path();
2627                 if (!path)
2628                         return -ENOMEM;
2629         } else
2630                 path = found_path;
2631
2632         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2633         if ((ret < 0) || (found_key == NULL)) {
2634                 if (path != found_path)
2635                         btrfs_free_path(path);
2636                 return ret;
2637         }
2638
2639         eb = path->nodes[0];
2640         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2641                 ret = btrfs_next_leaf(fs_root, path);
2642                 if (ret)
2643                         return ret;
2644                 eb = path->nodes[0];
2645         }
2646
2647         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2648         if (found_key->type != key.type ||
2649                         found_key->objectid != key.objectid)
2650                 return 1;
2651
2652         return 0;
2653 }
2654
2655 /*
2656  * look for key in the tree.  path is filled in with nodes along the way
2657  * if key is found, we return zero and you can find the item in the leaf
2658  * level of the path (level 0)
2659  *
2660  * If the key isn't found, the path points to the slot where it should
2661  * be inserted, and 1 is returned.  If there are other errors during the
2662  * search a negative error number is returned.
2663  *
2664  * if ins_len > 0, nodes and leaves will be split as we walk down the
2665  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2666  * possible)
2667  */
2668 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2669                       *root, struct btrfs_key *key, struct btrfs_path *p, int
2670                       ins_len, int cow)
2671 {
2672         struct extent_buffer *b;
2673         int slot;
2674         int ret;
2675         int err;
2676         int level;
2677         int lowest_unlock = 1;
2678         int root_lock;
2679         /* everything at write_lock_level or lower must be write locked */
2680         int write_lock_level = 0;
2681         u8 lowest_level = 0;
2682         int min_write_lock_level;
2683         int prev_cmp;
2684
2685         lowest_level = p->lowest_level;
2686         WARN_ON(lowest_level && ins_len > 0);
2687         WARN_ON(p->nodes[0] != NULL);
2688         BUG_ON(!cow && ins_len);
2689
2690         if (ins_len < 0) {
2691                 lowest_unlock = 2;
2692
2693                 /* when we are removing items, we might have to go up to level
2694                  * two as we update tree pointers  Make sure we keep write
2695                  * for those levels as well
2696                  */
2697                 write_lock_level = 2;
2698         } else if (ins_len > 0) {
2699                 /*
2700                  * for inserting items, make sure we have a write lock on
2701                  * level 1 so we can update keys
2702                  */
2703                 write_lock_level = 1;
2704         }
2705
2706         if (!cow)
2707                 write_lock_level = -1;
2708
2709         if (cow && (p->keep_locks || p->lowest_level))
2710                 write_lock_level = BTRFS_MAX_LEVEL;
2711
2712         min_write_lock_level = write_lock_level;
2713
2714 again:
2715         prev_cmp = -1;
2716         /*
2717          * we try very hard to do read locks on the root
2718          */
2719         root_lock = BTRFS_READ_LOCK;
2720         level = 0;
2721         if (p->search_commit_root) {
2722                 /*
2723                  * the commit roots are read only
2724                  * so we always do read locks
2725                  */
2726                 if (p->need_commit_sem)
2727                         down_read(&root->fs_info->commit_root_sem);
2728                 b = root->commit_root;
2729                 extent_buffer_get(b);
2730                 level = btrfs_header_level(b);
2731                 if (p->need_commit_sem)
2732                         up_read(&root->fs_info->commit_root_sem);
2733                 if (!p->skip_locking)
2734                         btrfs_tree_read_lock(b);
2735         } else {
2736                 if (p->skip_locking) {
2737                         b = btrfs_root_node(root);
2738                         level = btrfs_header_level(b);
2739                 } else {
2740                         /* we don't know the level of the root node
2741                          * until we actually have it read locked
2742                          */
2743                         b = btrfs_read_lock_root_node(root);
2744                         level = btrfs_header_level(b);
2745                         if (level <= write_lock_level) {
2746                                 /* whoops, must trade for write lock */
2747                                 btrfs_tree_read_unlock(b);
2748                                 free_extent_buffer(b);
2749                                 b = btrfs_lock_root_node(root);
2750                                 root_lock = BTRFS_WRITE_LOCK;
2751
2752                                 /* the level might have changed, check again */
2753                                 level = btrfs_header_level(b);
2754                         }
2755                 }
2756         }
2757         p->nodes[level] = b;
2758         if (!p->skip_locking)
2759                 p->locks[level] = root_lock;
2760
2761         while (b) {
2762                 level = btrfs_header_level(b);
2763
2764                 /*
2765                  * setup the path here so we can release it under lock
2766                  * contention with the cow code
2767                  */
2768                 if (cow) {
2769                         /*
2770                          * if we don't really need to cow this block
2771                          * then we don't want to set the path blocking,
2772                          * so we test it here
2773                          */
2774                         if (!should_cow_block(trans, root, b))
2775                                 goto cow_done;
2776
2777                         /*
2778                          * must have write locks on this node and the
2779                          * parent
2780                          */
2781                         if (level > write_lock_level ||
2782                             (level + 1 > write_lock_level &&
2783                             level + 1 < BTRFS_MAX_LEVEL &&
2784                             p->nodes[level + 1])) {
2785                                 write_lock_level = level + 1;
2786                                 btrfs_release_path(p);
2787                                 goto again;
2788                         }
2789
2790                         btrfs_set_path_blocking(p);
2791                         err = btrfs_cow_block(trans, root, b,
2792                                               p->nodes[level + 1],
2793                                               p->slots[level + 1], &b);
2794                         if (err) {
2795                                 ret = err;
2796                                 goto done;
2797                         }
2798                 }
2799 cow_done:
2800                 p->nodes[level] = b;
2801                 btrfs_clear_path_blocking(p, NULL, 0);
2802
2803                 /*
2804                  * we have a lock on b and as long as we aren't changing
2805                  * the tree, there is no way to for the items in b to change.
2806                  * It is safe to drop the lock on our parent before we
2807                  * go through the expensive btree search on b.
2808                  *
2809                  * If we're inserting or deleting (ins_len != 0), then we might
2810                  * be changing slot zero, which may require changing the parent.
2811                  * So, we can't drop the lock until after we know which slot
2812                  * we're operating on.
2813                  */
2814                 if (!ins_len && !p->keep_locks) {
2815                         int u = level + 1;
2816
2817                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2818                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2819                                 p->locks[u] = 0;
2820                         }
2821                 }
2822
2823                 ret = key_search(b, key, level, &prev_cmp, &slot);
2824
2825                 if (level != 0) {
2826                         int dec = 0;
2827                         if (ret && slot > 0) {
2828                                 dec = 1;
2829                                 slot -= 1;
2830                         }
2831                         p->slots[level] = slot;
2832                         err = setup_nodes_for_search(trans, root, p, b, level,
2833                                              ins_len, &write_lock_level);
2834                         if (err == -EAGAIN)
2835                                 goto again;
2836                         if (err) {
2837                                 ret = err;
2838                                 goto done;
2839                         }
2840                         b = p->nodes[level];
2841                         slot = p->slots[level];
2842
2843                         /*
2844                          * slot 0 is special, if we change the key
2845                          * we have to update the parent pointer
2846                          * which means we must have a write lock
2847                          * on the parent
2848                          */
2849                         if (slot == 0 && ins_len &&
2850                             write_lock_level < level + 1) {
2851                                 write_lock_level = level + 1;
2852                                 btrfs_release_path(p);
2853                                 goto again;
2854                         }
2855
2856                         unlock_up(p, level, lowest_unlock,
2857                                   min_write_lock_level, &write_lock_level);
2858
2859                         if (level == lowest_level) {
2860                                 if (dec)
2861                                         p->slots[level]++;
2862                                 goto done;
2863                         }
2864
2865                         err = read_block_for_search(trans, root, p,
2866                                                     &b, level, slot, key, 0);
2867                         if (err == -EAGAIN)
2868                                 goto again;
2869                         if (err) {
2870                                 ret = err;
2871                                 goto done;
2872                         }
2873
2874                         if (!p->skip_locking) {
2875                                 level = btrfs_header_level(b);
2876                                 if (level <= write_lock_level) {
2877                                         err = btrfs_try_tree_write_lock(b);
2878                                         if (!err) {
2879                                                 btrfs_set_path_blocking(p);
2880                                                 btrfs_tree_lock(b);
2881                                                 btrfs_clear_path_blocking(p, b,
2882                                                                   BTRFS_WRITE_LOCK);
2883                                         }
2884                                         p->locks[level] = BTRFS_WRITE_LOCK;
2885                                 } else {
2886                                         err = btrfs_tree_read_lock_atomic(b);
2887                                         if (!err) {
2888                                                 btrfs_set_path_blocking(p);
2889                                                 btrfs_tree_read_lock(b);
2890                                                 btrfs_clear_path_blocking(p, b,
2891                                                                   BTRFS_READ_LOCK);
2892                                         }
2893                                         p->locks[level] = BTRFS_READ_LOCK;
2894                                 }
2895                                 p->nodes[level] = b;
2896                         }
2897                 } else {
2898                         p->slots[level] = slot;
2899                         if (ins_len > 0 &&
2900                             btrfs_leaf_free_space(root, b) < ins_len) {
2901                                 if (write_lock_level < 1) {
2902                                         write_lock_level = 1;
2903                                         btrfs_release_path(p);
2904                                         goto again;
2905                                 }
2906
2907                                 btrfs_set_path_blocking(p);
2908                                 err = split_leaf(trans, root, key,
2909                                                  p, ins_len, ret == 0);
2910                                 btrfs_clear_path_blocking(p, NULL, 0);
2911
2912                                 BUG_ON(err > 0);
2913                                 if (err) {
2914                                         ret = err;
2915                                         goto done;
2916                                 }
2917                         }
2918                         if (!p->search_for_split)
2919                                 unlock_up(p, level, lowest_unlock,
2920                                           min_write_lock_level, &write_lock_level);
2921                         goto done;
2922                 }
2923         }
2924         ret = 1;
2925 done:
2926         /*
2927          * we don't really know what they plan on doing with the path
2928          * from here on, so for now just mark it as blocking
2929          */
2930         if (!p->leave_spinning)
2931                 btrfs_set_path_blocking(p);
2932         if (ret < 0 && !p->skip_release_on_error)
2933                 btrfs_release_path(p);
2934         return ret;
2935 }
2936
2937 /*
2938  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2939  * current state of the tree together with the operations recorded in the tree
2940  * modification log to search for the key in a previous version of this tree, as
2941  * denoted by the time_seq parameter.
2942  *
2943  * Naturally, there is no support for insert, delete or cow operations.
2944  *
2945  * The resulting path and return value will be set up as if we called
2946  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2947  */
2948 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2949                           struct btrfs_path *p, u64 time_seq)
2950 {
2951         struct extent_buffer *b;
2952         int slot;
2953         int ret;
2954         int err;
2955         int level;
2956         int lowest_unlock = 1;
2957         u8 lowest_level = 0;
2958         int prev_cmp = -1;
2959
2960         lowest_level = p->lowest_level;
2961         WARN_ON(p->nodes[0] != NULL);
2962
2963         if (p->search_commit_root) {
2964                 BUG_ON(time_seq);
2965                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2966         }
2967
2968 again:
2969         b = get_old_root(root, time_seq);
2970         level = btrfs_header_level(b);
2971         p->locks[level] = BTRFS_READ_LOCK;
2972
2973         while (b) {
2974                 level = btrfs_header_level(b);
2975                 p->nodes[level] = b;
2976                 btrfs_clear_path_blocking(p, NULL, 0);
2977
2978                 /*
2979                  * we have a lock on b and as long as we aren't changing
2980                  * the tree, there is no way to for the items in b to change.
2981                  * It is safe to drop the lock on our parent before we
2982                  * go through the expensive btree search on b.
2983                  */
2984                 btrfs_unlock_up_safe(p, level + 1);
2985
2986                 /*
2987                  * Since we can unwind eb's we want to do a real search every
2988                  * time.
2989                  */
2990                 prev_cmp = -1;
2991                 ret = key_search(b, key, level, &prev_cmp, &slot);
2992
2993                 if (level != 0) {
2994                         int dec = 0;
2995                         if (ret && slot > 0) {
2996                                 dec = 1;
2997                                 slot -= 1;
2998                         }
2999                         p->slots[level] = slot;
3000                         unlock_up(p, level, lowest_unlock, 0, NULL);
3001
3002                         if (level == lowest_level) {
3003                                 if (dec)
3004                                         p->slots[level]++;
3005                                 goto done;
3006                         }
3007
3008                         err = read_block_for_search(NULL, root, p, &b, level,
3009                                                     slot, key, time_seq);
3010                         if (err == -EAGAIN)
3011                                 goto again;
3012                         if (err) {
3013                                 ret = err;
3014                                 goto done;
3015                         }
3016
3017                         level = btrfs_header_level(b);
3018                         err = btrfs_tree_read_lock_atomic(b);
3019                         if (!err) {
3020                                 btrfs_set_path_blocking(p);
3021                                 btrfs_tree_read_lock(b);
3022                                 btrfs_clear_path_blocking(p, b,
3023                                                           BTRFS_READ_LOCK);
3024                         }
3025                         b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3026                         if (!b) {
3027                                 ret = -ENOMEM;
3028                                 goto done;
3029                         }
3030                         p->locks[level] = BTRFS_READ_LOCK;
3031                         p->nodes[level] = b;
3032                 } else {
3033                         p->slots[level] = slot;
3034                         unlock_up(p, level, lowest_unlock, 0, NULL);
3035                         goto done;
3036                 }
3037         }
3038         ret = 1;
3039 done:
3040         if (!p->leave_spinning)
3041                 btrfs_set_path_blocking(p);
3042         if (ret < 0)
3043                 btrfs_release_path(p);
3044
3045         return ret;
3046 }
3047
3048 /*
3049  * helper to use instead of search slot if no exact match is needed but
3050  * instead the next or previous item should be returned.
3051  * When find_higher is true, the next higher item is returned, the next lower
3052  * otherwise.
3053  * When return_any and find_higher are both true, and no higher item is found,
3054  * return the next lower instead.
3055  * When return_any is true and find_higher is false, and no lower item is found,
3056  * return the next higher instead.
3057  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3058  * < 0 on error
3059  */
3060 int btrfs_search_slot_for_read(struct btrfs_root *root,
3061                                struct btrfs_key *key, struct btrfs_path *p,
3062                                int find_higher, int return_any)
3063 {
3064         int ret;
3065         struct extent_buffer *leaf;
3066
3067 again:
3068         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3069         if (ret <= 0)
3070                 return ret;
3071         /*
3072          * a return value of 1 means the path is at the position where the
3073          * item should be inserted. Normally this is the next bigger item,
3074          * but in case the previous item is the last in a leaf, path points
3075          * to the first free slot in the previous leaf, i.e. at an invalid
3076          * item.
3077          */
3078         leaf = p->nodes[0];
3079
3080         if (find_higher) {
3081                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3082                         ret = btrfs_next_leaf(root, p);
3083                         if (ret <= 0)
3084                                 return ret;
3085                         if (!return_any)
3086                                 return 1;
3087                         /*
3088                          * no higher item found, return the next
3089                          * lower instead
3090                          */
3091                         return_any = 0;
3092                         find_higher = 0;
3093                         btrfs_release_path(p);
3094                         goto again;
3095                 }
3096         } else {
3097                 if (p->slots[0] == 0) {
3098                         ret = btrfs_prev_leaf(root, p);
3099                         if (ret < 0)
3100                                 return ret;
3101                         if (!ret) {
3102                                 leaf = p->nodes[0];
3103                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3104                                         p->slots[0]--;
3105                                 return 0;
3106                         }
3107                         if (!return_any)
3108                                 return 1;
3109                         /*
3110                          * no lower item found, return the next
3111                          * higher instead
3112                          */
3113                         return_any = 0;
3114                         find_higher = 1;
3115                         btrfs_release_path(p);
3116                         goto again;
3117                 } else {
3118                         --p->slots[0];
3119                 }
3120         }
3121         return 0;
3122 }
3123
3124 /*
3125  * adjust the pointers going up the tree, starting at level
3126  * making sure the right key of each node is points to 'key'.
3127  * This is used after shifting pointers to the left, so it stops
3128  * fixing up pointers when a given leaf/node is not in slot 0 of the
3129  * higher levels
3130  *
3131  */
3132 static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
3133                            struct btrfs_disk_key *key, int level)
3134 {
3135         int i;
3136         struct extent_buffer *t;
3137
3138         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3139                 int tslot = path->slots[i];
3140                 if (!path->nodes[i])
3141                         break;
3142                 t = path->nodes[i];
3143                 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
3144                 btrfs_set_node_key(t, key, tslot);
3145                 btrfs_mark_buffer_dirty(path->nodes[i]);
3146                 if (tslot != 0)
3147                         break;
3148         }
3149 }
3150
3151 /*
3152  * update item key.
3153  *
3154  * This function isn't completely safe. It's the caller's responsibility
3155  * that the new key won't break the order
3156  */
3157 void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
3158                              struct btrfs_key *new_key)
3159 {
3160         struct btrfs_disk_key disk_key;
3161         struct extent_buffer *eb;
3162         int slot;
3163
3164         eb = path->nodes[0];
3165         slot = path->slots[0];
3166         if (slot > 0) {
3167                 btrfs_item_key(eb, &disk_key, slot - 1);
3168                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3169         }
3170         if (slot < btrfs_header_nritems(eb) - 1) {
3171                 btrfs_item_key(eb, &disk_key, slot + 1);
3172                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3173         }
3174
3175         btrfs_cpu_key_to_disk(&disk_key, new_key);
3176         btrfs_set_item_key(eb, &disk_key, slot);
3177         btrfs_mark_buffer_dirty(eb);
3178         if (slot == 0)
3179                 fixup_low_keys(root, path, &disk_key, 1);
3180 }
3181
3182 /*
3183  * try to push data from one node into the next node left in the
3184  * tree.
3185  *
3186  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3187  * error, and > 0 if there was no room in the left hand block.
3188  */
3189 static int push_node_left(struct btrfs_trans_handle *trans,
3190                           struct btrfs_root *root, struct extent_buffer *dst,
3191                           struct extent_buffer *src, int empty)
3192 {
3193         int push_items = 0;
3194         int src_nritems;
3195         int dst_nritems;
3196         int ret = 0;
3197
3198         src_nritems = btrfs_header_nritems(src);
3199         dst_nritems = btrfs_header_nritems(dst);
3200         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3201         WARN_ON(btrfs_header_generation(src) != trans->transid);
3202         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3203
3204         if (!empty && src_nritems <= 8)
3205                 return 1;
3206
3207         if (push_items <= 0)
3208                 return 1;
3209
3210         if (empty) {
3211                 push_items = min(src_nritems, push_items);
3212                 if (push_items < src_nritems) {
3213                         /* leave at least 8 pointers in the node if
3214                          * we aren't going to empty it
3215                          */
3216                         if (src_nritems - push_items < 8) {
3217                                 if (push_items <= 8)
3218                                         return 1;
3219                                 push_items -= 8;
3220                         }
3221                 }
3222         } else
3223                 push_items = min(src_nritems - 8, push_items);
3224
3225         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3226                                    push_items);
3227         if (ret) {
3228                 btrfs_abort_transaction(trans, root, ret);
3229                 return ret;
3230         }
3231         copy_extent_buffer(dst, src,
3232                            btrfs_node_key_ptr_offset(dst_nritems),
3233                            btrfs_node_key_ptr_offset(0),
3234                            push_items * sizeof(struct btrfs_key_ptr));
3235
3236         if (push_items < src_nritems) {
3237                 /*
3238                  * don't call tree_mod_log_eb_move here, key removal was already
3239                  * fully logged by tree_mod_log_eb_copy above.
3240                  */
3241                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3242                                       btrfs_node_key_ptr_offset(push_items),
3243                                       (src_nritems - push_items) *
3244                                       sizeof(struct btrfs_key_ptr));
3245         }
3246         btrfs_set_header_nritems(src, src_nritems - push_items);
3247         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3248         btrfs_mark_buffer_dirty(src);
3249         btrfs_mark_buffer_dirty(dst);
3250
3251         return ret;
3252 }
3253
3254 /*
3255  * try to push data from one node into the next node right in the
3256  * tree.
3257  *
3258  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3259  * error, and > 0 if there was no room in the right hand block.
3260  *
3261  * this will  only push up to 1/2 the contents of the left node over
3262  */
3263 static int balance_node_right(struct btrfs_trans_handle *trans,
3264                               struct btrfs_root *root,
3265                               struct extent_buffer *dst,
3266                               struct extent_buffer *src)
3267 {
3268         int push_items = 0;
3269         int max_push;
3270         int src_nritems;
3271         int dst_nritems;
3272         int ret = 0;
3273
3274         WARN_ON(btrfs_header_generation(src) != trans->transid);
3275         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3276
3277         src_nritems = btrfs_header_nritems(src);
3278         dst_nritems = btrfs_header_nritems(dst);
3279         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3280         if (push_items <= 0)
3281                 return 1;
3282
3283         if (src_nritems < 4)
3284                 return 1;
3285
3286         max_push = src_nritems / 2 + 1;
3287         /* don't try to empty the node */
3288         if (max_push >= src_nritems)
3289                 return 1;
3290
3291         if (max_push < push_items)
3292                 push_items = max_push;
3293
3294         tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3295         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3296                                       btrfs_node_key_ptr_offset(0),
3297                                       (dst_nritems) *
3298                                       sizeof(struct btrfs_key_ptr));
3299
3300         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3301                                    src_nritems - push_items, push_items);
3302         if (ret) {
3303                 btrfs_abort_transaction(trans, root, ret);
3304                 return ret;
3305         }
3306         copy_extent_buffer(dst, src,
3307                            btrfs_node_key_ptr_offset(0),
3308                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3309                            push_items * sizeof(struct btrfs_key_ptr));
3310
3311         btrfs_set_header_nritems(src, src_nritems - push_items);
3312         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3313
3314         btrfs_mark_buffer_dirty(src);
3315         btrfs_mark_buffer_dirty(dst);
3316
3317         return ret;
3318 }
3319
3320 /*
3321  * helper function to insert a new root level in the tree.
3322  * A new node is allocated, and a single item is inserted to
3323  * point to the existing root
3324  *
3325  * returns zero on success or < 0 on failure.
3326  */
3327 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3328                            struct btrfs_root *root,
3329                            struct btrfs_path *path, int level)
3330 {
3331         u64 lower_gen;
3332         struct extent_buffer *lower;
3333         struct extent_buffer *c;
3334         struct extent_buffer *old;
3335         struct btrfs_disk_key lower_key;
3336
3337         BUG_ON(path->nodes[level]);
3338         BUG_ON(path->nodes[level-1] != root->node);
3339
3340         lower = path->nodes[level-1];
3341         if (level == 1)
3342                 btrfs_item_key(lower, &lower_key, 0);
3343         else
3344                 btrfs_node_key(lower, &lower_key, 0);
3345
3346         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3347                                    &lower_key, level, root->node->start, 0);
3348         if (IS_ERR(c))
3349                 return PTR_ERR(c);
3350
3351         root_add_used(root, root->nodesize);
3352
3353         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3354         btrfs_set_header_nritems(c, 1);
3355         btrfs_set_header_level(c, level);
3356         btrfs_set_header_bytenr(c, c->start);
3357         btrfs_set_header_generation(c, trans->transid);
3358         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3359         btrfs_set_header_owner(c, root->root_key.objectid);
3360
3361         write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3362                             BTRFS_FSID_SIZE);
3363
3364         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3365                             btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3366
3367         btrfs_set_node_key(c, &lower_key, 0);
3368         btrfs_set_node_blockptr(c, 0, lower->start);
3369         lower_gen = btrfs_header_generation(lower);
3370         WARN_ON(lower_gen != trans->transid);
3371
3372         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3373
3374         btrfs_mark_buffer_dirty(c);
3375
3376         old = root->node;
3377         tree_mod_log_set_root_pointer(root, c, 0);
3378         rcu_assign_pointer(root->node, c);
3379
3380         /* the super has an extra ref to root->node */
3381         free_extent_buffer(old);
3382
3383         add_root_to_dirty_list(root);
3384         extent_buffer_get(c);
3385         path->nodes[level] = c;
3386         path->locks[level] = BTRFS_WRITE_LOCK;
3387         path->slots[level] = 0;
3388         return 0;
3389 }
3390
3391 /*
3392  * worker function to insert a single pointer in a node.
3393  * the node should have enough room for the pointer already
3394  *
3395  * slot and level indicate where you want the key to go, and
3396  * blocknr is the block the key points to.
3397  */
3398 static void insert_ptr(struct btrfs_trans_handle *trans,
3399                        struct btrfs_root *root, struct btrfs_path *path,
3400                        struct btrfs_disk_key *key, u64 bytenr,
3401                        int slot, int level)
3402 {
3403         struct extent_buffer *lower;
3404         int nritems;
3405         int ret;
3406
3407         BUG_ON(!path->nodes[level]);
3408         btrfs_assert_tree_locked(path->nodes[level]);
3409         lower = path->nodes[level];
3410         nritems = btrfs_header_nritems(lower);
3411         BUG_ON(slot > nritems);
3412         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3413         if (slot != nritems) {
3414                 if (level)
3415                         tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3416                                              slot, nritems - slot);
3417                 memmove_extent_buffer(lower,
3418                               btrfs_node_key_ptr_offset(slot + 1),
3419                               btrfs_node_key_ptr_offset(slot),
3420                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3421         }
3422         if (level) {
3423                 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3424                                               MOD_LOG_KEY_ADD, GFP_NOFS);
3425                 BUG_ON(ret < 0);
3426         }
3427         btrfs_set_node_key(lower, key, slot);
3428         btrfs_set_node_blockptr(lower, slot, bytenr);
3429         WARN_ON(trans->transid == 0);
3430         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3431         btrfs_set_header_nritems(lower, nritems + 1);
3432         btrfs_mark_buffer_dirty(lower);
3433 }
3434
3435 /*
3436  * split the node at the specified level in path in two.
3437  * The path is corrected to point to the appropriate node after the split
3438  *
3439  * Before splitting this tries to make some room in the node by pushing
3440  * left and right, if either one works, it returns right away.
3441  *
3442  * returns 0 on success and < 0 on failure
3443  */
3444 static noinline int split_node(struct btrfs_trans_handle *trans,
3445                                struct btrfs_root *root,
3446                                struct btrfs_path *path, int level)
3447 {
3448         struct extent_buffer *c;
3449         struct extent_buffer *split;
3450         struct btrfs_disk_key disk_key;
3451         int mid;
3452         int ret;
3453         u32 c_nritems;
3454
3455         c = path->nodes[level];
3456         WARN_ON(btrfs_header_generation(c) != trans->transid);
3457         if (c == root->node) {
3458                 /*
3459                  * trying to split the root, lets make a new one
3460                  *
3461                  * tree mod log: We don't log_removal old root in
3462                  * insert_new_root, because that root buffer will be kept as a
3463                  * normal node. We are going to log removal of half of the
3464                  * elements below with tree_mod_log_eb_copy. We're holding a
3465                  * tree lock on the buffer, which is why we cannot race with
3466                  * other tree_mod_log users.
3467                  */
3468                 ret = insert_new_root(trans, root, path, level + 1);
3469                 if (ret)
3470                         return ret;
3471         } else {
3472                 ret = push_nodes_for_insert(trans, root, path, level);
3473                 c = path->nodes[level];
3474                 if (!ret && btrfs_header_nritems(c) <
3475                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3476                         return 0;
3477                 if (ret < 0)
3478                         return ret;
3479         }
3480
3481         c_nritems = btrfs_header_nritems(c);
3482         mid = (c_nritems + 1) / 2;
3483         btrfs_node_key(c, &disk_key, mid);
3484
3485         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3486                         &disk_key, level, c->start, 0);
3487         if (IS_ERR(split))
3488                 return PTR_ERR(split);
3489
3490         root_add_used(root, root->nodesize);
3491
3492         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3493         btrfs_set_header_level(split, btrfs_header_level(c));
3494         btrfs_set_header_bytenr(split, split->start);
3495         btrfs_set_header_generation(split, trans->transid);
3496         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3497         btrfs_set_header_owner(split, root->root_key.objectid);
3498         write_extent_buffer(split, root->fs_info->fsid,
3499                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
3500         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3501                             btrfs_header_chunk_tree_uuid(split),
3502                             BTRFS_UUID_SIZE);
3503
3504         ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3505                                    mid, c_nritems - mid);
3506         if (ret) {
3507                 btrfs_abort_transaction(trans, root, ret);
3508                 return ret;
3509         }
3510         copy_extent_buffer(split, c,
3511                            btrfs_node_key_ptr_offset(0),
3512                            btrfs_node_key_ptr_offset(mid),
3513                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3514         btrfs_set_header_nritems(split, c_nritems - mid);
3515         btrfs_set_header_nritems(c, mid);
3516         ret = 0;
3517
3518         btrfs_mark_buffer_dirty(c);
3519         btrfs_mark_buffer_dirty(split);
3520
3521         insert_ptr(trans, root, path, &disk_key, split->start,
3522                    path->slots[level + 1] + 1, level + 1);
3523
3524         if (path->slots[level] >= mid) {
3525                 path->slots[level] -= mid;
3526                 btrfs_tree_unlock(c);
3527                 free_extent_buffer(c);
3528                 path->nodes[level] = split;
3529                 path->slots[level + 1] += 1;
3530         } else {
3531                 btrfs_tree_unlock(split);
3532                 free_extent_buffer(split);
3533         }
3534         return ret;
3535 }
3536
3537 /*
3538  * how many bytes are required to store the items in a leaf.  start
3539  * and nr indicate which items in the leaf to check.  This totals up the
3540  * space used both by the item structs and the item data
3541  */
3542 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3543 {
3544         struct btrfs_item *start_item;
3545         struct btrfs_item *end_item;
3546         struct btrfs_map_token token;
3547         int data_len;
3548         int nritems = btrfs_header_nritems(l);
3549         int end = min(nritems, start + nr) - 1;
3550
3551         if (!nr)
3552                 return 0;
3553         btrfs_init_map_token(&token);
3554         start_item = btrfs_item_nr(start);
3555         end_item = btrfs_item_nr(end);
3556         data_len = btrfs_token_item_offset(l, start_item, &token) +
3557                 btrfs_token_item_size(l, start_item, &token);
3558         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3559         data_len += sizeof(struct btrfs_item) * nr;
3560         WARN_ON(data_len < 0);
3561         return data_len;
3562 }
3563
3564 /*
3565  * The space between the end of the leaf items and
3566  * the start of the leaf data.  IOW, how much room
3567  * the leaf has left for both items and data
3568  */
3569 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3570                                    struct extent_buffer *leaf)
3571 {
3572         int nritems = btrfs_header_nritems(leaf);
3573         int ret;
3574         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3575         if (ret < 0) {
3576                 btrfs_crit(root->fs_info,
3577                         "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3578                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3579                        leaf_space_used(leaf, 0, nritems), nritems);
3580         }
3581         return ret;
3582 }
3583
3584 /*
3585  * min slot controls the lowest index we're willing to push to the
3586  * right.  We'll push up to and including min_slot, but no lower
3587  */
3588 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3589                                       struct btrfs_root *root,
3590                                       struct btrfs_path *path,
3591                                       int data_size, int empty,
3592                                       struct extent_buffer *right,
3593                                       int free_space, u32 left_nritems,
3594                                       u32 min_slot)
3595 {
3596         struct extent_buffer *left = path->nodes[0];
3597         struct extent_buffer *upper = path->nodes[1];
3598         struct btrfs_map_token token;
3599         struct btrfs_disk_key disk_key;
3600         int slot;
3601         u32 i;
3602         int push_space = 0;
3603         int push_items = 0;
3604         struct btrfs_item *item;
3605         u32 nr;
3606         u32 right_nritems;
3607         u32 data_end;
3608         u32 this_item_size;
3609
3610         btrfs_init_map_token(&token);
3611
3612         if (empty)
3613                 nr = 0;
3614         else
3615                 nr = max_t(u32, 1, min_slot);
3616
3617         if (path->slots[0] >= left_nritems)
3618                 push_space += data_size;
3619
3620         slot = path->slots[1];
3621         i = left_nritems - 1;
3622         while (i >= nr) {
3623                 item = btrfs_item_nr(i);
3624
3625                 if (!empty && push_items > 0) {
3626                         if (path->slots[0] > i)
3627                                 break;
3628                         if (path->slots[0] == i) {
3629                                 int space = btrfs_leaf_free_space(root, left);
3630                                 if (space + push_space * 2 > free_space)
3631                                         break;
3632                         }
3633                 }
3634
3635                 if (path->slots[0] == i)
3636                         push_space += data_size;
3637
3638                 this_item_size = btrfs_item_size(left, item);
3639                 if (this_item_size + sizeof(*item) + push_space > free_space)
3640                         break;
3641
3642                 push_items++;
3643                 push_space += this_item_size + sizeof(*item);
3644                 if (i == 0)
3645                         break;
3646                 i--;
3647         }
3648
3649         if (push_items == 0)
3650                 goto out_unlock;
3651
3652         WARN_ON(!empty && push_items == left_nritems);
3653
3654         /* push left to right */
3655         right_nritems = btrfs_header_nritems(right);
3656
3657         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3658         push_space -= leaf_data_end(root, left);
3659
3660         /* make room in the right data area */
3661         data_end = leaf_data_end(root, right);
3662         memmove_extent_buffer(right,
3663                               btrfs_leaf_data(right) + data_end - push_space,
3664                               btrfs_leaf_data(right) + data_end,
3665                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
3666
3667         /* copy from the left data area */
3668         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3669                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3670                      btrfs_leaf_data(left) + leaf_data_end(root, left),
3671                      push_space);
3672
3673         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3674                               btrfs_item_nr_offset(0),
3675                               right_nritems * sizeof(struct btrfs_item));
3676
3677         /* copy the items from left to right */
3678         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3679                    btrfs_item_nr_offset(left_nritems - push_items),
3680                    push_items * sizeof(struct btrfs_item));
3681
3682         /* update the item pointers */
3683         right_nritems += push_items;
3684         btrfs_set_header_nritems(right, right_nritems);
3685         push_space = BTRFS_LEAF_DATA_SIZE(root);
3686         for (i = 0; i < right_nritems; i++) {
3687                 item = btrfs_item_nr(i);
3688                 push_space -= btrfs_token_item_size(right, item, &token);
3689                 btrfs_set_token_item_offset(right, item, push_space, &token);
3690         }
3691
3692         left_nritems -= push_items;
3693         btrfs_set_header_nritems(left, left_nritems);
3694
3695         if (left_nritems)
3696                 btrfs_mark_buffer_dirty(left);
3697         else
3698                 clean_tree_block(trans, root, left);
3699
3700         btrfs_mark_buffer_dirty(right);
3701
3702         btrfs_item_key(right, &disk_key, 0);
3703         btrfs_set_node_key(upper, &disk_key, slot + 1);
3704         btrfs_mark_buffer_dirty(upper);
3705
3706         /* then fixup the leaf pointer in the path */
3707         if (path->slots[0] >= left_nritems) {
3708                 path->slots[0] -= left_nritems;
3709                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3710                         clean_tree_block(trans, root, path->nodes[0]);
3711                 btrfs_tree_unlock(path->nodes[0]);
3712                 free_extent_buffer(path->nodes[0]);
3713                 path->nodes[0] = right;
3714                 path->slots[1] += 1;
3715         } else {
3716                 btrfs_tree_unlock(right);
3717                 free_extent_buffer(right);
3718         }
3719         return 0;
3720
3721 out_unlock:
3722         btrfs_tree_unlock(right);
3723         free_extent_buffer(right);
3724         return 1;
3725 }
3726
3727 /*
3728  * push some data in the path leaf to the right, trying to free up at
3729  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3730  *
3731  * returns 1 if the push failed because the other node didn't have enough
3732  * room, 0 if everything worked out and < 0 if there were major errors.
3733  *
3734  * this will push starting from min_slot to the end of the leaf.  It won't
3735  * push any slot lower than min_slot
3736  */
3737 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3738                            *root, struct btrfs_path *path,
3739                            int min_data_size, int data_size,
3740                            int empty, u32 min_slot)
3741 {
3742         struct extent_buffer *left = path->nodes[0];
3743         struct extent_buffer *right;
3744         struct extent_buffer *upper;
3745         int slot;
3746         int free_space;
3747         u32 left_nritems;
3748         int ret;
3749
3750         if (!path->nodes[1])
3751                 return 1;
3752
3753         slot = path->slots[1];
3754         upper = path->nodes[1];
3755         if (slot >= btrfs_header_nritems(upper) - 1)
3756                 return 1;
3757
3758         btrfs_assert_tree_locked(path->nodes[1]);
3759
3760         right = read_node_slot(root, upper, slot + 1);
3761         if (right == NULL)
3762                 return 1;
3763
3764         btrfs_tree_lock(right);
3765         btrfs_set_lock_blocking(right);
3766
3767         free_space = btrfs_leaf_free_space(root, right);
3768         if (free_space < data_size)
3769                 goto out_unlock;
3770
3771         /* cow and double check */
3772         ret = btrfs_cow_block(trans, root, right, upper,
3773                               slot + 1, &right);
3774         if (ret)
3775                 goto out_unlock;
3776
3777         free_space = btrfs_leaf_free_space(root, right);
3778         if (free_space < data_size)
3779                 goto out_unlock;
3780
3781         left_nritems = btrfs_header_nritems(left);
3782         if (left_nritems == 0)
3783                 goto out_unlock;
3784
3785         if (path->slots[0] == left_nritems && !empty) {
3786                 /* Key greater than all keys in the leaf, right neighbor has
3787                  * enough room for it and we're not emptying our leaf to delete
3788                  * it, therefore use right neighbor to insert the new item and
3789                  * no need to touch/dirty our left leaft. */
3790                 btrfs_tree_unlock(left);
3791                 free_extent_buffer(left);
3792                 path->nodes[0] = right;
3793                 path->slots[0] = 0;
3794                 path->slots[1]++;
3795                 return 0;
3796         }
3797
3798         return __push_leaf_right(trans, root, path, min_data_size, empty,
3799                                 right, free_space, left_nritems, min_slot);
3800 out_unlock:
3801         btrfs_tree_unlock(right);
3802         free_extent_buffer(right);
3803         return 1;
3804 }
3805
3806 /*
3807  * push some data in the path leaf to the left, trying to free up at
3808  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3809  *
3810  * max_slot can put a limit on how far into the leaf we'll push items.  The
3811  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3812  * items
3813  */
3814 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3815                                      struct btrfs_root *root,
3816                                      struct btrfs_path *path, int data_size,
3817                                      int empty, struct extent_buffer *left,
3818                                      int free_space, u32 right_nritems,
3819                                      u32 max_slot)
3820 {
3821         struct btrfs_disk_key disk_key;
3822         struct extent_buffer *right = path->nodes[0];
3823         int i;
3824         int push_space = 0;
3825         int push_items = 0;
3826         struct btrfs_item *item;
3827         u32 old_left_nritems;
3828         u32 nr;
3829         int ret = 0;
3830         u32 this_item_size;
3831         u32 old_left_item_size;
3832         struct btrfs_map_token token;
3833
3834         btrfs_init_map_token(&token);
3835
3836         if (empty)
3837                 nr = min(right_nritems, max_slot);
3838         else
3839                 nr = min(right_nritems - 1, max_slot);
3840
3841         for (i = 0; i < nr; i++) {
3842                 item = btrfs_item_nr(i);
3843
3844                 if (!empty && push_items > 0) {
3845                         if (path->slots[0] < i)
3846                                 break;
3847                         if (path->slots[0] == i) {
3848                                 int space = btrfs_leaf_free_space(root, right);
3849                                 if (space + push_space * 2 > free_space)
3850                                         break;
3851                         }
3852                 }
3853
3854                 if (path->slots[0] == i)
3855                         push_space += data_size;
3856
3857                 this_item_size = btrfs_item_size(right, item);
3858                 if (this_item_size + sizeof(*item) + push_space > free_space)
3859                         break;
3860
3861                 push_items++;
3862                 push_space += this_item_size + sizeof(*item);
3863         }
3864
3865         if (push_items == 0) {
3866                 ret = 1;
3867                 goto out;
3868         }
3869         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3870
3871         /* push data from right to left */
3872         copy_extent_buffer(left, right,
3873                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3874                            btrfs_item_nr_offset(0),
3875                            push_items * sizeof(struct btrfs_item));
3876
3877         push_space = BTRFS_LEAF_DATA_SIZE(root) -
3878                      btrfs_item_offset_nr(right, push_items - 1);
3879
3880         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3881                      leaf_data_end(root, left) - push_space,
3882                      btrfs_leaf_data(right) +
3883                      btrfs_item_offset_nr(right, push_items - 1),
3884                      push_space);
3885         old_left_nritems = btrfs_header_nritems(left);
3886         BUG_ON(old_left_nritems <= 0);
3887
3888         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3889         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3890                 u32 ioff;
3891
3892                 item = btrfs_item_nr(i);
3893
3894                 ioff = btrfs_token_item_offset(left, item, &token);
3895                 btrfs_set_token_item_offset(left, item,
3896                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3897                       &token);
3898         }
3899         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3900
3901         /* fixup right node */
3902         if (push_items > right_nritems)
3903                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3904                        right_nritems);
3905
3906         if (push_items < right_nritems) {
3907                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3908                                                   leaf_data_end(root, right);
3909                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
3910                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
3911                                       btrfs_leaf_data(right) +
3912                                       leaf_data_end(root, right), push_space);
3913
3914                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3915                               btrfs_item_nr_offset(push_items),
3916                              (btrfs_header_nritems(right) - push_items) *
3917                              sizeof(struct btrfs_item));
3918         }
3919         right_nritems -= push_items;
3920         btrfs_set_header_nritems(right, right_nritems);
3921         push_space = BTRFS_LEAF_DATA_SIZE(root);
3922         for (i = 0; i < right_nritems; i++) {
3923                 item = btrfs_item_nr(i);
3924
3925                 push_space = push_space - btrfs_token_item_size(right,
3926                                                                 item, &token);
3927                 btrfs_set_token_item_offset(right, item, push_space, &token);
3928         }
3929
3930         btrfs_mark_buffer_dirty(left);
3931         if (right_nritems)
3932                 btrfs_mark_buffer_dirty(right);
3933         else
3934                 clean_tree_block(trans, root, right);
3935
3936         btrfs_item_key(right, &disk_key, 0);
3937         fixup_low_keys(root, path, &disk_key, 1);
3938
3939         /* then fixup the leaf pointer in the path */
3940         if (path->slots[0] < push_items) {
3941                 path->slots[0] += old_left_nritems;
3942                 btrfs_tree_unlock(path->nodes[0]);
3943                 free_extent_buffer(path->nodes[0]);
3944                 path->nodes[0] = left;
3945                 path->slots[1] -= 1;
3946         } else {
3947                 btrfs_tree_unlock(left);
3948                 free_extent_buffer(left);
3949                 path->slots[0] -= push_items;
3950         }
3951         BUG_ON(path->slots[0] < 0);
3952         return ret;
3953 out:
3954         btrfs_tree_unlock(left);
3955         free_extent_buffer(left);
3956         return ret;
3957 }
3958
3959 /*
3960  * push some data in the path leaf to the left, trying to free up at
3961  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3962  *
3963  * max_slot can put a limit on how far into the leaf we'll push items.  The
3964  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3965  * items
3966  */
3967 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3968                           *root, struct btrfs_path *path, int min_data_size,
3969                           int data_size, int empty, u32 max_slot)
3970 {
3971         struct extent_buffer *right = path->nodes[0];
3972         struct extent_buffer *left;
3973         int slot;
3974         int free_space;
3975         u32 right_nritems;
3976         int ret = 0;
3977
3978         slot = path->slots[1];
3979         if (slot == 0)
3980                 return 1;
3981         if (!path->nodes[1])
3982                 return 1;
3983
3984         right_nritems = btrfs_header_nritems(right);
3985         if (right_nritems == 0)
3986                 return 1;
3987
3988         btrfs_assert_tree_locked(path->nodes[1]);
3989
3990         left = read_node_slot(root, path->nodes[1], slot - 1);
3991         if (left == NULL)
3992                 return 1;
3993
3994         btrfs_tree_lock(left);
3995         btrfs_set_lock_blocking(left);
3996
3997         free_space = btrfs_leaf_free_space(root, left);
3998         if (free_space < data_size) {
3999                 ret = 1;
4000                 goto out;
4001         }
4002
4003         /* cow and double check */
4004         ret = btrfs_cow_block(trans, root, left,
4005                               path->nodes[1], slot - 1, &left);
4006         if (ret) {
4007                 /* we hit -ENOSPC, but it isn't fatal here */
4008                 if (ret == -ENOSPC)
4009                         ret = 1;
4010                 goto out;
4011         }
4012
4013         free_space = btrfs_leaf_free_space(root, left);
4014         if (free_space < data_size) {
4015                 ret = 1;
4016                 goto out;
4017         }
4018
4019         return __push_leaf_left(trans, root, path, min_data_size,
4020                                empty, left, free_space, right_nritems,
4021                                max_slot);
4022 out:
4023         btrfs_tree_unlock(left);
4024         free_extent_buffer(left);
4025         return ret;
4026 }
4027
4028 /*
4029  * split the path's leaf in two, making sure there is at least data_size
4030  * available for the resulting leaf level of the path.
4031  */
4032 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4033                                     struct btrfs_root *root,
4034                                     struct btrfs_path *path,
4035                                     struct extent_buffer *l,
4036                                     struct extent_buffer *right,
4037                                     int slot, int mid, int nritems)
4038 {
4039         int data_copy_size;
4040         int rt_data_off;
4041         int i;
4042         struct btrfs_disk_key disk_key;
4043         struct btrfs_map_token token;
4044
4045         btrfs_init_map_token(&token);
4046
4047         nritems = nritems - mid;
4048         btrfs_set_header_nritems(right, nritems);
4049         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4050
4051         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4052                            btrfs_item_nr_offset(mid),
4053                            nritems * sizeof(struct btrfs_item));
4054
4055         copy_extent_buffer(right, l,
4056                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4057                      data_copy_size, btrfs_leaf_data(l) +
4058                      leaf_data_end(root, l), data_copy_size);
4059
4060         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4061                       btrfs_item_end_nr(l, mid);
4062
4063         for (i = 0; i < nritems; i++) {
4064                 struct btrfs_item *item = btrfs_item_nr(i);
4065                 u32 ioff;
4066
4067                 ioff = btrfs_token_item_offset(right, item, &token);
4068                 btrfs_set_token_item_offset(right, item,
4069                                             ioff + rt_data_off, &token);
4070         }
4071
4072         btrfs_set_header_nritems(l, mid);
4073         btrfs_item_key(right, &disk_key, 0);
4074         insert_ptr(trans, root, path, &disk_key, right->start,
4075                    path->slots[1] + 1, 1);
4076
4077         btrfs_mark_buffer_dirty(right);
4078         btrfs_mark_buffer_dirty(l);
4079         BUG_ON(path->slots[0] != slot);
4080
4081         if (mid <= slot) {
4082                 btrfs_tree_unlock(path->nodes[0]);
4083                 free_extent_buffer(path->nodes[0]);
4084                 path->nodes[0] = right;
4085                 path->slots[0] -= mid;
4086                 path->slots[1] += 1;
4087         } else {
4088                 btrfs_tree_unlock(right);
4089                 free_extent_buffer(right);
4090         }
4091
4092         BUG_ON(path->slots[0] < 0);
4093 }
4094
4095 /*
4096  * double splits happen when we need to insert a big item in the middle
4097  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4098  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4099  *          A                 B                 C
4100  *
4101  * We avoid this by trying to push the items on either side of our target
4102  * into the adjacent leaves.  If all goes well we can avoid the double split
4103  * completely.
4104  */
4105 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4106                                           struct btrfs_root *root,
4107                                           struct btrfs_path *path,
4108                                           int data_size)
4109 {
4110         int ret;
4111         int progress = 0;
4112         int slot;
4113         u32 nritems;
4114         int space_needed = data_size;
4115
4116         slot = path->slots[0];
4117         if (slot < btrfs_header_nritems(path->nodes[0]))
4118                 space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4119
4120         /*
4121          * try to push all the items after our slot into the
4122          * right leaf
4123          */
4124         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4125         if (ret < 0)
4126                 return ret;
4127
4128         if (ret == 0)
4129                 progress++;
4130
4131         nritems = btrfs_header_nritems(path->nodes[0]);
4132         /*
4133          * our goal is to get our slot at the start or end of a leaf.  If
4134          * we've done so we're done
4135          */
4136         if (path->slots[0] == 0 || path->slots[0] == nritems)
4137                 return 0;
4138
4139         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4140                 return 0;
4141
4142         /* try to push all the items before our slot into the next leaf */
4143         slot = path->slots[0];
4144         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4145         if (ret < 0)
4146                 return ret;
4147
4148         if (ret == 0)
4149                 progress++;
4150
4151         if (progress)
4152                 return 0;
4153         return 1;
4154 }
4155
4156 /*
4157  * split the path's leaf in two, making sure there is at least data_size
4158  * available for the resulting leaf level of the path.
4159  *
4160  * returns 0 if all went well and < 0 on failure.
4161  */
4162 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4163                                struct btrfs_root *root,
4164                                struct btrfs_key *ins_key,
4165                                struct btrfs_path *path, int data_size,
4166                                int extend)
4167 {
4168         struct btrfs_disk_key disk_key;
4169         struct extent_buffer *l;
4170         u32 nritems;
4171         int mid;
4172         int slot;
4173         struct extent_buffer *right;
4174         int ret = 0;
4175         int wret;
4176         int split;
4177         int num_doubles = 0;
4178         int tried_avoid_double = 0;
4179
4180         l = path->nodes[0];
4181         slot = path->slots[0];
4182         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4183             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4184                 return -EOVERFLOW;
4185
4186         /* first try to make some room by pushing left and right */
4187         if (data_size && path->nodes[1]) {
4188                 int space_needed = data_size;
4189
4190                 if (slot < btrfs_header_nritems(l))
4191                         space_needed -= btrfs_leaf_free_space(root, l);
4192
4193                 wret = push_leaf_right(trans, root, path, space_needed,
4194                                        space_needed, 0, 0);
4195                 if (wret < 0)
4196                         return wret;
4197                 if (wret) {
4198                         wret = push_leaf_left(trans, root, path, space_needed,
4199                                               space_needed, 0, (u32)-1);
4200                         if (wret < 0)
4201                                 return wret;
4202                 }
4203                 l = path->nodes[0];
4204
4205                 /* did the pushes work? */
4206                 if (btrfs_leaf_free_space(root, l) >= data_size)
4207                         return 0;
4208         }
4209
4210         if (!path->nodes[1]) {
4211                 ret = insert_new_root(trans, root, path, 1);
4212                 if (ret)
4213                         return ret;
4214         }
4215 again:
4216         split = 1;
4217         l = path->nodes[0];
4218         slot = path->slots[0];
4219         nritems = btrfs_header_nritems(l);
4220         mid = (nritems + 1) / 2;
4221
4222         if (mid <= slot) {
4223                 if (nritems == 1 ||
4224                     leaf_space_used(l, mid, nritems - mid) + data_size >
4225                         BTRFS_LEAF_DATA_SIZE(root)) {
4226                         if (slot >= nritems) {
4227                                 split = 0;
4228                         } else {
4229                                 mid = slot;
4230                                 if (mid != nritems &&
4231                                     leaf_space_used(l, mid, nritems - mid) +
4232                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4233                                         if (data_size && !tried_avoid_double)
4234                                                 goto push_for_double;
4235                                         split = 2;
4236                                 }
4237                         }
4238                 }
4239         } else {
4240                 if (leaf_space_used(l, 0, mid) + data_size >
4241                         BTRFS_LEAF_DATA_SIZE(root)) {
4242                         if (!extend && data_size && slot == 0) {
4243                                 split = 0;
4244                         } else if ((extend || !data_size) && slot == 0) {
4245                                 mid = 1;
4246                         } else {
4247                                 mid = slot;
4248                                 if (mid != nritems &&
4249                                     leaf_space_used(l, mid, nritems - mid) +
4250                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4251                                         if (data_size && !tried_avoid_double)
4252                                                 goto push_for_double;
4253                                         split = 2;
4254                                 }
4255                         }
4256                 }
4257         }
4258
4259         if (split == 0)
4260                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4261         else
4262                 btrfs_item_key(l, &disk_key, mid);
4263
4264         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4265                         &disk_key, 0, l->start, 0);
4266         if (IS_ERR(right))
4267                 return PTR_ERR(right);
4268
4269         root_add_used(root, root->nodesize);
4270
4271         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4272         btrfs_set_header_bytenr(right, right->start);
4273         btrfs_set_header_generation(right, trans->transid);
4274         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4275         btrfs_set_header_owner(right, root->root_key.objectid);
4276         btrfs_set_header_level(right, 0);
4277         write_extent_buffer(right, root->fs_info->fsid,
4278                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
4279
4280         write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4281                             btrfs_header_chunk_tree_uuid(right),
4282                             BTRFS_UUID_SIZE);
4283
4284         if (split == 0) {
4285                 if (mid <= slot) {
4286                         btrfs_set_header_nritems(right, 0);
4287                         insert_ptr(trans, root, path, &disk_key, right->start,
4288                                    path->slots[1] + 1, 1);
4289                         btrfs_tree_unlock(path->nodes[0]);
4290                         free_extent_buffer(path->nodes[0]);
4291                         path->nodes[0] = right;
4292                         path->slots[0] = 0;
4293                         path->slots[1] += 1;
4294                 } else {
4295                         btrfs_set_header_nritems(right, 0);
4296                         insert_ptr(trans, root, path, &disk_key, right->start,
4297                                           path->slots[1], 1);
4298                         btrfs_tree_unlock(path->nodes[0]);
4299                         free_extent_buffer(path->nodes[0]);
4300                         path->nodes[0] = right;
4301                         path->slots[0] = 0;
4302                         if (path->slots[1] == 0)
4303                                 fixup_low_keys(root, path, &disk_key, 1);
4304                 }
4305                 btrfs_mark_buffer_dirty(right);
4306                 return ret;
4307         }
4308
4309         copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4310
4311         if (split == 2) {
4312                 BUG_ON(num_doubles != 0);
4313                 num_doubles++;
4314                 goto again;
4315         }
4316
4317         return 0;
4318
4319 push_for_double:
4320         push_for_double_split(trans, root, path, data_size);
4321         tried_avoid_double = 1;
4322         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4323                 return 0;
4324         goto again;
4325 }
4326
4327 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4328                                          struct btrfs_root *root,
4329                                          struct btrfs_path *path, int ins_len)
4330 {
4331         struct btrfs_key key;
4332         struct extent_buffer *leaf;
4333         struct btrfs_file_extent_item *fi;
4334         u64 extent_len = 0;
4335         u32 item_size;
4336         int ret;
4337
4338         leaf = path->nodes[0];
4339         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4340
4341         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4342                key.type != BTRFS_EXTENT_CSUM_KEY);
4343
4344         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4345                 return 0;
4346
4347         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4348         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4349                 fi = btrfs_item_ptr(leaf, path->slots[0],
4350                                     struct btrfs_file_extent_item);
4351                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4352         }
4353         btrfs_release_path(path);
4354
4355         path->keep_locks = 1;
4356         path->search_for_split = 1;
4357         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4358         path->search_for_split = 0;
4359         if (ret < 0)
4360                 goto err;
4361
4362         ret = -EAGAIN;
4363         leaf = path->nodes[0];
4364         /* if our item isn't there or got smaller, return now */
4365         if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4366                 goto err;
4367
4368         /* the leaf has  changed, it now has room.  return now */
4369         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4370                 goto err;
4371
4372         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4373                 fi = btrfs_item_ptr(leaf, path->slots[0],
4374                                     struct btrfs_file_extent_item);
4375                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4376                         goto err;
4377         }
4378
4379         btrfs_set_path_blocking(path);
4380         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4381         if (ret)
4382                 goto err;
4383
4384         path->keep_locks = 0;
4385         btrfs_unlock_up_safe(path, 1);
4386         return 0;
4387 err:
4388         path->keep_locks = 0;
4389         return ret;
4390 }
4391
4392 static noinline int split_item(struct btrfs_trans_handle *trans,
4393                                struct btrfs_root *root,
4394                                struct btrfs_path *path,
4395                                struct btrfs_key *new_key,
4396                                unsigned long split_offset)
4397 {
4398         struct extent_buffer *leaf;
4399         struct btrfs_item *item;
4400         struct btrfs_item *new_item;
4401         int slot;
4402         char *buf;
4403         u32 nritems;
4404         u32 item_size;
4405         u32 orig_offset;
4406         struct btrfs_disk_key disk_key;
4407
4408         leaf = path->nodes[0];
4409         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4410
4411         btrfs_set_path_blocking(path);
4412
4413         item = btrfs_item_nr(path->slots[0]);
4414         orig_offset = btrfs_item_offset(leaf, item);
4415         item_size = btrfs_item_size(leaf, item);
4416
4417         buf = kmalloc(item_size, GFP_NOFS);
4418         if (!buf)
4419                 return -ENOMEM;
4420
4421         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4422                             path->slots[0]), item_size);
4423
4424         slot = path->slots[0] + 1;
4425         nritems = btrfs_header_nritems(leaf);
4426         if (slot != nritems) {
4427                 /* shift the items */
4428                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4429                                 btrfs_item_nr_offset(slot),
4430                                 (nritems - slot) * sizeof(struct btrfs_item));
4431         }
4432
4433         btrfs_cpu_key_to_disk(&disk_key, new_key);
4434         btrfs_set_item_key(leaf, &disk_key, slot);
4435
4436         new_item = btrfs_item_nr(slot);
4437
4438         btrfs_set_item_offset(leaf, new_item, orig_offset);
4439         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4440
4441         btrfs_set_item_offset(leaf, item,
4442                               orig_offset + item_size - split_offset);
4443         btrfs_set_item_size(leaf, item, split_offset);
4444
4445         btrfs_set_header_nritems(leaf, nritems + 1);
4446
4447         /* write the data for the start of the original item */
4448         write_extent_buffer(leaf, buf,
4449                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4450                             split_offset);
4451
4452         /* write the data for the new item */
4453         write_extent_buffer(leaf, buf + split_offset,
4454                             btrfs_item_ptr_offset(leaf, slot),
4455                             item_size - split_offset);
4456         btrfs_mark_buffer_dirty(leaf);
4457
4458         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4459         kfree(buf);
4460         return 0;
4461 }
4462
4463 /*
4464  * This function splits a single item into two items,
4465  * giving 'new_key' to the new item and splitting the
4466  * old one at split_offset (from the start of the item).
4467  *
4468  * The path may be released by this operation.  After
4469  * the split, the path is pointing to the old item.  The
4470  * new item is going to be in the same node as the old one.
4471  *
4472  * Note, the item being split must be smaller enough to live alone on
4473  * a tree block with room for one extra struct btrfs_item
4474  *
4475  * This allows us to split the item in place, keeping a lock on the
4476  * leaf the entire time.
4477  */
4478 int btrfs_split_item(struct btrfs_trans_handle *trans,
4479                      struct btrfs_root *root,
4480                      struct btrfs_path *path,
4481                      struct btrfs_key *new_key,
4482                      unsigned long split_offset)
4483 {
4484         int ret;
4485         ret = setup_leaf_for_split(trans, root, path,
4486                                    sizeof(struct btrfs_item));
4487         if (ret)
4488                 return ret;
4489
4490         ret = split_item(trans, root, path, new_key, split_offset);
4491         return ret;
4492 }
4493
4494 /*
4495  * This function duplicate a item, giving 'new_key' to the new item.
4496  * It guarantees both items live in the same tree leaf and the new item
4497  * is contiguous with the original item.
4498  *
4499  * This allows us to split file extent in place, keeping a lock on the
4500  * leaf the entire time.
4501  */
4502 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4503                          struct btrfs_root *root,
4504                          struct btrfs_path *path,
4505                          struct btrfs_key *new_key)
4506 {
4507         struct extent_buffer *leaf;
4508         int ret;
4509         u32 item_size;
4510
4511         leaf = path->nodes[0];
4512         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4513         ret = setup_leaf_for_split(trans, root, path,
4514                                    item_size + sizeof(struct btrfs_item));
4515         if (ret)
4516                 return ret;
4517
4518         path->slots[0]++;
4519         setup_items_for_insert(root, path, new_key, &item_size,
4520                                item_size, item_size +
4521                                sizeof(struct btrfs_item), 1);
4522         leaf = path->nodes[0];
4523         memcpy_extent_buffer(leaf,
4524                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4525                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4526                              item_size);
4527         return 0;
4528 }
4529
4530 /*
4531  * make the item pointed to by the path smaller.  new_size indicates
4532  * how small to make it, and from_end tells us if we just chop bytes
4533  * off the end of the item or if we shift the item to chop bytes off
4534  * the front.
4535  */
4536 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4537                          u32 new_size, int from_end)
4538 {
4539         int slot;
4540         struct extent_buffer *leaf;
4541         struct btrfs_item *item;
4542         u32 nritems;
4543         unsigned int data_end;
4544         unsigned int old_data_start;
4545         unsigned int old_size;
4546         unsigned int size_diff;
4547         int i;
4548         struct btrfs_map_token token;
4549
4550         btrfs_init_map_token(&token);
4551
4552         leaf = path->nodes[0];
4553         slot = path->slots[0];
4554
4555         old_size = btrfs_item_size_nr(leaf, slot);
4556         if (old_size == new_size)
4557                 return;
4558
4559         nritems = btrfs_header_nritems(leaf);
4560         data_end = leaf_data_end(root, leaf);
4561
4562         old_data_start = btrfs_item_offset_nr(leaf, slot);
4563
4564         size_diff = old_size - new_size;
4565
4566         BUG_ON(slot < 0);
4567         BUG_ON(slot >= nritems);
4568
4569         /*
4570          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4571          */
4572         /* first correct the data pointers */
4573         for (i = slot; i < nritems; i++) {
4574                 u32 ioff;
4575                 item = btrfs_item_nr(i);
4576
4577                 ioff = btrfs_token_item_offset(leaf, item, &token);
4578                 btrfs_set_token_item_offset(leaf, item,
4579                                             ioff + size_diff, &token);
4580         }
4581
4582         /* shift the data */
4583         if (from_end) {
4584                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4585                               data_end + size_diff, btrfs_leaf_data(leaf) +
4586                               data_end, old_data_start + new_size - data_end);
4587         } else {
4588                 struct btrfs_disk_key disk_key;
4589                 u64 offset;
4590
4591                 btrfs_item_key(leaf, &disk_key, slot);
4592
4593                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4594                         unsigned long ptr;
4595                         struct btrfs_file_extent_item *fi;
4596
4597                         fi = btrfs_item_ptr(leaf, slot,
4598                                             struct btrfs_file_extent_item);
4599                         fi = (struct btrfs_file_extent_item *)(
4600                              (unsigned long)fi - size_diff);
4601
4602                         if (btrfs_file_extent_type(leaf, fi) ==
4603                             BTRFS_FILE_EXTENT_INLINE) {
4604                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4605                                 memmove_extent_buffer(leaf, ptr,
4606                                       (unsigned long)fi,
4607                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4608                         }
4609                 }
4610
4611                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4612                               data_end + size_diff, btrfs_leaf_data(leaf) +
4613                               data_end, old_data_start - data_end);
4614
4615                 offset = btrfs_disk_key_offset(&disk_key);
4616                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4617                 btrfs_set_item_key(leaf, &disk_key, slot);
4618                 if (slot == 0)
4619                         fixup_low_keys(root, path, &disk_key, 1);
4620         }
4621
4622         item = btrfs_item_nr(slot);
4623         btrfs_set_item_size(leaf, item, new_size);
4624         btrfs_mark_buffer_dirty(leaf);
4625
4626         if (btrfs_leaf_free_space(root, leaf) < 0) {
4627                 btrfs_print_leaf(root, leaf);
4628                 BUG();
4629         }
4630 }
4631
4632 /*
4633  * make the item pointed to by the path bigger, data_size is the added size.
4634  */
4635 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4636                        u32 data_size)
4637 {
4638         int slot;
4639         struct extent_buffer *leaf;
4640         struct btrfs_item *item;
4641         u32 nritems;
4642         unsigned int data_end;
4643         unsigned int old_data;
4644         unsigned int old_size;
4645         int i;
4646         struct btrfs_map_token token;
4647
4648         btrfs_init_map_token(&token);
4649
4650         leaf = path->nodes[0];
4651
4652         nritems = btrfs_header_nritems(leaf);
4653         data_end = leaf_data_end(root, leaf);
4654
4655         if (btrfs_leaf_free_space(root, leaf) < data_size) {
4656                 btrfs_print_leaf(root, leaf);
4657                 BUG();
4658         }
4659         slot = path->slots[0];
4660         old_data = btrfs_item_end_nr(leaf, slot);
4661
4662         BUG_ON(slot < 0);
4663         if (slot >= nritems) {
4664                 btrfs_print_leaf(root, leaf);
4665                 btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4666                        slot, nritems);
4667                 BUG_ON(1);
4668         }
4669
4670         /*
4671          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4672          */
4673         /* first correct the data pointers */
4674         for (i = slot; i < nritems; i++) {
4675                 u32 ioff;
4676                 item = btrfs_item_nr(i);
4677
4678                 ioff = btrfs_token_item_offset(leaf, item, &token);
4679                 btrfs_set_token_item_offset(leaf, item,
4680                                             ioff - data_size, &token);
4681         }
4682
4683         /* shift the data */
4684         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4685                       data_end - data_size, btrfs_leaf_data(leaf) +
4686                       data_end, old_data - data_end);
4687
4688         data_end = old_data;
4689         old_size = btrfs_item_size_nr(leaf, slot);
4690         item = btrfs_item_nr(slot);
4691         btrfs_set_item_size(leaf, item, old_size + data_size);
4692         btrfs_mark_buffer_dirty(leaf);
4693
4694         if (btrfs_leaf_free_space(root, leaf) < 0) {
4695                 btrfs_print_leaf(root, leaf);
4696                 BUG();
4697         }
4698 }
4699
4700 /*
4701  * this is a helper for btrfs_insert_empty_items, the main goal here is
4702  * to save stack depth by doing the bulk of the work in a function
4703  * that doesn't call btrfs_search_slot
4704  */
4705 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4706                             struct btrfs_key *cpu_key, u32 *data_size,
4707                             u32 total_data, u32 total_size, int nr)
4708 {
4709         struct btrfs_item *item;
4710         int i;
4711         u32 nritems;
4712         unsigned int data_end;
4713         struct btrfs_disk_key disk_key;
4714         struct extent_buffer *leaf;
4715         int slot;
4716         struct btrfs_map_token token;
4717
4718         if (path->slots[0] == 0) {
4719                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4720                 fixup_low_keys(root, path, &disk_key, 1);
4721         }
4722         btrfs_unlock_up_safe(path, 1);
4723
4724         btrfs_init_map_token(&token);
4725
4726         leaf = path->nodes[0];
4727         slot = path->slots[0];
4728
4729         nritems = btrfs_header_nritems(leaf);
4730         data_end = leaf_data_end(root, leaf);
4731
4732         if (btrfs_leaf_free_space(root, leaf) < total_size) {
4733                 btrfs_print_leaf(root, leaf);
4734                 btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4735                        total_size, btrfs_leaf_free_space(root, leaf));
4736                 BUG();
4737         }
4738
4739         if (slot != nritems) {
4740                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4741
4742                 if (old_data < data_end) {
4743                         btrfs_print_leaf(root, leaf);
4744                         btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4745                                slot, old_data, data_end);
4746                         BUG_ON(1);
4747                 }
4748                 /*
4749                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4750                  */
4751                 /* first correct the data pointers */
4752                 for (i = slot; i < nritems; i++) {
4753                         u32 ioff;
4754
4755                         item = btrfs_item_nr( i);
4756                         ioff = btrfs_token_item_offset(leaf, item, &token);
4757                         btrfs_set_token_item_offset(leaf, item,
4758                                                     ioff - total_data, &token);
4759                 }
4760                 /* shift the items */
4761                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4762                               btrfs_item_nr_offset(slot),
4763                               (nritems - slot) * sizeof(struct btrfs_item));
4764
4765                 /* shift the data */
4766                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4767                               data_end - total_data, btrfs_leaf_data(leaf) +
4768                               data_end, old_data - data_end);
4769                 data_end = old_data;
4770         }
4771
4772         /* setup the item for the new data */
4773         for (i = 0; i < nr; i++) {
4774                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4775                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4776                 item = btrfs_item_nr(slot + i);
4777                 btrfs_set_token_item_offset(leaf, item,
4778                                             data_end - data_size[i], &token);
4779                 data_end -= data_size[i];
4780                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4781         }
4782
4783         btrfs_set_header_nritems(leaf, nritems + nr);
4784         btrfs_mark_buffer_dirty(leaf);
4785
4786         if (btrfs_leaf_free_space(root, leaf) < 0) {
4787                 btrfs_print_leaf(root, leaf);
4788                 BUG();
4789         }
4790 }
4791
4792 /*
4793  * Given a key and some data, insert items into the tree.
4794  * This does all the path init required, making room in the tree if needed.
4795  */
4796 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4797                             struct btrfs_root *root,
4798                             struct btrfs_path *path,
4799                             struct btrfs_key *cpu_key, u32 *data_size,
4800                             int nr)
4801 {
4802         int ret = 0;
4803         int slot;
4804         int i;
4805         u32 total_size = 0;
4806         u32 total_data = 0;
4807
4808         for (i = 0; i < nr; i++)
4809                 total_data += data_size[i];
4810
4811         total_size = total_data + (nr * sizeof(struct btrfs_item));
4812         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4813         if (ret == 0)
4814                 return -EEXIST;
4815         if (ret < 0)
4816                 return ret;
4817
4818         slot = path->slots[0];
4819         BUG_ON(slot < 0);
4820
4821         setup_items_for_insert(root, path, cpu_key, data_size,
4822                                total_data, total_size, nr);
4823         return 0;
4824 }
4825
4826 /*
4827  * Given a key and some data, insert an item into the tree.
4828  * This does all the path init required, making room in the tree if needed.
4829  */
4830 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4831                       *root, struct btrfs_key *cpu_key, void *data, u32
4832                       data_size)
4833 {
4834         int ret = 0;
4835         struct btrfs_path *path;
4836         struct extent_buffer *leaf;
4837         unsigned long ptr;
4838
4839         path = btrfs_alloc_path();
4840         if (!path)
4841                 return -ENOMEM;
4842         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4843         if (!ret) {
4844                 leaf = path->nodes[0];
4845                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4846                 write_extent_buffer(leaf, data, ptr, data_size);
4847                 btrfs_mark_buffer_dirty(leaf);
4848         }
4849         btrfs_free_path(path);
4850         return ret;
4851 }
4852
4853 /*
4854  * delete the pointer from a given node.
4855  *
4856  * the tree should have been previously balanced so the deletion does not
4857  * empty a node.
4858  */
4859 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4860                     int level, int slot)
4861 {
4862         struct extent_buffer *parent = path->nodes[level];
4863         u32 nritems;
4864         int ret;
4865
4866         nritems = btrfs_header_nritems(parent);
4867         if (slot != nritems - 1) {
4868                 if (level)
4869                         tree_mod_log_eb_move(root->fs_info, parent, slot,
4870                                              slot + 1, nritems - slot - 1);
4871                 memmove_extent_buffer(parent,
4872                               btrfs_node_key_ptr_offset(slot),
4873                               btrfs_node_key_ptr_offset(slot + 1),
4874                               sizeof(struct btrfs_key_ptr) *
4875                               (nritems - slot - 1));
4876         } else if (level) {
4877                 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4878                                               MOD_LOG_KEY_REMOVE, GFP_NOFS);
4879                 BUG_ON(ret < 0);
4880         }
4881
4882         nritems--;
4883         btrfs_set_header_nritems(parent, nritems);
4884         if (nritems == 0 && parent == root->node) {
4885                 BUG_ON(btrfs_header_level(root->node) != 1);
4886                 /* just turn the root into a leaf and break */
4887                 btrfs_set_header_level(root->node, 0);
4888         } else if (slot == 0) {
4889                 struct btrfs_disk_key disk_key;
4890
4891                 btrfs_node_key(parent, &disk_key, 0);
4892                 fixup_low_keys(root, path, &disk_key, level + 1);
4893         }
4894         btrfs_mark_buffer_dirty(parent);
4895 }
4896
4897 /*
4898  * a helper function to delete the leaf pointed to by path->slots[1] and
4899  * path->nodes[1].
4900  *
4901  * This deletes the pointer in path->nodes[1] and frees the leaf
4902  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4903  *
4904  * The path must have already been setup for deleting the leaf, including
4905  * all the proper balancing.  path->nodes[1] must be locked.
4906  */
4907 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4908                                     struct btrfs_root *root,
4909                                     struct btrfs_path *path,
4910                                     struct extent_buffer *leaf)
4911 {
4912         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4913         del_ptr(root, path, 1, path->slots[1]);
4914
4915         /*
4916          * btrfs_free_extent is expensive, we want to make sure we
4917          * aren't holding any locks when we call it
4918          */
4919         btrfs_unlock_up_safe(path, 0);
4920
4921         root_sub_used(root, leaf->len);
4922
4923         extent_buffer_get(leaf);
4924         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4925         free_extent_buffer_stale(leaf);
4926 }
4927 /*
4928  * delete the item at the leaf level in path.  If that empties
4929  * the leaf, remove it from the tree
4930  */
4931 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4932                     struct btrfs_path *path, int slot, int nr)
4933 {
4934         struct extent_buffer *leaf;
4935         struct btrfs_item *item;
4936         int last_off;
4937         int dsize = 0;
4938         int ret = 0;
4939         int wret;
4940         int i;
4941         u32 nritems;
4942         struct btrfs_map_token token;
4943
4944         btrfs_init_map_token(&token);
4945
4946         leaf = path->nodes[0];
4947         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4948
4949         for (i = 0; i < nr; i++)
4950                 dsize += btrfs_item_size_nr(leaf, slot + i);
4951
4952         nritems = btrfs_header_nritems(leaf);
4953
4954         if (slot + nr != nritems) {
4955                 int data_end = leaf_data_end(root, leaf);
4956
4957                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4958                               data_end + dsize,
4959                               btrfs_leaf_data(leaf) + data_end,
4960                               last_off - data_end);
4961
4962                 for (i = slot + nr; i < nritems; i++) {
4963                         u32 ioff;
4964
4965                         item = btrfs_item_nr(i);
4966                         ioff = btrfs_token_item_offset(leaf, item, &token);
4967                         btrfs_set_token_item_offset(leaf, item,
4968                                                     ioff + dsize, &token);
4969                 }
4970
4971                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4972                               btrfs_item_nr_offset(slot + nr),
4973                               sizeof(struct btrfs_item) *
4974                               (nritems - slot - nr));
4975         }
4976         btrfs_set_header_nritems(leaf, nritems - nr);
4977         nritems -= nr;
4978
4979         /* delete the leaf if we've emptied it */
4980         if (nritems == 0) {
4981                 if (leaf == root->node) {
4982                         btrfs_set_header_level(leaf, 0);
4983                 } else {
4984                         btrfs_set_path_blocking(path);
4985                         clean_tree_block(trans, root, leaf);
4986                         btrfs_del_leaf(trans, root, path, leaf);
4987                 }
4988         } else {
4989                 int used = leaf_space_used(leaf, 0, nritems);
4990                 if (slot == 0) {
4991                         struct btrfs_disk_key disk_key;
4992
4993                         btrfs_item_key(leaf, &disk_key, 0);
4994                         fixup_low_keys(root, path, &disk_key, 1);
4995                 }
4996
4997                 /* delete the leaf if it is mostly empty */
4998                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4999                         /* push_leaf_left fixes the path.
5000                          * make sure the path still points to our leaf
5001                          * for possible call to del_ptr below
5002                          */
5003                         slot = path->slots[1];
5004                         extent_buffer_get(leaf);
5005
5006                         btrfs_set_path_blocking(path);
5007                         wret = push_leaf_left(trans, root, path, 1, 1,
5008                                               1, (u32)-1);
5009                         if (wret < 0 && wret != -ENOSPC)
5010                                 ret = wret;
5011
5012                         if (path->nodes[0] == leaf &&
5013                             btrfs_header_nritems(leaf)) {
5014                                 wret = push_leaf_right(trans, root, path, 1,
5015                                                        1, 1, 0);
5016                                 if (wret < 0 && wret != -ENOSPC)
5017                                         ret = wret;
5018                         }
5019
5020                         if (btrfs_header_nritems(leaf) == 0) {
5021                                 path->slots[1] = slot;
5022                                 btrfs_del_leaf(trans, root, path, leaf);
5023                                 free_extent_buffer(leaf);
5024                                 ret = 0;
5025                         } else {
5026                                 /* if we're still in the path, make sure
5027                                  * we're dirty.  Otherwise, one of the
5028                                  * push_leaf functions must have already
5029                                  * dirtied this buffer
5030                                  */
5031                                 if (path->nodes[0] == leaf)
5032                                         btrfs_mark_buffer_dirty(leaf);
5033                                 free_extent_buffer(leaf);
5034                         }
5035                 } else {
5036                         btrfs_mark_buffer_dirty(leaf);
5037                 }
5038         }
5039         return ret;
5040 }
5041
5042 /*
5043  * search the tree again to find a leaf with lesser keys
5044  * returns 0 if it found something or 1 if there are no lesser leaves.
5045  * returns < 0 on io errors.
5046  *
5047  * This may release the path, and so you may lose any locks held at the
5048  * time you call it.
5049  */
5050 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5051 {
5052         struct btrfs_key key;
5053         struct btrfs_disk_key found_key;
5054         int ret;
5055
5056         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5057
5058         if (key.offset > 0) {
5059                 key.offset--;
5060         } else if (key.type > 0) {
5061                 key.type--;
5062                 key.offset = (u64)-1;
5063         } else if (key.objectid > 0) {
5064                 key.objectid--;
5065                 key.type = (u8)-1;
5066                 key.offset = (u64)-1;
5067         } else {
5068                 return 1;
5069         }
5070
5071         btrfs_release_path(path);
5072         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5073         if (ret < 0)
5074                 return ret;
5075         btrfs_item_key(path->nodes[0], &found_key, 0);
5076         ret = comp_keys(&found_key, &key);
5077         /*
5078          * We might have had an item with the previous key in the tree right
5079          * before we released our path. And after we released our path, that
5080          * item might have been pushed to the first slot (0) of the leaf we
5081          * were holding due to a tree balance. Alternatively, an item with the
5082          * previous key can exist as the only element of a leaf (big fat item).
5083          * Therefore account for these 2 cases, so that our callers (like
5084          * btrfs_previous_item) don't miss an existing item with a key matching
5085          * the previous key we computed above.
5086          */
5087         if (ret <= 0)
5088                 return 0;
5089         return 1;
5090 }
5091
5092 /*
5093  * A helper function to walk down the tree starting at min_key, and looking
5094  * for nodes or leaves that are have a minimum transaction id.
5095  * This is used by the btree defrag code, and tree logging
5096  *
5097  * This does not cow, but it does stuff the starting key it finds back
5098  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5099  * key and get a writable path.
5100  *
5101  * This does lock as it descends, and path->keep_locks should be set
5102  * to 1 by the caller.
5103  *
5104  * This honors path->lowest_level to prevent descent past a given level
5105  * of the tree.
5106  *
5107  * min_trans indicates the oldest transaction that you are interested
5108  * in walking through.  Any nodes or leaves older than min_trans are
5109  * skipped over (without reading them).
5110  *
5111  * returns zero if something useful was found, < 0 on error and 1 if there
5112  * was nothing in the tree that matched the search criteria.
5113  */
5114 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5115                          struct btrfs_path *path,
5116                          u64 min_trans)
5117 {
5118         struct extent_buffer *cur;
5119         struct btrfs_key found_key;
5120         int slot;
5121         int sret;
5122         u32 nritems;
5123         int level;
5124         int ret = 1;
5125         int keep_locks = path->keep_locks;
5126
5127         path->keep_locks = 1;
5128 again:
5129         cur = btrfs_read_lock_root_node(root);
5130         level = btrfs_header_level(cur);
5131         WARN_ON(path->nodes[level]);
5132         path->nodes[level] = cur;
5133         path->locks[level] = BTRFS_READ_LOCK;
5134
5135         if (btrfs_header_generation(cur) < min_trans) {
5136                 ret = 1;
5137                 goto out;
5138         }
5139         while (1) {
5140                 nritems = btrfs_header_nritems(cur);
5141                 level = btrfs_header_level(cur);
5142                 sret = bin_search(cur, min_key, level, &slot);
5143
5144                 /* at the lowest level, we're done, setup the path and exit */
5145                 if (level == path->lowest_level) {
5146                         if (slot >= nritems)
5147                                 goto find_next_key;
5148                         ret = 0;
5149                         path->slots[level] = slot;
5150                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5151                         goto out;
5152                 }
5153                 if (sret && slot > 0)
5154                         slot--;
5155                 /*
5156                  * check this node pointer against the min_trans parameters.
5157                  * If it is too old, old, skip to the next one.
5158                  */
5159                 while (slot < nritems) {
5160                         u64 gen;
5161
5162                         gen = btrfs_node_ptr_generation(cur, slot);
5163                         if (gen < min_trans) {
5164                                 slot++;
5165                                 continue;
5166                         }
5167                         break;
5168                 }
5169 find_next_key:
5170                 /*
5171                  * we didn't find a candidate key in this node, walk forward
5172                  * and find another one
5173                  */
5174                 if (slot >= nritems) {
5175                         path->slots[level] = slot;
5176                         btrfs_set_path_blocking(path);
5177                         sret = btrfs_find_next_key(root, path, min_key, level,
5178                                                   min_trans);
5179                         if (sret == 0) {
5180                                 btrfs_release_path(path);
5181                                 goto again;
5182                         } else {
5183                                 goto out;
5184                         }
5185                 }
5186                 /* save our key for returning back */
5187                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5188                 path->slots[level] = slot;
5189                 if (level == path->lowest_level) {
5190                         ret = 0;
5191                         goto out;
5192                 }
5193                 btrfs_set_path_blocking(path);
5194                 cur = read_node_slot(root, cur, slot);
5195                 BUG_ON(!cur); /* -ENOMEM */
5196
5197                 btrfs_tree_read_lock(cur);
5198
5199                 path->locks[level - 1] = BTRFS_READ_LOCK;
5200                 path->nodes[level - 1] = cur;
5201                 unlock_up(path, level, 1, 0, NULL);
5202                 btrfs_clear_path_blocking(path, NULL, 0);
5203         }
5204 out:
5205         path->keep_locks = keep_locks;
5206         if (ret == 0) {
5207                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5208                 btrfs_set_path_blocking(path);
5209                 memcpy(min_key, &found_key, sizeof(found_key));
5210         }
5211         return ret;
5212 }
5213
5214 static void tree_move_down(struct btrfs_root *root,
5215                            struct btrfs_path *path,
5216                            int *level, int root_level)
5217 {
5218         BUG_ON(*level == 0);
5219         path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5220                                         path->slots[*level]);
5221         path->slots[*level - 1] = 0;
5222         (*level)--;
5223 }
5224
5225 static int tree_move_next_or_upnext(struct btrfs_root *root,
5226                                     struct btrfs_path *path,
5227                                     int *level, int root_level)
5228 {
5229         int ret = 0;
5230         int nritems;
5231         nritems = btrfs_header_nritems(path->nodes[*level]);
5232
5233         path->slots[*level]++;
5234
5235         while (path->slots[*level] >= nritems) {
5236                 if (*level == root_level)
5237                         return -1;
5238
5239                 /* move upnext */
5240                 path->slots[*level] = 0;
5241                 free_extent_buffer(path->nodes[*level]);
5242                 path->nodes[*level] = NULL;
5243                 (*level)++;
5244                 path->slots[*level]++;
5245
5246                 nritems = btrfs_header_nritems(path->nodes[*level]);
5247                 ret = 1;
5248         }
5249         return ret;
5250 }
5251
5252 /*
5253  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5254  * or down.
5255  */
5256 static int tree_advance(struct btrfs_root *root,
5257                         struct btrfs_path *path,
5258                         int *level, int root_level,
5259                         int allow_down,
5260                         struct btrfs_key *key)
5261 {
5262         int ret;
5263
5264         if (*level == 0 || !allow_down) {
5265                 ret = tree_move_next_or_upnext(root, path, level, root_level);
5266         } else {
5267                 tree_move_down(root, path, level, root_level);
5268                 ret = 0;
5269         }
5270         if (ret >= 0) {
5271                 if (*level == 0)
5272                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5273                                         path->slots[*level]);
5274                 else
5275                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5276                                         path->slots[*level]);
5277         }
5278         return ret;
5279 }
5280
5281 static int tree_compare_item(struct btrfs_root *left_root,
5282                              struct btrfs_path *left_path,
5283                              struct btrfs_path *right_path,
5284                              char *tmp_buf)
5285 {
5286         int cmp;
5287         int len1, len2;
5288         unsigned long off1, off2;
5289
5290         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5291         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5292         if (len1 != len2)
5293                 return 1;
5294
5295         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5296         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5297                                 right_path->slots[0]);
5298
5299         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5300
5301         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5302         if (cmp)
5303                 return 1;
5304         return 0;
5305 }
5306
5307 #define ADVANCE 1
5308 #define ADVANCE_ONLY_NEXT -1
5309
5310 /*
5311  * This function compares two trees and calls the provided callback for
5312  * every changed/new/deleted item it finds.
5313  * If shared tree blocks are encountered, whole subtrees are skipped, making
5314  * the compare pretty fast on snapshotted subvolumes.
5315  *
5316  * This currently works on commit roots only. As commit roots are read only,
5317  * we don't do any locking. The commit roots are protected with transactions.
5318  * Transactions are ended and rejoined when a commit is tried in between.
5319  *
5320  * This function checks for modifications done to the trees while comparing.
5321  * If it detects a change, it aborts immediately.
5322  */
5323 int btrfs_compare_trees(struct btrfs_root *left_root,
5324                         struct btrfs_root *right_root,
5325                         btrfs_changed_cb_t changed_cb, void *ctx)
5326 {
5327         int ret;
5328         int cmp;
5329         struct btrfs_path *left_path = NULL;
5330         struct btrfs_path *right_path = NULL;
5331         struct btrfs_key left_key;
5332         struct btrfs_key right_key;
5333         char *tmp_buf = NULL;
5334         int left_root_level;
5335         int right_root_level;
5336         int left_level;
5337         int right_level;
5338         int left_end_reached;
5339         int right_end_reached;
5340         int advance_left;
5341         int advance_right;
5342         u64 left_blockptr;
5343         u64 right_blockptr;
5344         u64 left_gen;
5345         u64 right_gen;
5346
5347         left_path = btrfs_alloc_path();
5348         if (!left_path) {
5349                 ret = -ENOMEM;
5350                 goto out;
5351         }
5352         right_path = btrfs_alloc_path();
5353         if (!right_path) {
5354                 ret = -ENOMEM;
5355                 goto out;
5356         }
5357
5358         tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5359         if (!tmp_buf) {
5360                 ret = -ENOMEM;
5361                 goto out;
5362         }
5363
5364         left_path->search_commit_root = 1;
5365         left_path->skip_locking = 1;
5366         right_path->search_commit_root = 1;
5367         right_path->skip_locking = 1;
5368
5369         /*
5370          * Strategy: Go to the first items of both trees. Then do
5371          *
5372          * If both trees are at level 0
5373          *   Compare keys of current items
5374          *     If left < right treat left item as new, advance left tree
5375          *       and repeat
5376          *     If left > right treat right item as deleted, advance right tree
5377          *       and repeat
5378          *     If left == right do deep compare of items, treat as changed if
5379          *       needed, advance both trees and repeat
5380          * If both trees are at the same level but not at level 0
5381          *   Compare keys of current nodes/leafs
5382          *     If left < right advance left tree and repeat
5383          *     If left > right advance right tree and repeat
5384          *     If left == right compare blockptrs of the next nodes/leafs
5385          *       If they match advance both trees but stay at the same level
5386          *         and repeat
5387          *       If they don't match advance both trees while allowing to go
5388          *         deeper and repeat
5389          * If tree levels are different
5390          *   Advance the tree that needs it and repeat
5391          *
5392          * Advancing a tree means:
5393          *   If we are at level 0, try to go to the next slot. If that's not
5394          *   possible, go one level up and repeat. Stop when we found a level
5395          *   where we could go to the next slot. We may at this point be on a
5396          *   node or a leaf.
5397          *
5398          *   If we are not at level 0 and not on shared tree blocks, go one
5399          *   level deeper.
5400          *
5401          *   If we are not at level 0 and on shared tree blocks, go one slot to
5402          *   the right if possible or go up and right.
5403          */
5404
5405         down_read(&left_root->fs_info->commit_root_sem);
5406         left_level = btrfs_header_level(left_root->commit_root);
5407         left_root_level = left_level;
5408         left_path->nodes[left_level] = left_root->commit_root;
5409         extent_buffer_get(left_path->nodes[left_level]);
5410
5411         right_level = btrfs_header_level(right_root->commit_root);
5412         right_root_level = right_level;
5413         right_path->nodes[right_level] = right_root->commit_root;
5414         extent_buffer_get(right_path->nodes[right_level]);
5415         up_read(&left_root->fs_info->commit_root_sem);
5416
5417         if (left_level == 0)
5418                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5419                                 &left_key, left_path->slots[left_level]);
5420         else
5421                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5422                                 &left_key, left_path->slots[left_level]);
5423         if (right_level == 0)
5424                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5425                                 &right_key, right_path->slots[right_level]);
5426         else
5427                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5428                                 &right_key, right_path->slots[right_level]);
5429
5430         left_end_reached = right_end_reached = 0;
5431         advance_left = advance_right = 0;
5432
5433         while (1) {
5434                 if (advance_left && !left_end_reached) {
5435                         ret = tree_advance(left_root, left_path, &left_level,
5436                                         left_root_level,
5437                                         advance_left != ADVANCE_ONLY_NEXT,
5438                                         &left_key);
5439                         if (ret < 0)
5440                                 left_end_reached = ADVANCE;
5441                         advance_left = 0;
5442                 }
5443                 if (advance_right && !right_end_reached) {
5444                         ret = tree_advance(right_root, right_path, &right_level,
5445                                         right_root_level,
5446                                         advance_right != ADVANCE_ONLY_NEXT,
5447                                         &right_key);
5448                         if (ret < 0)
5449                                 right_end_reached = ADVANCE;
5450                         advance_right = 0;
5451                 }
5452
5453                 if (left_end_reached && right_end_reached) {
5454                         ret = 0;
5455                         goto out;
5456                 } else if (left_end_reached) {
5457                         if (right_level == 0) {
5458                                 ret = changed_cb(left_root, right_root,
5459                                                 left_path, right_path,
5460                                                 &right_key,
5461                                                 BTRFS_COMPARE_TREE_DELETED,
5462                                                 ctx);
5463                                 if (ret < 0)
5464                                         goto out;
5465                         }
5466                         advance_right = ADVANCE;
5467                         continue;
5468                 } else if (right_end_reached) {
5469                         if (left_level == 0) {
5470                                 ret = changed_cb(left_root, right_root,
5471                                                 left_path, right_path,
5472                                                 &left_key,
5473                                                 BTRFS_COMPARE_TREE_NEW,
5474                                                 ctx);
5475                                 if (ret < 0)
5476                                         goto out;
5477                         }
5478                         advance_left = ADVANCE;
5479                         continue;
5480                 }
5481
5482                 if (left_level == 0 && right_level == 0) {
5483                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5484                         if (cmp < 0) {
5485                                 ret = changed_cb(left_root, right_root,
5486                                                 left_path, right_path,
5487                                                 &left_key,
5488                                                 BTRFS_COMPARE_TREE_NEW,
5489                                                 ctx);
5490                                 if (ret < 0)
5491                                         goto out;
5492                                 advance_left = ADVANCE;
5493                         } else if (cmp > 0) {
5494                                 ret = changed_cb(left_root, right_root,
5495                                                 left_path, right_path,
5496                                                 &right_key,
5497                                                 BTRFS_COMPARE_TREE_DELETED,
5498                                                 ctx);
5499                                 if (ret < 0)
5500                                         goto out;
5501                                 advance_right = ADVANCE;
5502                         } else {
5503                                 enum btrfs_compare_tree_result result;
5504
5505                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5506                                 ret = tree_compare_item(left_root, left_path,
5507                                                 right_path, tmp_buf);
5508                                 if (ret)
5509                                         result = BTRFS_COMPARE_TREE_CHANGED;
5510                                 else
5511                                         result = BTRFS_COMPARE_TREE_SAME;
5512                                 ret = changed_cb(left_root, right_root,
5513                                                  left_path, right_path,
5514                                                  &left_key, result, ctx);
5515                                 if (ret < 0)
5516                                         goto out;
5517                                 advance_left = ADVANCE;
5518                                 advance_right = ADVANCE;
5519                         }
5520                 } else if (left_level == right_level) {
5521                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5522                         if (cmp < 0) {
5523                                 advance_left = ADVANCE;
5524                         } else if (cmp > 0) {
5525                                 advance_right = ADVANCE;
5526                         } else {
5527                                 left_blockptr = btrfs_node_blockptr(
5528                                                 left_path->nodes[left_level],
5529                                                 left_path->slots[left_level]);
5530                                 right_blockptr = btrfs_node_blockptr(
5531                                                 right_path->nodes[right_level],
5532                                                 right_path->slots[right_level]);
5533                                 left_gen = btrfs_node_ptr_generation(
5534                                                 left_path->nodes[left_level],
5535                                                 left_path->slots[left_level]);
5536                                 right_gen = btrfs_node_ptr_generation(
5537                                                 right_path->nodes[right_level],
5538                                                 right_path->slots[right_level]);
5539                                 if (left_blockptr == right_blockptr &&
5540                                     left_gen == right_gen) {
5541                                         /*
5542                                          * As we're on a shared block, don't
5543                                          * allow to go deeper.
5544                                          */
5545                                         advance_left = ADVANCE_ONLY_NEXT;
5546                                         advance_right = ADVANCE_ONLY_NEXT;
5547                                 } else {
5548                                         advance_left = ADVANCE;
5549                                         advance_right = ADVANCE;
5550                                 }
5551                         }
5552                 } else if (left_level < right_level) {
5553                         advance_right = ADVANCE;
5554                 } else {
5555                         advance_left = ADVANCE;
5556                 }
5557         }
5558
5559 out:
5560         btrfs_free_path(left_path);
5561         btrfs_free_path(right_path);
5562         kfree(tmp_buf);
5563         return ret;
5564 }
5565
5566 /*
5567  * this is similar to btrfs_next_leaf, but does not try to preserve
5568  * and fixup the path.  It looks for and returns the next key in the
5569  * tree based on the current path and the min_trans parameters.
5570  *
5571  * 0 is returned if another key is found, < 0 if there are any errors
5572  * and 1 is returned if there are no higher keys in the tree
5573  *
5574  * path->keep_locks should be set to 1 on the search made before
5575  * calling this function.
5576  */
5577 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5578                         struct btrfs_key *key, int level, u64 min_trans)
5579 {
5580         int slot;
5581         struct extent_buffer *c;
5582
5583         WARN_ON(!path->keep_locks);
5584         while (level < BTRFS_MAX_LEVEL) {
5585                 if (!path->nodes[level])
5586                         return 1;
5587
5588                 slot = path->slots[level] + 1;
5589                 c = path->nodes[level];
5590 next:
5591                 if (slot >= btrfs_header_nritems(c)) {
5592                         int ret;
5593                         int orig_lowest;
5594                         struct btrfs_key cur_key;
5595                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5596                             !path->nodes[level + 1])
5597                                 return 1;
5598
5599                         if (path->locks[level + 1]) {
5600                                 level++;
5601                                 continue;
5602                         }
5603
5604                         slot = btrfs_header_nritems(c) - 1;
5605                         if (level == 0)
5606                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5607                         else
5608                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5609
5610                         orig_lowest = path->lowest_level;
5611                         btrfs_release_path(path);
5612                         path->lowest_level = level;
5613                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5614                                                 0, 0);
5615                         path->lowest_level = orig_lowest;
5616                         if (ret < 0)
5617                                 return ret;
5618
5619                         c = path->nodes[level];
5620                         slot = path->slots[level];
5621                         if (ret == 0)
5622                                 slot++;
5623                         goto next;
5624                 }
5625
5626                 if (level == 0)
5627                         btrfs_item_key_to_cpu(c, key, slot);
5628                 else {
5629                         u64 gen = btrfs_node_ptr_generation(c, slot);
5630
5631                         if (gen < min_trans) {
5632                                 slot++;
5633                                 goto next;
5634                         }
5635                         btrfs_node_key_to_cpu(c, key, slot);
5636                 }
5637                 return 0;
5638         }
5639         return 1;
5640 }
5641
5642 /*
5643  * search the tree again to find a leaf with greater keys
5644  * returns 0 if it found something or 1 if there are no greater leaves.
5645  * returns < 0 on io errors.
5646  */
5647 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5648 {
5649         return btrfs_next_old_leaf(root, path, 0);
5650 }
5651
5652 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5653                         u64 time_seq)
5654 {
5655         int slot;
5656         int level;
5657         struct extent_buffer *c;
5658         struct extent_buffer *next;
5659         struct btrfs_key key;
5660         u32 nritems;
5661         int ret;
5662         int old_spinning = path->leave_spinning;
5663         int next_rw_lock = 0;
5664
5665         nritems = btrfs_header_nritems(path->nodes[0]);
5666         if (nritems == 0)
5667                 return 1;
5668
5669         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5670 again:
5671         level = 1;
5672         next = NULL;
5673         next_rw_lock = 0;
5674         btrfs_release_path(path);
5675
5676         path->keep_locks = 1;
5677         path->leave_spinning = 1;
5678
5679         if (time_seq)
5680                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5681         else
5682                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5683         path->keep_locks = 0;
5684
5685         if (ret < 0)
5686                 return ret;
5687
5688         nritems = btrfs_header_nritems(path->nodes[0]);
5689         /*
5690          * by releasing the path above we dropped all our locks.  A balance
5691          * could have added more items next to the key that used to be
5692          * at the very end of the block.  So, check again here and
5693          * advance the path if there are now more items available.
5694          */
5695         if (nritems > 0 && path->slots[0] < nritems - 1) {
5696                 if (ret == 0)
5697                         path->slots[0]++;
5698                 ret = 0;
5699                 goto done;
5700         }
5701         /*
5702          * So the above check misses one case:
5703          * - after releasing the path above, someone has removed the item that
5704          *   used to be at the very end of the block, and balance between leafs
5705          *   gets another one with bigger key.offset to replace it.
5706          *
5707          * This one should be returned as well, or we can get leaf corruption
5708          * later(esp. in __btrfs_drop_extents()).
5709          *
5710          * And a bit more explanation about this check,
5711          * with ret > 0, the key isn't found, the path points to the slot
5712          * where it should be inserted, so the path->slots[0] item must be the
5713          * bigger one.
5714          */
5715         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5716                 ret = 0;
5717                 goto done;
5718         }
5719
5720         while (level < BTRFS_MAX_LEVEL) {
5721                 if (!path->nodes[level]) {
5722                         ret = 1;
5723                         goto done;
5724                 }
5725
5726                 slot = path->slots[level] + 1;
5727                 c = path->nodes[level];
5728                 if (slot >= btrfs_header_nritems(c)) {
5729                         level++;
5730                         if (level == BTRFS_MAX_LEVEL) {
5731                                 ret = 1;
5732                                 goto done;
5733                         }
5734                         continue;
5735                 }
5736
5737                 if (next) {
5738                         btrfs_tree_unlock_rw(next, next_rw_lock);
5739                         free_extent_buffer(next);
5740                 }
5741
5742                 next = c;
5743                 next_rw_lock = path->locks[level];
5744                 ret = read_block_for_search(NULL, root, path, &next, level,
5745                                             slot, &key, 0);
5746                 if (ret == -EAGAIN)
5747                         goto again;
5748
5749                 if (ret < 0) {
5750                         btrfs_release_path(path);
5751                         goto done;
5752                 }
5753
5754                 if (!path->skip_locking) {
5755                         ret = btrfs_try_tree_read_lock(next);
5756                         if (!ret && time_seq) {
5757                                 /*
5758                                  * If we don't get the lock, we may be racing
5759                                  * with push_leaf_left, holding that lock while
5760                                  * itself waiting for the leaf we've currently
5761                                  * locked. To solve this situation, we give up
5762                                  * on our lock and cycle.
5763                                  */
5764                                 free_extent_buffer(next);
5765                                 btrfs_release_path(path);
5766                                 cond_resched();
5767                                 goto again;
5768                         }
5769                         if (!ret) {
5770                                 btrfs_set_path_blocking(path);
5771                                 btrfs_tree_read_lock(next);
5772                                 btrfs_clear_path_blocking(path, next,
5773                                                           BTRFS_READ_LOCK);
5774                         }
5775                         next_rw_lock = BTRFS_READ_LOCK;
5776                 }
5777                 break;
5778         }
5779         path->slots[level] = slot;
5780         while (1) {
5781                 level--;
5782                 c = path->nodes[level];
5783                 if (path->locks[level])
5784                         btrfs_tree_unlock_rw(c, path->locks[level]);
5785
5786                 free_extent_buffer(c);
5787                 path->nodes[level] = next;
5788                 path->slots[level] = 0;
5789                 if (!path->skip_locking)
5790                         path->locks[level] = next_rw_lock;
5791                 if (!level)
5792                         break;
5793
5794                 ret = read_block_for_search(NULL, root, path, &next, level,
5795                                             0, &key, 0);
5796                 if (ret == -EAGAIN)
5797                         goto again;
5798
5799                 if (ret < 0) {
5800                         btrfs_release_path(path);
5801                         goto done;
5802                 }
5803
5804                 if (!path->skip_locking) {
5805                         ret = btrfs_try_tree_read_lock(next);
5806                         if (!ret) {
5807                                 btrfs_set_path_blocking(path);
5808                                 btrfs_tree_read_lock(next);
5809                                 btrfs_clear_path_blocking(path, next,
5810                                                           BTRFS_READ_LOCK);
5811                         }
5812                         next_rw_lock = BTRFS_READ_LOCK;
5813                 }
5814         }
5815         ret = 0;
5816 done:
5817         unlock_up(path, 0, 1, 0, NULL);
5818         path->leave_spinning = old_spinning;
5819         if (!old_spinning)
5820                 btrfs_set_path_blocking(path);
5821
5822         return ret;
5823 }
5824
5825 /*
5826  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5827  * searching until it gets past min_objectid or finds an item of 'type'
5828  *
5829  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5830  */
5831 int btrfs_previous_item(struct btrfs_root *root,
5832                         struct btrfs_path *path, u64 min_objectid,
5833                         int type)
5834 {
5835         struct btrfs_key found_key;
5836         struct extent_buffer *leaf;
5837         u32 nritems;
5838         int ret;
5839
5840         while (1) {
5841                 if (path->slots[0] == 0) {
5842                         btrfs_set_path_blocking(path);
5843                         ret = btrfs_prev_leaf(root, path);
5844                         if (ret != 0)
5845                                 return ret;
5846                 } else {
5847                         path->slots[0]--;
5848                 }
5849                 leaf = path->nodes[0];
5850                 nritems = btrfs_header_nritems(leaf);
5851                 if (nritems == 0)
5852                         return 1;
5853                 if (path->slots[0] == nritems)
5854                         path->slots[0]--;
5855
5856                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5857                 if (found_key.objectid < min_objectid)
5858                         break;
5859                 if (found_key.type == type)
5860                         return 0;
5861                 if (found_key.objectid == min_objectid &&
5862                     found_key.type < type)
5863                         break;
5864         }
5865         return 1;
5866 }
5867
5868 /*
5869  * search in extent tree to find a previous Metadata/Data extent item with
5870  * min objecitd.
5871  *
5872  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5873  */
5874 int btrfs_previous_extent_item(struct btrfs_root *root,
5875                         struct btrfs_path *path, u64 min_objectid)
5876 {
5877         struct btrfs_key found_key;
5878         struct extent_buffer *leaf;
5879         u32 nritems;
5880         int ret;
5881
5882         while (1) {
5883                 if (path->slots[0] == 0) {
5884                         btrfs_set_path_blocking(path);
5885                         ret = btrfs_prev_leaf(root, path);
5886                         if (ret != 0)
5887                                 return ret;
5888                 } else {
5889                         path->slots[0]--;
5890                 }
5891                 leaf = path->nodes[0];
5892                 nritems = btrfs_header_nritems(leaf);
5893                 if (nritems == 0)
5894                         return 1;
5895                 if (path->slots[0] == nritems)
5896                         path->slots[0]--;
5897
5898                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5899                 if (found_key.objectid < min_objectid)
5900                         break;
5901                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5902                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5903                         return 0;
5904                 if (found_key.objectid == min_objectid &&
5905                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5906                         break;
5907         }
5908         return 1;
5909 }