powerpc/powernv: Check sysfs size before copying
[cascardo/linux.git] / drivers / md / dm-thin-metadata.c
1 /*
2  * Copyright (C) 2011-2012 Red Hat, Inc.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm-thin-metadata.h"
8 #include "persistent-data/dm-btree.h"
9 #include "persistent-data/dm-space-map.h"
10 #include "persistent-data/dm-space-map-disk.h"
11 #include "persistent-data/dm-transaction-manager.h"
12
13 #include <linux/list.h>
14 #include <linux/device-mapper.h>
15 #include <linux/workqueue.h>
16
17 /*--------------------------------------------------------------------------
18  * As far as the metadata goes, there is:
19  *
20  * - A superblock in block zero, taking up fewer than 512 bytes for
21  *   atomic writes.
22  *
23  * - A space map managing the metadata blocks.
24  *
25  * - A space map managing the data blocks.
26  *
27  * - A btree mapping our internal thin dev ids onto struct disk_device_details.
28  *
29  * - A hierarchical btree, with 2 levels which effectively maps (thin
30  *   dev id, virtual block) -> block_time.  Block time is a 64-bit
31  *   field holding the time in the low 24 bits, and block in the top 48
32  *   bits.
33  *
34  * BTrees consist solely of btree_nodes, that fill a block.  Some are
35  * internal nodes, as such their values are a __le64 pointing to other
36  * nodes.  Leaf nodes can store data of any reasonable size (ie. much
37  * smaller than the block size).  The nodes consist of the header,
38  * followed by an array of keys, followed by an array of values.  We have
39  * to binary search on the keys so they're all held together to help the
40  * cpu cache.
41  *
42  * Space maps have 2 btrees:
43  *
44  * - One maps a uint64_t onto a struct index_entry.  Which points to a
45  *   bitmap block, and has some details about how many free entries there
46  *   are etc.
47  *
48  * - The bitmap blocks have a header (for the checksum).  Then the rest
49  *   of the block is pairs of bits.  With the meaning being:
50  *
51  *   0 - ref count is 0
52  *   1 - ref count is 1
53  *   2 - ref count is 2
54  *   3 - ref count is higher than 2
55  *
56  * - If the count is higher than 2 then the ref count is entered in a
57  *   second btree that directly maps the block_address to a uint32_t ref
58  *   count.
59  *
60  * The space map metadata variant doesn't have a bitmaps btree.  Instead
61  * it has one single blocks worth of index_entries.  This avoids
62  * recursive issues with the bitmap btree needing to allocate space in
63  * order to insert.  With a small data block size such as 64k the
64  * metadata support data devices that are hundreds of terrabytes.
65  *
66  * The space maps allocate space linearly from front to back.  Space that
67  * is freed in a transaction is never recycled within that transaction.
68  * To try and avoid fragmenting _free_ space the allocator always goes
69  * back and fills in gaps.
70  *
71  * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
72  * from the block manager.
73  *--------------------------------------------------------------------------*/
74
75 #define DM_MSG_PREFIX   "thin metadata"
76
77 #define THIN_SUPERBLOCK_MAGIC 27022010
78 #define THIN_SUPERBLOCK_LOCATION 0
79 #define THIN_VERSION 2
80 #define THIN_METADATA_CACHE_SIZE 64
81 #define SECTOR_TO_BLOCK_SHIFT 3
82
83 /*
84  *  3 for btree insert +
85  *  2 for btree lookup used within space map
86  */
87 #define THIN_MAX_CONCURRENT_LOCKS 5
88
89 /* This should be plenty */
90 #define SPACE_MAP_ROOT_SIZE 128
91
92 /*
93  * Little endian on-disk superblock and device details.
94  */
95 struct thin_disk_superblock {
96         __le32 csum;    /* Checksum of superblock except for this field. */
97         __le32 flags;
98         __le64 blocknr; /* This block number, dm_block_t. */
99
100         __u8 uuid[16];
101         __le64 magic;
102         __le32 version;
103         __le32 time;
104
105         __le64 trans_id;
106
107         /*
108          * Root held by userspace transactions.
109          */
110         __le64 held_root;
111
112         __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
113         __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
114
115         /*
116          * 2-level btree mapping (dev_id, (dev block, time)) -> data block
117          */
118         __le64 data_mapping_root;
119
120         /*
121          * Device detail root mapping dev_id -> device_details
122          */
123         __le64 device_details_root;
124
125         __le32 data_block_size;         /* In 512-byte sectors. */
126
127         __le32 metadata_block_size;     /* In 512-byte sectors. */
128         __le64 metadata_nr_blocks;
129
130         __le32 compat_flags;
131         __le32 compat_ro_flags;
132         __le32 incompat_flags;
133 } __packed;
134
135 struct disk_device_details {
136         __le64 mapped_blocks;
137         __le64 transaction_id;          /* When created. */
138         __le32 creation_time;
139         __le32 snapshotted_time;
140 } __packed;
141
142 struct dm_pool_metadata {
143         struct hlist_node hash;
144
145         struct block_device *bdev;
146         struct dm_block_manager *bm;
147         struct dm_space_map *metadata_sm;
148         struct dm_space_map *data_sm;
149         struct dm_transaction_manager *tm;
150         struct dm_transaction_manager *nb_tm;
151
152         /*
153          * Two-level btree.
154          * First level holds thin_dev_t.
155          * Second level holds mappings.
156          */
157         struct dm_btree_info info;
158
159         /*
160          * Non-blocking version of the above.
161          */
162         struct dm_btree_info nb_info;
163
164         /*
165          * Just the top level for deleting whole devices.
166          */
167         struct dm_btree_info tl_info;
168
169         /*
170          * Just the bottom level for creating new devices.
171          */
172         struct dm_btree_info bl_info;
173
174         /*
175          * Describes the device details btree.
176          */
177         struct dm_btree_info details_info;
178
179         struct rw_semaphore root_lock;
180         uint32_t time;
181         dm_block_t root;
182         dm_block_t details_root;
183         struct list_head thin_devices;
184         uint64_t trans_id;
185         unsigned long flags;
186         sector_t data_block_size;
187         bool read_only:1;
188
189         /*
190          * Set if a transaction has to be aborted but the attempt to roll back
191          * to the previous (good) transaction failed.  The only pool metadata
192          * operation possible in this state is the closing of the device.
193          */
194         bool fail_io:1;
195
196         /*
197          * Reading the space map roots can fail, so we read it into these
198          * buffers before the superblock is locked and updated.
199          */
200         __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
201         __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
202 };
203
204 struct dm_thin_device {
205         struct list_head list;
206         struct dm_pool_metadata *pmd;
207         dm_thin_id id;
208
209         int open_count;
210         bool changed:1;
211         bool aborted_with_changes:1;
212         uint64_t mapped_blocks;
213         uint64_t transaction_id;
214         uint32_t creation_time;
215         uint32_t snapshotted_time;
216 };
217
218 /*----------------------------------------------------------------
219  * superblock validator
220  *--------------------------------------------------------------*/
221
222 #define SUPERBLOCK_CSUM_XOR 160774
223
224 static void sb_prepare_for_write(struct dm_block_validator *v,
225                                  struct dm_block *b,
226                                  size_t block_size)
227 {
228         struct thin_disk_superblock *disk_super = dm_block_data(b);
229
230         disk_super->blocknr = cpu_to_le64(dm_block_location(b));
231         disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
232                                                       block_size - sizeof(__le32),
233                                                       SUPERBLOCK_CSUM_XOR));
234 }
235
236 static int sb_check(struct dm_block_validator *v,
237                     struct dm_block *b,
238                     size_t block_size)
239 {
240         struct thin_disk_superblock *disk_super = dm_block_data(b);
241         __le32 csum_le;
242
243         if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
244                 DMERR("sb_check failed: blocknr %llu: "
245                       "wanted %llu", le64_to_cpu(disk_super->blocknr),
246                       (unsigned long long)dm_block_location(b));
247                 return -ENOTBLK;
248         }
249
250         if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
251                 DMERR("sb_check failed: magic %llu: "
252                       "wanted %llu", le64_to_cpu(disk_super->magic),
253                       (unsigned long long)THIN_SUPERBLOCK_MAGIC);
254                 return -EILSEQ;
255         }
256
257         csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
258                                              block_size - sizeof(__le32),
259                                              SUPERBLOCK_CSUM_XOR));
260         if (csum_le != disk_super->csum) {
261                 DMERR("sb_check failed: csum %u: wanted %u",
262                       le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
263                 return -EILSEQ;
264         }
265
266         return 0;
267 }
268
269 static struct dm_block_validator sb_validator = {
270         .name = "superblock",
271         .prepare_for_write = sb_prepare_for_write,
272         .check = sb_check
273 };
274
275 /*----------------------------------------------------------------
276  * Methods for the btree value types
277  *--------------------------------------------------------------*/
278
279 static uint64_t pack_block_time(dm_block_t b, uint32_t t)
280 {
281         return (b << 24) | t;
282 }
283
284 static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
285 {
286         *b = v >> 24;
287         *t = v & ((1 << 24) - 1);
288 }
289
290 static void data_block_inc(void *context, const void *value_le)
291 {
292         struct dm_space_map *sm = context;
293         __le64 v_le;
294         uint64_t b;
295         uint32_t t;
296
297         memcpy(&v_le, value_le, sizeof(v_le));
298         unpack_block_time(le64_to_cpu(v_le), &b, &t);
299         dm_sm_inc_block(sm, b);
300 }
301
302 static void data_block_dec(void *context, const void *value_le)
303 {
304         struct dm_space_map *sm = context;
305         __le64 v_le;
306         uint64_t b;
307         uint32_t t;
308
309         memcpy(&v_le, value_le, sizeof(v_le));
310         unpack_block_time(le64_to_cpu(v_le), &b, &t);
311         dm_sm_dec_block(sm, b);
312 }
313
314 static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
315 {
316         __le64 v1_le, v2_le;
317         uint64_t b1, b2;
318         uint32_t t;
319
320         memcpy(&v1_le, value1_le, sizeof(v1_le));
321         memcpy(&v2_le, value2_le, sizeof(v2_le));
322         unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
323         unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
324
325         return b1 == b2;
326 }
327
328 static void subtree_inc(void *context, const void *value)
329 {
330         struct dm_btree_info *info = context;
331         __le64 root_le;
332         uint64_t root;
333
334         memcpy(&root_le, value, sizeof(root_le));
335         root = le64_to_cpu(root_le);
336         dm_tm_inc(info->tm, root);
337 }
338
339 static void subtree_dec(void *context, const void *value)
340 {
341         struct dm_btree_info *info = context;
342         __le64 root_le;
343         uint64_t root;
344
345         memcpy(&root_le, value, sizeof(root_le));
346         root = le64_to_cpu(root_le);
347         if (dm_btree_del(info, root))
348                 DMERR("btree delete failed\n");
349 }
350
351 static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
352 {
353         __le64 v1_le, v2_le;
354         memcpy(&v1_le, value1_le, sizeof(v1_le));
355         memcpy(&v2_le, value2_le, sizeof(v2_le));
356
357         return v1_le == v2_le;
358 }
359
360 /*----------------------------------------------------------------*/
361
362 static int superblock_lock_zero(struct dm_pool_metadata *pmd,
363                                 struct dm_block **sblock)
364 {
365         return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
366                                      &sb_validator, sblock);
367 }
368
369 static int superblock_lock(struct dm_pool_metadata *pmd,
370                            struct dm_block **sblock)
371 {
372         return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
373                                 &sb_validator, sblock);
374 }
375
376 static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
377 {
378         int r;
379         unsigned i;
380         struct dm_block *b;
381         __le64 *data_le, zero = cpu_to_le64(0);
382         unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
383
384         /*
385          * We can't use a validator here - it may be all zeroes.
386          */
387         r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
388         if (r)
389                 return r;
390
391         data_le = dm_block_data(b);
392         *result = 1;
393         for (i = 0; i < block_size; i++) {
394                 if (data_le[i] != zero) {
395                         *result = 0;
396                         break;
397                 }
398         }
399
400         return dm_bm_unlock(b);
401 }
402
403 static void __setup_btree_details(struct dm_pool_metadata *pmd)
404 {
405         pmd->info.tm = pmd->tm;
406         pmd->info.levels = 2;
407         pmd->info.value_type.context = pmd->data_sm;
408         pmd->info.value_type.size = sizeof(__le64);
409         pmd->info.value_type.inc = data_block_inc;
410         pmd->info.value_type.dec = data_block_dec;
411         pmd->info.value_type.equal = data_block_equal;
412
413         memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
414         pmd->nb_info.tm = pmd->nb_tm;
415
416         pmd->tl_info.tm = pmd->tm;
417         pmd->tl_info.levels = 1;
418         pmd->tl_info.value_type.context = &pmd->bl_info;
419         pmd->tl_info.value_type.size = sizeof(__le64);
420         pmd->tl_info.value_type.inc = subtree_inc;
421         pmd->tl_info.value_type.dec = subtree_dec;
422         pmd->tl_info.value_type.equal = subtree_equal;
423
424         pmd->bl_info.tm = pmd->tm;
425         pmd->bl_info.levels = 1;
426         pmd->bl_info.value_type.context = pmd->data_sm;
427         pmd->bl_info.value_type.size = sizeof(__le64);
428         pmd->bl_info.value_type.inc = data_block_inc;
429         pmd->bl_info.value_type.dec = data_block_dec;
430         pmd->bl_info.value_type.equal = data_block_equal;
431
432         pmd->details_info.tm = pmd->tm;
433         pmd->details_info.levels = 1;
434         pmd->details_info.value_type.context = NULL;
435         pmd->details_info.value_type.size = sizeof(struct disk_device_details);
436         pmd->details_info.value_type.inc = NULL;
437         pmd->details_info.value_type.dec = NULL;
438         pmd->details_info.value_type.equal = NULL;
439 }
440
441 static int save_sm_roots(struct dm_pool_metadata *pmd)
442 {
443         int r;
444         size_t len;
445
446         r = dm_sm_root_size(pmd->metadata_sm, &len);
447         if (r < 0)
448                 return r;
449
450         r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
451         if (r < 0)
452                 return r;
453
454         r = dm_sm_root_size(pmd->data_sm, &len);
455         if (r < 0)
456                 return r;
457
458         return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
459 }
460
461 static void copy_sm_roots(struct dm_pool_metadata *pmd,
462                           struct thin_disk_superblock *disk)
463 {
464         memcpy(&disk->metadata_space_map_root,
465                &pmd->metadata_space_map_root,
466                sizeof(pmd->metadata_space_map_root));
467
468         memcpy(&disk->data_space_map_root,
469                &pmd->data_space_map_root,
470                sizeof(pmd->data_space_map_root));
471 }
472
473 static int __write_initial_superblock(struct dm_pool_metadata *pmd)
474 {
475         int r;
476         struct dm_block *sblock;
477         struct thin_disk_superblock *disk_super;
478         sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
479
480         if (bdev_size > THIN_METADATA_MAX_SECTORS)
481                 bdev_size = THIN_METADATA_MAX_SECTORS;
482
483         r = dm_sm_commit(pmd->data_sm);
484         if (r < 0)
485                 return r;
486
487         r = save_sm_roots(pmd);
488         if (r < 0)
489                 return r;
490
491         r = dm_tm_pre_commit(pmd->tm);
492         if (r < 0)
493                 return r;
494
495         r = superblock_lock_zero(pmd, &sblock);
496         if (r)
497                 return r;
498
499         disk_super = dm_block_data(sblock);
500         disk_super->flags = 0;
501         memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
502         disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
503         disk_super->version = cpu_to_le32(THIN_VERSION);
504         disk_super->time = 0;
505         disk_super->trans_id = 0;
506         disk_super->held_root = 0;
507
508         copy_sm_roots(pmd, disk_super);
509
510         disk_super->data_mapping_root = cpu_to_le64(pmd->root);
511         disk_super->device_details_root = cpu_to_le64(pmd->details_root);
512         disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
513         disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
514         disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
515
516         return dm_tm_commit(pmd->tm, sblock);
517 }
518
519 static int __format_metadata(struct dm_pool_metadata *pmd)
520 {
521         int r;
522
523         r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
524                                  &pmd->tm, &pmd->metadata_sm);
525         if (r < 0) {
526                 DMERR("tm_create_with_sm failed");
527                 return r;
528         }
529
530         pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
531         if (IS_ERR(pmd->data_sm)) {
532                 DMERR("sm_disk_create failed");
533                 r = PTR_ERR(pmd->data_sm);
534                 goto bad_cleanup_tm;
535         }
536
537         pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
538         if (!pmd->nb_tm) {
539                 DMERR("could not create non-blocking clone tm");
540                 r = -ENOMEM;
541                 goto bad_cleanup_data_sm;
542         }
543
544         __setup_btree_details(pmd);
545
546         r = dm_btree_empty(&pmd->info, &pmd->root);
547         if (r < 0)
548                 goto bad_cleanup_nb_tm;
549
550         r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
551         if (r < 0) {
552                 DMERR("couldn't create devices root");
553                 goto bad_cleanup_nb_tm;
554         }
555
556         r = __write_initial_superblock(pmd);
557         if (r)
558                 goto bad_cleanup_nb_tm;
559
560         return 0;
561
562 bad_cleanup_nb_tm:
563         dm_tm_destroy(pmd->nb_tm);
564 bad_cleanup_data_sm:
565         dm_sm_destroy(pmd->data_sm);
566 bad_cleanup_tm:
567         dm_tm_destroy(pmd->tm);
568         dm_sm_destroy(pmd->metadata_sm);
569
570         return r;
571 }
572
573 static int __check_incompat_features(struct thin_disk_superblock *disk_super,
574                                      struct dm_pool_metadata *pmd)
575 {
576         uint32_t features;
577
578         features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
579         if (features) {
580                 DMERR("could not access metadata due to unsupported optional features (%lx).",
581                       (unsigned long)features);
582                 return -EINVAL;
583         }
584
585         /*
586          * Check for read-only metadata to skip the following RDWR checks.
587          */
588         if (get_disk_ro(pmd->bdev->bd_disk))
589                 return 0;
590
591         features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
592         if (features) {
593                 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
594                       (unsigned long)features);
595                 return -EINVAL;
596         }
597
598         return 0;
599 }
600
601 static int __open_metadata(struct dm_pool_metadata *pmd)
602 {
603         int r;
604         struct dm_block *sblock;
605         struct thin_disk_superblock *disk_super;
606
607         r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
608                             &sb_validator, &sblock);
609         if (r < 0) {
610                 DMERR("couldn't read superblock");
611                 return r;
612         }
613
614         disk_super = dm_block_data(sblock);
615
616         r = __check_incompat_features(disk_super, pmd);
617         if (r < 0)
618                 goto bad_unlock_sblock;
619
620         r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
621                                disk_super->metadata_space_map_root,
622                                sizeof(disk_super->metadata_space_map_root),
623                                &pmd->tm, &pmd->metadata_sm);
624         if (r < 0) {
625                 DMERR("tm_open_with_sm failed");
626                 goto bad_unlock_sblock;
627         }
628
629         pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
630                                        sizeof(disk_super->data_space_map_root));
631         if (IS_ERR(pmd->data_sm)) {
632                 DMERR("sm_disk_open failed");
633                 r = PTR_ERR(pmd->data_sm);
634                 goto bad_cleanup_tm;
635         }
636
637         pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
638         if (!pmd->nb_tm) {
639                 DMERR("could not create non-blocking clone tm");
640                 r = -ENOMEM;
641                 goto bad_cleanup_data_sm;
642         }
643
644         __setup_btree_details(pmd);
645         return dm_bm_unlock(sblock);
646
647 bad_cleanup_data_sm:
648         dm_sm_destroy(pmd->data_sm);
649 bad_cleanup_tm:
650         dm_tm_destroy(pmd->tm);
651         dm_sm_destroy(pmd->metadata_sm);
652 bad_unlock_sblock:
653         dm_bm_unlock(sblock);
654
655         return r;
656 }
657
658 static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
659 {
660         int r, unformatted;
661
662         r = __superblock_all_zeroes(pmd->bm, &unformatted);
663         if (r)
664                 return r;
665
666         if (unformatted)
667                 return format_device ? __format_metadata(pmd) : -EPERM;
668
669         return __open_metadata(pmd);
670 }
671
672 static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
673 {
674         int r;
675
676         pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
677                                           THIN_METADATA_CACHE_SIZE,
678                                           THIN_MAX_CONCURRENT_LOCKS);
679         if (IS_ERR(pmd->bm)) {
680                 DMERR("could not create block manager");
681                 return PTR_ERR(pmd->bm);
682         }
683
684         r = __open_or_format_metadata(pmd, format_device);
685         if (r)
686                 dm_block_manager_destroy(pmd->bm);
687
688         return r;
689 }
690
691 static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
692 {
693         dm_sm_destroy(pmd->data_sm);
694         dm_sm_destroy(pmd->metadata_sm);
695         dm_tm_destroy(pmd->nb_tm);
696         dm_tm_destroy(pmd->tm);
697         dm_block_manager_destroy(pmd->bm);
698 }
699
700 static int __begin_transaction(struct dm_pool_metadata *pmd)
701 {
702         int r;
703         struct thin_disk_superblock *disk_super;
704         struct dm_block *sblock;
705
706         /*
707          * We re-read the superblock every time.  Shouldn't need to do this
708          * really.
709          */
710         r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
711                             &sb_validator, &sblock);
712         if (r)
713                 return r;
714
715         disk_super = dm_block_data(sblock);
716         pmd->time = le32_to_cpu(disk_super->time);
717         pmd->root = le64_to_cpu(disk_super->data_mapping_root);
718         pmd->details_root = le64_to_cpu(disk_super->device_details_root);
719         pmd->trans_id = le64_to_cpu(disk_super->trans_id);
720         pmd->flags = le32_to_cpu(disk_super->flags);
721         pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
722
723         dm_bm_unlock(sblock);
724         return 0;
725 }
726
727 static int __write_changed_details(struct dm_pool_metadata *pmd)
728 {
729         int r;
730         struct dm_thin_device *td, *tmp;
731         struct disk_device_details details;
732         uint64_t key;
733
734         list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
735                 if (!td->changed)
736                         continue;
737
738                 key = td->id;
739
740                 details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
741                 details.transaction_id = cpu_to_le64(td->transaction_id);
742                 details.creation_time = cpu_to_le32(td->creation_time);
743                 details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
744                 __dm_bless_for_disk(&details);
745
746                 r = dm_btree_insert(&pmd->details_info, pmd->details_root,
747                                     &key, &details, &pmd->details_root);
748                 if (r)
749                         return r;
750
751                 if (td->open_count)
752                         td->changed = 0;
753                 else {
754                         list_del(&td->list);
755                         kfree(td);
756                 }
757         }
758
759         return 0;
760 }
761
762 static int __commit_transaction(struct dm_pool_metadata *pmd)
763 {
764         int r;
765         size_t metadata_len, data_len;
766         struct thin_disk_superblock *disk_super;
767         struct dm_block *sblock;
768
769         /*
770          * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
771          */
772         BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
773
774         r = __write_changed_details(pmd);
775         if (r < 0)
776                 return r;
777
778         r = dm_sm_commit(pmd->data_sm);
779         if (r < 0)
780                 return r;
781
782         r = dm_tm_pre_commit(pmd->tm);
783         if (r < 0)
784                 return r;
785
786         r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
787         if (r < 0)
788                 return r;
789
790         r = dm_sm_root_size(pmd->data_sm, &data_len);
791         if (r < 0)
792                 return r;
793
794         r = save_sm_roots(pmd);
795         if (r < 0)
796                 return r;
797
798         r = superblock_lock(pmd, &sblock);
799         if (r)
800                 return r;
801
802         disk_super = dm_block_data(sblock);
803         disk_super->time = cpu_to_le32(pmd->time);
804         disk_super->data_mapping_root = cpu_to_le64(pmd->root);
805         disk_super->device_details_root = cpu_to_le64(pmd->details_root);
806         disk_super->trans_id = cpu_to_le64(pmd->trans_id);
807         disk_super->flags = cpu_to_le32(pmd->flags);
808
809         copy_sm_roots(pmd, disk_super);
810
811         return dm_tm_commit(pmd->tm, sblock);
812 }
813
814 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
815                                                sector_t data_block_size,
816                                                bool format_device)
817 {
818         int r;
819         struct dm_pool_metadata *pmd;
820
821         pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
822         if (!pmd) {
823                 DMERR("could not allocate metadata struct");
824                 return ERR_PTR(-ENOMEM);
825         }
826
827         init_rwsem(&pmd->root_lock);
828         pmd->time = 0;
829         INIT_LIST_HEAD(&pmd->thin_devices);
830         pmd->read_only = false;
831         pmd->fail_io = false;
832         pmd->bdev = bdev;
833         pmd->data_block_size = data_block_size;
834
835         r = __create_persistent_data_objects(pmd, format_device);
836         if (r) {
837                 kfree(pmd);
838                 return ERR_PTR(r);
839         }
840
841         r = __begin_transaction(pmd);
842         if (r < 0) {
843                 if (dm_pool_metadata_close(pmd) < 0)
844                         DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
845                 return ERR_PTR(r);
846         }
847
848         return pmd;
849 }
850
851 int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
852 {
853         int r;
854         unsigned open_devices = 0;
855         struct dm_thin_device *td, *tmp;
856
857         down_read(&pmd->root_lock);
858         list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
859                 if (td->open_count)
860                         open_devices++;
861                 else {
862                         list_del(&td->list);
863                         kfree(td);
864                 }
865         }
866         up_read(&pmd->root_lock);
867
868         if (open_devices) {
869                 DMERR("attempt to close pmd when %u device(s) are still open",
870                        open_devices);
871                 return -EBUSY;
872         }
873
874         if (!pmd->read_only && !pmd->fail_io) {
875                 r = __commit_transaction(pmd);
876                 if (r < 0)
877                         DMWARN("%s: __commit_transaction() failed, error = %d",
878                                __func__, r);
879         }
880
881         if (!pmd->fail_io)
882                 __destroy_persistent_data_objects(pmd);
883
884         kfree(pmd);
885         return 0;
886 }
887
888 /*
889  * __open_device: Returns @td corresponding to device with id @dev,
890  * creating it if @create is set and incrementing @td->open_count.
891  * On failure, @td is undefined.
892  */
893 static int __open_device(struct dm_pool_metadata *pmd,
894                          dm_thin_id dev, int create,
895                          struct dm_thin_device **td)
896 {
897         int r, changed = 0;
898         struct dm_thin_device *td2;
899         uint64_t key = dev;
900         struct disk_device_details details_le;
901
902         /*
903          * If the device is already open, return it.
904          */
905         list_for_each_entry(td2, &pmd->thin_devices, list)
906                 if (td2->id == dev) {
907                         /*
908                          * May not create an already-open device.
909                          */
910                         if (create)
911                                 return -EEXIST;
912
913                         td2->open_count++;
914                         *td = td2;
915                         return 0;
916                 }
917
918         /*
919          * Check the device exists.
920          */
921         r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
922                             &key, &details_le);
923         if (r) {
924                 if (r != -ENODATA || !create)
925                         return r;
926
927                 /*
928                  * Create new device.
929                  */
930                 changed = 1;
931                 details_le.mapped_blocks = 0;
932                 details_le.transaction_id = cpu_to_le64(pmd->trans_id);
933                 details_le.creation_time = cpu_to_le32(pmd->time);
934                 details_le.snapshotted_time = cpu_to_le32(pmd->time);
935         }
936
937         *td = kmalloc(sizeof(**td), GFP_NOIO);
938         if (!*td)
939                 return -ENOMEM;
940
941         (*td)->pmd = pmd;
942         (*td)->id = dev;
943         (*td)->open_count = 1;
944         (*td)->changed = changed;
945         (*td)->aborted_with_changes = false;
946         (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
947         (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
948         (*td)->creation_time = le32_to_cpu(details_le.creation_time);
949         (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
950
951         list_add(&(*td)->list, &pmd->thin_devices);
952
953         return 0;
954 }
955
956 static void __close_device(struct dm_thin_device *td)
957 {
958         --td->open_count;
959 }
960
961 static int __create_thin(struct dm_pool_metadata *pmd,
962                          dm_thin_id dev)
963 {
964         int r;
965         dm_block_t dev_root;
966         uint64_t key = dev;
967         struct disk_device_details details_le;
968         struct dm_thin_device *td;
969         __le64 value;
970
971         r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
972                             &key, &details_le);
973         if (!r)
974                 return -EEXIST;
975
976         /*
977          * Create an empty btree for the mappings.
978          */
979         r = dm_btree_empty(&pmd->bl_info, &dev_root);
980         if (r)
981                 return r;
982
983         /*
984          * Insert it into the main mapping tree.
985          */
986         value = cpu_to_le64(dev_root);
987         __dm_bless_for_disk(&value);
988         r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
989         if (r) {
990                 dm_btree_del(&pmd->bl_info, dev_root);
991                 return r;
992         }
993
994         r = __open_device(pmd, dev, 1, &td);
995         if (r) {
996                 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
997                 dm_btree_del(&pmd->bl_info, dev_root);
998                 return r;
999         }
1000         __close_device(td);
1001
1002         return r;
1003 }
1004
1005 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1006 {
1007         int r = -EINVAL;
1008
1009         down_write(&pmd->root_lock);
1010         if (!pmd->fail_io)
1011                 r = __create_thin(pmd, dev);
1012         up_write(&pmd->root_lock);
1013
1014         return r;
1015 }
1016
1017 static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1018                                   struct dm_thin_device *snap,
1019                                   dm_thin_id origin, uint32_t time)
1020 {
1021         int r;
1022         struct dm_thin_device *td;
1023
1024         r = __open_device(pmd, origin, 0, &td);
1025         if (r)
1026                 return r;
1027
1028         td->changed = 1;
1029         td->snapshotted_time = time;
1030
1031         snap->mapped_blocks = td->mapped_blocks;
1032         snap->snapshotted_time = time;
1033         __close_device(td);
1034
1035         return 0;
1036 }
1037
1038 static int __create_snap(struct dm_pool_metadata *pmd,
1039                          dm_thin_id dev, dm_thin_id origin)
1040 {
1041         int r;
1042         dm_block_t origin_root;
1043         uint64_t key = origin, dev_key = dev;
1044         struct dm_thin_device *td;
1045         struct disk_device_details details_le;
1046         __le64 value;
1047
1048         /* check this device is unused */
1049         r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1050                             &dev_key, &details_le);
1051         if (!r)
1052                 return -EEXIST;
1053
1054         /* find the mapping tree for the origin */
1055         r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1056         if (r)
1057                 return r;
1058         origin_root = le64_to_cpu(value);
1059
1060         /* clone the origin, an inc will do */
1061         dm_tm_inc(pmd->tm, origin_root);
1062
1063         /* insert into the main mapping tree */
1064         value = cpu_to_le64(origin_root);
1065         __dm_bless_for_disk(&value);
1066         key = dev;
1067         r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1068         if (r) {
1069                 dm_tm_dec(pmd->tm, origin_root);
1070                 return r;
1071         }
1072
1073         pmd->time++;
1074
1075         r = __open_device(pmd, dev, 1, &td);
1076         if (r)
1077                 goto bad;
1078
1079         r = __set_snapshot_details(pmd, td, origin, pmd->time);
1080         __close_device(td);
1081
1082         if (r)
1083                 goto bad;
1084
1085         return 0;
1086
1087 bad:
1088         dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1089         dm_btree_remove(&pmd->details_info, pmd->details_root,
1090                         &key, &pmd->details_root);
1091         return r;
1092 }
1093
1094 int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1095                                  dm_thin_id dev,
1096                                  dm_thin_id origin)
1097 {
1098         int r = -EINVAL;
1099
1100         down_write(&pmd->root_lock);
1101         if (!pmd->fail_io)
1102                 r = __create_snap(pmd, dev, origin);
1103         up_write(&pmd->root_lock);
1104
1105         return r;
1106 }
1107
1108 static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1109 {
1110         int r;
1111         uint64_t key = dev;
1112         struct dm_thin_device *td;
1113
1114         /* TODO: failure should mark the transaction invalid */
1115         r = __open_device(pmd, dev, 0, &td);
1116         if (r)
1117                 return r;
1118
1119         if (td->open_count > 1) {
1120                 __close_device(td);
1121                 return -EBUSY;
1122         }
1123
1124         list_del(&td->list);
1125         kfree(td);
1126         r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1127                             &key, &pmd->details_root);
1128         if (r)
1129                 return r;
1130
1131         r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1132         if (r)
1133                 return r;
1134
1135         return 0;
1136 }
1137
1138 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1139                                dm_thin_id dev)
1140 {
1141         int r = -EINVAL;
1142
1143         down_write(&pmd->root_lock);
1144         if (!pmd->fail_io)
1145                 r = __delete_device(pmd, dev);
1146         up_write(&pmd->root_lock);
1147
1148         return r;
1149 }
1150
1151 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1152                                         uint64_t current_id,
1153                                         uint64_t new_id)
1154 {
1155         int r = -EINVAL;
1156
1157         down_write(&pmd->root_lock);
1158
1159         if (pmd->fail_io)
1160                 goto out;
1161
1162         if (pmd->trans_id != current_id) {
1163                 DMERR("mismatched transaction id");
1164                 goto out;
1165         }
1166
1167         pmd->trans_id = new_id;
1168         r = 0;
1169
1170 out:
1171         up_write(&pmd->root_lock);
1172
1173         return r;
1174 }
1175
1176 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1177                                         uint64_t *result)
1178 {
1179         int r = -EINVAL;
1180
1181         down_read(&pmd->root_lock);
1182         if (!pmd->fail_io) {
1183                 *result = pmd->trans_id;
1184                 r = 0;
1185         }
1186         up_read(&pmd->root_lock);
1187
1188         return r;
1189 }
1190
1191 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1192 {
1193         int r, inc;
1194         struct thin_disk_superblock *disk_super;
1195         struct dm_block *copy, *sblock;
1196         dm_block_t held_root;
1197
1198         /*
1199          * Copy the superblock.
1200          */
1201         dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1202         r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1203                                &sb_validator, &copy, &inc);
1204         if (r)
1205                 return r;
1206
1207         BUG_ON(!inc);
1208
1209         held_root = dm_block_location(copy);
1210         disk_super = dm_block_data(copy);
1211
1212         if (le64_to_cpu(disk_super->held_root)) {
1213                 DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1214
1215                 dm_tm_dec(pmd->tm, held_root);
1216                 dm_tm_unlock(pmd->tm, copy);
1217                 return -EBUSY;
1218         }
1219
1220         /*
1221          * Wipe the spacemap since we're not publishing this.
1222          */
1223         memset(&disk_super->data_space_map_root, 0,
1224                sizeof(disk_super->data_space_map_root));
1225         memset(&disk_super->metadata_space_map_root, 0,
1226                sizeof(disk_super->metadata_space_map_root));
1227
1228         /*
1229          * Increment the data structures that need to be preserved.
1230          */
1231         dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1232         dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1233         dm_tm_unlock(pmd->tm, copy);
1234
1235         /*
1236          * Write the held root into the superblock.
1237          */
1238         r = superblock_lock(pmd, &sblock);
1239         if (r) {
1240                 dm_tm_dec(pmd->tm, held_root);
1241                 return r;
1242         }
1243
1244         disk_super = dm_block_data(sblock);
1245         disk_super->held_root = cpu_to_le64(held_root);
1246         dm_bm_unlock(sblock);
1247         return 0;
1248 }
1249
1250 int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1251 {
1252         int r = -EINVAL;
1253
1254         down_write(&pmd->root_lock);
1255         if (!pmd->fail_io)
1256                 r = __reserve_metadata_snap(pmd);
1257         up_write(&pmd->root_lock);
1258
1259         return r;
1260 }
1261
1262 static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1263 {
1264         int r;
1265         struct thin_disk_superblock *disk_super;
1266         struct dm_block *sblock, *copy;
1267         dm_block_t held_root;
1268
1269         r = superblock_lock(pmd, &sblock);
1270         if (r)
1271                 return r;
1272
1273         disk_super = dm_block_data(sblock);
1274         held_root = le64_to_cpu(disk_super->held_root);
1275         disk_super->held_root = cpu_to_le64(0);
1276
1277         dm_bm_unlock(sblock);
1278
1279         if (!held_root) {
1280                 DMWARN("No pool metadata snapshot found: nothing to release.");
1281                 return -EINVAL;
1282         }
1283
1284         r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
1285         if (r)
1286                 return r;
1287
1288         disk_super = dm_block_data(copy);
1289         dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
1290         dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
1291         dm_sm_dec_block(pmd->metadata_sm, held_root);
1292
1293         return dm_tm_unlock(pmd->tm, copy);
1294 }
1295
1296 int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1297 {
1298         int r = -EINVAL;
1299
1300         down_write(&pmd->root_lock);
1301         if (!pmd->fail_io)
1302                 r = __release_metadata_snap(pmd);
1303         up_write(&pmd->root_lock);
1304
1305         return r;
1306 }
1307
1308 static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1309                                dm_block_t *result)
1310 {
1311         int r;
1312         struct thin_disk_superblock *disk_super;
1313         struct dm_block *sblock;
1314
1315         r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1316                             &sb_validator, &sblock);
1317         if (r)
1318                 return r;
1319
1320         disk_super = dm_block_data(sblock);
1321         *result = le64_to_cpu(disk_super->held_root);
1322
1323         return dm_bm_unlock(sblock);
1324 }
1325
1326 int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1327                               dm_block_t *result)
1328 {
1329         int r = -EINVAL;
1330
1331         down_read(&pmd->root_lock);
1332         if (!pmd->fail_io)
1333                 r = __get_metadata_snap(pmd, result);
1334         up_read(&pmd->root_lock);
1335
1336         return r;
1337 }
1338
1339 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1340                              struct dm_thin_device **td)
1341 {
1342         int r = -EINVAL;
1343
1344         down_write(&pmd->root_lock);
1345         if (!pmd->fail_io)
1346                 r = __open_device(pmd, dev, 0, td);
1347         up_write(&pmd->root_lock);
1348
1349         return r;
1350 }
1351
1352 int dm_pool_close_thin_device(struct dm_thin_device *td)
1353 {
1354         down_write(&td->pmd->root_lock);
1355         __close_device(td);
1356         up_write(&td->pmd->root_lock);
1357
1358         return 0;
1359 }
1360
1361 dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1362 {
1363         return td->id;
1364 }
1365
1366 /*
1367  * Check whether @time (of block creation) is older than @td's last snapshot.
1368  * If so then the associated block is shared with the last snapshot device.
1369  * Any block on a device created *after* the device last got snapshotted is
1370  * necessarily not shared.
1371  */
1372 static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1373 {
1374         return td->snapshotted_time > time;
1375 }
1376
1377 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1378                        int can_block, struct dm_thin_lookup_result *result)
1379 {
1380         int r = -EINVAL;
1381         uint64_t block_time = 0;
1382         __le64 value;
1383         struct dm_pool_metadata *pmd = td->pmd;
1384         dm_block_t keys[2] = { td->id, block };
1385         struct dm_btree_info *info;
1386
1387         if (can_block) {
1388                 down_read(&pmd->root_lock);
1389                 info = &pmd->info;
1390         } else if (down_read_trylock(&pmd->root_lock))
1391                 info = &pmd->nb_info;
1392         else
1393                 return -EWOULDBLOCK;
1394
1395         if (pmd->fail_io)
1396                 goto out;
1397
1398         r = dm_btree_lookup(info, pmd->root, keys, &value);
1399         if (!r)
1400                 block_time = le64_to_cpu(value);
1401
1402 out:
1403         up_read(&pmd->root_lock);
1404
1405         if (!r) {
1406                 dm_block_t exception_block;
1407                 uint32_t exception_time;
1408                 unpack_block_time(block_time, &exception_block,
1409                                   &exception_time);
1410                 result->block = exception_block;
1411                 result->shared = __snapshotted_since(td, exception_time);
1412         }
1413
1414         return r;
1415 }
1416
1417 static int __insert(struct dm_thin_device *td, dm_block_t block,
1418                     dm_block_t data_block)
1419 {
1420         int r, inserted;
1421         __le64 value;
1422         struct dm_pool_metadata *pmd = td->pmd;
1423         dm_block_t keys[2] = { td->id, block };
1424
1425         value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1426         __dm_bless_for_disk(&value);
1427
1428         r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1429                                    &pmd->root, &inserted);
1430         if (r)
1431                 return r;
1432
1433         td->changed = 1;
1434         if (inserted)
1435                 td->mapped_blocks++;
1436
1437         return 0;
1438 }
1439
1440 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1441                          dm_block_t data_block)
1442 {
1443         int r = -EINVAL;
1444
1445         down_write(&td->pmd->root_lock);
1446         if (!td->pmd->fail_io)
1447                 r = __insert(td, block, data_block);
1448         up_write(&td->pmd->root_lock);
1449
1450         return r;
1451 }
1452
1453 static int __remove(struct dm_thin_device *td, dm_block_t block)
1454 {
1455         int r;
1456         struct dm_pool_metadata *pmd = td->pmd;
1457         dm_block_t keys[2] = { td->id, block };
1458
1459         r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
1460         if (r)
1461                 return r;
1462
1463         td->mapped_blocks--;
1464         td->changed = 1;
1465
1466         return 0;
1467 }
1468
1469 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1470 {
1471         int r = -EINVAL;
1472
1473         down_write(&td->pmd->root_lock);
1474         if (!td->pmd->fail_io)
1475                 r = __remove(td, block);
1476         up_write(&td->pmd->root_lock);
1477
1478         return r;
1479 }
1480
1481 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1482 {
1483         int r;
1484         uint32_t ref_count;
1485
1486         down_read(&pmd->root_lock);
1487         r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1488         if (!r)
1489                 *result = (ref_count != 0);
1490         up_read(&pmd->root_lock);
1491
1492         return r;
1493 }
1494
1495 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1496 {
1497         int r;
1498
1499         down_read(&td->pmd->root_lock);
1500         r = td->changed;
1501         up_read(&td->pmd->root_lock);
1502
1503         return r;
1504 }
1505
1506 bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1507 {
1508         bool r = false;
1509         struct dm_thin_device *td, *tmp;
1510
1511         down_read(&pmd->root_lock);
1512         list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1513                 if (td->changed) {
1514                         r = td->changed;
1515                         break;
1516                 }
1517         }
1518         up_read(&pmd->root_lock);
1519
1520         return r;
1521 }
1522
1523 bool dm_thin_aborted_changes(struct dm_thin_device *td)
1524 {
1525         bool r;
1526
1527         down_read(&td->pmd->root_lock);
1528         r = td->aborted_with_changes;
1529         up_read(&td->pmd->root_lock);
1530
1531         return r;
1532 }
1533
1534 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1535 {
1536         int r = -EINVAL;
1537
1538         down_write(&pmd->root_lock);
1539         if (!pmd->fail_io)
1540                 r = dm_sm_new_block(pmd->data_sm, result);
1541         up_write(&pmd->root_lock);
1542
1543         return r;
1544 }
1545
1546 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1547 {
1548         int r = -EINVAL;
1549
1550         down_write(&pmd->root_lock);
1551         if (pmd->fail_io)
1552                 goto out;
1553
1554         r = __commit_transaction(pmd);
1555         if (r <= 0)
1556                 goto out;
1557
1558         /*
1559          * Open the next transaction.
1560          */
1561         r = __begin_transaction(pmd);
1562 out:
1563         up_write(&pmd->root_lock);
1564         return r;
1565 }
1566
1567 static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1568 {
1569         struct dm_thin_device *td;
1570
1571         list_for_each_entry(td, &pmd->thin_devices, list)
1572                 td->aborted_with_changes = td->changed;
1573 }
1574
1575 int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1576 {
1577         int r = -EINVAL;
1578
1579         down_write(&pmd->root_lock);
1580         if (pmd->fail_io)
1581                 goto out;
1582
1583         __set_abort_with_changes_flags(pmd);
1584         __destroy_persistent_data_objects(pmd);
1585         r = __create_persistent_data_objects(pmd, false);
1586         if (r)
1587                 pmd->fail_io = true;
1588
1589 out:
1590         up_write(&pmd->root_lock);
1591
1592         return r;
1593 }
1594
1595 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1596 {
1597         int r = -EINVAL;
1598
1599         down_read(&pmd->root_lock);
1600         if (!pmd->fail_io)
1601                 r = dm_sm_get_nr_free(pmd->data_sm, result);
1602         up_read(&pmd->root_lock);
1603
1604         return r;
1605 }
1606
1607 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1608                                           dm_block_t *result)
1609 {
1610         int r = -EINVAL;
1611
1612         down_read(&pmd->root_lock);
1613         if (!pmd->fail_io)
1614                 r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1615         up_read(&pmd->root_lock);
1616
1617         return r;
1618 }
1619
1620 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1621                                   dm_block_t *result)
1622 {
1623         int r = -EINVAL;
1624
1625         down_read(&pmd->root_lock);
1626         if (!pmd->fail_io)
1627                 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1628         up_read(&pmd->root_lock);
1629
1630         return r;
1631 }
1632
1633 int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
1634 {
1635         down_read(&pmd->root_lock);
1636         *result = pmd->data_block_size;
1637         up_read(&pmd->root_lock);
1638
1639         return 0;
1640 }
1641
1642 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1643 {
1644         int r = -EINVAL;
1645
1646         down_read(&pmd->root_lock);
1647         if (!pmd->fail_io)
1648                 r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1649         up_read(&pmd->root_lock);
1650
1651         return r;
1652 }
1653
1654 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1655 {
1656         int r = -EINVAL;
1657         struct dm_pool_metadata *pmd = td->pmd;
1658
1659         down_read(&pmd->root_lock);
1660         if (!pmd->fail_io) {
1661                 *result = td->mapped_blocks;
1662                 r = 0;
1663         }
1664         up_read(&pmd->root_lock);
1665
1666         return r;
1667 }
1668
1669 static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1670 {
1671         int r;
1672         __le64 value_le;
1673         dm_block_t thin_root;
1674         struct dm_pool_metadata *pmd = td->pmd;
1675
1676         r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
1677         if (r)
1678                 return r;
1679
1680         thin_root = le64_to_cpu(value_le);
1681
1682         return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
1683 }
1684
1685 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1686                                      dm_block_t *result)
1687 {
1688         int r = -EINVAL;
1689         struct dm_pool_metadata *pmd = td->pmd;
1690
1691         down_read(&pmd->root_lock);
1692         if (!pmd->fail_io)
1693                 r = __highest_block(td, result);
1694         up_read(&pmd->root_lock);
1695
1696         return r;
1697 }
1698
1699 static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
1700 {
1701         int r;
1702         dm_block_t old_count;
1703
1704         r = dm_sm_get_nr_blocks(sm, &old_count);
1705         if (r)
1706                 return r;
1707
1708         if (new_count == old_count)
1709                 return 0;
1710
1711         if (new_count < old_count) {
1712                 DMERR("cannot reduce size of space map");
1713                 return -EINVAL;
1714         }
1715
1716         return dm_sm_extend(sm, new_count - old_count);
1717 }
1718
1719 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1720 {
1721         int r = -EINVAL;
1722
1723         down_write(&pmd->root_lock);
1724         if (!pmd->fail_io)
1725                 r = __resize_space_map(pmd->data_sm, new_count);
1726         up_write(&pmd->root_lock);
1727
1728         return r;
1729 }
1730
1731 int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1732 {
1733         int r = -EINVAL;
1734
1735         down_write(&pmd->root_lock);
1736         if (!pmd->fail_io)
1737                 r = __resize_space_map(pmd->metadata_sm, new_count);
1738         up_write(&pmd->root_lock);
1739
1740         return r;
1741 }
1742
1743 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1744 {
1745         down_write(&pmd->root_lock);
1746         pmd->read_only = true;
1747         dm_bm_set_read_only(pmd->bm);
1748         up_write(&pmd->root_lock);
1749 }
1750
1751 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
1752 {
1753         down_write(&pmd->root_lock);
1754         pmd->read_only = false;
1755         dm_bm_set_read_write(pmd->bm);
1756         up_write(&pmd->root_lock);
1757 }
1758
1759 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1760                                         dm_block_t threshold,
1761                                         dm_sm_threshold_fn fn,
1762                                         void *context)
1763 {
1764         int r;
1765
1766         down_write(&pmd->root_lock);
1767         r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
1768         up_write(&pmd->root_lock);
1769
1770         return r;
1771 }
1772
1773 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
1774 {
1775         int r;
1776         struct dm_block *sblock;
1777         struct thin_disk_superblock *disk_super;
1778
1779         down_write(&pmd->root_lock);
1780         pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
1781
1782         r = superblock_lock(pmd, &sblock);
1783         if (r) {
1784                 DMERR("couldn't read superblock");
1785                 goto out;
1786         }
1787
1788         disk_super = dm_block_data(sblock);
1789         disk_super->flags = cpu_to_le32(pmd->flags);
1790
1791         dm_bm_unlock(sblock);
1792 out:
1793         up_write(&pmd->root_lock);
1794         return r;
1795 }
1796
1797 bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1798 {
1799         bool needs_check;
1800
1801         down_read(&pmd->root_lock);
1802         needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
1803         up_read(&pmd->root_lock);
1804
1805         return needs_check;
1806 }