Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 17 Jan 2011 22:43:43 +0000 (14:43 -0800)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (25 commits)
  Btrfs: forced readonly mounts on errors
  btrfs: Require CAP_SYS_ADMIN for filesystem rebalance
  Btrfs: don't warn if we get ENOSPC in btrfs_block_rsv_check
  btrfs: Fix memory leak in btrfs_read_fs_root_no_radix()
  btrfs: check NULL or not
  btrfs: Don't pass NULL ptr to func that may deref it.
  btrfs: mount failure return value fix
  btrfs: Mem leak in btrfs_get_acl()
  btrfs: fix wrong free space information of btrfs
  btrfs: make the chunk allocator utilize the devices better
  btrfs: restructure find_free_dev_extent()
  btrfs: fix wrong calculation of stripe size
  btrfs: try to reclaim some space when chunk allocation fails
  btrfs: fix wrong data space statistics
  fs/btrfs: Fix build of ctree
  Btrfs: fix off by one while setting block groups readonly
  Btrfs: Add BTRFS_IOC_SUBVOL_GETFLAGS/SETFLAGS ioctls
  Btrfs: Add readonly snapshots support
  Btrfs: Refactor btrfs_ioctl_snap_create()
  btrfs: Extract duplicate decompress code
  ...

29 files changed:
fs/btrfs/Kconfig
fs/btrfs/Makefile
fs/btrfs/acl.c
fs/btrfs/btrfs_inode.h
fs/btrfs/compression.c
fs/btrfs/compression.h
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/extent_map.c
fs/btrfs/extent_map.h
fs/btrfs/file.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/lzo.c [new file with mode: 0644]
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/xattr.c
fs/btrfs/zlib.c

index 7bb3c02..ecb9fd3 100644 (file)
@@ -4,6 +4,8 @@ config BTRFS_FS
        select LIBCRC32C
        select ZLIB_INFLATE
        select ZLIB_DEFLATE
+       select LZO_COMPRESS
+       select LZO_DECOMPRESS
        help
          Btrfs is a new filesystem with extents, writable snapshotting,
          support for multiple devices and many more features.
index a35eb36..31610ea 100644 (file)
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-          export.o tree-log.o acl.o free-space-cache.o zlib.o \
+          export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o
index 6ae2c8c..15b5ca2 100644 (file)
@@ -60,8 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
-                       if (IS_ERR(acl))
+                       if (IS_ERR(acl)) {
+                               kfree(value);
                                return acl;
+                       }
                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
index 6ad63f1..ccc991c 100644 (file)
@@ -157,7 +157,7 @@ struct btrfs_inode {
        /*
         * always compress this one file
         */
-       unsigned force_compress:1;
+       unsigned force_compress:4;
 
        struct inode vfs_inode;
 };
index b50bc4b..f745287 100644 (file)
@@ -62,6 +62,9 @@ struct compressed_bio {
        /* number of bytes on disk */
        unsigned long compressed_len;
 
+       /* the compression algorithm for this bio */
+       int compress_type;
+
        /* number of compressed pages in the array */
        unsigned long nr_pages;
 
@@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
        /* ok, we're the last bio for this extent, lets start
         * the decompression.
         */
-       ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
-                                       cb->start,
-                                       cb->orig_bio->bi_io_vec,
-                                       cb->orig_bio->bi_vcnt,
-                                       cb->compressed_len);
+       ret = btrfs_decompress_biovec(cb->compress_type,
+                                     cb->compressed_pages,
+                                     cb->start,
+                                     cb->orig_bio->bi_io_vec,
+                                     cb->orig_bio->bi_vcnt,
+                                     cb->compressed_len);
 csum_failed:
        if (ret)
                cb->errors = 1;
@@ -588,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
        cb->len = uncompressed_len;
        cb->compressed_len = compressed_len;
+       cb->compress_type = extent_compress_type(bio_flags);
        cb->orig_bio = bio;
 
        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -677,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        bio_put(comp_bio);
        return 0;
 }
+
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+
+struct btrfs_compress_op *btrfs_compress_op[] = {
+       &btrfs_zlib_compress,
+       &btrfs_lzo_compress,
+};
+
+int __init btrfs_init_compress(void)
+{
+       int i;
+
+       for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+               INIT_LIST_HEAD(&comp_idle_workspace[i]);
+               spin_lock_init(&comp_workspace_lock[i]);
+               atomic_set(&comp_alloc_workspace[i], 0);
+               init_waitqueue_head(&comp_workspace_wait[i]);
+       }
+       return 0;
+}
+
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+       struct list_head *workspace;
+       int cpus = num_online_cpus();
+       int idx = type - 1;
+
+       struct list_head *idle_workspace        = &comp_idle_workspace[idx];
+       spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
+       atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
+       wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
+       int *num_workspace                      = &comp_num_workspace[idx];
+again:
+       spin_lock(workspace_lock);
+       if (!list_empty(idle_workspace)) {
+               workspace = idle_workspace->next;
+               list_del(workspace);
+               (*num_workspace)--;
+               spin_unlock(workspace_lock);
+               return workspace;
+
+       }
+       if (atomic_read(alloc_workspace) > cpus) {
+               DEFINE_WAIT(wait);
+
+               spin_unlock(workspace_lock);
+               prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+               if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+                       schedule();
+               finish_wait(workspace_wait, &wait);
+               goto again;
+       }
+       atomic_inc(alloc_workspace);
+       spin_unlock(workspace_lock);
+
+       workspace = btrfs_compress_op[idx]->alloc_workspace();
+       if (IS_ERR(workspace)) {
+               atomic_dec(alloc_workspace);
+               wake_up(workspace_wait);
+       }
+       return workspace;
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+       int idx = type - 1;
+       struct list_head *idle_workspace        = &comp_idle_workspace[idx];
+       spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
+       atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
+       wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
+       int *num_workspace                      = &comp_num_workspace[idx];
+
+       spin_lock(workspace_lock);
+       if (*num_workspace < num_online_cpus()) {
+               list_add_tail(workspace, idle_workspace);
+               (*num_workspace)++;
+               spin_unlock(workspace_lock);
+               goto wake;
+       }
+       spin_unlock(workspace_lock);
+
+       btrfs_compress_op[idx]->free_workspace(workspace);
+       atomic_dec(alloc_workspace);
+wake:
+       if (waitqueue_active(workspace_wait))
+               wake_up(workspace_wait);
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+       struct list_head *workspace;
+       int i;
+
+       for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+               while (!list_empty(&comp_idle_workspace[i])) {
+                       workspace = comp_idle_workspace[i].next;
+                       list_del(workspace);
+                       btrfs_compress_op[i]->free_workspace(workspace);
+                       atomic_dec(&comp_alloc_workspace[i]);
+               }
+       }
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+                        u64 start, unsigned long len,
+                        struct page **pages,
+                        unsigned long nr_dest_pages,
+                        unsigned long *out_pages,
+                        unsigned long *total_in,
+                        unsigned long *total_out,
+                        unsigned long max_out)
+{
+       struct list_head *workspace;
+       int ret;
+
+       workspace = find_workspace(type);
+       if (IS_ERR(workspace))
+               return -1;
+
+       ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+                                                     start, len, pages,
+                                                     nr_dest_pages, out_pages,
+                                                     total_in, total_out,
+                                                     max_out);
+       free_workspace(type, workspace);
+       return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+                           struct bio_vec *bvec, int vcnt, size_t srclen)
+{
+       struct list_head *workspace;
+       int ret;
+
+       workspace = find_workspace(type);
+       if (IS_ERR(workspace))
+               return -ENOMEM;
+
+       ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+                                                        disk_start,
+                                                        bvec, vcnt, srclen);
+       free_workspace(type, workspace);
+       return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+                    unsigned long start_byte, size_t srclen, size_t destlen)
+{
+       struct list_head *workspace;
+       int ret;
+
+       workspace = find_workspace(type);
+       if (IS_ERR(workspace))
+               return -ENOMEM;
+
+       ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+                                                 dest_page, start_byte,
+                                                 srclen, destlen);
+
+       free_workspace(type, workspace);
+       return ret;
+}
+
+void __exit btrfs_exit_compress(void)
+{
+       free_workspaces();
+}
+
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+                             unsigned long total_out, u64 disk_start,
+                             struct bio_vec *bvec, int vcnt,
+                             unsigned long *page_index,
+                             unsigned long *pg_offset)
+{
+       unsigned long buf_offset;
+       unsigned long current_buf_start;
+       unsigned long start_byte;
+       unsigned long working_bytes = total_out - buf_start;
+       unsigned long bytes;
+       char *kaddr;
+       struct page *page_out = bvec[*page_index].bv_page;
+
+       /*
+        * start byte is the first byte of the page we're currently
+        * copying into relative to the start of the compressed data.
+        */
+       start_byte = page_offset(page_out) - disk_start;
+
+       /* we haven't yet hit data corresponding to this page */
+       if (total_out <= start_byte)
+               return 1;
+
+       /*
+        * the start of the data we care about is offset into
+        * the middle of our working buffer
+        */
+       if (total_out > start_byte && buf_start < start_byte) {
+               buf_offset = start_byte - buf_start;
+               working_bytes -= buf_offset;
+       } else {
+               buf_offset = 0;
+       }
+       current_buf_start = buf_start;
+
+       /* copy bytes from the working buffer into the pages */
+       while (working_bytes > 0) {
+               bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+                           PAGE_CACHE_SIZE - buf_offset);
+               bytes = min(bytes, working_bytes);
+               kaddr = kmap_atomic(page_out, KM_USER0);
+               memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+               kunmap_atomic(kaddr, KM_USER0);
+               flush_dcache_page(page_out);
+
+               *pg_offset += bytes;
+               buf_offset += bytes;
+               working_bytes -= bytes;
+               current_buf_start += bytes;
+
+               /* check if we need to pick another page */
+               if (*pg_offset == PAGE_CACHE_SIZE) {
+                       (*page_index)++;
+                       if (*page_index >= vcnt)
+                               return 0;
+
+                       page_out = bvec[*page_index].bv_page;
+                       *pg_offset = 0;
+                       start_byte = page_offset(page_out) - disk_start;
+
+                       /*
+                        * make sure our new page is covered by this
+                        * working buffer
+                        */
+                       if (total_out <= start_byte)
+                               return 1;
+
+                       /*
+                        * the next page in the biovec might not be adjacent
+                        * to the last page, but it might still be found
+                        * inside this working buffer. bump our offset pointer
+                        */
+                       if (total_out > start_byte &&
+                           current_buf_start < start_byte) {
+                               buf_offset = start_byte - buf_start;
+                               working_bytes = total_out - start_byte;
+                               current_buf_start = buf_start + buf_offset;
+                       }
+               }
+       }
+
+       return 1;
+}
index 421f5b4..5100017 100644 (file)
 #ifndef __BTRFS_COMPRESSION_
 #define __BTRFS_COMPRESSION_
 
-int btrfs_zlib_decompress(unsigned char *data_in,
-                         struct page *dest_page,
-                         unsigned long start_byte,
-                         size_t srclen, size_t destlen);
-int btrfs_zlib_compress_pages(struct address_space *mapping,
-                             u64 start, unsigned long len,
-                             struct page **pages,
-                             unsigned long nr_dest_pages,
-                             unsigned long *out_pages,
-                             unsigned long *total_in,
-                             unsigned long *total_out,
-                             unsigned long max_out);
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
-                             u64 disk_start,
-                             struct bio_vec *bvec,
-                             int vcnt,
-                             size_t srclen);
-void btrfs_zlib_exit(void);
+int btrfs_init_compress(void);
+void btrfs_exit_compress(void);
+
+int btrfs_compress_pages(int type, struct address_space *mapping,
+                        u64 start, unsigned long len,
+                        struct page **pages,
+                        unsigned long nr_dest_pages,
+                        unsigned long *out_pages,
+                        unsigned long *total_in,
+                        unsigned long *total_out,
+                        unsigned long max_out);
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+                           struct bio_vec *bvec, int vcnt, size_t srclen);
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+                    unsigned long start_byte, size_t srclen, size_t destlen);
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+                             unsigned long total_out, u64 disk_start,
+                             struct bio_vec *bvec, int vcnt,
+                             unsigned long *page_index,
+                             unsigned long *pg_offset);
+
 int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                                  unsigned long len, u64 disk_start,
                                  unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                                  unsigned long nr_pages);
 int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags);
+
+struct btrfs_compress_op {
+       struct list_head *(*alloc_workspace)(void);
+
+       void (*free_workspace)(struct list_head *workspace);
+
+       int (*compress_pages)(struct list_head *workspace,
+                             struct address_space *mapping,
+                             u64 start, unsigned long len,
+                             struct page **pages,
+                             unsigned long nr_dest_pages,
+                             unsigned long *out_pages,
+                             unsigned long *total_in,
+                             unsigned long *total_out,
+                             unsigned long max_out);
+
+       int (*decompress_biovec)(struct list_head *workspace,
+                                struct page **pages_in,
+                                u64 disk_start,
+                                struct bio_vec *bvec,
+                                int vcnt,
+                                size_t srclen);
+
+       int (*decompress)(struct list_head *workspace,
+                         unsigned char *data_in,
+                         struct page *dest_page,
+                         unsigned long start_byte,
+                         size_t srclen, size_t destlen);
+};
+
+extern struct btrfs_compress_op btrfs_zlib_compress;
+extern struct btrfs_compress_op btrfs_lzo_compress;
+
 #endif
index 9ac1715..b5baff0 100644 (file)
@@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
+       if (!p)
+               return;
        btrfs_release_path(NULL, p);
        kmem_cache_free(btrfs_path_cachep, p);
 }
@@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        btrfs_assert_tree_locked(path->nodes[1]);
 
        right = read_node_slot(root, upper, slot + 1);
+       if (right == NULL)
+               return 1;
+
        btrfs_tree_lock(right);
        btrfs_set_lock_blocking(right);
 
@@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
        btrfs_assert_tree_locked(path->nodes[1]);
 
        left = read_node_slot(root, path->nodes[1], slot - 1);
+       if (left == NULL)
+               return 1;
+
        btrfs_tree_lock(left);
        btrfs_set_lock_blocking(left);
 
index b875d44..2c98b3a 100644 (file)
@@ -295,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 #define BTRFS_FSID_SIZE 16
 #define BTRFS_HEADER_FLAG_WRITTEN      (1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC                (1ULL << 1)
+
+/*
+ * File system states
+ */
+
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR         (1ULL << 2)
+
 #define BTRFS_SUPER_FLAG_SEEDING       (1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP      (1ULL << 33)
 
@@ -399,13 +407,15 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF   (1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL  (1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS    (1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO    (1ULL << 3)
 
 #define BTRFS_FEATURE_COMPAT_SUPP              0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP           0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP                    \
        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
-        BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+        BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
+        BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -552,9 +562,11 @@ struct btrfs_timespec {
 } __attribute__ ((__packed__));
 
 enum btrfs_compression_type {
-       BTRFS_COMPRESS_NONE = 0,
-       BTRFS_COMPRESS_ZLIB = 1,
-       BTRFS_COMPRESS_LAST = 2,
+       BTRFS_COMPRESS_NONE  = 0,
+       BTRFS_COMPRESS_ZLIB  = 1,
+       BTRFS_COMPRESS_LZO   = 2,
+       BTRFS_COMPRESS_TYPES = 2,
+       BTRFS_COMPRESS_LAST  = 3,
 };
 
 struct btrfs_inode_item {
@@ -598,6 +610,8 @@ struct btrfs_dir_item {
        u8 type;
 } __attribute__ ((__packed__));
 
+#define BTRFS_ROOT_SUBVOL_RDONLY       (1ULL << 0)
+
 struct btrfs_root_item {
        struct btrfs_inode_item inode;
        __le64 generation;
@@ -896,7 +910,8 @@ struct btrfs_fs_info {
         */
        u64 last_trans_log_full_commit;
        u64 open_ioctl_trans;
-       unsigned long mount_opt;
+       unsigned long mount_opt:20;
+       unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
        struct btrfs_transaction *running_transaction;
@@ -1051,6 +1066,9 @@ struct btrfs_fs_info {
        unsigned metadata_ratio;
 
        void *bdev_holder;
+
+       /* filesystem state */
+       u64 fs_state;
 };
 
 /*
@@ -1894,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
                         last_snapshot, 64);
 
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+       return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+}
+
 /* struct btrfs_super_block */
 
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2146,6 +2169,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
@@ -2189,6 +2213,12 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+                                  u64 start, u64 end);
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                              u64 num_bytes);
+
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2542,6 +2572,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* super.c */
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                    unsigned int line, int errno);
+
+#define btrfs_std_error(fs_info, errno)                                \
+do {                                                           \
+       if ((errno))                                            \
+               __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+} while (0)
 
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
index 51d2e4d..b531c36 100644 (file)
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                                   int read_only);
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                     struct btrfs_root *root);
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                       struct extent_io_tree *dirty_pages,
+                                       int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                      struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
 
 /*
  * end_io_wq structs are used to do processing in task context when an IO is
@@ -353,6 +367,10 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        WARN_ON(len == 0);
 
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+       if (eb == NULL) {
+               WARN_ON(1);
+               goto out;
+       }
        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
                                             btrfs_header_generation(eb));
        BUG_ON(ret);
@@ -427,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        WARN_ON(len == 0);
 
        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+       if (eb == NULL) {
+               ret = -EIO;
+               goto out;
+       }
 
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
@@ -1145,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        }
        btrfs_free_path(path);
        if (ret) {
+               kfree(root);
                if (ret > 0)
                        ret = -ENOENT;
                return ERR_PTR(ret);
@@ -1713,8 +1736,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-       if (!bh)
+       if (!bh) {
+               err = -EINVAL;
                goto fail_iput;
+       }
 
        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1727,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!btrfs_super_root(disk_super))
                goto fail_iput;
 
+       /* check FS state, whether FS is broken. */
+       fs_info->fs_state |= btrfs_super_flags(disk_super);
+
+       btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+
        ret = btrfs_parse_options(tree_root, options);
        if (ret) {
                err = ret;
@@ -1744,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        }
 
        features = btrfs_super_incompat_flags(disk_super);
-       if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
-               features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-               btrfs_set_super_incompat_flags(disk_super, features);
-       }
+       features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+       if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+               features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+       btrfs_set_super_incompat_flags(disk_super, features);
 
        features = btrfs_super_compat_ro_flags(disk_super) &
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1957,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
 
-       if (btrfs_super_log_root(disk_super) != 0) {
+       /* do not make disk changes in broken FS */
+       if (btrfs_super_log_root(disk_super) != 0 &&
+           !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
                u64 bytenr = btrfs_super_log_root(disk_super);
 
                if (fs_devices->rw_devices == 0) {
@@ -2442,8 +2474,28 @@ int close_ctree(struct btrfs_root *root)
        smp_mb();
 
        btrfs_put_block_group_cache(fs_info);
+
+       /*
+        * Here come 2 situations when btrfs is broken to flip readonly:
+        *
+        * 1. when btrfs flips readonly somewhere else before
+        * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+        * and btrfs will skip to write sb directly to keep
+        * ERROR state on disk.
+        *
+        * 2. when btrfs flips readonly just in btrfs_commit_super,
+        * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+        * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+        * btrfs will cleanup all FS resources first and write sb then.
+        */
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-               ret =  btrfs_commit_super(root);
+               ret = btrfs_commit_super(root);
+               if (ret)
+                       printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+       }
+
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               ret = btrfs_error_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
@@ -2619,6 +2671,352 @@ out:
        return 0;
 }
 
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                             int read_only)
+{
+       if (read_only)
+               return;
+
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+               printk(KERN_WARNING "warning: mount fs with errors, "
+                      "running btrfsck is recommended\n");
+}
+
+int btrfs_error_commit_super(struct btrfs_root *root)
+{
+       int ret;
+
+       mutex_lock(&root->fs_info->cleaner_mutex);
+       btrfs_run_delayed_iputs(root);
+       mutex_unlock(&root->fs_info->cleaner_mutex);
+
+       down_write(&root->fs_info->cleanup_work_sem);
+       up_write(&root->fs_info->cleanup_work_sem);
+
+       /* cleanup FS via transaction */
+       btrfs_cleanup_transaction(root);
+
+       ret = write_ctree_super(NULL, root, 0);
+
+       return ret;
+}
+
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+{
+       struct btrfs_inode *btrfs_inode;
+       struct list_head splice;
+
+       INIT_LIST_HEAD(&splice);
+
+       mutex_lock(&root->fs_info->ordered_operations_mutex);
+       spin_lock(&root->fs_info->ordered_extent_lock);
+
+       list_splice_init(&root->fs_info->ordered_operations, &splice);
+       while (!list_empty(&splice)) {
+               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                        ordered_operations);
+
+               list_del_init(&btrfs_inode->ordered_operations);
+
+               btrfs_invalidate_inodes(btrfs_inode->root);
+       }
+
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+       mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+       return 0;
+}
+
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+       struct list_head splice;
+       struct btrfs_ordered_extent *ordered;
+       struct inode *inode;
+
+       INIT_LIST_HEAD(&splice);
+
+       spin_lock(&root->fs_info->ordered_extent_lock);
+
+       list_splice_init(&root->fs_info->ordered_extents, &splice);
+       while (!list_empty(&splice)) {
+               ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+                                    root_extent_list);
+
+               list_del_init(&ordered->root_extent_list);
+               atomic_inc(&ordered->refs);
+
+               /* the inode may be getting freed (in sys_unlink path). */
+               inode = igrab(ordered->inode);
+
+               spin_unlock(&root->fs_info->ordered_extent_lock);
+               if (inode)
+                       iput(inode);
+
+               atomic_set(&ordered->refs, 1);
+               btrfs_put_ordered_extent(ordered);
+
+               spin_lock(&root->fs_info->ordered_extent_lock);
+       }
+
+       spin_unlock(&root->fs_info->ordered_extent_lock);
+
+       return 0;
+}
+
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                     struct btrfs_root *root)
+{
+       struct rb_node *node;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       int ret = 0;
+
+       delayed_refs = &trans->delayed_refs;
+
+       spin_lock(&delayed_refs->lock);
+       if (delayed_refs->num_entries == 0) {
+               printk(KERN_INFO "delayed_refs has NO entry\n");
+               return ret;
+       }
+
+       node = rb_first(&delayed_refs->root);
+       while (node) {
+               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+               node = rb_next(node);
+
+               ref->in_tree = 0;
+               rb_erase(&ref->rb_node, &delayed_refs->root);
+               delayed_refs->num_entries--;
+
+               atomic_set(&ref->refs, 1);
+               if (btrfs_delayed_ref_is_head(ref)) {
+                       struct btrfs_delayed_ref_head *head;
+
+                       head = btrfs_delayed_node_to_head(ref);
+                       mutex_lock(&head->mutex);
+                       kfree(head->extent_op);
+                       delayed_refs->num_heads--;
+                       if (list_empty(&head->cluster))
+                               delayed_refs->num_heads_ready--;
+                       list_del_init(&head->cluster);
+                       mutex_unlock(&head->mutex);
+               }
+
+               spin_unlock(&delayed_refs->lock);
+               btrfs_put_delayed_ref(ref);
+
+               cond_resched();
+               spin_lock(&delayed_refs->lock);
+       }
+
+       spin_unlock(&delayed_refs->lock);
+
+       return ret;
+}
+
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+{
+       struct btrfs_pending_snapshot *snapshot;
+       struct list_head splice;
+
+       INIT_LIST_HEAD(&splice);
+
+       list_splice_init(&t->pending_snapshots, &splice);
+
+       while (!list_empty(&splice)) {
+               snapshot = list_entry(splice.next,
+                                     struct btrfs_pending_snapshot,
+                                     list);
+
+               list_del_init(&snapshot->list);
+
+               kfree(snapshot);
+       }
+
+       return 0;
+}
+
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+       struct btrfs_inode *btrfs_inode;
+       struct list_head splice;
+
+       INIT_LIST_HEAD(&splice);
+
+       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+
+       spin_lock(&root->fs_info->delalloc_lock);
+
+       while (!list_empty(&splice)) {
+               btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                   delalloc_inodes);
+
+               list_del_init(&btrfs_inode->delalloc_inodes);
+
+               btrfs_invalidate_inodes(btrfs_inode->root);
+       }
+
+       spin_unlock(&root->fs_info->delalloc_lock);
+
+       return 0;
+}
+
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                       struct extent_io_tree *dirty_pages,
+                                       int mark)
+{
+       int ret;
+       struct page *page;
+       struct inode *btree_inode = root->fs_info->btree_inode;
+       struct extent_buffer *eb;
+       u64 start = 0;
+       u64 end;
+       u64 offset;
+       unsigned long index;
+
+       while (1) {
+               ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                           mark);
+               if (ret)
+                       break;
+
+               clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+               while (start <= end) {
+                       index = start >> PAGE_CACHE_SHIFT;
+                       start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                       page = find_get_page(btree_inode->i_mapping, index);
+                       if (!page)
+                               continue;
+                       offset = page_offset(page);
+
+                       spin_lock(&dirty_pages->buffer_lock);
+                       eb = radix_tree_lookup(
+                            &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+                                              offset >> PAGE_CACHE_SHIFT);
+                       spin_unlock(&dirty_pages->buffer_lock);
+                       if (eb) {
+                               ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+                                                        &eb->bflags);
+                               atomic_set(&eb->refs, 1);
+                       }
+                       if (PageWriteback(page))
+                               end_page_writeback(page);
+
+                       lock_page(page);
+                       if (PageDirty(page)) {
+                               clear_page_dirty_for_io(page);
+                               spin_lock_irq(&page->mapping->tree_lock);
+                               radix_tree_tag_clear(&page->mapping->page_tree,
+                                                       page_index(page),
+                                                       PAGECACHE_TAG_DIRTY);
+                               spin_unlock_irq(&page->mapping->tree_lock);
+                       }
+
+                       page->mapping->a_ops->invalidatepage(page, 0);
+                       unlock_page(page);
+               }
+       }
+
+       return ret;
+}
+
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                      struct extent_io_tree *pinned_extents)
+{
+       struct extent_io_tree *unpin;
+       u64 start;
+       u64 end;
+       int ret;
+
+       unpin = pinned_extents;
+       while (1) {
+               ret = find_first_extent_bit(unpin, 0, &start, &end,
+                                           EXTENT_DIRTY);
+               if (ret)
+                       break;
+
+               /* opt_discard */
+               ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+
+               clear_extent_dirty(unpin, start, end, GFP_NOFS);
+               btrfs_error_unpin_extent_range(root, start, end);
+               cond_resched();
+       }
+
+       return 0;
+}
+
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+       struct btrfs_transaction *t;
+       LIST_HEAD(list);
+
+       WARN_ON(1);
+
+       mutex_lock(&root->fs_info->trans_mutex);
+       mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+       list_splice_init(&root->fs_info->trans_list, &list);
+       while (!list_empty(&list)) {
+               t = list_entry(list.next, struct btrfs_transaction, list);
+               if (!t)
+                       break;
+
+               btrfs_destroy_ordered_operations(root);
+
+               btrfs_destroy_ordered_extents(root);
+
+               btrfs_destroy_delayed_refs(t, root);
+
+               btrfs_block_rsv_release(root,
+                                       &root->fs_info->trans_block_rsv,
+                                       t->dirty_pages.dirty_bytes);
+
+               /* FIXME: cleanup wait for commit */
+               t->in_commit = 1;
+               t->blocked = 1;
+               if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+                       wake_up(&root->fs_info->transaction_blocked_wait);
+
+               t->blocked = 0;
+               if (waitqueue_active(&root->fs_info->transaction_wait))
+                       wake_up(&root->fs_info->transaction_wait);
+               mutex_unlock(&root->fs_info->trans_mutex);
+
+               mutex_lock(&root->fs_info->trans_mutex);
+               t->commit_done = 1;
+               if (waitqueue_active(&t->commit_wait))
+                       wake_up(&t->commit_wait);
+               mutex_unlock(&root->fs_info->trans_mutex);
+
+               mutex_lock(&root->fs_info->trans_mutex);
+
+               btrfs_destroy_pending_snapshots(t);
+
+               btrfs_destroy_delalloc_inodes(root);
+
+               spin_lock(&root->fs_info->new_trans_lock);
+               root->fs_info->running_transaction = NULL;
+               spin_unlock(&root->fs_info->new_trans_lock);
+
+               btrfs_destroy_marked_extents(root, &t->dirty_pages,
+                                            EXTENT_DIRTY);
+
+               btrfs_destroy_pinned_extent(root,
+                                           root->fs_info->pinned_extents);
+
+               t->use_count = 0;
+               list_del_init(&t->list);
+               memset(t, 0, sizeof(*t));
+               kmem_cache_free(btrfs_transaction_cachep, t);
+       }
+
+       mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+       mutex_unlock(&root->fs_info->trans_mutex);
+
+       return 0;
+}
+
 static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
index 88e825a..07b20dc 100644 (file)
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
+int btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize);
 struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
index 227e581..b552693 100644 (file)
@@ -3089,7 +3089,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
        return btrfs_reduce_alloc_profile(root, flags);
 }
 
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
        u64 flags;
 
@@ -3161,8 +3161,12 @@ alloc:
                                             bytes + 2 * 1024 * 1024,
                                             alloc_target, 0);
                        btrfs_end_transaction(trans, root);
-                       if (ret < 0)
-                               return ret;
+                       if (ret < 0) {
+                               if (ret != -ENOSPC)
+                                       return ret;
+                               else
+                                       goto commit_trans;
+                       }
 
                        if (!data_sinfo) {
                                btrfs_set_inode_space_info(root, inode);
@@ -3173,6 +3177,7 @@ alloc:
                spin_unlock(&data_sinfo->lock);
 
                /* commit the current transaction and try again */
+commit_trans:
                if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
@@ -3721,11 +3726,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                return 0;
        }
 
-       WARN_ON(1);
-       printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-               block_rsv->size, block_rsv->reserved,
-               block_rsv->freed[0], block_rsv->freed[1]);
-
        return -ENOSPC;
 }
 
@@ -7970,13 +7970,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
 
        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
            sinfo->bytes_may_use + sinfo->bytes_readonly +
-           cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+           cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
                sinfo->bytes_reserved += cache->reserved_pinned;
                cache->reserved_pinned = 0;
                cache->ro = 1;
                ret = 0;
        }
+
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
        return ret;
@@ -8012,6 +8013,62 @@ out:
        return ret;
 }
 
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+{
+       struct btrfs_block_group_cache *block_group;
+       u64 free_bytes = 0;
+       int factor;
+
+       list_for_each_entry(block_group, groups_list, list) {
+               spin_lock(&block_group->lock);
+
+               if (!block_group->ro) {
+                       spin_unlock(&block_group->lock);
+                       continue;
+               }
+
+               if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                         BTRFS_BLOCK_GROUP_RAID10 |
+                                         BTRFS_BLOCK_GROUP_DUP))
+                       factor = 2;
+               else
+                       factor = 1;
+
+               free_bytes += (block_group->key.offset -
+                              btrfs_block_group_used(&block_group->item)) *
+                              factor;
+
+               spin_unlock(&block_group->lock);
+       }
+
+       return free_bytes;
+}
+
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+       int i;
+       u64 free_bytes = 0;
+
+       spin_lock(&sinfo->lock);
+
+       for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+               if (!list_empty(&sinfo->block_groups[i]))
+                       free_bytes += __btrfs_get_ro_block_group_free_space(
+                                               &sinfo->block_groups[i]);
+
+       spin_unlock(&sinfo->lock);
+
+       return free_bytes;
+}
+
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
 {
@@ -8092,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                u64 min_free = btrfs_block_group_used(&block_group->item);
-               u64 dev_offset, max_avail;
+               u64 dev_offset;
 
                /*
                 * check to make sure we can actually find a chunk with enough
@@ -8100,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 */
                if (device->total_bytes > device->bytes_used + min_free) {
                        ret = find_free_dev_extent(NULL, device, min_free,
-                                                  &dev_offset, &max_avail);
+                                                  &dev_offset, NULL);
                        if (!ret)
                                break;
                        ret = -1;
@@ -8584,3 +8641,14 @@ out:
        btrfs_free_path(path);
        return ret;
 }
+
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+       return unpin_extent_range(root, start, end);
+}
+
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                              u64 num_bytes)
+{
+       return btrfs_discard_extent(root, bytenr, num_bytes);
+}
index 3e86b9f..2e993cf 100644 (file)
@@ -2028,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                BUG_ON(extent_map_end(em) <= cur);
                BUG_ON(end < cur);
 
-               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        this_bio_flag = EXTENT_BIO_COMPRESSED;
+                       extent_set_compress_type(&this_bio_flag,
+                                                em->compress_type);
+               }
 
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
@@ -3072,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 #endif
 
        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+       if (eb == NULL)
+               return NULL;
        eb->start = start;
        eb->len = len;
        spin_lock_init(&eb->lock);
index 4183c81..7083cfa 100644 (file)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
-/* flags for bio submission */
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
@@ -135,6 +139,17 @@ struct extent_buffer {
        wait_queue_head_t lock_wq;
 };
 
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+                                           int compress_type)
+{
+       *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
+
+static inline int extent_compress_type(unsigned long bio_flags)
+{
+       return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
+}
+
 struct extent_map_tree;
 
 static inline struct extent_state *extent_state_next(struct extent_state *state)
index 23cb8da..b0e1fce 100644 (file)
@@ -3,6 +3,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include "ctree.h"
 #include "extent_map.h"
 
 
@@ -54,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
                return em;
        em->in_tree = 0;
        em->flags = 0;
+       em->compress_type = BTRFS_COMPRESS_NONE;
        atomic_set(&em->refs, 1);
        return em;
 }
index ab6d74b..28b44db 100644 (file)
@@ -26,7 +26,8 @@ struct extent_map {
        unsigned long flags;
        struct block_device *bdev;
        atomic_t refs;
-       int in_tree;
+       unsigned int in_tree:1;
+       unsigned int compress_type:4;
 };
 
 struct extent_map_tree {
index a9e0a4e..c800d58 100644 (file)
@@ -225,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
                        split->bdev = em->bdev;
                        split->flags = flags;
+                       split->compress_type = em->compress_type;
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
                        free_extent_map(split);
@@ -239,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->len = em->start + em->len - (start + len);
                        split->bdev = em->bdev;
                        split->flags = flags;
+                       split->compress_type = em->compress_type;
 
                        if (compressed) {
                                split->block_len = em->block_len;
@@ -891,6 +893,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        if (err)
                goto out;
 
+       /*
+        * If BTRFS flips readonly due to some impossible error
+        * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+        * although we have opened a file as writable, we have
+        * to stop this write operation to ensure FS consistency.
+        */
+       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               err = -EROFS;
+               goto out;
+       }
+
        file_update_time(file);
        BTRFS_I(inode)->sequence++;
 
index 902afbf..160b55b 100644 (file)
@@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        size_t cur_size = size;
        size_t datasize;
        unsigned long offset;
-       int use_compress = 0;
+       int compress_type = BTRFS_COMPRESS_NONE;
 
        if (compressed_size && compressed_pages) {
-               use_compress = 1;
+               compress_type = root->fs_info->compress_type;
                cur_size = compressed_size;
        }
 
@@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
        ptr = btrfs_file_extent_inline_start(ei);
 
-       if (use_compress) {
+       if (compress_type != BTRFS_COMPRESS_NONE) {
                struct page *cpage;
                int i = 0;
                while (compressed_size > 0) {
@@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                        compressed_size -= cur_size;
                }
                btrfs_set_file_extent_compression(leaf, ei,
-                                                 BTRFS_COMPRESS_ZLIB);
+                                                 compress_type);
        } else {
                page = find_get_page(inode->i_mapping,
                                     start >> PAGE_CACHE_SHIFT);
@@ -263,6 +263,7 @@ struct async_extent {
        u64 compressed_size;
        struct page **pages;
        unsigned long nr_pages;
+       int compress_type;
        struct list_head list;
 };
 
@@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
                                     u64 start, u64 ram_size,
                                     u64 compressed_size,
                                     struct page **pages,
-                                    unsigned long nr_pages)
+                                    unsigned long nr_pages,
+                                    int compress_type)
 {
        struct async_extent *async_extent;
 
@@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
        async_extent->compressed_size = compressed_size;
        async_extent->pages = pages;
        async_extent->nr_pages = nr_pages;
+       async_extent->compress_type = compress_type;
        list_add_tail(&async_extent->list, &cow->extents);
        return 0;
 }
@@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
        unsigned long max_uncompressed = 128 * 1024;
        int i;
        int will_compress;
+       int compress_type = root->fs_info->compress_type;
 
        actual_end = min_t(u64, isize, end + 1);
 again:
@@ -381,12 +385,16 @@ again:
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
-               ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
-                                               total_compressed, pages,
-                                               nr_pages, &nr_pages_ret,
-                                               &total_in,
-                                               &total_compressed,
-                                               max_compressed);
+               if (BTRFS_I(inode)->force_compress)
+                       compress_type = BTRFS_I(inode)->force_compress;
+
+               ret = btrfs_compress_pages(compress_type,
+                                          inode->i_mapping, start,
+                                          total_compressed, pages,
+                                          nr_pages, &nr_pages_ret,
+                                          &total_in,
+                                          &total_compressed,
+                                          max_compressed);
 
                if (!ret) {
                        unsigned long offset = total_compressed &
@@ -493,7 +501,8 @@ again:
                 * and will submit them to the elevator.
                 */
                add_async_extent(async_cow, start, num_bytes,
-                                total_compressed, pages, nr_pages_ret);
+                                total_compressed, pages, nr_pages_ret,
+                                compress_type);
 
                if (start + num_bytes < end) {
                        start += num_bytes;
@@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed:
                        __set_page_dirty_nobuffers(locked_page);
                        /* unlocked later on in the async handlers */
                }
-               add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+               add_async_extent(async_cow, start, end - start + 1,
+                                0, NULL, 0, BTRFS_COMPRESS_NONE);
                *num_added += 1;
        }
 
@@ -640,6 +650,7 @@ retry:
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
+               em->compress_type = async_extent->compress_type;
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
@@ -656,11 +667,13 @@ retry:
                                                async_extent->ram_size - 1, 0);
                }
 
-               ret = btrfs_add_ordered_extent(inode, async_extent->start,
-                                              ins.objectid,
-                                              async_extent->ram_size,
-                                              ins.offset,
-                                              BTRFS_ORDERED_COMPRESSED);
+               ret = btrfs_add_ordered_extent_compress(inode,
+                                               async_extent->start,
+                                               ins.objectid,
+                                               async_extent->ram_size,
+                                               ins.offset,
+                                               BTRFS_ORDERED_COMPRESSED,
+                                               async_extent->compress_type);
                BUG_ON(ret);
 
                /*
@@ -1670,7 +1683,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
-       int compressed = 0;
+       int compress_type = 0;
        int ret;
        bool nolock = false;
 
@@ -1711,9 +1724,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-               compressed = 1;
+               compress_type = ordered_extent->compress_type;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-               BUG_ON(compressed);
+               BUG_ON(compress_type);
                ret = btrfs_mark_extent_written(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
@@ -1727,7 +1740,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->disk_len,
                                                ordered_extent->len,
                                                ordered_extent->len,
-                                               compressed, 0, 0,
+                                               compress_type, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
                                   ordered_extent->file_offset,
@@ -1829,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        logical = em->block_start;
                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                       extent_set_compress_type(&failrec->bio_flags,
+                                                em->compress_type);
                }
                failrec->logical = logical;
                free_extent_map(em);
@@ -3671,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
        int err;
 
+       if (btrfs_root_readonly(root))
+               return -EROFS;
+
        err = inode_change_ok(inode, attr);
        if (err)
                return err;
@@ -4928,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
        size_t max_size;
        unsigned long inline_size;
        unsigned long ptr;
+       int compress_type;
 
        WARN_ON(pg_offset != 0);
+       compress_type = btrfs_file_extent_compression(leaf, item);
        max_size = btrfs_file_extent_ram_bytes(leaf, item);
        inline_size = btrfs_file_extent_inline_item_len(leaf,
                                        btrfs_item_nr(leaf, path->slots[0]));
@@ -4939,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
        read_extent_buffer(leaf, tmp, ptr, inline_size);
 
        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
-       ret = btrfs_zlib_decompress(tmp, page, extent_offset,
-                                   inline_size, max_size);
+       ret = btrfs_decompress(compress_type, tmp, page,
+                              extent_offset, inline_size, max_size);
        if (ret) {
                char *kaddr = kmap_atomic(page, KM_USER0);
                unsigned long copy_size = min_t(u64,
@@ -4982,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_trans_handle *trans = NULL;
-       int compressed;
+       int compress_type;
 
 again:
        read_lock(&em_tree->lock);
@@ -5041,7 +5062,7 @@ again:
 
        found_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
-       compressed = btrfs_file_extent_compression(leaf, item);
+       compress_type = btrfs_file_extent_compression(leaf, item);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                extent_end = extent_start +
@@ -5087,8 +5108,9 @@ again:
                        em->block_start = EXTENT_MAP_HOLE;
                        goto insert;
                }
-               if (compressed) {
+               if (compress_type != BTRFS_COMPRESS_NONE) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                       em->compress_type = compress_type;
                        em->block_start = bytenr;
                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
                                                                         item);
@@ -5122,12 +5144,14 @@ again:
                em->len = (copy_size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
                em->orig_start = EXTENT_MAP_INLINE;
-               if (compressed)
+               if (compress_type) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                       em->compress_type = compress_type;
+               }
                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                if (create == 0 && !PageUptodate(page)) {
-                       if (btrfs_file_extent_compression(leaf, item) ==
-                           BTRFS_COMPRESS_ZLIB) {
+                       if (btrfs_file_extent_compression(leaf, item) !=
+                           BTRFS_COMPRESS_NONE) {
                                ret = uncompress_inline(path, inode, page,
                                                        pg_offset,
                                                        extent_offset, item);
@@ -6477,7 +6501,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->ordered_data_close = 0;
        ei->orphan_meta_reserved = 0;
        ei->dummy_inode = 0;
-       ei->force_compress = 0;
+       ei->force_compress = BTRFS_COMPRESS_NONE;
 
        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
@@ -7105,6 +7129,10 @@ static int btrfs_set_page_dirty(struct page *page)
 
 static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
+       if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+               return -EROFS;
        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
        return generic_permission(inode, mask, flags, btrfs_check_acl);
index f87552a..a506a22 100644 (file)
@@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        unsigned int flags, oldflags;
        int ret;
 
+       if (btrfs_root_readonly(root))
+               return -EROFS;
+
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
 
@@ -360,7 +363,8 @@ fail:
 }
 
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-                          char *name, int namelen, u64 *async_transid)
+                          char *name, int namelen, u64 *async_transid,
+                          bool readonly)
 {
        struct inode *inode;
        struct dentry *parent;
@@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        btrfs_init_block_rsv(&pending_snapshot->block_rsv);
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
+       pending_snapshot->readonly = readonly;
 
        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
        if (IS_ERR(trans)) {
@@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 static noinline int btrfs_mksubvol(struct path *parent,
                                   char *name, int namelen,
                                   struct btrfs_root *snap_src,
-                                  u64 *async_transid)
+                                  u64 *async_transid, bool readonly)
 {
        struct inode *dir  = parent->dentry->d_inode;
        struct dentry *dentry;
@@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
 
        if (snap_src) {
                error = create_snapshot(snap_src, dentry,
-                                       name, namelen, async_transid);
+                                       name, namelen, async_transid, readonly);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
                                      name, namelen, async_transid);
@@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file,
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_ordered_extent *ordered;
        struct page *page;
+       struct btrfs_super_block *disk_super;
        unsigned long last_index;
        unsigned long ra_pages = root->fs_info->bdi.ra_pages;
        unsigned long total_read = 0;
+       u64 features;
        u64 page_start;
        u64 page_end;
        u64 last_len = 0;
@@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file,
        u64 defrag_end = 0;
        unsigned long i;
        int ret;
+       int compress_type = BTRFS_COMPRESS_ZLIB;
+
+       if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+               if (range->compress_type > BTRFS_COMPRESS_TYPES)
+                       return -EINVAL;
+               if (range->compress_type)
+                       compress_type = range->compress_type;
+       }
 
        if (inode->i_size == 0)
                return 0;
@@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file,
                total_read++;
                mutex_lock(&inode->i_mutex);
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-                       BTRFS_I(inode)->force_compress = 1;
+                       BTRFS_I(inode)->force_compress = compress_type;
 
                ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
                if (ret)
@@ -781,10 +796,17 @@ loop_unlock:
                atomic_dec(&root->fs_info->async_submit_draining);
 
                mutex_lock(&inode->i_mutex);
-               BTRFS_I(inode)->force_compress = 0;
+               BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
                mutex_unlock(&inode->i_mutex);
        }
 
+       disk_super = &root->fs_info->super_copy;
+       features = btrfs_super_incompat_flags(disk_super);
+       if (range->compress_type == BTRFS_COMPRESS_LZO) {
+               features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+               btrfs_set_super_incompat_flags(disk_super, features);
+       }
+
        return 0;
 
 err_reservations:
@@ -901,7 +923,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                                                    char *name,
                                                    unsigned long fd,
                                                    int subvol,
-                                                   u64 *transid)
+                                                   u64 *transid,
+                                                   bool readonly)
 {
        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct file *src_file;
@@ -919,7 +942,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 
        if (subvol) {
                ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                    NULL, transid);
+                                    NULL, transid, readonly);
        } else {
                struct inode *src_inode;
                src_file = fget(fd);
@@ -938,7 +961,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                }
                ret = btrfs_mksubvol(&file->f_path, name, namelen,
                                     BTRFS_I(src_inode)->root,
-                                    transid);
+                                    transid, readonly);
                fput(src_file);
        }
 out:
@@ -946,58 +969,139 @@ out:
 }
 
 static noinline int btrfs_ioctl_snap_create(struct file *file,
-                                           void __user *arg, int subvol,
-                                           int v2)
+                                           void __user *arg, int subvol)
 {
-       struct btrfs_ioctl_vol_args *vol_args = NULL;
-       struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL;
-       char *name;
-       u64 fd;
+       struct btrfs_ioctl_vol_args *vol_args;
        int ret;
 
-       if (v2) {
-               u64 transid = 0;
-               u64 *ptr = NULL;
+       vol_args = memdup_user(arg, sizeof(*vol_args));
+       if (IS_ERR(vol_args))
+               return PTR_ERR(vol_args);
+       vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-               vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2));
-               if (IS_ERR(vol_args_v2))
-                       return PTR_ERR(vol_args_v2);
+       ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+                                             vol_args->fd, subvol,
+                                             NULL, false);
 
-               if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-
-               name = vol_args_v2->name;
-               fd = vol_args_v2->fd;
-               vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+       kfree(vol_args);
+       return ret;
+}
 
-               if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC)
-                       ptr = &transid;
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
+                                              void __user *arg, int subvol)
+{
+       struct btrfs_ioctl_vol_args_v2 *vol_args;
+       int ret;
+       u64 transid = 0;
+       u64 *ptr = NULL;
+       bool readonly = false;
 
-               ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-                                                     subvol, ptr);
+       vol_args = memdup_user(arg, sizeof(*vol_args));
+       if (IS_ERR(vol_args))
+               return PTR_ERR(vol_args);
+       vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 
-               if (ret == 0 && ptr &&
-                   copy_to_user(arg +
-                                offsetof(struct btrfs_ioctl_vol_args_v2,
-                                         transid), ptr, sizeof(*ptr)))
-                       ret = -EFAULT;
-       } else {
-               vol_args = memdup_user(arg, sizeof(*vol_args));
-               if (IS_ERR(vol_args))
-                       return PTR_ERR(vol_args);
-               name = vol_args->name;
-               fd = vol_args->fd;
-               vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-
-               ret = btrfs_ioctl_snap_create_transid(file, name, fd,
-                                                     subvol, NULL);
+       if (vol_args->flags &
+           ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+               ret = -EOPNOTSUPP;
+               goto out;
        }
+
+       if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+               ptr = &transid;
+       if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+               readonly = true;
+
+       ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+                                             vol_args->fd, subvol,
+                                             ptr, readonly);
+
+       if (ret == 0 && ptr &&
+           copy_to_user(arg +
+                        offsetof(struct btrfs_ioctl_vol_args_v2,
+                                 transid), ptr, sizeof(*ptr)))
+               ret = -EFAULT;
 out:
        kfree(vol_args);
-       kfree(vol_args_v2);
+       return ret;
+}
 
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+                                               void __user *arg)
+{
+       struct inode *inode = fdentry(file)->d_inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret = 0;
+       u64 flags = 0;
+
+       if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+               return -EINVAL;
+
+       down_read(&root->fs_info->subvol_sem);
+       if (btrfs_root_readonly(root))
+               flags |= BTRFS_SUBVOL_RDONLY;
+       up_read(&root->fs_info->subvol_sem);
+
+       if (copy_to_user(arg, &flags, sizeof(flags)))
+               ret = -EFAULT;
+
+       return ret;
+}
+
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+                                             void __user *arg)
+{
+       struct inode *inode = fdentry(file)->d_inode;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_trans_handle *trans;
+       u64 root_flags;
+       u64 flags;
+       int ret = 0;
+
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+               return -EINVAL;
+
+       if (copy_from_user(&flags, arg, sizeof(flags)))
+               return -EFAULT;
+
+       if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+               return -EINVAL;
+
+       if (flags & ~BTRFS_SUBVOL_RDONLY)
+               return -EOPNOTSUPP;
+
+       down_write(&root->fs_info->subvol_sem);
+
+       /* nothing to do */
+       if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+               goto out;
+
+       root_flags = btrfs_root_flags(&root->root_item);
+       if (flags & BTRFS_SUBVOL_RDONLY)
+               btrfs_set_root_flags(&root->root_item,
+                                    root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+       else
+               btrfs_set_root_flags(&root->root_item,
+                                    root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_reset;
+       }
+
+       ret = btrfs_update_root(trans, root,
+                               &root->root_key, &root->root_item);
+
+       btrfs_commit_transaction(trans, root);
+out_reset:
+       if (ret)
+               btrfs_set_root_flags(&root->root_item, root_flags);
+out:
+       up_write(&root->fs_info->subvol_sem);
        return ret;
 }
 
@@ -1509,6 +1613,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
        struct btrfs_ioctl_defrag_range_args *range;
        int ret;
 
+       if (btrfs_root_readonly(root))
+               return -EROFS;
+
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                return ret;
@@ -1637,6 +1744,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
                return -EINVAL;
 
+       if (btrfs_root_readonly(root))
+               return -EROFS;
+
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                return ret;
@@ -1958,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
        if (file->private_data)
                goto out;
 
+       ret = -EROFS;
+       if (btrfs_root_readonly(root))
+               goto out;
+
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                goto out;
@@ -2257,13 +2371,17 @@ long btrfs_ioctl(struct file *file, unsigned int
        case FS_IOC_GETVERSION:
                return btrfs_ioctl_getversion(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
-               return btrfs_ioctl_snap_create(file, argp, 0, 0);
+               return btrfs_ioctl_snap_create(file, argp, 0);
        case BTRFS_IOC_SNAP_CREATE_V2:
-               return btrfs_ioctl_snap_create(file, argp, 0, 1);
+               return btrfs_ioctl_snap_create_v2(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
-               return btrfs_ioctl_snap_create(file, argp, 1, 0);
+               return btrfs_ioctl_snap_create(file, argp, 1);
        case BTRFS_IOC_SNAP_DESTROY:
                return btrfs_ioctl_snap_destroy(file, argp);
+       case BTRFS_IOC_SUBVOL_GETFLAGS:
+               return btrfs_ioctl_subvol_getflags(file, argp);
+       case BTRFS_IOC_SUBVOL_SETFLAGS:
+               return btrfs_ioctl_subvol_setflags(file, argp);
        case BTRFS_IOC_DEFAULT_SUBVOL:
                return btrfs_ioctl_default_subvol(file, argp);
        case BTRFS_IOC_DEFRAG:
index c344d12..8fb3821 100644 (file)
@@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args {
 };
 
 #define BTRFS_SUBVOL_CREATE_ASYNC      (1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY            (1ULL << 1)
 
 #define BTRFS_SUBVOL_NAME_MAX 4039
 struct btrfs_ioctl_vol_args_v2 {
@@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args {
         */
        __u32 extent_thresh;
 
+       /*
+        * which compression method to use if turning on compression
+        * for this defrag operation.  If unspecified, zlib will
+        * be used
+        */
+       __u32 compress_type;
+
        /* spare for later */
-       __u32 unused[5];
+       __u32 unused[4];
 };
 
 struct btrfs_ioctl_space_info {
@@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args {
 #define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
 #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
                                   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
 #endif
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644 (file)
index 0000000..cc9b450
--- /dev/null
@@ -0,0 +1,420 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+
+#define LZO_LEN        4
+
+struct workspace {
+       void *mem;
+       void *buf;      /* where compressed data goes */
+       void *cbuf;     /* where decompressed data goes */
+       struct list_head list;
+};
+
+static void lzo_free_workspace(struct list_head *ws)
+{
+       struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+       vfree(workspace->buf);
+       vfree(workspace->cbuf);
+       vfree(workspace->mem);
+       kfree(workspace);
+}
+
+static struct list_head *lzo_alloc_workspace(void)
+{
+       struct workspace *workspace;
+
+       workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+       if (!workspace)
+               return ERR_PTR(-ENOMEM);
+
+       workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+       workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+       workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+       if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+               goto fail;
+
+       INIT_LIST_HEAD(&workspace->list);
+
+       return &workspace->list;
+fail:
+       lzo_free_workspace(&workspace->list);
+       return ERR_PTR(-ENOMEM);
+}
+
+static inline void write_compress_length(char *buf, size_t len)
+{
+       __le32 dlen;
+
+       dlen = cpu_to_le32(len);
+       memcpy(buf, &dlen, LZO_LEN);
+}
+
+static inline size_t read_compress_length(char *buf)
+{
+       __le32 dlen;
+
+       memcpy(&dlen, buf, LZO_LEN);
+       return le32_to_cpu(dlen);
+}
+
+static int lzo_compress_pages(struct list_head *ws,
+                             struct address_space *mapping,
+                             u64 start, unsigned long len,
+                             struct page **pages,
+                             unsigned long nr_dest_pages,
+                             unsigned long *out_pages,
+                             unsigned long *total_in,
+                             unsigned long *total_out,
+                             unsigned long max_out)
+{
+       struct workspace *workspace = list_entry(ws, struct workspace, list);
+       int ret = 0;
+       char *data_in;
+       char *cpage_out;
+       int nr_pages = 0;
+       struct page *in_page = NULL;
+       struct page *out_page = NULL;
+       unsigned long bytes_left;
+
+       size_t in_len;
+       size_t out_len;
+       char *buf;
+       unsigned long tot_in = 0;
+       unsigned long tot_out = 0;
+       unsigned long pg_bytes_left;
+       unsigned long out_offset;
+       unsigned long bytes;
+
+       *out_pages = 0;
+       *total_out = 0;
+       *total_in = 0;
+
+       in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+       data_in = kmap(in_page);
+
+       /*
+        * store the size of all chunks of compressed data in
+        * the first 4 bytes
+        */
+       out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       if (out_page == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       cpage_out = kmap(out_page);
+       out_offset = LZO_LEN;
+       tot_out = LZO_LEN;
+       pages[0] = out_page;
+       nr_pages = 1;
+       pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+       /* compress at most one page of data each time */
+       in_len = min(len, PAGE_CACHE_SIZE);
+       while (tot_in < len) {
+               ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+                                      &out_len, workspace->mem);
+               if (ret != LZO_E_OK) {
+                       printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                              ret);
+                       ret = -1;
+                       goto out;
+               }
+
+               /* store the size of this chunk of compressed data */
+               write_compress_length(cpage_out + out_offset, out_len);
+               tot_out += LZO_LEN;
+               out_offset += LZO_LEN;
+               pg_bytes_left -= LZO_LEN;
+
+               tot_in += in_len;
+               tot_out += out_len;
+
+               /* copy bytes from the working buffer into the pages */
+               buf = workspace->cbuf;
+               while (out_len) {
+                       bytes = min_t(unsigned long, pg_bytes_left, out_len);
+
+                       memcpy(cpage_out + out_offset, buf, bytes);
+
+                       out_len -= bytes;
+                       pg_bytes_left -= bytes;
+                       buf += bytes;
+                       out_offset += bytes;
+
+                       /*
+                        * we need another page for writing out.
+                        *
+                        * Note if there's less than 4 bytes left, we just
+                        * skip to a new page.
+                        */
+                       if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+                           pg_bytes_left == 0) {
+                               if (pg_bytes_left) {
+                                       memset(cpage_out + out_offset, 0,
+                                              pg_bytes_left);
+                                       tot_out += pg_bytes_left;
+                               }
+
+                               /* we're done, don't allocate new page */
+                               if (out_len == 0 && tot_in >= len)
+                                       break;
+
+                               kunmap(out_page);
+                               if (nr_pages == nr_dest_pages) {
+                                       out_page = NULL;
+                                       ret = -1;
+                                       goto out;
+                               }
+
+                               out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                               if (out_page == NULL) {
+                                       ret = -ENOMEM;
+                                       goto out;
+                               }
+                               cpage_out = kmap(out_page);
+                               pages[nr_pages++] = out_page;
+
+                               pg_bytes_left = PAGE_CACHE_SIZE;
+                               out_offset = 0;
+                       }
+               }
+
+               /* we're making it bigger, give up */
+               if (tot_in > 8192 && tot_in < tot_out)
+                       goto out;
+
+               /* we're all done */
+               if (tot_in >= len)
+                       break;
+
+               if (tot_out > max_out)
+                       break;
+
+               bytes_left = len - tot_in;
+               kunmap(in_page);
+               page_cache_release(in_page);
+
+               start += PAGE_CACHE_SIZE;
+               in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+               data_in = kmap(in_page);
+               in_len = min(bytes_left, PAGE_CACHE_SIZE);
+       }
+
+       if (tot_out > tot_in)
+               goto out;
+
+       /* store the size of all chunks of compressed data */
+       cpage_out = kmap(pages[0]);
+       write_compress_length(cpage_out, tot_out);
+
+       kunmap(pages[0]);
+
+       ret = 0;
+       *total_out = tot_out;
+       *total_in = tot_in;
+out:
+       *out_pages = nr_pages;
+       if (out_page)
+               kunmap(out_page);
+
+       if (in_page) {
+               kunmap(in_page);
+               page_cache_release(in_page);
+       }
+
+       return ret;
+}
+
+static int lzo_decompress_biovec(struct list_head *ws,
+                                struct page **pages_in,
+                                u64 disk_start,
+                                struct bio_vec *bvec,
+                                int vcnt,
+                                size_t srclen)
+{
+       struct workspace *workspace = list_entry(ws, struct workspace, list);
+       int ret = 0, ret2;
+       char *data_in;
+       unsigned long page_in_index = 0;
+       unsigned long page_out_index = 0;
+       unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                       PAGE_CACHE_SIZE;
+       unsigned long buf_start;
+       unsigned long buf_offset = 0;
+       unsigned long bytes;
+       unsigned long working_bytes;
+       unsigned long pg_offset;
+
+       size_t in_len;
+       size_t out_len;
+       unsigned long in_offset;
+       unsigned long in_page_bytes_left;
+       unsigned long tot_in;
+       unsigned long tot_out;
+       unsigned long tot_len;
+       char *buf;
+
+       data_in = kmap(pages_in[0]);
+       tot_len = read_compress_length(data_in);
+
+       tot_in = LZO_LEN;
+       in_offset = LZO_LEN;
+       tot_len = min_t(size_t, srclen, tot_len);
+       in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+
+       tot_out = 0;
+       pg_offset = 0;
+
+       while (tot_in < tot_len) {
+               in_len = read_compress_length(data_in + in_offset);
+               in_page_bytes_left -= LZO_LEN;
+               in_offset += LZO_LEN;
+               tot_in += LZO_LEN;
+
+               tot_in += in_len;
+               working_bytes = in_len;
+
+               /* fast path: avoid using the working buffer */
+               if (in_page_bytes_left >= in_len) {
+                       buf = data_in + in_offset;
+                       bytes = in_len;
+                       goto cont;
+               }
+
+               /* copy bytes from the pages into the working buffer */
+               buf = workspace->cbuf;
+               buf_offset = 0;
+               while (working_bytes) {
+                       bytes = min(working_bytes, in_page_bytes_left);
+
+                       memcpy(buf + buf_offset, data_in + in_offset, bytes);
+                       buf_offset += bytes;
+cont:
+                       working_bytes -= bytes;
+                       in_page_bytes_left -= bytes;
+                       in_offset += bytes;
+
+                       /* check if we need to pick another page */
+                       if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+                           || in_page_bytes_left == 0) {
+                               tot_in += in_page_bytes_left;
+
+                               if (working_bytes == 0 && tot_in >= tot_len)
+                                       break;
+
+                               kunmap(pages_in[page_in_index]);
+                               page_in_index++;
+                               if (page_in_index >= total_pages_in) {
+                                       ret = -1;
+                                       data_in = NULL;
+                                       goto done;
+                               }
+                               data_in = kmap(pages_in[page_in_index]);
+
+                               in_page_bytes_left = PAGE_CACHE_SIZE;
+                               in_offset = 0;
+                       }
+               }
+
+               out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+               ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+                                           &out_len);
+               if (ret != LZO_E_OK) {
+                       printk(KERN_WARNING "btrfs decompress failed\n");
+                       ret = -1;
+                       break;
+               }
+
+               buf_start = tot_out;
+               tot_out += out_len;
+
+               ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+                                                tot_out, disk_start,
+                                                bvec, vcnt,
+                                                &page_out_index, &pg_offset);
+               if (ret2 == 0)
+                       break;
+       }
+done:
+       if (data_in)
+               kunmap(pages_in[page_in_index]);
+       return ret;
+}
+
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+                         struct page *dest_page,
+                         unsigned long start_byte,
+                         size_t srclen, size_t destlen)
+{
+       struct workspace *workspace = list_entry(ws, struct workspace, list);
+       size_t in_len;
+       size_t out_len;
+       size_t tot_len;
+       int ret = 0;
+       char *kaddr;
+       unsigned long bytes;
+
+       BUG_ON(srclen < LZO_LEN);
+
+       tot_len = read_compress_length(data_in);
+       data_in += LZO_LEN;
+
+       in_len = read_compress_length(data_in);
+       data_in += LZO_LEN;
+
+       out_len = PAGE_CACHE_SIZE;
+       ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+       if (ret != LZO_E_OK) {
+               printk(KERN_WARNING "btrfs decompress failed!\n");
+               ret = -1;
+               goto out;
+       }
+
+       if (out_len < start_byte) {
+               ret = -1;
+               goto out;
+       }
+
+       bytes = min_t(unsigned long, destlen, out_len - start_byte);
+
+       kaddr = kmap_atomic(dest_page, KM_USER0);
+       memcpy(kaddr, workspace->buf + start_byte, bytes);
+       kunmap_atomic(kaddr, KM_USER0);
+out:
+       return ret;
+}
+
+struct btrfs_compress_op btrfs_lzo_compress = {
+       .alloc_workspace        = lzo_alloc_workspace,
+       .free_workspace         = lzo_free_workspace,
+       .compress_pages         = lzo_compress_pages,
+       .decompress_biovec      = lzo_decompress_biovec,
+       .decompress             = lzo_decompress,
+};
index ae7737e..2b61e1d 100644 (file)
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                                      u64 start, u64 len, u64 disk_len,
-                                     int type, int dio)
+                                     int type, int dio, int compress_type)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->disk_len = disk_len;
        entry->bytes_left = len;
        entry->inode = inode;
+       entry->compress_type = compress_type;
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
 
@@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type)
 {
        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-                                         disk_len, type, 0);
+                                         disk_len, type, 0,
+                                         BTRFS_COMPRESS_NONE);
 }
 
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
                                 u64 start, u64 len, u64 disk_len, int type)
 {
        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-                                         disk_len, type, 1);
+                                         disk_len, type, 1,
+                                         BTRFS_COMPRESS_NONE);
+}
+
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+                                     u64 start, u64 len, u64 disk_len,
+                                     int type, int compress_type)
+{
+       return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                         disk_len, type, 0,
+                                         compress_type);
 }
 
 /*
index 61dca83..ff1f69a 100644 (file)
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
 
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
 
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
 
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
        /* flags (described above) */
        unsigned long flags;
 
+       /* compression algorithm */
+       int compress_type;
+
        /* reference count */
        atomic_t refs;
 
@@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
                                 u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+                                     u64 start, u64 len, u64 disk_len,
+                                     int type, int compress_type);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
index 22acdaa..b2130c4 100644 (file)
 
 static const struct super_operations btrfs_super_ops;
 
+static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+                                     char nbuf[16])
+{
+       char *errstr = NULL;
+
+       switch (errno) {
+       case -EIO:
+               errstr = "IO failure";
+               break;
+       case -ENOMEM:
+               errstr = "Out of memory";
+               break;
+       case -EROFS:
+               errstr = "Readonly filesystem";
+               break;
+       default:
+               if (nbuf) {
+                       if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+                               errstr = nbuf;
+               }
+               break;
+       }
+
+       return errstr;
+}
+
+static void __save_error_info(struct btrfs_fs_info *fs_info)
+{
+       /*
+        * today we only save the error info into ram.  Long term we'll
+        * also send it down to the disk
+        */
+       fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+}
+
+/* NOTE:
+ *     We move write_super stuff at umount in order to avoid deadlock
+ *     for umount hold all lock.
+ */
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+       __save_error_info(fs_info);
+}
+
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+       struct super_block *sb = fs_info->sb;
+
+       if (sb->s_flags & MS_RDONLY)
+               return;
+
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               sb->s_flags |= MS_RDONLY;
+               printk(KERN_INFO "btrfs is forced readonly\n");
+       }
+}
+
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                    unsigned int line, int errno)
+{
+       struct super_block *sb = fs_info->sb;
+       char nbuf[16];
+       const char *errstr;
+
+       /*
+        * Special case: if the error is EROFS, and we're already
+        * under MS_RDONLY, then it is safe here.
+        */
+       if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+               return;
+
+       errstr = btrfs_decode_error(fs_info, errno, nbuf);
+       printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+               sb->s_id, function, line, errstr);
+       save_error_info(fs_info);
+
+       btrfs_handle_error(fs_info);
+}
+
 static void btrfs_put_super(struct super_block *sb)
 {
        struct btrfs_root *root = btrfs_sb(sb);
@@ -69,9 +153,9 @@ enum {
        Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-       Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
-       Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err,
-       Opt_user_subvol_rm_allowed,
+       Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
+       Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+       Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -86,7 +170,9 @@ static match_table_t tokens = {
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
        {Opt_compress, "compress"},
+       {Opt_compress_type, "compress=%s"},
        {Opt_compress_force, "compress-force"},
+       {Opt_compress_force_type, "compress-force=%s"},
        {Opt_ssd, "ssd"},
        {Opt_ssd_spread, "ssd_spread"},
        {Opt_nossd, "nossd"},
@@ -112,6 +198,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        char *p, *num, *orig;
        int intarg;
        int ret = 0;
+       char *compress_type;
+       bool compress_force = false;
 
        if (!options)
                return 0;
@@ -154,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
-               case Opt_compress:
-                       printk(KERN_INFO "btrfs: use compression\n");
-                       btrfs_set_opt(info->mount_opt, COMPRESS);
-                       break;
                case Opt_compress_force:
-                       printk(KERN_INFO "btrfs: forcing compression\n");
-                       btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+               case Opt_compress_force_type:
+                       compress_force = true;
+               case Opt_compress:
+               case Opt_compress_type:
+                       if (token == Opt_compress ||
+                           token == Opt_compress_force ||
+                           strcmp(args[0].from, "zlib") == 0) {
+                               compress_type = "zlib";
+                               info->compress_type = BTRFS_COMPRESS_ZLIB;
+                       } else if (strcmp(args[0].from, "lzo") == 0) {
+                               compress_type = "lzo";
+                               info->compress_type = BTRFS_COMPRESS_LZO;
+                       } else {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+
                        btrfs_set_opt(info->mount_opt, COMPRESS);
+                       if (compress_force) {
+                               btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                               pr_info("btrfs: force %s compression\n",
+                                       compress_type);
+                       } else
+                               pr_info("btrfs: use %s compression\n",
+                                       compress_type);
                        break;
                case Opt_ssd:
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -753,6 +859,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
 
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_device_info *devices_info;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       u64 skip_space;
+       u64 type;
+       u64 avail_space;
+       u64 used_space;
+       u64 min_stripe_size;
+       int min_stripes = 1;
+       int i = 0, nr_devices;
+       int ret;
+
+       nr_devices = fs_info->fs_devices->rw_devices;
+       BUG_ON(!nr_devices);
+
+       devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+                              GFP_NOFS);
+       if (!devices_info)
+               return -ENOMEM;
+
+       /* calc min stripe number for data space alloction */
+       type = btrfs_get_alloc_profile(root, 1);
+       if (type & BTRFS_BLOCK_GROUP_RAID0)
+               min_stripes = 2;
+       else if (type & BTRFS_BLOCK_GROUP_RAID1)
+               min_stripes = 2;
+       else if (type & BTRFS_BLOCK_GROUP_RAID10)
+               min_stripes = 4;
+
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+       else
+               min_stripe_size = BTRFS_STRIPE_LEN;
+
+       list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+               if (!device->in_fs_metadata)
+                       continue;
+
+               avail_space = device->total_bytes - device->bytes_used;
+
+               /* align with stripe_len */
+               do_div(avail_space, BTRFS_STRIPE_LEN);
+               avail_space *= BTRFS_STRIPE_LEN;
+
+               /*
+                * In order to avoid overwritting the superblock on the drive,
+                * btrfs starts at an offset of at least 1MB when doing chunk
+                * allocation.
+                */
+               skip_space = 1024 * 1024;
+
+               /* user can set the offset in fs_info->alloc_start. */
+               if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+                   device->total_bytes)
+                       skip_space = max(fs_info->alloc_start, skip_space);
+
+               /*
+                * btrfs can not use the free space in [0, skip_space - 1],
+                * we must subtract it from the total. In order to implement
+                * it, we account the used space in this range first.
+                */
+               ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+                                                    &used_space);
+               if (ret) {
+                       kfree(devices_info);
+                       return ret;
+               }
+
+               /* calc the free space in [0, skip_space - 1] */
+               skip_space -= used_space;
+
+               /*
+                * we can use the free space in [0, skip_space - 1], subtract
+                * it from the total.
+                */
+               if (avail_space && avail_space >= skip_space)
+                       avail_space -= skip_space;
+               else
+                       avail_space = 0;
+
+               if (avail_space < min_stripe_size)
+                       continue;
+
+               devices_info[i].dev = device;
+               devices_info[i].max_avail = avail_space;
+
+               i++;
+       }
+
+       nr_devices = i;
+
+       btrfs_descending_sort_devices(devices_info, nr_devices);
+
+       i = nr_devices - 1;
+       avail_space = 0;
+       while (nr_devices >= min_stripes) {
+               if (devices_info[i].max_avail >= min_stripe_size) {
+                       int j;
+                       u64 alloc_size;
+
+                       avail_space += devices_info[i].max_avail * min_stripes;
+                       alloc_size = devices_info[i].max_avail;
+                       for (j = i + 1 - min_stripes; j <= i; j++)
+                               devices_info[j].max_avail -= alloc_size;
+               }
+               i--;
+               nr_devices--;
+       }
+
+       kfree(devices_info);
+       *free_bytes = avail_space;
+       return 0;
+}
+
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -760,17 +987,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-       u64 total_used_data = 0;
+       u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
+       int ret;
 
+       /* holding chunk_muext to avoid allocating new chunks */
+       mutex_lock(&root->fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
-               if (found->flags & (BTRFS_BLOCK_GROUP_METADATA |
-                                   BTRFS_BLOCK_GROUP_SYSTEM))
-                       total_used_data += found->disk_total;
-               else
-                       total_used_data += found->disk_used;
+               if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+                       total_free_data += found->disk_total - found->disk_used;
+                       total_free_data -=
+                               btrfs_account_ro_block_groups_free_space(found);
+               }
+
                total_used += found->disk_used;
        }
        rcu_read_unlock();
@@ -778,9 +1009,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-       buf->f_bavail = buf->f_blocks - (total_used_data >> bits);
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
+       buf->f_bavail = total_free_data;
+       ret = btrfs_calc_avail_data_space(root, &total_free_data);
+       if (ret) {
+               mutex_unlock(&root->fs_info->chunk_mutex);
+               return ret;
+       }
+       buf->f_bavail += total_free_data;
+       buf->f_bavail = buf->f_bavail >> bits;
+       mutex_unlock(&root->fs_info->chunk_mutex);
 
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
@@ -897,10 +1136,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                return err;
 
-       err = btrfs_init_cachep();
+       err = btrfs_init_compress();
        if (err)
                goto free_sysfs;
 
+       err = btrfs_init_cachep();
+       if (err)
+               goto free_compress;
+
        err = extent_io_init();
        if (err)
                goto free_cachep;
@@ -928,6 +1171,8 @@ free_extent_io:
        extent_io_exit();
 free_cachep:
        btrfs_destroy_cachep();
+free_compress:
+       btrfs_exit_compress();
 free_sysfs:
        btrfs_exit_sysfs();
        return err;
@@ -942,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
        unregister_filesystem(&btrfs_fs_type);
        btrfs_exit_sysfs();
        btrfs_cleanup_fs_uuids();
-       btrfs_zlib_exit();
+       btrfs_exit_compress();
 }
 
 module_init(init_btrfs_fs)
index f50e931..bae5c7b 100644 (file)
@@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
        int ret;
+
+       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+               return ERR_PTR(-EROFS);
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
@@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        u64 to_reserve = 0;
        u64 index = 0;
        u64 objectid;
+       u64 root_flags;
 
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
@@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
+       root_flags = btrfs_root_flags(new_root_item);
+       if (pending->readonly)
+               root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+       else
+               root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+       btrfs_set_root_flags(new_root_item, root_flags);
+
        old = btrfs_lock_root_node(root);
        btrfs_cow_block(trans, root, old, NULL, 0, &old);
        btrfs_set_lock_blocking(old);
index f104b57..229a594 100644 (file)
@@ -62,6 +62,7 @@ struct btrfs_pending_snapshot {
        struct btrfs_block_rsv block_rsv;
        /* extra metadata reseration for relocation */
        int error;
+       bool readonly;
        struct list_head list;
 };
 
index 1718e1a..d158530 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/iocontext.h>
+#include <linux/capability.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -600,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                set_blocksize(bdev, 4096);
 
                bh = btrfs_read_dev_super(bdev);
-               if (!bh)
+               if (!bh) {
+                       ret = -EINVAL;
                        goto error_close;
+               }
 
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -703,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                goto error_close;
        bh = btrfs_read_dev_super(bdev);
        if (!bh) {
-               ret = -EIO;
+               ret = -EINVAL;
                goto error_close;
        }
        disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -729,59 +732,167 @@ error:
        return ret;
 }
 
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                  u64 end, u64 *length)
+{
+       struct btrfs_key key;
+       struct btrfs_root *root = device->dev_root;
+       struct btrfs_dev_extent *dev_extent;
+       struct btrfs_path *path;
+       u64 extent_end;
+       int ret;
+       int slot;
+       struct extent_buffer *l;
+
+       *length = 0;
+
+       if (start >= device->total_bytes)
+               return 0;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       path->reada = 2;
+
+       key.objectid = device->devid;
+       key.offset = start;
+       key.type = BTRFS_DEV_EXTENT_KEY;
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       if (ret > 0) {
+               ret = btrfs_previous_item(root, path, key.objectid, key.type);
+               if (ret < 0)
+                       goto out;
+       }
+
+       while (1) {
+               l = path->nodes[0];
+               slot = path->slots[0];
+               if (slot >= btrfs_header_nritems(l)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret == 0)
+                               continue;
+                       if (ret < 0)
+                               goto out;
+
+                       break;
+               }
+               btrfs_item_key_to_cpu(l, &key, slot);
+
+               if (key.objectid < device->devid)
+                       goto next;
+
+               if (key.objectid > device->devid)
+                       break;
+
+               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                       goto next;
+
+               dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+               extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                 dev_extent);
+               if (key.offset <= start && extent_end > end) {
+                       *length = end - start + 1;
+                       break;
+               } else if (key.offset <= start && extent_end > start)
+                       *length += extent_end - start;
+               else if (key.offset > start && extent_end <= end)
+                       *length += extent_end - key.offset;
+               else if (key.offset > start && key.offset <= end) {
+                       *length += end - key.offset + 1;
+                       break;
+               } else if (key.offset > end)
+                       break;
+
+next:
+               path->slots[0]++;
+       }
+       ret = 0;
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
 /*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans:     transaction handler
+ * @device:    the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @start:     store the start of the free space.
+ * @len:       the size of the free space. that we find, or the size of the max
+ *             free space if we don't find suitable free space
+ *
  * this uses a pretty simple search, the expectation is that it is
  * called very infrequently and that a given device has a small number
  * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
  */
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_device *device, u64 num_bytes,
-                        u64 *start, u64 *max_avail)
+                        u64 *start, u64 *len)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
-       struct btrfs_dev_extent *dev_extent = NULL;
+       struct btrfs_dev_extent *dev_extent;
        struct btrfs_path *path;
-       u64 hole_size = 0;
-       u64 last_byte = 0;
-       u64 search_start = 0;
+       u64 hole_size;
+       u64 max_hole_start;
+       u64 max_hole_size;
+       u64 extent_end;
+       u64 search_start;
        u64 search_end = device->total_bytes;
        int ret;
-       int slot = 0;
-       int start_found;
+       int slot;
        struct extent_buffer *l;
 
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       path->reada = 2;
-       start_found = 0;
-
        /* FIXME use last free of some kind */
 
        /* we don't want to overwrite the superblock on the drive,
         * so we make sure to start at an offset of at least 1MB
         */
-       search_start = max((u64)1024 * 1024, search_start);
+       search_start = 1024 * 1024;
 
-       if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+       if (root->fs_info->alloc_start + num_bytes <= search_end)
                search_start = max(root->fs_info->alloc_start, search_start);
 
+       max_hole_start = search_start;
+       max_hole_size = 0;
+
+       if (search_start >= search_end) {
+               ret = -ENOSPC;
+               goto error;
+       }
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto error;
+       }
+       path->reada = 2;
+
        key.objectid = device->devid;
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;
+
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
-               goto error;
+               goto out;
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid, key.type);
                if (ret < 0)
-                       goto error;
-               if (ret > 0)
-                       start_found = 1;
+                       goto out;
        }
-       l = path->nodes[0];
-       btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
        while (1) {
                l = path->nodes[0];
                slot = path->slots[0];
@@ -790,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
                        if (ret == 0)
                                continue;
                        if (ret < 0)
-                               goto error;
-no_more_items:
-                       if (!start_found) {
-                               if (search_start >= search_end) {
-                                       ret = -ENOSPC;
-                                       goto error;
-                               }
-                               *start = search_start;
-                               start_found = 1;
-                               goto check_pending;
-                       }
-                       *start = last_byte > search_start ?
-                               last_byte : search_start;
-                       if (search_end <= *start) {
-                               ret = -ENOSPC;
-                               goto error;
-                       }
-                       goto check_pending;
+                               goto out;
+
+                       break;
                }
                btrfs_item_key_to_cpu(l, &key, slot);
 
@@ -815,48 +911,62 @@ no_more_items:
                        goto next;
 
                if (key.objectid > device->devid)
-                       goto no_more_items;
+                       break;
 
-               if (key.offset >= search_start && key.offset > last_byte &&
-                   start_found) {
-                       if (last_byte < search_start)
-                               last_byte = search_start;
-                       hole_size = key.offset - last_byte;
+               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                       goto next;
 
-                       if (hole_size > *max_avail)
-                               *max_avail = hole_size;
+               if (key.offset > search_start) {
+                       hole_size = key.offset - search_start;
 
-                       if (key.offset > last_byte &&
-                           hole_size >= num_bytes) {
-                               *start = last_byte;
-                               goto check_pending;
+                       if (hole_size > max_hole_size) {
+                               max_hole_start = search_start;
+                               max_hole_size = hole_size;
+                       }
+
+                       /*
+                        * If this free space is greater than which we need,
+                        * it must be the max free space that we have found
+                        * until now, so max_hole_start must point to the start
+                        * of this free space and the length of this free space
+                        * is stored in max_hole_size. Thus, we return
+                        * max_hole_start and max_hole_size and go back to the
+                        * caller.
+                        */
+                       if (hole_size >= num_bytes) {
+                               ret = 0;
+                               goto out;
                        }
                }
-               if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-                       goto next;
 
-               start_found = 1;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-               last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+               extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                 dev_extent);
+               if (extent_end > search_start)
+                       search_start = extent_end;
 next:
                path->slots[0]++;
                cond_resched();
        }
-check_pending:
-       /* we have to make sure we didn't find an extent that has already
-        * been allocated by the map tree or the original allocation
-        */
-       BUG_ON(*start < search_start);
 
-       if (*start + num_bytes > search_end) {
-               ret = -ENOSPC;
-               goto error;
+       hole_size = search_end- search_start;
+       if (hole_size > max_hole_size) {
+               max_hole_start = search_start;
+               max_hole_size = hole_size;
        }
-       /* check for pending inserts here */
-       ret = 0;
 
-error:
+       /* See above. */
+       if (hole_size < num_bytes)
+               ret = -ENOSPC;
+       else
+               ret = 0;
+
+out:
        btrfs_free_path(path);
+error:
+       *start = max_hole_start;
+       if (len)
+               *len = max_hole_size;
        return ret;
 }
 
@@ -1196,7 +1306,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
                if (!bh) {
-                       ret = -EIO;
+                       ret = -EINVAL;
                        goto error_close;
                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1916,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
        if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
        mutex_lock(&dev_root->fs_info->volume_mutex);
        dev_root = dev_root->fs_info->dev_root;
 
@@ -2154,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
                return calc_size * num_stripes;
 }
 
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *extent_root,
-                              struct map_lookup **map_ret,
-                              u64 *num_bytes, u64 *stripe_size,
-                              u64 start, u64 type)
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
 {
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct btrfs_device *device = NULL;
-       struct btrfs_fs_devices *fs_devices = info->fs_devices;
-       struct list_head *cur;
-       struct map_lookup *map = NULL;
-       struct extent_map_tree *em_tree;
-       struct extent_map *em;
-       struct list_head private_devs;
-       int min_stripe_size = 1 * 1024 * 1024;
-       u64 calc_size = 1024 * 1024 * 1024;
-       u64 max_chunk_size = calc_size;
-       u64 min_free;
-       u64 avail;
-       u64 max_avail = 0;
-       u64 dev_offset;
-       int num_stripes = 1;
-       int min_stripes = 1;
-       int sub_stripes = 0;
-       int looped = 0;
-       int ret;
-       int index;
-       int stripe_len = 64 * 1024;
+       if (((struct btrfs_device_info *)dev_info1)->max_avail >
+           ((struct btrfs_device_info *)dev_info2)->max_avail)
+               return -1;
+       else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+                ((struct btrfs_device_info *)dev_info2)->max_avail)
+               return 1;
+       else
+               return 0;
+}
 
-       if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
-           (type & BTRFS_BLOCK_GROUP_DUP)) {
-               WARN_ON(1);
-               type &= ~BTRFS_BLOCK_GROUP_DUP;
-       }
-       if (list_empty(&fs_devices->alloc_list))
-               return -ENOSPC;
+static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
+                                int *num_stripes, int *min_stripes,
+                                int *sub_stripes)
+{
+       *num_stripes = 1;
+       *min_stripes = 1;
+       *sub_stripes = 0;
 
        if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-               num_stripes = fs_devices->rw_devices;
-               min_stripes = 2;
+               *num_stripes = fs_devices->rw_devices;
+               *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-               num_stripes = 2;
-               min_stripes = 2;
+               *num_stripes = 2;
+               *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
                if (fs_devices->rw_devices < 2)
                        return -ENOSPC;
-               num_stripes = 2;
-               min_stripes = 2;
+               *num_stripes = 2;
+               *min_stripes = 2;
        }
        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-               num_stripes = fs_devices->rw_devices;
-               if (num_stripes < 4)
+               *num_stripes = fs_devices->rw_devices;
+               if (*num_stripes < 4)
                        return -ENOSPC;
-               num_stripes &= ~(u32)1;
-               sub_stripes = 2;
-               min_stripes = 4;
+               *num_stripes &= ~(u32)1;
+               *sub_stripes = 2;
+               *min_stripes = 4;
        }
 
+       return 0;
+}
+
+static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
+                                   u64 proposed_size, u64 type,
+                                   int num_stripes, int small_stripe)
+{
+       int min_stripe_size = 1 * 1024 * 1024;
+       u64 calc_size = proposed_size;
+       u64 max_chunk_size = calc_size;
+       int ncopies = 1;
+
+       if (type & (BTRFS_BLOCK_GROUP_RAID1 |
+                   BTRFS_BLOCK_GROUP_DUP |
+                   BTRFS_BLOCK_GROUP_RAID10))
+               ncopies = 2;
+
        if (type & BTRFS_BLOCK_GROUP_DATA) {
                max_chunk_size = 10 * calc_size;
                min_stripe_size = 64 * 1024 * 1024;
@@ -2230,51 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
                             max_chunk_size);
 
-again:
-       max_avail = 0;
-       if (!map || map->num_stripes != num_stripes) {
-               kfree(map);
-               map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-               if (!map)
-                       return -ENOMEM;
-               map->num_stripes = num_stripes;
-       }
-
-       if (calc_size * num_stripes > max_chunk_size) {
-               calc_size = max_chunk_size;
+       if (calc_size * num_stripes > max_chunk_size * ncopies) {
+               calc_size = max_chunk_size * ncopies;
                do_div(calc_size, num_stripes);
-               do_div(calc_size, stripe_len);
-               calc_size *= stripe_len;
+               do_div(calc_size, BTRFS_STRIPE_LEN);
+               calc_size *= BTRFS_STRIPE_LEN;
        }
 
        /* we don't want tiny stripes */
-       if (!looped)
+       if (!small_stripe)
                calc_size = max_t(u64, min_stripe_size, calc_size);
 
        /*
-        * we're about to do_div by the stripe_len so lets make sure
+        * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
         * we end up with something bigger than a stripe
         */
-       calc_size = max_t(u64, calc_size, stripe_len * 4);
+       calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+
+       do_div(calc_size, BTRFS_STRIPE_LEN);
+       calc_size *= BTRFS_STRIPE_LEN;
+
+       return calc_size;
+}
+
+static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
+                                                     int num_stripes)
+{
+       struct map_lookup *new;
+       size_t len = map_lookup_size(num_stripes);
+
+       BUG_ON(map->num_stripes < num_stripes);
+
+       if (map->num_stripes == num_stripes)
+               return map;
+
+       new = kmalloc(len, GFP_NOFS);
+       if (!new) {
+               /* just change map->num_stripes */
+               map->num_stripes = num_stripes;
+               return map;
+       }
+
+       memcpy(new, map, len);
+       new->num_stripes = num_stripes;
+       kfree(map);
+       return new;
+}
+
+/*
+ * helper to allocate device space from btrfs_device_info, in which we stored
+ * max free space information of every device. It is used when we can not
+ * allocate chunks by default size.
+ *
+ * By this helper, we can allocate a new chunk as larger as possible.
+ */
+static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_devices *fs_devices,
+                                   struct btrfs_device_info *devices,
+                                   int nr_device, u64 type,
+                                   struct map_lookup **map_lookup,
+                                   int min_stripes, u64 *stripe_size)
+{
+       int i, index, sort_again = 0;
+       int min_devices = min_stripes;
+       u64 max_avail, min_free;
+       struct map_lookup *map = *map_lookup;
+       int ret;
+
+       if (nr_device < min_stripes)
+               return -ENOSPC;
+
+       btrfs_descending_sort_devices(devices, nr_device);
+
+       max_avail = devices[0].max_avail;
+       if (!max_avail)
+               return -ENOSPC;
+
+       for (i = 0; i < nr_device; i++) {
+               /*
+                * if dev_offset = 0, it means the free space of this device
+                * is less than what we need, and we didn't search max avail
+                * extent on this device, so do it now.
+                */
+               if (!devices[i].dev_offset) {
+                       ret = find_free_dev_extent(trans, devices[i].dev,
+                                                  max_avail,
+                                                  &devices[i].dev_offset,
+                                                  &devices[i].max_avail);
+                       if (ret != 0 && ret != -ENOSPC)
+                               return ret;
+                       sort_again = 1;
+               }
+       }
+
+       /* we update the max avail free extent of each devices, sort again */
+       if (sort_again)
+               btrfs_descending_sort_devices(devices, nr_device);
+
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               min_devices = 1;
+
+       if (!devices[min_devices - 1].max_avail)
+               return -ENOSPC;
+
+       max_avail = devices[min_devices - 1].max_avail;
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               do_div(max_avail, 2);
+
+       max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
+                                            min_stripes, 1);
+       if (type & BTRFS_BLOCK_GROUP_DUP)
+               min_free = max_avail * 2;
+       else
+               min_free = max_avail;
+
+       if (min_free > devices[min_devices - 1].max_avail)
+               return -ENOSPC;
+
+       map = __shrink_map_lookup_stripes(map, min_stripes);
+       *stripe_size = max_avail;
+
+       index = 0;
+       for (i = 0; i < min_stripes; i++) {
+               map->stripes[i].dev = devices[index].dev;
+               map->stripes[i].physical = devices[index].dev_offset;
+               if (type & BTRFS_BLOCK_GROUP_DUP) {
+                       i++;
+                       map->stripes[i].dev = devices[index].dev;
+                       map->stripes[i].physical = devices[index].dev_offset +
+                                                  max_avail;
+               }
+               index++;
+       }
+       *map_lookup = map;
+
+       return 0;
+}
 
-       do_div(calc_size, stripe_len);
-       calc_size *= stripe_len;
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *extent_root,
+                              struct map_lookup **map_ret,
+                              u64 *num_bytes, u64 *stripe_size,
+                              u64 start, u64 type)
+{
+       struct btrfs_fs_info *info = extent_root->fs_info;
+       struct btrfs_device *device = NULL;
+       struct btrfs_fs_devices *fs_devices = info->fs_devices;
+       struct list_head *cur;
+       struct map_lookup *map;
+       struct extent_map_tree *em_tree;
+       struct extent_map *em;
+       struct btrfs_device_info *devices_info;
+       struct list_head private_devs;
+       u64 calc_size = 1024 * 1024 * 1024;
+       u64 min_free;
+       u64 avail;
+       u64 dev_offset;
+       int num_stripes;
+       int min_stripes;
+       int sub_stripes;
+       int min_devices;        /* the min number of devices we need */
+       int i;
+       int ret;
+       int index;
+
+       if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+           (type & BTRFS_BLOCK_GROUP_DUP)) {
+               WARN_ON(1);
+               type &= ~BTRFS_BLOCK_GROUP_DUP;
+       }
+       if (list_empty(&fs_devices->alloc_list))
+               return -ENOSPC;
+
+       ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
+                                   &min_stripes, &sub_stripes);
+       if (ret)
+               return ret;
+
+       devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+                              GFP_NOFS);
+       if (!devices_info)
+               return -ENOMEM;
+
+       map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+       if (!map) {
+               ret = -ENOMEM;
+               goto error;
+       }
+       map->num_stripes = num_stripes;
 
        cur = fs_devices->alloc_list.next;
        index = 0;
+       i = 0;
 
-       if (type & BTRFS_BLOCK_GROUP_DUP)
+       calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
+                                            num_stripes, 0);
+
+       if (type & BTRFS_BLOCK_GROUP_DUP) {
                min_free = calc_size * 2;
-       else
+               min_devices = 1;
+       } else {
                min_free = calc_size;
-
-       /*
-        * we add 1MB because we never use the first 1MB of the device, unless
-        * we've looped, then we are likely allocating the maximum amount of
-        * space left already
-        */
-       if (!looped)
-               min_free += 1024 * 1024;
+               min_devices = min_stripes;
+       }
 
        INIT_LIST_HEAD(&private_devs);
        while (index < num_stripes) {
@@ -2287,27 +2559,39 @@ again:
                cur = cur->next;
 
                if (device->in_fs_metadata && avail >= min_free) {
-                       ret = find_free_dev_extent(trans, device,
-                                                  min_free, &dev_offset,
-                                                  &max_avail);
+                       ret = find_free_dev_extent(trans, device, min_free,
+                                                  &devices_info[i].dev_offset,
+                                                  &devices_info[i].max_avail);
                        if (ret == 0) {
                                list_move_tail(&device->dev_alloc_list,
                                               &private_devs);
                                map->stripes[index].dev = device;
-                               map->stripes[index].physical = dev_offset;
+                               map->stripes[index].physical =
+                                               devices_info[i].dev_offset;
                                index++;
                                if (type & BTRFS_BLOCK_GROUP_DUP) {
                                        map->stripes[index].dev = device;
                                        map->stripes[index].physical =
-                                               dev_offset + calc_size;
+                                               devices_info[i].dev_offset +
+                                               calc_size;
                                        index++;
                                }
-                       }
-               } else if (device->in_fs_metadata && avail > max_avail)
-                       max_avail = avail;
+                       } else if (ret != -ENOSPC)
+                               goto error;
+
+                       devices_info[i].dev = device;
+                       i++;
+               } else if (device->in_fs_metadata &&
+                          avail >= BTRFS_STRIPE_LEN) {
+                       devices_info[i].dev = device;
+                       devices_info[i].max_avail = avail;
+                       i++;
+               }
+
                if (cur == &fs_devices->alloc_list)
                        break;
        }
+
        list_splice(&private_devs, &fs_devices->alloc_list);
        if (index < num_stripes) {
                if (index >= min_stripes) {
@@ -2316,34 +2600,36 @@ again:
                                num_stripes /= sub_stripes;
                                num_stripes *= sub_stripes;
                        }
-                       looped = 1;
-                       goto again;
-               }
-               if (!looped && max_avail > 0) {
-                       looped = 1;
-                       calc_size = max_avail;
-                       goto again;
+
+                       map = __shrink_map_lookup_stripes(map, num_stripes);
+               } else if (i >= min_devices) {
+                       ret = __btrfs_alloc_tiny_space(trans, fs_devices,
+                                                      devices_info, i, type,
+                                                      &map, min_stripes,
+                                                      &calc_size);
+                       if (ret)
+                               goto error;
+               } else {
+                       ret = -ENOSPC;
+                       goto error;
                }
-               kfree(map);
-               return -ENOSPC;
        }
        map->sector_size = extent_root->sectorsize;
-       map->stripe_len = stripe_len;
-       map->io_align = stripe_len;
-       map->io_width = stripe_len;
+       map->stripe_len = BTRFS_STRIPE_LEN;
+       map->io_align = BTRFS_STRIPE_LEN;
+       map->io_width = BTRFS_STRIPE_LEN;
        map->type = type;
-       map->num_stripes = num_stripes;
        map->sub_stripes = sub_stripes;
 
        *map_ret = map;
        *stripe_size = calc_size;
        *num_bytes = chunk_bytes_by_type(type, calc_size,
-                                        num_stripes, sub_stripes);
+                                        map->num_stripes, sub_stripes);
 
        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
-               kfree(map);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto error;
        }
        em->bdev = (struct block_device *)map;
        em->start = start;
@@ -2376,7 +2662,13 @@ again:
                index++;
        }
 
+       kfree(devices_info);
        return 0;
+
+error:
+       kfree(map);
+       kfree(devices_info);
+       return ret;
 }
 
 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
index 1be7810..7fb59d4 100644 (file)
 #define __BTRFS_VOLUMES_
 
 #include <linux/bio.h>
+#include <linux/sort.h>
 #include "async-thread.h"
 
+#define BTRFS_STRIPE_LEN       (64 * 1024)
+
 struct buffer_head;
 struct btrfs_pending_bios {
        struct bio *head;
@@ -136,6 +139,30 @@ struct btrfs_multi_bio {
        struct btrfs_bio_stripe stripes[];
 };
 
+struct btrfs_device_info {
+       struct btrfs_device *dev;
+       u64 dev_offset;
+       u64 max_avail;
+};
+
+/* Used to sort the devices by max_avail(descending sort) */
+int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+                                       struct btrfs_device_info *devices,
+                                       size_t nr_devices)
+{
+       sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+            btrfs_cmp_device_free_bytes, NULL);
+}
+
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                  u64 end, u64 *length);
+
 #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))
 
index 698fdd2..a577653 100644 (file)
@@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                   size_t size, int flags)
 {
+       struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+       /*
+        * The permission on security.* and system.* is not checked
+        * in permission().
+        */
+       if (btrfs_root_readonly(root))
+               return -EROFS;
+
        /*
         * If this is a request for a synthetic attribute in the system.*
         * namespace use the generic infrastructure to resolve a handler
@@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
+       struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+
+       /*
+        * The permission on security.* and system.* is not checked
+        * in permission().
+        */
+       if (btrfs_root_readonly(root))
+               return -EROFS;
+
        /*
         * If this is a request for a synthetic attribute in the system.*
         * namespace use the generic infrastructure to resolve a handler
index b9cd544..f5ec2d4 100644 (file)
 #include <linux/bio.h>
 #include "compression.h"
 
-/* Plan: call deflate() with avail_in == *sourcelen,
-       avail_out = *dstlen - 12 and flush == Z_FINISH.
-       If it doesn't manage to finish, call it again with
-       avail_in == 0 and avail_out set to the remaining 12
-       bytes for it to clean up.
-   Q: Is 12 bytes sufficient?
-*/
-#define STREAM_END_SPACE 12
-
 struct workspace {
        z_stream inf_strm;
        z_stream def_strm;
@@ -48,152 +39,51 @@ struct workspace {
        struct list_head list;
 };
 
-static LIST_HEAD(idle_workspace);
-static DEFINE_SPINLOCK(workspace_lock);
-static unsigned long num_workspace;
-static atomic_t alloc_workspace = ATOMIC_INIT(0);
-static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+static void zlib_free_workspace(struct list_head *ws)
+{
+       struct workspace *workspace = list_entry(ws, struct workspace, list);
 
-/*
- * this finds an available zlib workspace or allocates a new one
- * NULL or an ERR_PTR is returned if things go bad.
- */
-static struct workspace *find_zlib_workspace(void)
+       vfree(workspace->def_strm.workspace);
+       vfree(workspace->inf_strm.workspace);
+       kfree(workspace->buf);
+       kfree(workspace);
+}
+
+static struct list_head *zlib_alloc_workspace(void)
 {
        struct workspace *workspace;
-       int ret;
-       int cpus = num_online_cpus();
-
-again:
-       spin_lock(&workspace_lock);
-       if (!list_empty(&idle_workspace)) {
-               workspace = list_entry(idle_workspace.next, struct workspace,
-                                      list);
-               list_del(&workspace->list);
-               num_workspace--;
-               spin_unlock(&workspace_lock);
-               return workspace;
 
-       }
-       spin_unlock(&workspace_lock);
-       if (atomic_read(&alloc_workspace) > cpus) {
-               DEFINE_WAIT(wait);
-               prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-               if (atomic_read(&alloc_workspace) > cpus)
-                       schedule();
-               finish_wait(&workspace_wait, &wait);
-               goto again;
-       }
-       atomic_inc(&alloc_workspace);
        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
-       if (!workspace) {
-               ret = -ENOMEM;
-               goto fail;
-       }
+       if (!workspace)
+               return ERR_PTR(-ENOMEM);
 
        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
-       if (!workspace->def_strm.workspace) {
-               ret = -ENOMEM;
-               goto fail;
-       }
        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
-       if (!workspace->inf_strm.workspace) {
-               ret = -ENOMEM;
-               goto fail_inflate;
-       }
        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-       if (!workspace->buf) {
-               ret = -ENOMEM;
-               goto fail_kmalloc;
-       }
-       return workspace;
-
-fail_kmalloc:
-       vfree(workspace->inf_strm.workspace);
-fail_inflate:
-       vfree(workspace->def_strm.workspace);
-fail:
-       kfree(workspace);
-       atomic_dec(&alloc_workspace);
-       wake_up(&workspace_wait);
-       return ERR_PTR(ret);
-}
-
-/*
- * put a workspace struct back on the list or free it if we have enough
- * idle ones sitting around
- */
-static int free_workspace(struct workspace *workspace)
-{
-       spin_lock(&workspace_lock);
-       if (num_workspace < num_online_cpus()) {
-               list_add_tail(&workspace->list, &idle_workspace);
-               num_workspace++;
-               spin_unlock(&workspace_lock);
-               if (waitqueue_active(&workspace_wait))
-                       wake_up(&workspace_wait);
-               return 0;
-       }
-       spin_unlock(&workspace_lock);
-       vfree(workspace->def_strm.workspace);
-       vfree(workspace->inf_strm.workspace);
-       kfree(workspace->buf);
-       kfree(workspace);
+       if (!workspace->def_strm.workspace ||
+           !workspace->inf_strm.workspace || !workspace->buf)
+               goto fail;
 
-       atomic_dec(&alloc_workspace);
-       if (waitqueue_active(&workspace_wait))
-               wake_up(&workspace_wait);
-       return 0;
-}
+       INIT_LIST_HEAD(&workspace->list);
 
-/*
- * cleanup function for module exit
- */
-static void free_workspaces(void)
-{
-       struct workspace *workspace;
-       while (!list_empty(&idle_workspace)) {
-               workspace = list_entry(idle_workspace.next, struct workspace,
-                                      list);
-               list_del(&workspace->list);
-               vfree(workspace->def_strm.workspace);
-               vfree(workspace->inf_strm.workspace);
-               kfree(workspace->buf);
-               kfree(workspace);
-               atomic_dec(&alloc_workspace);
-       }
+       return &workspace->list;
+fail:
+       zlib_free_workspace(&workspace->list);
+       return ERR_PTR(-ENOMEM);
 }
 
-/*
- * given an address space and start/len, compress the bytes.
- *
- * pages are allocated to hold the compressed result and stored
- * in 'pages'
- *
- * out_pages is used to return the number of pages allocated.  There
- * may be pages allocated even if we return an error
- *
- * total_in is used to return the number of bytes actually read.  It
- * may be smaller then len if we had to exit early because we
- * ran out of room in the pages array or because we cross the
- * max_out threshold.
- *
- * total_out is used to return the total number of compressed bytes
- *
- * max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
- */
-int btrfs_zlib_compress_pages(struct address_space *mapping,
-                             u64 start, unsigned long len,
-                             struct page **pages,
-                             unsigned long nr_dest_pages,
-                             unsigned long *out_pages,
-                             unsigned long *total_in,
-                             unsigned long *total_out,
-                             unsigned long max_out)
+static int zlib_compress_pages(struct list_head *ws,
+                              struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out)
 {
+       struct workspace *workspace = list_entry(ws, struct workspace, list);
        int ret;
-       struct workspace *workspace;
        char *data_in;
        char *cpage_out;
        int nr_pages = 0;
@@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        *total_out = 0;
        *total_in = 0;
 
-       workspace = find_zlib_workspace();
-       if (IS_ERR(workspace))
-               return -1;
-
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
                printk(KERN_WARNING "deflateInit failed\n");
                ret = -1;
@@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        data_in = kmap(in_page);
 
        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       if (out_page == NULL) {
+               ret = -1;
+               goto out;
+       }
        cpage_out = kmap(out_page);
        pages[0] = out_page;
        nr_pages = 1;
@@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
                                goto out;
                        }
                        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                       if (out_page == NULL) {
+                               ret = -1;
+                               goto out;
+                       }
                        cpage_out = kmap(out_page);
                        pages[nr_pages] = out_page;
                        nr_pages++;
@@ -314,55 +208,26 @@ out:
                kunmap(in_page);
                page_cache_release(in_page);
        }
-       free_workspace(workspace);
        return ret;
 }
 
-/*
- * pages_in is an array of pages with compressed data.
- *
- * disk_start is the starting logical offset of this array in the file
- *
- * bvec is a bio_vec of pages from the file that we want to decompress into
- *
- * vcnt is the count of pages in the biovec
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous.  They all correspond to the range of bytes covered by
- * the compressed extent.
- */
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
-                             u64 disk_start,
-                             struct bio_vec *bvec,
-                             int vcnt,
-                             size_t srclen)
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
+                                 u64 disk_start,
+                                 struct bio_vec *bvec,
+                                 int vcnt,
+                                 size_t srclen)
 {
-       int ret = 0;
+       struct workspace *workspace = list_entry(ws, struct workspace, list);
+       int ret = 0, ret2;
        int wbits = MAX_WBITS;
-       struct workspace *workspace;
        char *data_in;
        size_t total_out = 0;
-       unsigned long page_bytes_left;
        unsigned long page_in_index = 0;
        unsigned long page_out_index = 0;
-       struct page *page_out;
        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
                                        PAGE_CACHE_SIZE;
        unsigned long buf_start;
-       unsigned long buf_offset;
-       unsigned long bytes;
-       unsigned long working_bytes;
        unsigned long pg_offset;
-       unsigned long start_byte;
-       unsigned long current_buf_start;
-       char *kaddr;
-
-       workspace = find_zlib_workspace();
-       if (IS_ERR(workspace))
-               return -ENOMEM;
 
        data_in = kmap(pages_in[page_in_index]);
        workspace->inf_strm.next_in = data_in;
@@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        workspace->inf_strm.total_out = 0;
        workspace->inf_strm.next_out = workspace->buf;
        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
-       page_out = bvec[page_out_index].bv_page;
-       page_bytes_left = PAGE_CACHE_SIZE;
        pg_offset = 0;
 
        /* If it's deflate, and it's got no preset dictionary, then
@@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
                printk(KERN_WARNING "inflateInit failed\n");
-               ret = -1;
-               goto out;
+               return -1;
        }
        while (workspace->inf_strm.total_in < srclen) {
                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
                if (ret != Z_OK && ret != Z_STREAM_END)
                        break;
-               /*
-                * buf start is the byte offset we're of the start of
-                * our workspace buffer
-                */
-               buf_start = total_out;
 
-               /* total_out is the last byte of the workspace buffer */
+               buf_start = total_out;
                total_out = workspace->inf_strm.total_out;
 
-               working_bytes = total_out - buf_start;
-
-               /*
-                * start byte is the first byte of the page we're currently
-                * copying into relative to the start of the compressed data.
-                */
-               start_byte = page_offset(page_out) - disk_start;
-
-               if (working_bytes == 0) {
-                       /* we didn't make progress in this inflate
-                        * call, we're done
-                        */
-                       if (ret != Z_STREAM_END)
-                               ret = -1;
+               /* we didn't make progress in this inflate call, we're done */
+               if (buf_start == total_out)
                        break;
-               }
 
-               /* we haven't yet hit data corresponding to this page */
-               if (total_out <= start_byte)
-                       goto next;
-
-               /*
-                * the start of the data we care about is offset into
-                * the middle of our working buffer
-                */
-               if (total_out > start_byte && buf_start < start_byte) {
-                       buf_offset = start_byte - buf_start;
-                       working_bytes -= buf_offset;
-               } else {
-                       buf_offset = 0;
-               }
-               current_buf_start = buf_start;
-
-               /* copy bytes from the working buffer into the pages */
-               while (working_bytes > 0) {
-                       bytes = min(PAGE_CACHE_SIZE - pg_offset,
-                                   PAGE_CACHE_SIZE - buf_offset);
-                       bytes = min(bytes, working_bytes);
-                       kaddr = kmap_atomic(page_out, KM_USER0);
-                       memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
-                              bytes);
-                       kunmap_atomic(kaddr, KM_USER0);
-                       flush_dcache_page(page_out);
-
-                       pg_offset += bytes;
-                       page_bytes_left -= bytes;
-                       buf_offset += bytes;
-                       working_bytes -= bytes;
-                       current_buf_start += bytes;
-
-                       /* check if we need to pick another page */
-                       if (page_bytes_left == 0) {
-                               page_out_index++;
-                               if (page_out_index >= vcnt) {
-                                       ret = 0;
-                                       goto done;
-                               }
-
-                               page_out = bvec[page_out_index].bv_page;
-                               pg_offset = 0;
-                               page_bytes_left = PAGE_CACHE_SIZE;
-                               start_byte = page_offset(page_out) - disk_start;
-
-                               /*
-                                * make sure our new page is covered by this
-                                * working buffer
-                                */
-                               if (total_out <= start_byte)
-                                       goto next;
-
-                               /* the next page in the biovec might not
-                                * be adjacent to the last page, but it
-                                * might still be found inside this working
-                                * buffer.  bump our offset pointer
-                                */
-                               if (total_out > start_byte &&
-                                   current_buf_start < start_byte) {
-                                       buf_offset = start_byte - buf_start;
-                                       working_bytes = total_out - start_byte;
-                                       current_buf_start = buf_start +
-                                               buf_offset;
-                               }
-                       }
+               ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+                                                total_out, disk_start,
+                                                bvec, vcnt,
+                                                &page_out_index, &pg_offset);
+               if (ret2 == 0) {
+                       ret = 0;
+                       goto done;
                }
-next:
+
                workspace->inf_strm.next_out = workspace->buf;
                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 
@@ -516,35 +301,21 @@ done:
        zlib_inflateEnd(&workspace->inf_strm);
        if (data_in)
                kunmap(pages_in[page_in_index]);
-out:
-       free_workspace(workspace);
        return ret;
 }
 
-/*
- * a less complex decompression routine.  Our compressed data fits in a
- * single page, and we want to read a single page out of it.
- * start_byte tells us the offset into the compressed data we're interested in
- */
-int btrfs_zlib_decompress(unsigned char *data_in,
-                         struct page *dest_page,
-                         unsigned long start_byte,
-                         size_t srclen, size_t destlen)
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen)
 {
+       struct workspace *workspace = list_entry(ws, struct workspace, list);
        int ret = 0;
        int wbits = MAX_WBITS;
-       struct workspace *workspace;
        unsigned long bytes_left = destlen;
        unsigned long total_out = 0;
        char *kaddr;
 
-       if (destlen > PAGE_CACHE_SIZE)
-               return -ENOMEM;
-
-       workspace = find_zlib_workspace();
-       if (IS_ERR(workspace))
-               return -ENOMEM;
-
        workspace->inf_strm.next_in = data_in;
        workspace->inf_strm.avail_in = srclen;
        workspace->inf_strm.total_in = 0;
@@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
                printk(KERN_WARNING "inflateInit failed\n");
-               ret = -1;
-               goto out;
+               return -1;
        }
 
        while (bytes_left > 0) {
@@ -616,12 +386,13 @@ next:
                ret = 0;
 
        zlib_inflateEnd(&workspace->inf_strm);
-out:
-       free_workspace(workspace);
        return ret;
 }
 
-void btrfs_zlib_exit(void)
-{
-    free_workspaces();
-}
+struct btrfs_compress_op btrfs_zlib_compress = {
+       .alloc_workspace        = zlib_alloc_workspace,
+       .free_workspace         = zlib_free_workspace,
+       .compress_pages         = zlib_compress_pages,
+       .decompress_biovec      = zlib_decompress_biovec,
+       .decompress             = zlib_decompress,
+};