Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 4 Apr 2014 22:39:39 +0000 (15:39 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 4 Apr 2014 22:39:39 +0000 (15:39 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Apr 2014 22:39:39 +0000 (15:39 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 4 Apr 2014 22:39:39 +0000 (15:39 -0700)
diff --git a/fs/adfs/super.c b/fs/adfs/super.c

index 7b3003c..952aeb0 100644 (file)
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -212,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options)
  
  static int adfs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_NODIRATIME;
         return parse_options(sb, data);
  }
diff --git a/fs/affs/super.c b/fs/affs/super.c

index d098731..3074530 100644 (file)
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -530,6 +530,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
  
         pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
  
+       sync_filesystem(sb);
         *flags |= MS_NODIRATIME;
  
         memcpy(volume, sbi->s_volume, 32);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c

index 5188f12..d626756 100644 (file)
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -913,6 +913,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
  static int
  befs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         if (!(*flags & MS_RDONLY))
                 return -EINVAL;
         return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index d4878dd..9dbf423 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1380,6 +1380,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
         unsigned int old_metadata_ratio = fs_info->metadata_ratio;
         int ret;
  
+       sync_filesystem(sb);
         btrfs_remount_prepare(fs_info);
  
         ret = btrfs_parse_options(root, data);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c

index ab8ad25..2c70cbe 100644 (file)
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -541,6 +541,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
  
  static int cifs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_NODIRATIME;
         return 0;
  }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c

index 626abc0..d9c7751 100644 (file)
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -96,6 +96,7 @@ void coda_destroy_inodecache(void)
  
  static int coda_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_NOATIME;
         return 0;
  }
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c

index a1f801c..ddcfe59 100644 (file)
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -243,6 +243,7 @@ static void cramfs_kill_sb(struct super_block *sb)
  
  static int cramfs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_RDONLY;
         return 0;
  }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c

index ca4a08f..8c41b52 100644 (file)
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
         int err;
         struct debugfs_fs_info *fsi = sb->s_fs_info;
  
+       sync_filesystem(sb);
         err = debugfs_parse_options(data, &fsi->mount_opts);
         if (err)
                 goto fail;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c

index a726b9f..c710380 100644 (file)
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -313,6 +313,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
         struct pts_fs_info *fsi = DEVPTS_SB(sb);
         struct pts_mount_opts *opts = &fsi->mount_opts;
  
+       sync_filesystem(sb);
         err = parse_mount_options(data, PARSE_REMOUNT, opts);
  
         /*
diff --git a/fs/efs/super.c b/fs/efs/super.c

index f8def1a..3befcc9 100644 (file)
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -114,6 +114,7 @@ static void destroy_inodecache(void)
  
  static int efs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_RDONLY;
         return 0;
  }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c

index 20d6697..d260115 100644 (file)
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
         unsigned long old_sb_flags;
         int err;
  
+       sync_filesystem(sb);
         spin_lock(&sbi->s_lock);
  
         /* Store the old options */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c

index 37fd31e..95c6c5a 100644 (file)
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
         int i;
  #endif
  
+       sync_filesystem(sb);
+
         /* Store the original options */
         old_sb_flags = sb->s_flags;
         old_opts.s_mount_opt = sbi->s_mount_opt;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index d3a534f..f1c65dc 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -31,6 +31,7 @@
  #include <linux/percpu_counter.h>
  #include <linux/ratelimit.h>
  #include <crypto/hash.h>
+#include <linux/falloc.h>
  #ifdef __KERNEL__
  #include <linux/compat.h>
  #endif
@@ -567,6 +568,8 @@ enum {
  #define EXT4_GET_BLOCKS_NO_LOCK                        0x0100
         /* Do not put hole in extent cache */
  #define EXT4_GET_BLOCKS_NO_PUT_HOLE            0x0200
+       /* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN      0x0400
  
  /*
   * The bit position of these flags must not overlap with any of the
@@ -998,6 +1001,8 @@ struct ext4_inode_info {
  #define EXT4_MOUNT2_STD_GROUP_SIZE     0x00000002 /* We have standard group
                                                       size of blocksize * 8
                                                       blocks */
+#define EXT4_MOUNT2_HURD_COMPAT                0x00000004 /* Support HURD-castrated
+                                                     file systems */
  
  #define clear_opt(sb, opt)             EXT4_SB(sb)->s_mount_opt &= \
                                                 ~EXT4_MOUNT_##opt
@@ -1326,6 +1331,7 @@ struct ext4_sb_info {
         struct list_head s_es_lru;
         unsigned long s_es_last_sorted;
         struct percpu_counter s_extent_cache_cnt;
+       struct mb_cache *s_mb_cache;
         spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
  
         /* Ratelimit ext4 messages. */
@@ -2133,8 +2139,6 @@ extern int ext4_writepage_trans_blocks(struct inode *);
  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
  extern int ext4_block_truncate_page(handle_t *handle,
                 struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
-               struct address_space *mapping, loff_t from, loff_t length);
  extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                              loff_t lstart, loff_t lend);
  extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2757,6 +2761,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
  extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                         __u64 start, __u64 len);
  extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
  
  /* move_extent.c */
  extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2766,6 +2771,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
  extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                              __u64 start_orig, __u64 start_donor,
                              __u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+                           struct ext4_extent **extent);
  
  /* page-io.c */
  extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c

index 3fe29de..c3fb607 100644 (file)
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                 if (WARN_ON_ONCE(err)) {
                         ext4_journal_abort_handle(where, line, __func__, bh,
                                                   handle, err);
+                       if (inode == NULL) {
+                               pr_err("EXT4: jbd2_journal_dirty_metadata "
+                                      "failed: handle type %u started at "
+                                      "line %u, credits %u/%u, errcode %d",
+                                      handle->h_type,
+                                      handle->h_line_no,
+                                      handle->h_requested_credits,
+                                      handle->h_buffer_credits, err);
+                               return err;
+                       }
                         ext4_error_inode(inode, where, line,
                                          bh->b_blocknr,
                                          "journal_dirty_metadata failed: "
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

index 74bc2d5..82df3ce 100644 (file)
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,6 @@
  #include <linux/quotaops.h>
  #include <linux/string.h>
  #include <linux/slab.h>
-#include <linux/falloc.h>
  #include <asm/uaccess.h>
  #include <linux/fiemap.h>
  #include "ext4_jbd2.h"
@@ -1691,7 +1690,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
          * the extent that was written properly split out and conversion to
          * initialized is trivial.
          */
-       if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
+       if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2))
                 return 0;
  
         ext1_ee_len = ext4_ext_get_actual_len(ex1);
@@ -1708,6 +1707,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
          */
         if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                 return 0;
+       if (ext4_ext_is_uninitialized(ex1) &&
+           (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+            atomic_read(&EXT4_I(inode)->i_unwritten) ||
+            (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN)))
+               return 0;
  #ifdef AGGRESSIVE_TEST
         if (ext1_ee_len >= 4)
                 return 0;
@@ -1731,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
  {
         struct ext4_extent_header *eh;
         unsigned int depth, len;
-       int merge_done = 0;
+       int merge_done = 0, uninit;
  
         depth = ext_depth(inode);
         BUG_ON(path[depth].p_hdr == NULL);
@@ -1741,8 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
                 if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
                         break;
                 /* merge with next extent! */
+               uninit = ext4_ext_is_uninitialized(ex);
                 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                 + ext4_ext_get_actual_len(ex + 1));
+               if (uninit)
+                       ext4_ext_mark_uninitialized(ex);
  
                 if (ex + 1 < EXT_LAST_EXTENT(eh)) {
                         len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1896,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
         struct ext4_ext_path *npath = NULL;
         int depth, len, err;
         ext4_lblk_t next;
-       int mb_flags = 0;
+       int mb_flags = 0, uninit;
  
         if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1946,9 +1953,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                                                   path + depth);
                         if (err)
                                 return err;
-
+                       uninit = ext4_ext_is_uninitialized(ex);
                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                         + ext4_ext_get_actual_len(newext));
+                       if (uninit)
+                               ext4_ext_mark_uninitialized(ex);
                         eh = path[depth].p_hdr;
                         nearex = ex;
                         goto merge;
@@ -1971,10 +1980,13 @@ prepend:
                         if (err)
                                 return err;
  
+                       uninit = ext4_ext_is_uninitialized(ex);
                         ex->ee_block = newext->ee_block;
                         ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
                         ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                         + ext4_ext_get_actual_len(newext));
+                       if (uninit)
+                               ext4_ext_mark_uninitialized(ex);
                         eh = path[depth].p_hdr;
                         nearex = ex;
                         goto merge;
@@ -2585,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
         ex_ee_block = le32_to_cpu(ex->ee_block);
         ex_ee_len = ext4_ext_get_actual_len(ex);
  
+       /*
+        * If we're starting with an extent other than the last one in the
+        * node, we need to see if it shares a cluster with the extent to
+        * the right (towards the end of the file). If its leftmost cluster
+        * is this extent's rightmost cluster and it is not cluster aligned,
+        * we'll mark it as a partial that is not to be deallocated.
+        */
+
+       if (ex != EXT_LAST_EXTENT(eh)) {
+               ext4_fsblk_t current_pblk, right_pblk;
+               long long current_cluster, right_cluster;
+
+               current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+               current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+               right_pblk = ext4_ext_pblock(ex + 1);
+               right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+               if (current_cluster == right_cluster &&
+                       EXT4_PBLK_COFF(sbi, right_pblk))
+                       *partial_cluster = -right_cluster;
+       }
+
         trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
  
         while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2710,10 +2743,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                 err = ext4_ext_correct_indexes(handle, inode, path);
  
         /*
-        * Free the partial cluster only if the current extent does not
-        * reference it. Otherwise we might free used cluster.
+        * If there's a partial cluster and at least one extent remains in
+        * the leaf, free the partial cluster if it isn't shared with the
+        * current extent.  If there's a partial cluster and no extents
+        * remain in the leaf, it can't be freed here.  It can only be
+        * freed when it's possible to determine if it's not shared with
+        * any other extent - when the next leaf is processed or when space
+        * removal is complete.
          */
-       if (*partial_cluster > 0 &&
+       if (*partial_cluster > 0 && eh->eh_entries &&
             (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
              *partial_cluster)) {
                 int flags = get_default_free_blocks_flags(inode);
@@ -3569,6 +3607,8 @@ out:
   *   b> Splits in two extents: Write is happening at either end of the extent
   *   c> Splits in three extents: Somone is writing in middle of the extent
   *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
   * One of more index blocks maybe needed if the extent tree grow after
   * the uninitialized extent split. To prevent ENOSPC occur at the IO
   * complete, we need to split the uninitialized extent before DIO submit
@@ -3579,7 +3619,7 @@ out:
   *
   * Returns the size of uninitialized extent to be written on success.
   */
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
                                         struct inode *inode,
                                         struct ext4_map_blocks *map,
                                         struct ext4_ext_path *path,
@@ -3591,9 +3631,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         unsigned int ee_len;
         int split_flag = 0, depth;
  
-       ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
-               "block %llu, max_blocks %u\n", inode->i_ino,
-               (unsigned long long)map->m_lblk, map->m_len);
+       ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+                 __func__, inode->i_ino,
+                 (unsigned long long)map->m_lblk, map->m_len);
  
         eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                 inode->i_sb->s_blocksize_bits;
@@ -3608,14 +3648,73 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         ee_block = le32_to_cpu(ex->ee_block);
         ee_len = ext4_ext_get_actual_len(ex);
  
-       split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
-       split_flag |= EXT4_EXT_MARK_UNINIT2;
-       if (flags & EXT4_GET_BLOCKS_CONVERT)
-               split_flag |= EXT4_EXT_DATA_VALID2;
+       /* Convert to unwritten */
+       if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+               split_flag |= EXT4_EXT_DATA_VALID1;
+       /* Convert to initialized */
+       } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+               split_flag |= ee_block + ee_len <= eof_block ?
+                             EXT4_EXT_MAY_ZEROOUT : 0;
+               split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
+       }
         flags |= EXT4_GET_BLOCKS_PRE_IO;
         return ext4_split_extent(handle, inode, path, map, split_flag, flags);
  }
  
+static int ext4_convert_initialized_extents(handle_t *handle,
+                                           struct inode *inode,
+                                           struct ext4_map_blocks *map,
+                                           struct ext4_ext_path *path)
+{
+       struct ext4_extent *ex;
+       ext4_lblk_t ee_block;
+       unsigned int ee_len;
+       int depth;
+       int err = 0;
+
+       depth = ext_depth(inode);
+       ex = path[depth].p_ext;
+       ee_block = le32_to_cpu(ex->ee_block);
+       ee_len = ext4_ext_get_actual_len(ex);
+
+       ext_debug("%s: inode %lu, logical"
+               "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+                 (unsigned long long)ee_block, ee_len);
+
+       if (ee_block != map->m_lblk || ee_len > map->m_len) {
+               err = ext4_split_convert_extents(handle, inode, map, path,
+                               EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+               if (err < 0)
+                       goto out;
+               ext4_ext_drop_refs(path);
+               path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+               if (IS_ERR(path)) {
+                       err = PTR_ERR(path);
+                       goto out;
+               }
+               depth = ext_depth(inode);
+               ex = path[depth].p_ext;
+       }
+
+       err = ext4_ext_get_access(handle, inode, path + depth);
+       if (err)
+               goto out;
+       /* first mark the extent as uninitialized */
+       ext4_ext_mark_uninitialized(ex);
+
+       /* note: ext4_ext_correct_indexes() isn't needed here because
+        * borders are not changed
+        */
+       ext4_ext_try_to_merge(handle, inode, path, ex);
+
+       /* Mark modified extent as dirty */
+       err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+       ext4_ext_show_leaf(inode, path);
+       return err;
+}
+
+
  static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                 struct inode *inode,
                                                 struct ext4_map_blocks *map,
@@ -3649,8 +3748,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                              inode->i_ino, (unsigned long long)ee_block, ee_len,
                              (unsigned long long)map->m_lblk, map->m_len);
  #endif
-               err = ext4_split_unwritten_extents(handle, inode, map, path,
-                                                  EXT4_GET_BLOCKS_CONVERT);
+               err = ext4_split_convert_extents(handle, inode, map, path,
+                                                EXT4_GET_BLOCKS_CONVERT);
                 if (err < 0)
                         goto out;
                 ext4_ext_drop_refs(path);
@@ -3850,6 +3949,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
         return allocated_clusters;
  }
  
+static int
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+                       struct ext4_map_blocks *map,
+                       struct ext4_ext_path *path, int flags,
+                       unsigned int allocated, ext4_fsblk_t newblock)
+{
+       int ret = 0;
+       int err = 0;
+
+       /*
+        * Make sure that the extent is no bigger than we support with
+        * uninitialized extent
+        */
+       if (map->m_len > EXT_UNINIT_MAX_LEN)
+               map->m_len = EXT_UNINIT_MAX_LEN / 2;
+
+       ret = ext4_convert_initialized_extents(handle, inode, map,
+                                               path);
+       if (ret >= 0) {
+               ext4_update_inode_fsync_trans(handle, inode, 1);
+               err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                        path, map->m_len);
+       } else
+               err = ret;
+       map->m_flags |= EXT4_MAP_UNWRITTEN;
+       if (allocated > map->m_len)
+               allocated = map->m_len;
+       map->m_len = allocated;
+
+       return err ? err : allocated;
+}
+
  static int
  ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                         struct ext4_map_blocks *map,
@@ -3877,8 +4008,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
  
         /* get_block() before submit the IO, split the extent */
         if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-               ret = ext4_split_unwritten_extents(handle, inode, map,
-                                                  path, flags);
+               ret = ext4_split_convert_extents(handle, inode, map,
+                                        path, flags | EXT4_GET_BLOCKS_CONVERT);
                 if (ret <= 0)
                         goto out;
                 /*
@@ -3993,10 +4124,6 @@ out1:
         map->m_pblk = newblock;
         map->m_len = allocated;
  out2:
-       if (path) {
-               ext4_ext_drop_refs(path);
-               kfree(path);
-       }
         return err ? err : allocated;
  }
  
@@ -4128,7 +4255,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
         struct ext4_extent newex, *ex, *ex2;
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
         ext4_fsblk_t newblock = 0;
-       int free_on_err = 0, err = 0, depth;
+       int free_on_err = 0, err = 0, depth, ret;
         unsigned int allocated = 0, offset = 0;
         unsigned int allocated_clusters = 0;
         struct ext4_allocation_request ar;
@@ -4170,6 +4297,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                 unsigned short ee_len;
  
+
                 /*
                  * Uninitialized extents are treated as holes, except that
                  * we split out initialized portions during a write.
@@ -4186,13 +4314,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                         ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                   ee_block, ee_len, newblock);
  
-                       if (!ext4_ext_is_uninitialized(ex))
+                       /*
+                        * If the extent is initialized check whether the
+                        * caller wants to convert it to unwritten.
+                        */
+                       if ((!ext4_ext_is_uninitialized(ex)) &&
+                           (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+                               allocated = ext4_ext_convert_initialized_extent(
+                                               handle, inode, map, path, flags,
+                                               allocated, newblock);
+                               goto out2;
+                       } else if (!ext4_ext_is_uninitialized(ex))
                                 goto out;
  
-                       allocated = ext4_ext_handle_uninitialized_extents(
+                       ret = ext4_ext_handle_uninitialized_extents(
                                 handle, inode, map, path, flags,
                                 allocated, newblock);
-                       goto out3;
+                       if (ret < 0)
+                               err = ret;
+                       else
+                               allocated = ret;
+                       goto out2;
                 }
         }
  
@@ -4473,7 +4615,6 @@ out2:
                 kfree(path);
         }
  
-out3:
         trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                        err ? err : allocated);
         ext4_es_lru_add(inode);
@@ -4514,34 +4655,200 @@ retry:
         ext4_std_error(inode->i_sb, err);
  }
  
-static void ext4_falloc_update_inode(struct inode *inode,
-                               int mode, loff_t new_size, int update_ctime)
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+                                 ext4_lblk_t len, int flags, int mode)
  {
-       struct timespec now;
+       struct inode *inode = file_inode(file);
+       handle_t *handle;
+       int ret = 0;
+       int ret2 = 0;
+       int retries = 0;
+       struct ext4_map_blocks map;
+       unsigned int credits;
  
-       if (update_ctime) {
-               now = current_fs_time(inode->i_sb);
-               if (!timespec_equal(&inode->i_ctime, &now))
-                       inode->i_ctime = now;
+       map.m_lblk = offset;
+       /*
+        * Don't normalize the request if it can fit in one extent so
+        * that it doesn't get unnecessarily split into multiple
+        * extents.
+        */
+       if (len <= EXT_UNINIT_MAX_LEN)
+               flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+       /*
+        * credits to insert 1 extent into extent tree
+        */
+       credits = ext4_chunk_trans_blocks(inode, len);
+
+retry:
+       while (ret >= 0 && ret < len) {
+               map.m_lblk = map.m_lblk + ret;
+               map.m_len = len = len - ret;
+               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+                                           credits);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       break;
+               }
+               ret = ext4_map_blocks(handle, inode, &map, flags);
+               if (ret <= 0) {
+                       ext4_debug("inode #%lu: block %u: len %u: "
+                                  "ext4_ext_map_blocks returned %d",
+                                  inode->i_ino, map.m_lblk,
+                                  map.m_len, ret);
+                       ext4_mark_inode_dirty(handle, inode);
+                       ret2 = ext4_journal_stop(handle);
+                       break;
+               }
+               ret2 = ext4_journal_stop(handle);
+               if (ret2)
+                       break;
+       }
+       if (ret == -ENOSPC &&
+                       ext4_should_retry_alloc(inode->i_sb, &retries)) {
+               ret = 0;
+               goto retry;
         }
+
+       return ret > 0 ? ret2 : ret;
+}
+
+static long ext4_zero_range(struct file *file, loff_t offset,
+                           loff_t len, int mode)
+{
+       struct inode *inode = file_inode(file);
+       handle_t *handle = NULL;
+       unsigned int max_blocks;
+       loff_t new_size = 0;
+       int ret = 0;
+       int flags;
+       int partial;
+       loff_t start, end;
+       ext4_lblk_t lblk;
+       struct address_space *mapping = inode->i_mapping;
+       unsigned int blkbits = inode->i_blkbits;
+
+       trace_ext4_zero_range(inode, offset, len, mode);
+
+       /*
+        * Write out all dirty pages to avoid race conditions
+        * Then release them.
+        */
+       if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               ret = filemap_write_and_wait_range(mapping, offset,
+                                                  offset + len - 1);
+               if (ret)
+                       return ret;
+       }
+
         /*
-        * Update only when preallocation was requested beyond
-        * the file size.
+        * Round up offset. This is not fallocate, we neet to zero out
+        * blocks, so convert interior block aligned part of the range to
+        * unwritten and possibly manually zero out unaligned parts of the
+        * range.
          */
-       if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+       start = round_up(offset, 1 << blkbits);
+       end = round_down((offset + len), 1 << blkbits);
+
+       if (start < offset || end > offset + len)
+               return -EINVAL;
+       partial = (offset + len) & ((1 << blkbits) - 1);
+
+       lblk = start >> blkbits;
+       max_blocks = (end >> blkbits);
+       if (max_blocks < lblk)
+               max_blocks = 0;
+       else
+               max_blocks -= lblk;
+
+       flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+               EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+       if (mode & FALLOC_FL_KEEP_SIZE)
+               flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+       mutex_lock(&inode->i_mutex);
+
+       /*
+        * Indirect files do not support unwritten extnets
+        */
+       if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+               ret = -EOPNOTSUPP;
+               goto out_mutex;
+       }
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+               new_size = offset + len;
+               ret = inode_newsize_ok(inode, new_size);
+               if (ret)
+                       goto out_mutex;
+               /*
+                * If we have a partial block after EOF we have to allocate
+                * the entire block.
+                */
+               if (partial)
+                       max_blocks += 1;
+       }
+
+       if (max_blocks > 0) {
+
+               /* Now release the pages and zero block aligned part of pages*/
+               truncate_pagecache_range(inode, start, end - 1);
+
+               /* Wait all existing dio workers, newcomers will block on i_mutex */
+               ext4_inode_block_unlocked_dio(inode);
+               inode_dio_wait(inode);
+
+               /*
+                * Remove entire range from the extent status tree.
+                */
+               ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+               if (ret)
+                       goto out_dio;
+
+               ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+                                            mode);
+               if (ret)
+                       goto out_dio;
+       }
+
+       handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               ext4_std_error(inode->i_sb, ret);
+               goto out_dio;
+       }
+
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+       if (new_size) {
                 if (new_size > i_size_read(inode))
                         i_size_write(inode, new_size);
                 if (new_size > EXT4_I(inode)->i_disksize)
                         ext4_update_i_disksize(inode, new_size);
         } else {
                 /*
-                * Mark that we allocate beyond EOF so the subsequent truncate
-                * can proceed even if the new size is the same as i_size.
-                */
-               if (new_size > i_size_read(inode))
+               * Mark that we allocate beyond EOF so the subsequent truncate
+               * can proceed even if the new size is the same as i_size.
+               */
+               if ((offset + len) > i_size_read(inode))
                         ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
         }
  
+       ext4_mark_inode_dirty(handle, inode);
+
+       /* Zero out partial block at the edges of the range */
+       ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+       if (file->f_flags & O_SYNC)
+               ext4_handle_sync(handle);
+
+       ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
  }
  
  /*
@@ -4555,22 +4862,25 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
  {
         struct inode *inode = file_inode(file);
         handle_t *handle;
-       loff_t new_size;
+       loff_t new_size = 0;
         unsigned int max_blocks;
         int ret = 0;
-       int ret2 = 0;
-       int retries = 0;
         int flags;
-       struct ext4_map_blocks map;
-       unsigned int credits, blkbits = inode->i_blkbits;
+       ext4_lblk_t lblk;
+       struct timespec tv;
+       unsigned int blkbits = inode->i_blkbits;
  
         /* Return error if mode is not supported */
-       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+                    FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                 return -EOPNOTSUPP;
  
         if (mode & FALLOC_FL_PUNCH_HOLE)
                 return ext4_punch_hole(inode, offset, len);
  
+       if (mode & FALLOC_FL_COLLAPSE_RANGE)
+               return ext4_collapse_range(inode, offset, len);
+
         ret = ext4_convert_inline_data(inode);
         if (ret)
                 return ret;
@@ -4582,83 +4892,66 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                 return -EOPNOTSUPP;
  
+       if (mode & FALLOC_FL_ZERO_RANGE)
+               return ext4_zero_range(file, offset, len, mode);
+
         trace_ext4_fallocate_enter(inode, offset, len, mode);
-       map.m_lblk = offset >> blkbits;
+       lblk = offset >> blkbits;
         /*
          * We can't just convert len to max_blocks because
          * If blocksize = 4096 offset = 3072 and len = 2048
          */
         max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-               - map.m_lblk;
-       /*
-        * credits to insert 1 extent into extent tree
-        */
-       credits = ext4_chunk_trans_blocks(inode, max_blocks);
-       mutex_lock(&inode->i_mutex);
-       ret = inode_newsize_ok(inode, (len + offset));
-       if (ret) {
-               mutex_unlock(&inode->i_mutex);
-               trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-               return ret;
-       }
+               - lblk;
+
         flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
         if (mode & FALLOC_FL_KEEP_SIZE)
                 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
-       /*
-        * Don't normalize the request if it can fit in one extent so
-        * that it doesn't get unnecessarily split into multiple
-        * extents.
-        */
-       if (len <= EXT_UNINIT_MAX_LEN << blkbits)
-               flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
  
-retry:
-       while (ret >= 0 && ret < max_blocks) {
-               map.m_lblk = map.m_lblk + ret;
-               map.m_len = max_blocks = max_blocks - ret;
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                           credits);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       break;
-               }
-               ret = ext4_map_blocks(handle, inode, &map, flags);
-               if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
-                       ext4_warning(inode->i_sb,
-                                    "inode #%lu: block %u: len %u: "
-                                    "ext4_ext_map_blocks returned %d",
-                                    inode->i_ino, map.m_lblk,
-                                    map.m_len, ret);
-#endif
-                       ext4_mark_inode_dirty(handle, inode);
-                       ret2 = ext4_journal_stop(handle);
-                       break;
-               }
-               if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
-                                               blkbits) >> blkbits))
-                       new_size = offset + len;
-               else
-                       new_size = ((loff_t) map.m_lblk + ret) << blkbits;
+       mutex_lock(&inode->i_mutex);
  
-               ext4_falloc_update_inode(inode, mode, new_size,
-                                        (map.m_flags & EXT4_MAP_NEW));
-               ext4_mark_inode_dirty(handle, inode);
-               if ((file->f_flags & O_SYNC) && ret >= max_blocks)
-                       ext4_handle_sync(handle);
-               ret2 = ext4_journal_stop(handle);
-               if (ret2)
-                       break;
+       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+               new_size = offset + len;
+               ret = inode_newsize_ok(inode, new_size);
+               if (ret)
+                       goto out;
         }
-       if (ret == -ENOSPC &&
-                       ext4_should_retry_alloc(inode->i_sb, &retries)) {
-               ret = 0;
-               goto retry;
+
+       ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+       if (ret)
+               goto out;
+
+       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+       if (IS_ERR(handle))
+               goto out;
+
+       tv = inode->i_ctime = ext4_current_time(inode);
+
+       if (new_size) {
+               if (new_size > i_size_read(inode)) {
+                       i_size_write(inode, new_size);
+                       inode->i_mtime = tv;
+               }
+               if (new_size > EXT4_I(inode)->i_disksize)
+                       ext4_update_i_disksize(inode, new_size);
+       } else {
+               /*
+               * Mark that we allocate beyond EOF so the subsequent truncate
+               * can proceed even if the new size is the same as i_size.
+               */
+               if ((offset + len) > i_size_read(inode))
+                       ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
         }
+       ext4_mark_inode_dirty(handle, inode);
+       if (file->f_flags & O_SYNC)
+               ext4_handle_sync(handle);
+
+       ext4_journal_stop(handle);
+out:
         mutex_unlock(&inode->i_mutex);
-       trace_ext4_fallocate_exit(inode, offset, max_blocks,
-                               ret > 0 ? ret2 : ret);
-       return ret > 0 ? ret2 : ret;
+       trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+       return ret;
  }
  
  /*
@@ -4869,3 +5162,304 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
         ext4_es_lru_add(inode);
         return error;
  }
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+               struct ext4_ext_path *path)
+{
+       int credits, err;
+
+       if (!ext4_handle_valid(handle))
+               return 0;
+
+       /*
+        * Check if need to extend journal credits
+        * 3 for leaf, sb, and inode plus 2 (bmap and group
+        * descriptor) for each block group; assume two block
+        * groups
+        */
+       if (handle->h_buffer_credits < 7) {
+               credits = ext4_writepage_trans_blocks(inode);
+               err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+               /* EAGAIN is success */
+               if (err && err != -EAGAIN)
+                       return err;
+       }
+
+       err = ext4_ext_get_access(handle, inode, path);
+       return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+                           struct inode *inode, handle_t *handle,
+                           ext4_lblk_t *start)
+{
+       int depth, err = 0;
+       struct ext4_extent *ex_start, *ex_last;
+       bool update = 0;
+       depth = path->p_depth;
+
+       while (depth >= 0) {
+               if (depth == path->p_depth) {
+                       ex_start = path[depth].p_ext;
+                       if (!ex_start)
+                               return -EIO;
+
+                       ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+                       if (!ex_last)
+                               return -EIO;
+
+                       err = ext4_access_path(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+                               update = 1;
+
+                       *start = ex_last->ee_block +
+                               ext4_ext_get_actual_len(ex_last);
+
+                       while (ex_start <= ex_last) {
+                               ex_start->ee_block -= shift;
+                               if (ex_start >
+                                       EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+                                       if (ext4_ext_try_to_merge_right(inode,
+                                               path, ex_start - 1))
+                                               ex_last--;
+                               }
+                               ex_start++;
+                       }
+                       err = ext4_ext_dirty(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       if (--depth < 0 || !update)
+                               break;
+               }
+
+               /* Update index too */
+               err = ext4_access_path(handle, inode, path + depth);
+               if (err)
+                       goto out;
+
+               path[depth].p_idx->ei_block -= shift;
+               err = ext4_ext_dirty(handle, inode, path + depth);
+               if (err)
+                       goto out;
+
+               /* we are done if current index is not a starting index */
+               if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+                       break;
+
+               depth--;
+       }
+
+out:
+       return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+                      ext4_lblk_t start, ext4_lblk_t shift)
+{
+       struct ext4_ext_path *path;
+       int ret = 0, depth;
+       struct ext4_extent *extent;
+       ext4_lblk_t stop_block, current_block;
+       ext4_lblk_t ex_start, ex_end;
+
+       /* Let path point to the last extent */
+       path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+       if (IS_ERR(path))
+               return PTR_ERR(path);
+
+       depth = path->p_depth;
+       extent = path[depth].p_ext;
+       if (!extent) {
+               ext4_ext_drop_refs(path);
+               kfree(path);
+               return ret;
+       }
+
+       stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
+       ext4_ext_drop_refs(path);
+       kfree(path);
+
+       /* Nothing to shift, if hole is at the end of file */
+       if (start >= stop_block)
+               return ret;
+
+       /*
+        * Don't start shifting extents until we make sure the hole is big
+        * enough to accomodate the shift.
+        */
+       path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+       depth = path->p_depth;
+       extent =  path[depth].p_ext;
+       ex_start = extent->ee_block;
+       ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
+       ext4_ext_drop_refs(path);
+       kfree(path);
+
+       if ((start == ex_start && shift > ex_start) ||
+           (shift > start - ex_end))
+               return -EINVAL;
+
+       /* Its safe to start updating extents */
+       while (start < stop_block) {
+               path = ext4_ext_find_extent(inode, start, NULL, 0);
+               if (IS_ERR(path))
+                       return PTR_ERR(path);
+               depth = path->p_depth;
+               extent = path[depth].p_ext;
+               current_block = extent->ee_block;
+               if (start > current_block) {
+                       /* Hole, move to the next extent */
+                       ret = mext_next_extent(inode, path, &extent);
+                       if (ret != 0) {
+                               ext4_ext_drop_refs(path);
+                               kfree(path);
+                               if (ret == 1)
+                                       ret = 0;
+                               break;
+                       }
+               }
+               ret = ext4_ext_shift_path_extents(path, shift, inode,
+                               handle, &start);
+               ext4_ext_drop_refs(path);
+               kfree(path);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+       struct super_block *sb = inode->i_sb;
+       ext4_lblk_t punch_start, punch_stop;
+       handle_t *handle;
+       unsigned int credits;
+       loff_t new_size;
+       int ret;
+
+       BUG_ON(offset + len > i_size_read(inode));
+
+       /* Collapse range works only on fs block size aligned offsets. */
+       if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+           len & (EXT4_BLOCK_SIZE(sb) - 1))
+               return -EINVAL;
+
+       if (!S_ISREG(inode->i_mode))
+               return -EOPNOTSUPP;
+
+       trace_ext4_collapse_range(inode, offset, len);
+
+       punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+       punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+       /* Write out all dirty pages */
+       ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1);
+       if (ret)
+               return ret;
+
+       /* Take mutex lock */
+       mutex_lock(&inode->i_mutex);
+
+       /* It's not possible punch hole on append only file */
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+               ret = -EPERM;
+               goto out_mutex;
+       }
+
+       if (IS_SWAPFILE(inode)) {
+               ret = -ETXTBSY;
+               goto out_mutex;
+       }
+
+       /* Currently just for extent based files */
+       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+               ret = -EOPNOTSUPP;
+               goto out_mutex;
+       }
+
+       truncate_pagecache_range(inode, offset, -1);
+
+       /* Wait for existing dio to complete */
+       ext4_inode_block_unlocked_dio(inode);
+       inode_dio_wait(inode);
+
+       credits = ext4_writepage_trans_blocks(inode);
+       handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out_dio;
+       }
+
+       down_write(&EXT4_I(inode)->i_data_sem);
+       ext4_discard_preallocations(inode);
+
+       ret = ext4_es_remove_extent(inode, punch_start,
+                                   EXT_MAX_BLOCKS - punch_start - 1);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+
+       ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+
+       ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+                                    punch_stop - punch_start);
+       if (ret) {
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto out_stop;
+       }
+
+       new_size = i_size_read(inode) - len;
+       truncate_setsize(inode, new_size);
+       EXT4_I(inode)->i_disksize = new_size;
+
+       ext4_discard_preallocations(inode);
+       up_write(&EXT4_I(inode)->i_data_sem);
+       if (IS_SYNC(inode))
+               ext4_handle_sync(handle);
+       inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+       ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+       ext4_journal_stop(handle);
+out_dio:
+       ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c

index 3981ff7..0a014a7 100644 (file)
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)
         while (node) {
                 struct extent_status *es;
                 es = rb_entry(node, struct extent_status, rb_node);
-               printk(KERN_DEBUG " [%u/%u) %llu %llx",
+               printk(KERN_DEBUG " [%u/%u) %llu %x",
                        es->es_lblk, es->es_len,
                        ext4_es_pblock(es), ext4_es_status(es));
                 node = rb_next(node);
@@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                                 pr_warn("ES insert assertion failed for "
                                         "inode: %lu we can find an extent "
                                         "at block [%d/%d/%llu/%c], but we "
-                                       "want to add an delayed/hole extent "
-                                       "[%d/%d/%llu/%llx]\n",
+                                       "want to add a delayed/hole extent "
+                                       "[%d/%d/%llu/%x]\n",
                                         inode->i_ino, ee_block, ee_len,
                                         ee_start, ee_status ? 'u' : 'w',
                                         es->es_lblk, es->es_len,
@@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
                         pr_warn("ES insert assertion failed for inode: %lu "
                                 "can't find an extent at block %d but we want "
-                               "to add an written/unwritten extent "
-                               "[%d/%d/%llu/%llx]\n", inode->i_ino,
+                               "to add a written/unwritten extent "
+                               "[%d/%d/%llu/%x]\n", inode->i_ino,
                                 es->es_lblk, es->es_lblk, es->es_len,
                                 ext4_es_pblock(es), ext4_es_status(es));
                 }
@@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                          */
                         pr_warn("ES insert assertion failed for inode: %lu "
                                 "We can find blocks but we want to add a "
-                               "delayed/hole extent [%d/%d/%llu/%llx]\n",
+                               "delayed/hole extent [%d/%d/%llu/%x]\n",
                                 inode->i_ino, es->es_lblk, es->es_len,
                                 ext4_es_pblock(es), ext4_es_status(es));
                         return;
@@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                 if (ext4_es_is_written(es)) {
                         pr_warn("ES insert assertion failed for inode: %lu "
                                 "We can't find the block but we want to add "
-                               "an written extent [%d/%d/%llu/%llx]\n",
+                               "a written extent [%d/%d/%llu/%x]\n",
                                 inode->i_ino, es->es_lblk, es->es_len,
                                 ext4_es_pblock(es), ext4_es_status(es));
                         return;
@@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
  
         newes.es_lblk = lblk;
         newes.es_len = len;
-       ext4_es_store_pblock(&newes, pblk);
-       ext4_es_store_status(&newes, status);
+       ext4_es_store_pblock_status(&newes, pblk, status);
         trace_ext4_es_insert_extent(inode, &newes);
  
         ext4_es_insert_extent_check(inode, &newes);
@@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
  
         newes.es_lblk = lblk;
         newes.es_len = len;
-       ext4_es_store_pblock(&newes, pblk);
-       ext4_es_store_status(&newes, status);
+       ext4_es_store_pblock_status(&newes, pblk, status);
         trace_ext4_es_cache_extent(inode, &newes);
  
         if (!len)
@@ -812,13 +810,13 @@ retry:
  
                         newes.es_lblk = end + 1;
                         newes.es_len = len2;
+                       block = 0x7FDEADBEEF;
                         if (ext4_es_is_written(&orig_es) ||
-                           ext4_es_is_unwritten(&orig_es)) {
+                           ext4_es_is_unwritten(&orig_es))
                                 block = ext4_es_pblock(&orig_es) +
                                         orig_es.es_len - len2;
-                               ext4_es_store_pblock(&newes, block);
-                       }
-                       ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+                       ext4_es_store_pblock_status(&newes, block,
+                                                   ext4_es_status(&orig_es));
                         err = __es_insert_extent(inode, &newes);
                         if (err) {
                                 es->es_lblk = orig_es.es_lblk;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h

index 167f4ab..f1b62a4 100644 (file)
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,
                        (es->es_pblk & ~ES_MASK));
  }
  
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+                                              ext4_fsblk_t pb,
+                                              unsigned int status)
+{
+       es->es_pblk = (((ext4_fsblk_t)
+                       (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+                      (pb & ~ES_MASK));
+}
+
  extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
  extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
  extern void ext4_es_lru_add(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 175c3f9..5b0d2c7 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -504,6 +504,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
  {
         struct extent_status es;
         int retval;
+       int ret = 0;
  #ifdef ES_AGGRESSIVE_TEST
         struct ext4_map_blocks orig_map;
  
@@ -515,6 +516,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                   "logical block %lu\n", inode->i_ino, flags, map->m_len,
                   (unsigned long) map->m_lblk);
  
+       /*
+        * ext4_map_blocks returns an int, and m_len is an unsigned int
+        */
+       if (unlikely(map->m_len > INT_MAX))
+               map->m_len = INT_MAX;
+
         /* Lookup extent status tree firstly */
         if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
                 ext4_es_lru_add(inode);
@@ -553,7 +560,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                                              EXT4_GET_BLOCKS_KEEP_SIZE);
         }
         if (retval > 0) {
-               int ret;
                 unsigned int status;
  
                 if (unlikely(retval != map->m_len)) {
@@ -580,7 +586,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
  
  found:
         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-               int ret = check_block_validity(inode, map);
+               ret = check_block_validity(inode, map);
                 if (ret != 0)
                         return ret;
         }
@@ -597,7 +603,13 @@ found:
          * with buffer head unmapped.
          */
         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
-               return retval;
+               /*
+                * If we need to convert extent to unwritten
+                * we continue and do the actual work in
+                * ext4_ext_map_blocks()
+                */
+               if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+                       return retval;
  
         /*
          * Here we clear m_flags because after allocating an new extent,
@@ -653,7 +665,6 @@ found:
                 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
  
         if (retval > 0) {
-               int ret;
                 unsigned int status;
  
                 if (unlikely(retval != map->m_len)) {
@@ -688,7 +699,7 @@ found:
  has_zeroout:
         up_write((&EXT4_I(inode)->i_data_sem));
         if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-               int ret = check_block_validity(inode, map);
+               ret = check_block_validity(inode, map);
                 if (ret != 0)
                         return ret;
         }
@@ -3312,26 +3323,6 @@ void ext4_set_aops(struct inode *inode)
                 inode->i_mapping->a_ops = &ext4_aops;
  }
  
-/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
-               struct address_space *mapping, loff_t from)
-{
-       unsigned offset = from & (PAGE_CACHE_SIZE-1);
-       unsigned length;
-       unsigned blocksize;
-       struct inode *inode = mapping->host;
-
-       blocksize = inode->i_sb->s_blocksize;
-       length = blocksize - (offset & (blocksize - 1));
-
-       return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-
  /*
   * ext4_block_zero_page_range() zeros out a mapping of length 'length'
   * starting from file offset 'from'.  The range to be zero'd must
@@ -3339,7 +3330,7 @@ int ext4_block_truncate_page(handle_t *handle,
   * the end of the block it will be shortened to end of the block
   * that cooresponds to 'from'
   */
-int ext4_block_zero_page_range(handle_t *handle,
+static int ext4_block_zero_page_range(handle_t *handle,
                 struct address_space *mapping, loff_t from, loff_t length)
  {
         ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -3429,6 +3420,26 @@ unlock:
         return err;
  }
  
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle,
+               struct address_space *mapping, loff_t from)
+{
+       unsigned offset = from & (PAGE_CACHE_SIZE-1);
+       unsigned length;
+       unsigned blocksize;
+       struct inode *inode = mapping->host;
+
+       blocksize = inode->i_sb->s_blocksize;
+       length = blocksize - (offset & (blocksize - 1));
+
+       return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
  int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                              loff_t lstart, loff_t length)
  {
@@ -3502,7 +3513,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
         if (!S_ISREG(inode->i_mode))
                 return -EOPNOTSUPP;
  
-       trace_ext4_punch_hole(inode, offset, length);
+       trace_ext4_punch_hole(inode, offset, length, 0);
  
         /*
          * Write out all dirty pages to avoid race conditions
@@ -3609,6 +3620,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
         up_write(&EXT4_I(inode)->i_data_sem);
         if (IS_SYNC(inode))
                 ext4_handle_sync(handle);
+
+       /* Now release the pages again to reduce race window */
+       if (last_block_offset > first_block_offset)
+               truncate_pagecache_range(inode, first_block_offset,
+                                        last_block_offset);
+
         inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
         ext4_mark_inode_dirty(handle, inode);
  out_stop:
@@ -3682,7 +3699,7 @@ void ext4_truncate(struct inode *inode)
  
         /*
          * There is a possibility that we're either freeing the inode
-        * or it completely new indode. In those cases we might not
+        * or it's a completely new inode. In those cases we might not
          * have i_mutex locked because it's not necessary.
          */
         if (!(inode->i_state & (I_NEW|I_FREEING)))
@@ -3934,8 +3951,8 @@ void ext4_set_inode_flags(struct inode *inode)
                 new_fl |= S_NOATIME;
         if (flags & EXT4_DIRSYNC_FL)
                 new_fl |= S_DIRSYNC;
-       set_mask_bits(&inode->i_flags,
-                     S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
+       inode_set_flags(inode, new_fl,
+                       S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
  }
  
  /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4154,11 +4171,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
         EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
         EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
  
-       inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
-       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-               if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-                       inode->i_version |=
-                       (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+       if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+               inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+               if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+                       if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                               inode->i_version |=
+                   (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+               }
         }
  
         ret = 0;
@@ -4328,8 +4347,7 @@ static int ext4_do_update_inode(handle_t *handle,
                 goto out_brelse;
         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
         raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
-       if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-           cpu_to_le32(EXT4_OS_HURD))
+       if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
                 raw_inode->i_file_acl_high =
                         cpu_to_le16(ei->i_file_acl >> 32);
         raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
@@ -4374,12 +4392,15 @@ static int ext4_do_update_inode(handle_t *handle,
                         raw_inode->i_block[block] = ei->i_data[block];
         }
  
-       raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
-       if (ei->i_extra_isize) {
-               if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-                       raw_inode->i_version_hi =
-                       cpu_to_le32(inode->i_version >> 32);
-               raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+       if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+               raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+               if (ei->i_extra_isize) {
+                       if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                               raw_inode->i_version_hi =
+                                       cpu_to_le32(inode->i_version >> 32);
+                       raw_inode->i_extra_isize =
+                               cpu_to_le16(ei->i_extra_isize);
+               }
         }
  
         ext4_inode_csum_set(inode, raw_inode, ei);
@@ -4446,7 +4467,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                         return -EIO;
                 }
  
-               if (wbc->sync_mode != WB_SYNC_ALL)
+               /*
+                * No need to force transaction in WB_SYNC_NONE mode. Also
+                * ext4_sync_fs() will force the commit after everything is
+                * written.
+                */
+               if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                         return 0;
  
                 err = ext4_force_commit(inode->i_sb);
@@ -4456,7 +4482,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                 err = __ext4_get_inode_loc(inode, &iloc, 0);
                 if (err)
                         return err;
-               if (wbc->sync_mode == WB_SYNC_ALL)
+               /*
+                * sync(2) will flush the whole buffer cache. No need to do
+                * it here separately for each inode.
+                */
+               if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
                         sync_dirty_buffer(iloc.bh);
                 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
                         EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

index a2a837f..0f2252e 100644 (file)
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb,
         struct ext4_inode_info *ei_bl;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
  
-       if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
-               err = -EINVAL;
-               goto swap_boot_out;
-       }
+       if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
+               return -EINVAL;
  
-       if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
-               err = -EPERM;
-               goto swap_boot_out;
-       }
+       if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
+               return -EPERM;
  
         inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
-       if (IS_ERR(inode_bl)) {
-               err = PTR_ERR(inode_bl);
-               goto swap_boot_out;
-       }
+       if (IS_ERR(inode_bl))
+               return PTR_ERR(inode_bl);
         ei_bl = EXT4_I(inode_bl);
  
         filemap_flush(inode->i_mapping);
@@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
                         ext4_mark_inode_dirty(handle, inode);
                 }
         }
-
         ext4_journal_stop(handle);
-
         ext4_double_up_write_data_sem(inode, inode_bl);
  
  journal_err_out:
         ext4_inode_resume_unlocked_dio(inode);
         ext4_inode_resume_unlocked_dio(inode_bl);
-
         unlock_two_nondirectories(inode, inode_bl);
-
         iput(inode_bl);
-
-swap_boot_out:
         return err;
  }
  
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c

index 04a5c75..a888cac 100644 (file)
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1808,6 +1808,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
         ext4_lock_group(ac->ac_sb, group);
         max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
                              ac->ac_g_ex.fe_len, &ex);
+       ex.fe_logical = 0xDEADFA11; /* debug value */
  
         if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
                 ext4_fsblk_t start;
@@ -1936,7 +1937,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                          */
                         break;
                 }
-
+               ex.fe_logical = 0xDEADC0DE; /* debug value */
                 ext4_mb_measure_extent(ac, &ex, e4b);
  
                 i += ex.fe_len;
@@ -1977,6 +1978,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                         max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
                         if (max >= sbi->s_stripe) {
                                 ac->ac_found++;
+                               ex.fe_logical = 0xDEADF00D; /* debug value */
                                 ac->ac_b_ex = ex;
                                 ext4_mb_use_best_found(ac, e4b);
                                 break;
@@ -4006,8 +4008,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
                         (unsigned long)ac->ac_b_ex.fe_len,
                         (unsigned long)ac->ac_b_ex.fe_logical,
                         (int)ac->ac_criteria);
-       ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
-                ac->ac_ex_scanned, ac->ac_found);
+       ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
         ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
         ngroups = ext4_get_groups_count(sb);
         for (i = 0; i < ngroups; i++) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h

index 08481ee..d634e18 100644 (file)
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;
                 }                                                       \
         } while (0)
  #else
-#define mb_debug(n, fmt, a...)
+#define mb_debug(n, fmt, a...)         no_printk(fmt, ## a)
  #endif
  
  #define EXT4_MB_HISTORY_ALLOC          1       /* allocation */
@@ -175,8 +175,6 @@ struct ext4_allocation_context {
         /* copy of the best found extent taken before preallocation efforts */
         struct ext4_free_extent ac_f_ex;
  
-       /* number of iterations done. we have to track to limit searching */
-       unsigned long ac_ex_scanned;
         __u16 ac_groups_scanned;
         __u16 ac_found;
         __u16 ac_tail;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c

index 773b503..58ee7dc 100644 (file)
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
   * ext4_ext_path structure refers to the last extent, or a negative error
   * value on failure.
   */
-static int
+int
  mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                       struct ext4_extent **extent)
  {
@@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
                         }
                         if (!buffer_mapped(bh)) {
                                 zero_user(page, block_start, blocksize);
-                               if (!err)
-                                       set_buffer_uptodate(bh);
+                               set_buffer_uptodate(bh);
                                 continue;
                         }
                 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 710fed2..f3c6670 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,6 +59,7 @@ static struct kset *ext4_kset;
  static struct ext4_lazy_init *ext4_li_info;
  static struct mutex ext4_li_mtx;
  static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
  
  static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                              unsigned long journal_devnum);
@@ -845,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)
                 invalidate_bdev(sbi->journal_bdev);
                 ext4_blkdev_remove(sbi);
         }
+       if (sbi->s_mb_cache) {
+               ext4_xattr_destroy_cache(sbi->s_mb_cache);
+               sbi->s_mb_cache = NULL;
+       }
         if (sbi->s_mmp_tsk)
                 kthread_stop(sbi->s_mmp_tsk);
         sb->s_fs_info = NULL;
@@ -940,7 +945,7 @@ static void init_once(void *foo)
         inode_init_once(&ei->vfs_inode);
  }
  
-static int init_inodecache(void)
+static int __init init_inodecache(void)
  {
         ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
                                              sizeof(struct ext4_inode_info),
@@ -3575,6 +3580,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        "feature flags set on rev 0 fs, "
                        "running e2fsck is recommended");
  
+       if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+               set_opt2(sb, HURD_COMPAT);
+               if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                             EXT4_FEATURE_INCOMPAT_64BIT)) {
+                       ext4_msg(sb, KERN_ERR,
+                                "The Hurd can't support 64-bit file systems");
+                       goto failed_mount;
+               }
+       }
+
         if (IS_EXT2_SB(sb)) {
                 if (ext2_feature_set_ok(sb))
                         ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
@@ -4010,6 +4025,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
  
  no_journal:
+       if (ext4_mballoc_ready) {
+               sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+               if (!sbi->s_mb_cache) {
+                       ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+                       goto failed_mount_wq;
+               }
+       }
+
         /*
          * Get the # of file system overhead blocks from the
          * superblock if present.
@@ -4835,6 +4858,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                 }
  
                 if (*flags & MS_RDONLY) {
+                       err = sync_filesystem(sb);
+                       if (err < 0)
+                               goto restore_opts;
                         err = dquot_suspend(sb, -1);
                         if (err < 0)
                                 goto restore_opts;
@@ -5515,12 +5541,10 @@ static int __init ext4_init_fs(void)
                 goto out4;
  
         err = ext4_init_mballoc();
-       if (err)
-               goto out3;
-
-       err = ext4_init_xattr();
         if (err)
                 goto out2;
+       else
+               ext4_mballoc_ready = 1;
         err = init_inodecache();
         if (err)
                 goto out1;
@@ -5536,10 +5560,9 @@ out:
         unregister_as_ext3();
         destroy_inodecache();
  out1:
-       ext4_exit_xattr();
-out2:
+       ext4_mballoc_ready = 0;
         ext4_exit_mballoc();
-out3:
+out2:
         ext4_exit_feat_adverts();
  out4:
         if (ext4_proc_root)
@@ -5562,7 +5585,6 @@ static void __exit ext4_exit_fs(void)
         unregister_as_ext3();
         unregister_filesystem(&ext4_fs_type);
         destroy_inodecache();
-       ext4_exit_xattr();
         ext4_exit_mballoc();
         ext4_exit_feat_adverts();
         remove_proc_entry("fs/ext4", NULL);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c

index e175e94..1f5cf58 100644 (file)
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@
  # define ea_bdebug(bh, fmt, ...)       no_printk(fmt, ##__VA_ARGS__)
  #endif
  
-static void ext4_xattr_cache_insert(struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
  static struct buffer_head *ext4_xattr_cache_find(struct inode *,
                                                  struct ext4_xattr_header *,
                                                  struct mb_cache_entry **);
@@ -90,8 +90,6 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
  static int ext4_xattr_list(struct dentry *dentry, char *buffer,
                            size_t buffer_size);
  
-static struct mb_cache *ext4_xattr_cache;
-
  static const struct xattr_handler *ext4_xattr_handler_map[] = {
         [EXT4_XATTR_INDEX_USER]              = &ext4_xattr_user_handler,
  #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
         NULL
  };
  
+#define EXT4_GET_MB_CACHE(inode)       (((struct ext4_sb_info *) \
+                               inode->i_sb->s_fs_info)->s_mb_cache)
+
  static __le32 ext4_xattr_block_csum(struct inode *inode,
                                     sector_t block_nr,
                                     struct ext4_xattr_header *hdr)
@@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
         struct ext4_xattr_entry *entry;
         size_t size;
         int error;
+       struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
  
         ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
                   name_index, name, buffer, (long)buffer_size);
@@ -286,7 +288,7 @@ bad_block:
                 error = -EIO;
                 goto cleanup;
         }
-       ext4_xattr_cache_insert(bh);
+       ext4_xattr_cache_insert(ext4_mb_cache, bh);
         entry = BFIRST(bh);
         error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
         if (error == -EIO)
@@ -409,6 +411,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
         struct inode *inode = dentry->d_inode;
         struct buffer_head *bh = NULL;
         int error;
+       struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
  
         ea_idebug(inode, "buffer=%p, buffer_size=%ld",
                   buffer, (long)buffer_size);
@@ -430,7 +433,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
                 error = -EIO;
                 goto cleanup;
         }
-       ext4_xattr_cache_insert(bh);
+       ext4_xattr_cache_insert(ext4_mb_cache, bh);
         error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
  
  cleanup:
@@ -526,8 +529,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
  {
         struct mb_cache_entry *ce = NULL;
         int error = 0;
+       struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
  
-       ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+       ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
         error = ext4_journal_get_write_access(handle, bh);
         if (error)
                 goto out;
@@ -567,12 +571,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
                                     size_t *min_offs, void *base, int *total)
  {
         for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-               *total += EXT4_XATTR_LEN(last->e_name_len);
                 if (!last->e_value_block && last->e_value_size) {
                         size_t offs = le16_to_cpu(last->e_value_offs);
                         if (offs < *min_offs)
                                 *min_offs = offs;
                 }
+               if (total)
+                       *total += EXT4_XATTR_LEN(last->e_name_len);
         }
         return (*min_offs - ((void *)last - base) - sizeof(__u32));
  }
@@ -745,13 +750,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
         struct ext4_xattr_search *s = &bs->s;
         struct mb_cache_entry *ce = NULL;
         int error = 0;
+       struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
  
  #define header(x) ((struct ext4_xattr_header *)(x))
  
         if (i->value && i->value_len > sb->s_blocksize)
                 return -ENOSPC;
         if (s->base) {
-               ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+               ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
                                         bs->bh->b_blocknr);
                 error = ext4_journal_get_write_access(handle, bs->bh);
                 if (error)
@@ -769,7 +775,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                                 if (!IS_LAST_ENTRY(s->first))
                                         ext4_xattr_rehash(header(s->base),
                                                           s->here);
-                               ext4_xattr_cache_insert(bs->bh);
+                               ext4_xattr_cache_insert(ext4_mb_cache,
+                                       bs->bh);
                         }
                         unlock_buffer(bs->bh);
                         if (error == -EIO)
@@ -905,7 +912,7 @@ getblk_failed:
                         memcpy(new_bh->b_data, s->base, new_bh->b_size);
                         set_buffer_uptodate(new_bh);
                         unlock_buffer(new_bh);
-                       ext4_xattr_cache_insert(new_bh);
+                       ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
                         error = ext4_handle_dirty_xattr_block(handle,
                                                               inode, new_bh);
                         if (error)
@@ -1228,7 +1235,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
         struct ext4_xattr_block_find *bs = NULL;
         char *buffer = NULL, *b_entry_name = NULL;
         size_t min_offs, free;
-       int total_ino, total_blk;
+       int total_ino;
         void *base, *start, *end;
         int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
         int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1286,8 +1293,7 @@ retry:
                 first = BFIRST(bh);
                 end = bh->b_data + bh->b_size;
                 min_offs = end - base;
-               free = ext4_xattr_free_space(first, &min_offs, base,
-                                            &total_blk);
+               free = ext4_xattr_free_space(first, &min_offs, base, NULL);
                 if (free < new_extra_isize) {
                         if (!tried_min_extra_isize && s_min_extra_isize) {
                                 tried_min_extra_isize++;
@@ -1495,13 +1501,13 @@ ext4_xattr_put_super(struct super_block *sb)
   * Returns 0, or a negative error number on failure.
   */
  static void
-ext4_xattr_cache_insert(struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
  {
         __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
         struct mb_cache_entry *ce;
         int error;
  
-       ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+       ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
         if (!ce) {
                 ea_bdebug(bh, "out of memory");
                 return;
@@ -1573,12 +1579,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
  {
         __u32 hash = le32_to_cpu(header->h_hash);
         struct mb_cache_entry *ce;
+       struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
  
         if (!header->h_hash)
                 return NULL;  /* never share */
         ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
  again:
-       ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+       ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
                                        hash);
         while (ce) {
                 struct buffer_head *bh;
@@ -1676,19 +1683,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
  
  #undef BLOCK_HASH_SHIFT
  
-int __init
-ext4_init_xattr(void)
+#define        HASH_BUCKET_BITS        10
+
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
  {
-       ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
-       if (!ext4_xattr_cache)
-               return -ENOMEM;
-       return 0;
+       return mb_cache_create(name, HASH_BUCKET_BITS);
  }
  
-void
-ext4_exit_xattr(void)
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
  {
-       if (ext4_xattr_cache)
-               mb_cache_destroy(ext4_xattr_cache);
-       ext4_xattr_cache = NULL;
+       if (cache)
+               mb_cache_destroy(cache);
  }
+
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h

index 819d639..29bedf5 100644 (file)
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -110,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);
  extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                             struct ext4_inode *raw_inode, handle_t *handle);
  
-extern int __init ext4_init_xattr(void);
-extern void ext4_exit_xattr(void);
-
  extern const struct xattr_handler *ext4_xattr_handlers[];
  
  extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
@@ -124,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
                                        struct ext4_xattr_info *i,
                                        struct ext4_xattr_ibody_find *is);
  
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
+
  #ifdef CONFIG_EXT4_FS_SECURITY
  extern int ext4_init_security(handle_t *handle, struct inode *inode,
                               struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c

index 1a85f83..856bdf9 100644 (file)
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -568,6 +568,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
         struct f2fs_mount_info org_mount_opt;
         int err, active_logs;
  
+       sync_filesystem(sb);
+
         /*
          * Save the old mount options in case we
          * need to restore them.
diff --git a/fs/fat/inode.c b/fs/fat/inode.c

index c68d9f2..b3361fe 100644 (file)
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -635,6 +635,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
         struct msdos_sb_info *sbi = MSDOS_SB(sb);
         *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
  
+       sync_filesystem(sb);
+
         /* make sure we update state on remount. */
         new_rdonly = *flags & MS_RDONLY;
         if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c

index e37eb27..7ca8c75 100644 (file)
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
  
  static int vxfs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_RDONLY;
         return 0;
  }
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c

index b4bff1b..8d61169 100644 (file)
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode)
  
  static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         if (*flags & MS_MANDLOCK)
                 return -EINVAL;
  
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c

index 033ee97..de8afad 100644 (file)
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1167,6 +1167,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
         struct gfs2_tune *gt = &sdp->sd_tune;
         int error;
  
+       sync_filesystem(sb);
+
         spin_lock(&gt->gt_spin);
         args.ar_commit = gt->gt_logd_secs;
         args.ar_quota_quantum = gt->gt_quota_quantum;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c

index 2d2039e..eee7206 100644 (file)
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
  
  static int hfs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_NODIRATIME;
         if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                 return 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c

index a6abf87..a513d2d 100644 (file)
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -323,6 +323,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
  
  static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                 return 0;
         if (!(*flags & MS_RDONLY)) {
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c

index 4534ff6..fe3463a 100644 (file)
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -421,6 +421,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
         struct hpfs_sb_info *sbi = hpfs_sb(s);
         char *new_opts = kstrdup(data, GFP_KERNEL);
         
+       sync_filesystem(s);
+
         *flags |= MS_NOATIME;
         
         hpfs_lock(s);
diff --git a/fs/inode.c b/fs/inode.c

index fb59ba7..f96d2a6 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1898,3 +1898,34 @@ void inode_dio_done(struct inode *inode)
                 wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
  }
  EXPORT_SYMBOL(inode_dio_done);
+
+/*
+ * inode_set_flags - atomically set some inode flags
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that
+ * they have exclusive access to the inode structure (i.e., while the
+ * inode is being instantiated).  The reason for the cmpxchg() loop
+ * --- which wouldn't be necessary if all code paths which modify
+ * i_flags actually followed this rule, is that there is at least one
+ * code path which doesn't today --- for example,
+ * __generic_file_aio_write() calls file_remove_suid() without holding
+ * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ *
+ * In the long run, i_mutex is overkill, and we should probably look
+ * at using the i_lock spinlock to protect i_flags, and then make sure
+ * it is so documented in include/linux/fs.h and that all code follows
+ * the locking convention!!
+ */
+void inode_set_flags(struct inode *inode, unsigned int flags,
+                    unsigned int mask)
+{
+       unsigned int old_flags, new_flags;
+
+       WARN_ON_ONCE(flags & ~mask);
+       do {
+               old_flags = ACCESS_ONCE(inode->i_flags);
+               new_flags = (old_flags & ~mask) | flags;
+       } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
+                                 new_flags) != old_flags));
+}
+EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c

index 4a9e10e..6af66ee 100644 (file)
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -117,6 +117,7 @@ static void destroy_inodecache(void)
  
  static int isofs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         if (!(*flags & MS_RDONLY))
                 return -EROFS;
         return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c

index cf2fc05..5f26139 100644 (file)
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         blk_start_plug(&plug);
         jbd2_journal_write_revoke_records(journal, commit_transaction,
                                           &log_bufs, WRITE_SYNC);
-       blk_finish_plug(&plug);
  
         jbd_debug(3, "JBD2: commit phase 2b\n");
  
@@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         err = 0;
         bufs = 0;
         descriptor = NULL;
-       blk_start_plug(&plug);
         while (commit_transaction->t_buffers) {
  
                 /* Find the next buffer to be journaled... */
@@ -1067,6 +1065,25 @@ restart_loop:
                 goto restart_loop;
         }
  
+       /* Add the transaction to the checkpoint list
+        * __journal_remove_checkpoint() can not destroy transaction
+        * under us because it is not marked as T_FINISHED yet */
+       if (journal->j_checkpoint_transactions == NULL) {
+               journal->j_checkpoint_transactions = commit_transaction;
+               commit_transaction->t_cpnext = commit_transaction;
+               commit_transaction->t_cpprev = commit_transaction;
+       } else {
+               commit_transaction->t_cpnext =
+                       journal->j_checkpoint_transactions;
+               commit_transaction->t_cpprev =
+                       commit_transaction->t_cpnext->t_cpprev;
+               commit_transaction->t_cpnext->t_cpprev =
+                       commit_transaction;
+               commit_transaction->t_cpprev->t_cpnext =
+                               commit_transaction;
+       }
+       spin_unlock(&journal->j_list_lock);
+
         /* Done with this transaction! */
  
         jbd_debug(3, "JBD2: commit phase 7\n");
@@ -1085,24 +1102,7 @@ restart_loop:
                 atomic_read(&commit_transaction->t_handle_count);
         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
                              commit_transaction->t_tid, &stats.run);
-
-       /*
-        * Calculate overall stats
-        */
-       spin_lock(&journal->j_history_lock);
-       journal->j_stats.ts_tid++;
-       if (commit_transaction->t_requested)
-               journal->j_stats.ts_requested++;
-       journal->j_stats.run.rs_wait += stats.run.rs_wait;
-       journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
-       journal->j_stats.run.rs_running += stats.run.rs_running;
-       journal->j_stats.run.rs_locked += stats.run.rs_locked;
-       journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
-       journal->j_stats.run.rs_logging += stats.run.rs_logging;
-       journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
-       journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
-       journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
-       spin_unlock(&journal->j_history_lock);
+       stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
  
         commit_transaction->t_state = T_COMMIT_CALLBACK;
         J_ASSERT(commit_transaction == journal->j_committing_transaction);
@@ -1122,24 +1122,6 @@ restart_loop:
  
         write_unlock(&journal->j_state_lock);
  
-       if (journal->j_checkpoint_transactions == NULL) {
-               journal->j_checkpoint_transactions = commit_transaction;
-               commit_transaction->t_cpnext = commit_transaction;
-               commit_transaction->t_cpprev = commit_transaction;
-       } else {
-               commit_transaction->t_cpnext =
-                       journal->j_checkpoint_transactions;
-               commit_transaction->t_cpprev =
-                       commit_transaction->t_cpnext->t_cpprev;
-               commit_transaction->t_cpnext->t_cpprev =
-                       commit_transaction;
-               commit_transaction->t_cpprev->t_cpnext =
-                               commit_transaction;
-       }
-       spin_unlock(&journal->j_list_lock);
-       /* Drop all spin_locks because commit_callback may be block.
-        * __journal_remove_checkpoint() can not destroy transaction
-        * under us because it is not marked as T_FINISHED yet */
         if (journal->j_commit_callback)
                 journal->j_commit_callback(journal, commit_transaction);
  
@@ -1150,7 +1132,7 @@ restart_loop:
         write_lock(&journal->j_state_lock);
         spin_lock(&journal->j_list_lock);
         commit_transaction->t_state = T_FINISHED;
-       /* Recheck checkpoint lists after j_list_lock was dropped */
+       /* Check if the transaction can be dropped now that we are finished */
         if (commit_transaction->t_checkpoint_list == NULL &&
             commit_transaction->t_checkpoint_io_list == NULL) {
                 __jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -1159,4 +1141,21 @@ restart_loop:
         spin_unlock(&journal->j_list_lock);
         write_unlock(&journal->j_state_lock);
         wake_up(&journal->j_wait_done_commit);
+
+       /*
+        * Calculate overall stats
+        */
+       spin_lock(&journal->j_history_lock);
+       journal->j_stats.ts_tid++;
+       journal->j_stats.ts_requested += stats.ts_requested;
+       journal->j_stats.run.rs_wait += stats.run.rs_wait;
+       journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
+       journal->j_stats.run.rs_running += stats.run.rs_running;
+       journal->j_stats.run.rs_locked += stats.run.rs_locked;
+       journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+       journal->j_stats.run.rs_logging += stats.run.rs_logging;
+       journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+       journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+       journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
+       spin_unlock(&journal->j_history_lock);
  }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c

index 5fa344a..67b8e30 100644 (file)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug);
  #endif
  
  /* Checksumming functions */
-int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
  {
         if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                 return 1;
@@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
         return cpu_to_be32(csum);
  }
  
-int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
  {
         if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                 return 1;
@@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
         return sb->s_checksum == jbd2_superblock_csum(j, sb);
  }
  
-void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
  {
         if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                 return;
@@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal)
         journal->j_flags |= JBD2_UNMOUNT;
  
         while (journal->j_task) {
-               wake_up(&journal->j_wait_commit);
                 write_unlock(&journal->j_state_lock);
+               wake_up(&journal->j_wait_commit);
                 wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
                 write_lock(&journal->j_state_lock);
         }
@@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
         while (tid_gt(tid, journal->j_commit_sequence)) {
                 jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
                                   tid, journal->j_commit_sequence);
-               wake_up(&journal->j_wait_commit);
                 read_unlock(&journal->j_state_lock);
+               wake_up(&journal->j_wait_commit);
                 wait_event(journal->j_wait_done_commit,
                                 !tid_gt(tid, journal->j_commit_sequence));
                 read_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c

index 60bb365..38cfcf5 100644 (file)
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
          * reused here.
          */
         jbd_lock_bh_state(bh);
-       spin_lock(&journal->j_list_lock);
         J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
                 jh->b_transaction == NULL ||
                 (jh->b_transaction == journal->j_committing_transaction &&
@@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
                 jh->b_modified = 0;
  
                 JBUFFER_TRACE(jh, "file as BJ_Reserved");
+               spin_lock(&journal->j_list_lock);
                 __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
         } else if (jh->b_transaction == journal->j_committing_transaction) {
                 /* first access by this transaction */
                 jh->b_modified = 0;
  
                 JBUFFER_TRACE(jh, "set next transaction");
+               spin_lock(&journal->j_list_lock);
                 jh->b_next_transaction = transaction;
         }
         spin_unlock(&journal->j_list_lock);
@@ -1312,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
                              journal->j_running_transaction)) {
                         printk(KERN_ERR "JBD2: %s: "
                                "jh->b_transaction (%llu, %p, %u) != "
-                              "journal->j_running_transaction (%p, %u)",
+                              "journal->j_running_transaction (%p, %u)\n",
                                journal->j_devname,
                                (unsigned long long) bh->b_blocknr,
                                jh->b_transaction,
@@ -1335,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
          */
         if (jh->b_transaction != transaction) {
                 JBUFFER_TRACE(jh, "already on other transaction");
-               if (unlikely(jh->b_transaction !=
-                            journal->j_committing_transaction)) {
-                       printk(KERN_ERR "JBD2: %s: "
-                              "jh->b_transaction (%llu, %p, %u) != "
-                              "journal->j_committing_transaction (%p, %u)",
+               if (unlikely(((jh->b_transaction !=
+                              journal->j_committing_transaction)) ||
+                            (jh->b_next_transaction != transaction))) {
+                       printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
+                              "bad jh for block %llu: "
+                              "transaction (%p, %u), "
+                              "jh->b_transaction (%p, %u), "
+                              "jh->b_next_transaction (%p, %u), jlist %u\n",
                                journal->j_devname,
                                (unsigned long long) bh->b_blocknr,
+                              transaction, transaction->t_tid,
                                jh->b_transaction,
-                              jh->b_transaction ? jh->b_transaction->t_tid : 0,
-                              journal->j_committing_transaction,
-                              journal->j_committing_transaction ?
-                              journal->j_committing_transaction->t_tid : 0);
-                       ret = -EINVAL;
-               }
-               if (unlikely(jh->b_next_transaction != transaction)) {
-                       printk(KERN_ERR "JBD2: %s: "
-                              "jh->b_next_transaction (%llu, %p, %u) != "
-                              "transaction (%p, %u)",
-                              journal->j_devname,
-                              (unsigned long long) bh->b_blocknr,
+                              jh->b_transaction ?
+                              jh->b_transaction->t_tid : 0,
                                jh->b_next_transaction,
                                jh->b_next_transaction ?
                                jh->b_next_transaction->t_tid : 0,
-                              transaction, transaction->t_tid);
+                              jh->b_jlist);
+                       WARN_ON(1);
                         ret = -EINVAL;
                 }
                 /* And this case is illegal: we can't reuse another
@@ -1415,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
         BUFFER_TRACE(bh, "entry");
  
         jbd_lock_bh_state(bh);
-       spin_lock(&journal->j_list_lock);
  
         if (!buffer_jbd(bh))
                 goto not_jbd;
@@ -1468,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                  * we know to remove the checkpoint after we commit.
                  */
  
+               spin_lock(&journal->j_list_lock);
                 if (jh->b_cp_transaction) {
                         __jbd2_journal_temp_unlink_buffer(jh);
                         __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -1480,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                                 goto drop;
                         }
                 }
+               spin_unlock(&journal->j_list_lock);
         } else if (jh->b_transaction) {
                 J_ASSERT_JH(jh, (jh->b_transaction ==
                                  journal->j_committing_transaction));
@@ -1491,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
  
                 if (jh->b_next_transaction) {
                         J_ASSERT(jh->b_next_transaction == transaction);
+                       spin_lock(&journal->j_list_lock);
                         jh->b_next_transaction = NULL;
+                       spin_unlock(&journal->j_list_lock);
  
                         /*
                          * only drop a reference if this transaction modified
@@ -1503,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
         }
  
  not_jbd:
-       spin_unlock(&journal->j_list_lock);
         jbd_unlock_bh_state(bh);
         __brelse(bh);
  drop:
@@ -1821,11 +1819,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
         if (buffer_locked(bh) || buffer_dirty(bh))
                 goto out;
  
-       if (jh->b_next_transaction != NULL)
+       if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
                 goto out;
  
         spin_lock(&journal->j_list_lock);
-       if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+       if (jh->b_cp_transaction != NULL) {
                 /* written-back checkpointed metadata buffer */
                 JBUFFER_TRACE(jh, "remove from checkpoint list");
                 __jbd2_journal_remove_checkpoint(jh);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c

index 0defb1c..0918f0e 100644 (file)
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
         struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
         int err;
  
+       sync_filesystem(sb);
         err = jffs2_parse_options(c, data);
         if (err)
                 return -EINVAL;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c

index e2b7483..97f7fda 100644 (file)
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -418,6 +418,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
         int flag = JFS_SBI(sb)->flag;
         int ret;
  
+       sync_filesystem(sb);
         if (!parse_options(data, sb, &newLVSize, &flag)) {
                 return -EINVAL;
         }
diff --git a/fs/mbcache.c b/fs/mbcache.c

index e519e45..bf166e3 100644 (file)
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,41 @@
   * back on the lru list.
   */
  
+/*
+ * Lock descriptions and usage:
+ *
+ * Each hash chain of both the block and index hash tables now contains
+ * a built-in lock used to serialize accesses to the hash chain.
+ *
+ * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+ * are serialized via the global spinlock mb_cache_spinlock.
+ *
+ * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
+ * accesses to its local data, such as e_used and e_queued.
+ *
+ * Lock ordering:
+ *
+ * Each block hash chain's lock has the highest lock order, followed by an
+ * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
+ * lock), and mb_cach_spinlock, with the lowest order.  While holding
+ * either a block or index hash chain lock, a thread can acquire an
+ * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
+ *
+ * Synchronization:
+ *
+ * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
+ * index hash chian, it needs to lock the corresponding hash chain.  For each
+ * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
+ * prevent either any simultaneous release or free on the entry and also
+ * to serialize accesses to either the e_used or e_queued member of the entry.
+ *
+ * To avoid having a dangling reference to an already freed
+ * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+ * block hash chain and also no longer being referenced, both e_used,
+ * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
+ * first removed from a block hash chain.
+ */
+
  #include <linux/kernel.h>
  #include <linux/module.h>
  
@@ -34,9 +69,10 @@
  #include <linux/mm.h>
  #include <linux/slab.h>
  #include <linux/sched.h>
-#include <linux/init.h>
+#include <linux/list_bl.h>
  #include <linux/mbcache.h>
-
+#include <linux/init.h>
+#include <linux/blockgroup_lock.h>
  
  #ifdef MB_CACHE_DEBUG
  # define mb_debug(f...) do { \
@@ -57,8 +93,14 @@
  
  #define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
  
+#define MB_CACHE_ENTRY_LOCK_BITS       __builtin_log2(NR_BG_LOCKS)
+#define        MB_CACHE_ENTRY_LOCK_INDEX(ce)                   \
+       (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
+
  static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
-               
+static struct blockgroup_lock *mb_cache_bg_lock;
+static struct kmem_cache *mb_cache_kmem_cache;
+
  MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
  MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
  MODULE_LICENSE("GPL");
@@ -86,58 +128,110 @@ static LIST_HEAD(mb_cache_list);
  static LIST_HEAD(mb_cache_lru_list);
  static DEFINE_SPINLOCK(mb_cache_spinlock);
  
+static inline void
+__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+       spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
+               MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+
+static inline void
+__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+       spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
+               MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+
  static inline int
-__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
+__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
  {
-       return !list_empty(&ce->e_block_list);
+       return !hlist_bl_unhashed(&ce->e_block_list);
  }
  
  
-static void
-__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+static inline void
+__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
  {
-       if (__mb_cache_entry_is_hashed(ce)) {
-               list_del_init(&ce->e_block_list);
-               list_del(&ce->e_index.o_list);
-       }
+       if (__mb_cache_entry_is_block_hashed(ce))
+               hlist_bl_del_init(&ce->e_block_list);
  }
  
+static inline int
+__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+{
+       return !hlist_bl_unhashed(&ce->e_index.o_list);
+}
+
+static inline void
+__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+{
+       if (__mb_cache_entry_is_index_hashed(ce))
+               hlist_bl_del_init(&ce->e_index.o_list);
+}
+
+/*
+ * __mb_cache_entry_unhash_unlock()
+ *
+ * This function is called to unhash both the block and index hash
+ * chain.
+ * It assumes both the block and index hash chain is locked upon entry.
+ * It also unlock both hash chains both exit
+ */
+static inline void
+__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
+{
+       __mb_cache_entry_unhash_index(ce);
+       hlist_bl_unlock(ce->e_index_hash_p);
+       __mb_cache_entry_unhash_block(ce);
+       hlist_bl_unlock(ce->e_block_hash_p);
+}
  
  static void
  __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
  {
         struct mb_cache *cache = ce->e_cache;
  
-       mb_assert(!(ce->e_used || ce->e_queued));
+       mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
         kmem_cache_free(cache->c_entry_cache, ce);
         atomic_dec(&cache->c_entry_count);
  }
  
-
  static void
-__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
-       __releases(mb_cache_spinlock)
+__mb_cache_entry_release(struct mb_cache_entry *ce)
  {
+       /* First lock the entry to serialize access to its local data. */
+       __spin_lock_mb_cache_entry(ce);
         /* Wake up all processes queuing for this cache entry. */
         if (ce->e_queued)
                 wake_up_all(&mb_cache_queue);
         if (ce->e_used >= MB_CACHE_WRITER)
                 ce->e_used -= MB_CACHE_WRITER;
+       /*
+        * Make sure that all cache entries on lru_list have
+        * both e_used and e_qued of 0s.
+        */
         ce->e_used--;
-       if (!(ce->e_used || ce->e_queued)) {
-               if (!__mb_cache_entry_is_hashed(ce))
+       if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
+               if (!__mb_cache_entry_is_block_hashed(ce)) {
+                       __spin_unlock_mb_cache_entry(ce);
                         goto forget;
-               mb_assert(list_empty(&ce->e_lru_list));
-               list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+               }
+               /*
+                * Need access to lru list, first drop entry lock,
+                * then reacquire the lock in the proper order.
+                */
+               spin_lock(&mb_cache_spinlock);
+               if (list_empty(&ce->e_lru_list))
+                       list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+               spin_unlock(&mb_cache_spinlock);
         }
-       spin_unlock(&mb_cache_spinlock);
+       __spin_unlock_mb_cache_entry(ce);
         return;
  forget:
-       spin_unlock(&mb_cache_spinlock);
+       mb_assert(list_empty(&ce->e_lru_list));
         __mb_cache_entry_forget(ce, GFP_KERNEL);
  }
  
-
  /*
   * mb_cache_shrink_scan()  memory pressure callback
   *
@@ -160,17 +254,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
  
         mb_debug("trying to free %d entries", nr_to_scan);
         spin_lock(&mb_cache_spinlock);
-       while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
+       while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
                 struct mb_cache_entry *ce =
                         list_entry(mb_cache_lru_list.next,
-                                  struct mb_cache_entry, e_lru_list);
-               list_move_tail(&ce->e_lru_list, &free_list);
-               __mb_cache_entry_unhash(ce);
-               freed++;
+                               struct mb_cache_entry, e_lru_list);
+               list_del_init(&ce->e_lru_list);
+               if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
+                       continue;
+               spin_unlock(&mb_cache_spinlock);
+               /* Prevent any find or get operation on the entry */
+               hlist_bl_lock(ce->e_block_hash_p);
+               hlist_bl_lock(ce->e_index_hash_p);
+               /* Ignore if it is touched by a find/get */
+               if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
+                       !list_empty(&ce->e_lru_list)) {
+                       hlist_bl_unlock(ce->e_index_hash_p);
+                       hlist_bl_unlock(ce->e_block_hash_p);
+                       spin_lock(&mb_cache_spinlock);
+                       continue;
+               }
+               __mb_cache_entry_unhash_unlock(ce);
+               list_add_tail(&ce->e_lru_list, &free_list);
+               spin_lock(&mb_cache_spinlock);
         }
         spin_unlock(&mb_cache_spinlock);
+
         list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
                 __mb_cache_entry_forget(entry, gfp_mask);
+               freed++;
         }
         return freed;
  }
@@ -215,29 +326,40 @@ mb_cache_create(const char *name, int bucket_bits)
         int n, bucket_count = 1 << bucket_bits;
         struct mb_cache *cache = NULL;
  
+       if (!mb_cache_bg_lock) {
+               mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
+                       GFP_KERNEL);
+               if (!mb_cache_bg_lock)
+                       return NULL;
+               bgl_lock_init(mb_cache_bg_lock);
+       }
+
         cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
         if (!cache)
                 return NULL;
         cache->c_name = name;
         atomic_set(&cache->c_entry_count, 0);
         cache->c_bucket_bits = bucket_bits;
-       cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
-                                     GFP_KERNEL);
+       cache->c_block_hash = kmalloc(bucket_count *
+               sizeof(struct hlist_bl_head), GFP_KERNEL);
         if (!cache->c_block_hash)
                 goto fail;
         for (n=0; n<bucket_count; n++)
-               INIT_LIST_HEAD(&cache->c_block_hash[n]);
-       cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
-                                     GFP_KERNEL);
+               INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
+       cache->c_index_hash = kmalloc(bucket_count *
+               sizeof(struct hlist_bl_head), GFP_KERNEL);
         if (!cache->c_index_hash)
                 goto fail;
         for (n=0; n<bucket_count; n++)
-               INIT_LIST_HEAD(&cache->c_index_hash[n]);
-       cache->c_entry_cache = kmem_cache_create(name,
-               sizeof(struct mb_cache_entry), 0,
-               SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
-       if (!cache->c_entry_cache)
-               goto fail2;
+               INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
+       if (!mb_cache_kmem_cache) {
+               mb_cache_kmem_cache = kmem_cache_create(name,
+                       sizeof(struct mb_cache_entry), 0,
+                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+               if (!mb_cache_kmem_cache)
+                       goto fail2;
+       }
+       cache->c_entry_cache = mb_cache_kmem_cache;
  
         /*
          * Set an upper limit on the number of cache entries so that the hash
@@ -273,21 +395,47 @@ void
  mb_cache_shrink(struct block_device *bdev)
  {
         LIST_HEAD(free_list);
-       struct list_head *l, *ltmp;
+       struct list_head *l;
+       struct mb_cache_entry *ce, *tmp;
  
+       l = &mb_cache_lru_list;
         spin_lock(&mb_cache_spinlock);
-       list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
-               struct mb_cache_entry *ce =
-                       list_entry(l, struct mb_cache_entry, e_lru_list);
+       while (!list_is_last(l, &mb_cache_lru_list)) {
+               l = l->next;
+               ce = list_entry(l, struct mb_cache_entry, e_lru_list);
                 if (ce->e_bdev == bdev) {
-                       list_move_tail(&ce->e_lru_list, &free_list);
-                       __mb_cache_entry_unhash(ce);
+                       list_del_init(&ce->e_lru_list);
+                       if (ce->e_used || ce->e_queued ||
+                               atomic_read(&ce->e_refcnt))
+                               continue;
+                       spin_unlock(&mb_cache_spinlock);
+                       /*
+                        * Prevent any find or get operation on the entry.
+                        */
+                       hlist_bl_lock(ce->e_block_hash_p);
+                       hlist_bl_lock(ce->e_index_hash_p);
+                       /* Ignore if it is touched by a find/get */
+                       if (ce->e_used || ce->e_queued ||
+                               atomic_read(&ce->e_refcnt) ||
+                               !list_empty(&ce->e_lru_list)) {
+                               hlist_bl_unlock(ce->e_index_hash_p);
+                               hlist_bl_unlock(ce->e_block_hash_p);
+                               l = &mb_cache_lru_list;
+                               spin_lock(&mb_cache_spinlock);
+                               continue;
+                       }
+                       __mb_cache_entry_unhash_unlock(ce);
+                       mb_assert(!(ce->e_used || ce->e_queued ||
+                               atomic_read(&ce->e_refcnt)));
+                       list_add_tail(&ce->e_lru_list, &free_list);
+                       l = &mb_cache_lru_list;
+                       spin_lock(&mb_cache_spinlock);
                 }
         }
         spin_unlock(&mb_cache_spinlock);
-       list_for_each_safe(l, ltmp, &free_list) {
-               __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-                                                  e_lru_list), GFP_KERNEL);
+
+       list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+               __mb_cache_entry_forget(ce, GFP_KERNEL);
         }
  }
  
@@ -303,23 +451,27 @@ void
  mb_cache_destroy(struct mb_cache *cache)
  {
         LIST_HEAD(free_list);
-       struct list_head *l, *ltmp;
+       struct mb_cache_entry *ce, *tmp;
  
         spin_lock(&mb_cache_spinlock);
-       list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
-               struct mb_cache_entry *ce =
-                       list_entry(l, struct mb_cache_entry, e_lru_list);
-               if (ce->e_cache == cache) {
+       list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
+               if (ce->e_cache == cache)
                         list_move_tail(&ce->e_lru_list, &free_list);
-                       __mb_cache_entry_unhash(ce);
-               }
         }
         list_del(&cache->c_cache_list);
         spin_unlock(&mb_cache_spinlock);
  
-       list_for_each_safe(l, ltmp, &free_list) {
-               __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
-                                                  e_lru_list), GFP_KERNEL);
+       list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+               list_del_init(&ce->e_lru_list);
+               /*
+                * Prevent any find or get operation on the entry.
+                */
+               hlist_bl_lock(ce->e_block_hash_p);
+               hlist_bl_lock(ce->e_index_hash_p);
+               mb_assert(!(ce->e_used || ce->e_queued ||
+                       atomic_read(&ce->e_refcnt)));
+               __mb_cache_entry_unhash_unlock(ce);
+               __mb_cache_entry_forget(ce, GFP_KERNEL);
         }
  
         if (atomic_read(&cache->c_entry_count) > 0) {
@@ -328,8 +480,10 @@ mb_cache_destroy(struct mb_cache *cache)
                           atomic_read(&cache->c_entry_count));
         }
  
-       kmem_cache_destroy(cache->c_entry_cache);
-
+       if (list_empty(&mb_cache_list)) {
+               kmem_cache_destroy(mb_cache_kmem_cache);
+               mb_cache_kmem_cache = NULL;
+       }
         kfree(cache->c_index_hash);
         kfree(cache->c_block_hash);
         kfree(cache);
@@ -346,28 +500,61 @@ mb_cache_destroy(struct mb_cache *cache)
  struct mb_cache_entry *
  mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
  {
-       struct mb_cache_entry *ce = NULL;
+       struct mb_cache_entry *ce;
  
         if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+               struct list_head *l;
+
+               l = &mb_cache_lru_list;
                 spin_lock(&mb_cache_spinlock);
-               if (!list_empty(&mb_cache_lru_list)) {
-                       ce = list_entry(mb_cache_lru_list.next,
-                                       struct mb_cache_entry, e_lru_list);
-                       list_del_init(&ce->e_lru_list);
-                       __mb_cache_entry_unhash(ce);
+               while (!list_is_last(l, &mb_cache_lru_list)) {
+                       l = l->next;
+                       ce = list_entry(l, struct mb_cache_entry, e_lru_list);
+                       if (ce->e_cache == cache) {
+                               list_del_init(&ce->e_lru_list);
+                               if (ce->e_used || ce->e_queued ||
+                                       atomic_read(&ce->e_refcnt))
+                                       continue;
+                               spin_unlock(&mb_cache_spinlock);
+                               /*
+                                * Prevent any find or get operation on the
+                                * entry.
+                                */
+                               hlist_bl_lock(ce->e_block_hash_p);
+                               hlist_bl_lock(ce->e_index_hash_p);
+                               /* Ignore if it is touched by a find/get */
+                               if (ce->e_used || ce->e_queued ||
+                                       atomic_read(&ce->e_refcnt) ||
+                                       !list_empty(&ce->e_lru_list)) {
+                                       hlist_bl_unlock(ce->e_index_hash_p);
+                                       hlist_bl_unlock(ce->e_block_hash_p);
+                                       l = &mb_cache_lru_list;
+                                       spin_lock(&mb_cache_spinlock);
+                                       continue;
+                               }
+                               mb_assert(list_empty(&ce->e_lru_list));
+                               mb_assert(!(ce->e_used || ce->e_queued ||
+                                       atomic_read(&ce->e_refcnt)));
+                               __mb_cache_entry_unhash_unlock(ce);
+                               goto found;
+                       }
                 }
                 spin_unlock(&mb_cache_spinlock);
         }
-       if (!ce) {
-               ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
-               if (!ce)
-                       return NULL;
-               atomic_inc(&cache->c_entry_count);
-               INIT_LIST_HEAD(&ce->e_lru_list);
-               INIT_LIST_HEAD(&ce->e_block_list);
-               ce->e_cache = cache;
-               ce->e_queued = 0;
-       }
+
+       ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+       if (!ce)
+               return NULL;
+       atomic_inc(&cache->c_entry_count);
+       INIT_LIST_HEAD(&ce->e_lru_list);
+       INIT_HLIST_BL_NODE(&ce->e_block_list);
+       INIT_HLIST_BL_NODE(&ce->e_index.o_list);
+       ce->e_cache = cache;
+       ce->e_queued = 0;
+       atomic_set(&ce->e_refcnt, 0);
+found:
+       ce->e_block_hash_p = &cache->c_block_hash[0];
+       ce->e_index_hash_p = &cache->c_index_hash[0];
         ce->e_used = 1 + MB_CACHE_WRITER;
         return ce;
  }
@@ -393,29 +580,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
  {
         struct mb_cache *cache = ce->e_cache;
         unsigned int bucket;
-       struct list_head *l;
-       int error = -EBUSY;
+       struct hlist_bl_node *l;
+       struct hlist_bl_head *block_hash_p;
+       struct hlist_bl_head *index_hash_p;
+       struct mb_cache_entry *lce;
  
+       mb_assert(ce);
         bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
                            cache->c_bucket_bits);
-       spin_lock(&mb_cache_spinlock);
-       list_for_each_prev(l, &cache->c_block_hash[bucket]) {
-               struct mb_cache_entry *ce =
-                       list_entry(l, struct mb_cache_entry, e_block_list);
-               if (ce->e_bdev == bdev && ce->e_block == block)
-                       goto out;
+       block_hash_p = &cache->c_block_hash[bucket];
+       hlist_bl_lock(block_hash_p);
+       hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
+               if (lce->e_bdev == bdev && lce->e_block == block) {
+                       hlist_bl_unlock(block_hash_p);
+                       return -EBUSY;
+               }
         }
-       __mb_cache_entry_unhash(ce);
+       mb_assert(!__mb_cache_entry_is_block_hashed(ce));
+       __mb_cache_entry_unhash_block(ce);
+       __mb_cache_entry_unhash_index(ce);
         ce->e_bdev = bdev;
         ce->e_block = block;
-       list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
+       ce->e_block_hash_p = block_hash_p;
         ce->e_index.o_key = key;
+       hlist_bl_add_head(&ce->e_block_list, block_hash_p);
+       hlist_bl_unlock(block_hash_p);
         bucket = hash_long(key, cache->c_bucket_bits);
-       list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
-       error = 0;
-out:
-       spin_unlock(&mb_cache_spinlock);
-       return error;
+       index_hash_p = &cache->c_index_hash[bucket];
+       hlist_bl_lock(index_hash_p);
+       ce->e_index_hash_p = index_hash_p;
+       hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
+       hlist_bl_unlock(index_hash_p);
+       return 0;
  }
  
  
@@ -429,24 +625,26 @@ out:
  void
  mb_cache_entry_release(struct mb_cache_entry *ce)
  {
-       spin_lock(&mb_cache_spinlock);
-       __mb_cache_entry_release_unlock(ce);
+       __mb_cache_entry_release(ce);
  }
  
  
  /*
   * mb_cache_entry_free()
   *
- * This is equivalent to the sequence mb_cache_entry_takeout() --
- * mb_cache_entry_release().
   */
  void
  mb_cache_entry_free(struct mb_cache_entry *ce)
  {
-       spin_lock(&mb_cache_spinlock);
+       mb_assert(ce);
         mb_assert(list_empty(&ce->e_lru_list));
-       __mb_cache_entry_unhash(ce);
-       __mb_cache_entry_release_unlock(ce);
+       hlist_bl_lock(ce->e_index_hash_p);
+       __mb_cache_entry_unhash_index(ce);
+       hlist_bl_unlock(ce->e_index_hash_p);
+       hlist_bl_lock(ce->e_block_hash_p);
+       __mb_cache_entry_unhash_block(ce);
+       hlist_bl_unlock(ce->e_block_hash_p);
+       __mb_cache_entry_release(ce);
  }
  
  
@@ -463,84 +661,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
                    sector_t block)
  {
         unsigned int bucket;
-       struct list_head *l;
+       struct hlist_bl_node *l;
         struct mb_cache_entry *ce;
+       struct hlist_bl_head *block_hash_p;
  
         bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
                            cache->c_bucket_bits);
-       spin_lock(&mb_cache_spinlock);
-       list_for_each(l, &cache->c_block_hash[bucket]) {
-               ce = list_entry(l, struct mb_cache_entry, e_block_list);
+       block_hash_p = &cache->c_block_hash[bucket];
+       /* First serialize access to the block corresponding hash chain. */
+       hlist_bl_lock(block_hash_p);
+       hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
+               mb_assert(ce->e_block_hash_p == block_hash_p);
                 if (ce->e_bdev == bdev && ce->e_block == block) {
-                       DEFINE_WAIT(wait);
+                       /*
+                        * Prevent a free from removing the entry.
+                        */
+                       atomic_inc(&ce->e_refcnt);
+                       hlist_bl_unlock(block_hash_p);
+                       __spin_lock_mb_cache_entry(ce);
+                       atomic_dec(&ce->e_refcnt);
+                       if (ce->e_used > 0) {
+                               DEFINE_WAIT(wait);
+                               while (ce->e_used > 0) {
+                                       ce->e_queued++;
+                                       prepare_to_wait(&mb_cache_queue, &wait,
+                                                       TASK_UNINTERRUPTIBLE);
+                                       __spin_unlock_mb_cache_entry(ce);
+                                       schedule();
+                                       __spin_lock_mb_cache_entry(ce);
+                                       ce->e_queued--;
+                               }
+                               finish_wait(&mb_cache_queue, &wait);
+                       }
+                       ce->e_used += 1 + MB_CACHE_WRITER;
+                       __spin_unlock_mb_cache_entry(ce);
  
-                       if (!list_empty(&ce->e_lru_list))
+                       if (!list_empty(&ce->e_lru_list)) {
+                               spin_lock(&mb_cache_spinlock);
                                 list_del_init(&ce->e_lru_list);
-
-                       while (ce->e_used > 0) {
-                               ce->e_queued++;
-                               prepare_to_wait(&mb_cache_queue, &wait,
-                                               TASK_UNINTERRUPTIBLE);
                                 spin_unlock(&mb_cache_spinlock);
-                               schedule();
-                               spin_lock(&mb_cache_spinlock);
-                               ce->e_queued--;
                         }
-                       finish_wait(&mb_cache_queue, &wait);
-                       ce->e_used += 1 + MB_CACHE_WRITER;
-
-                       if (!__mb_cache_entry_is_hashed(ce)) {
-                               __mb_cache_entry_release_unlock(ce);
+                       if (!__mb_cache_entry_is_block_hashed(ce)) {
+                               __mb_cache_entry_release(ce);
                                 return NULL;
                         }
-                       goto cleanup;
+                       return ce;
                 }
         }
-       ce = NULL;
-
-cleanup:
-       spin_unlock(&mb_cache_spinlock);
-       return ce;
+       hlist_bl_unlock(block_hash_p);
+       return NULL;
  }
  
  #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
  
  static struct mb_cache_entry *
-__mb_cache_entry_find(struct list_head *l, struct list_head *head,
+__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
                       struct block_device *bdev, unsigned int key)
  {
-       while (l != head) {
+
+       /* The index hash chain is alredy acquire by caller. */
+       while (l != NULL) {
                 struct mb_cache_entry *ce =
-                       list_entry(l, struct mb_cache_entry, e_index.o_list);
+                       hlist_bl_entry(l, struct mb_cache_entry,
+                               e_index.o_list);
+               mb_assert(ce->e_index_hash_p == head);
                 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
-                       DEFINE_WAIT(wait);
-
-                       if (!list_empty(&ce->e_lru_list))
-                               list_del_init(&ce->e_lru_list);
-
+                       /*
+                        * Prevent a free from removing the entry.
+                        */
+                       atomic_inc(&ce->e_refcnt);
+                       hlist_bl_unlock(head);
+                       __spin_lock_mb_cache_entry(ce);
+                       atomic_dec(&ce->e_refcnt);
+                       ce->e_used++;
                         /* Incrementing before holding the lock gives readers
                            priority over writers. */
-                       ce->e_used++;
-                       while (ce->e_used >= MB_CACHE_WRITER) {
-                               ce->e_queued++;
-                               prepare_to_wait(&mb_cache_queue, &wait,
-                                               TASK_UNINTERRUPTIBLE);
-                               spin_unlock(&mb_cache_spinlock);
-                               schedule();
-                               spin_lock(&mb_cache_spinlock);
-                               ce->e_queued--;
+                       if (ce->e_used >= MB_CACHE_WRITER) {
+                               DEFINE_WAIT(wait);
+
+                               while (ce->e_used >= MB_CACHE_WRITER) {
+                                       ce->e_queued++;
+                                       prepare_to_wait(&mb_cache_queue, &wait,
+                                                       TASK_UNINTERRUPTIBLE);
+                                       __spin_unlock_mb_cache_entry(ce);
+                                       schedule();
+                                       __spin_lock_mb_cache_entry(ce);
+                                       ce->e_queued--;
+                               }
+                               finish_wait(&mb_cache_queue, &wait);
                         }
-                       finish_wait(&mb_cache_queue, &wait);
-
-                       if (!__mb_cache_entry_is_hashed(ce)) {
-                               __mb_cache_entry_release_unlock(ce);
+                       __spin_unlock_mb_cache_entry(ce);
+                       if (!list_empty(&ce->e_lru_list)) {
                                 spin_lock(&mb_cache_spinlock);
+                               list_del_init(&ce->e_lru_list);
+                               spin_unlock(&mb_cache_spinlock);
+                       }
+                       if (!__mb_cache_entry_is_block_hashed(ce)) {
+                               __mb_cache_entry_release(ce);
                                 return ERR_PTR(-EAGAIN);
                         }
                         return ce;
                 }
                 l = l->next;
         }
+       hlist_bl_unlock(head);
         return NULL;
  }
  
@@ -562,13 +786,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
                           unsigned int key)
  {
         unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-       struct list_head *l;
-       struct mb_cache_entry *ce;
-
-       spin_lock(&mb_cache_spinlock);
-       l = cache->c_index_hash[bucket].next;
-       ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
-       spin_unlock(&mb_cache_spinlock);
+       struct hlist_bl_node *l;
+       struct mb_cache_entry *ce = NULL;
+       struct hlist_bl_head *index_hash_p;
+
+       index_hash_p = &cache->c_index_hash[bucket];
+       hlist_bl_lock(index_hash_p);
+       if (!hlist_bl_empty(index_hash_p)) {
+               l = hlist_bl_first(index_hash_p);
+               ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+       } else
+               hlist_bl_unlock(index_hash_p);
         return ce;
  }
  
@@ -597,13 +825,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
  {
         struct mb_cache *cache = prev->e_cache;
         unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-       struct list_head *l;
+       struct hlist_bl_node *l;
         struct mb_cache_entry *ce;
+       struct hlist_bl_head *index_hash_p;
  
-       spin_lock(&mb_cache_spinlock);
+       index_hash_p = &cache->c_index_hash[bucket];
+       mb_assert(prev->e_index_hash_p == index_hash_p);
+       hlist_bl_lock(index_hash_p);
+       mb_assert(!hlist_bl_empty(index_hash_p));
         l = prev->e_index.o_list.next;
-       ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
-       __mb_cache_entry_release_unlock(prev);
+       ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+       __mb_cache_entry_release(prev);
         return ce;
  }
  
diff --git a/fs/minix/inode.c b/fs/minix/inode.c

index 0ad2ec9..f007a33 100644 (file)
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
         struct minix_sb_info * sbi = minix_sb(sb);
         struct minix_super_block * ms;
  
+       sync_filesystem(sb);
         ms = sbi->s_ms;
         if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                 return 0;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c

index ee59d35..647d86d 100644 (file)
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -99,6 +99,7 @@ static void destroy_inodecache(void)
  
  static int ncp_remount(struct super_block *sb, int *flags, char* data)
  {
+       sync_filesystem(sb);
         *flags |= MS_NODIRATIME;
         return 0;
  }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c

index 910ed90..2cb5694 100644 (file)
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2215,6 +2215,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
         struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
         u32 nfsvers = nfss->nfs_client->rpc_ops->version;
  
+       sync_filesystem(sb);
+
         /*
          * Userspace mount programs that send binary options generally send
          * them populated with default values. We have no way to know which
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c

index 7ac2a12..8c532b2 100644 (file)
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
         unsigned long old_mount_opt;
         int err;
  
+       sync_filesystem(sb);
         old_sb_flags = sb->s_flags;
         old_mount_opt = nilfs->ns_mount_opt;
  
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c

index 82650d5..bd5610d 100644 (file)
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -468,6 +468,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
  
         ntfs_debug("Entering with remount options string: %s", opt);
  
+       sync_filesystem(sb);
+
  #ifndef NTFS_RW
         /* For read-only compiled driver, enforce read-only flag. */
         *flags |= MS_RDONLY;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c

index 1aecd62..a7cdd56 100644 (file)
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -634,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
         struct ocfs2_super *osb = OCFS2_SB(sb);
         u32 tmp;
  
+       sync_filesystem(sb);
+
         if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
             !ocfs2_check_set_options(sb, &parsed_options)) {
                 ret = -EINVAL;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c

index 8c0ceb8..15e4500 100644 (file)
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
  
  static int openprom_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_NOATIME;
         return 0;
  }
diff --git a/fs/proc/root.c b/fs/proc/root.c

index 7bbeb52..5dbadec 100644 (file)
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
  int proc_remount(struct super_block *sb, int *flags, char *data)
  {
         struct pid_namespace *pid = sb->s_fs_info;
+
+       sync_filesystem(sb);
         return !proc_parse_options(data, pid);
  }
  
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c

index 1282384..192297b 100644 (file)
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -249,6 +249,7 @@ static void parse_options(char *options)
  
  static int pstore_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         parse_options(data);
  
         return 0;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c

index 8955881..c4bcb77 100644 (file)
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -44,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
  {
         struct qnx4_sb_info *qs;
  
+       sync_filesystem(sb);
         qs = qnx4_sb(sb);
         qs->Version = QNX4_VERSION;
         *flags |= MS_RDONLY;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c

index 8d941ed..65cdaab 100644 (file)
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
  
  static int qnx6_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_RDONLY;
         return 0;
  }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c

index ed54a04..9fb2042 100644 (file)
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1318,6 +1318,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
         int i;
  #endif
  
+       sync_filesystem(s);
         reiserfs_write_lock(s);
  
  #ifdef CONFIG_QUOTA
diff --git a/fs/romfs/super.c b/fs/romfs/super.c

index d841878..ef90e8b 100644 (file)
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
   */
  static int romfs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_RDONLY;
         return 0;
  }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c

index 202df63..031c8d6 100644 (file)
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
  
  static int squashfs_remount(struct super_block *sb, int *flags, char *data)
  {
+       sync_filesystem(sb);
         *flags |= MS_RDONLY;
         return 0;
  }
diff --git a/fs/super.c b/fs/super.c

index 80d5cf2..e9dc3c3 100644 (file)
--- a/fs/super.c
+++ b/fs/super.c
@@ -719,8 +719,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                 }
         }
  
-       sync_filesystem(sb);
-
         if (sb->s_op->remount_fs) {
                 retval = sb->s_op->remount_fs(sb, &flags, data);
                 if (retval) {
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c

index 5625ca9..8895630 100644 (file)
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
  {
         struct sysv_sb_info *sbi = SYSV_SB(sb);
  
+       sync_filesystem(sb);
         if (sbi->s_forced_ro)
                 *flags |= MS_RDONLY;
         return 0;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c

index 48f943f..a126608 100644 (file)
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1827,6 +1827,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
         int err;
         struct ubifs_info *c = sb->s_fs_info;
  
+       sync_filesystem(sb);
         dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
  
         err = ubifs_parse_options(c, data, 1);
diff --git a/fs/udf/super.c b/fs/udf/super.c

index 3306b9f..64f2b73 100644 (file)
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -646,6 +646,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
         int error = 0;
         struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
  
+       sync_filesystem(sb);
         if (lvidiu) {
                 int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
                 if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
diff --git a/fs/ufs/super.c b/fs/ufs/super.c

index 329f2f5..b8c6791 100644 (file)
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1280,6 +1280,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
         unsigned new_mount_opt, ufstype;
         unsigned flags;
  
+       sync_filesystem(sb);
         lock_ufs(sb);
         mutex_lock(&UFS_SB(sb)->s_lock);
         uspi = UFS_SB(sb)->s_uspi;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 0ef5992..2053767 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
         char                    *p;
         int                     error;
  
+       sync_filesystem(sb);
         while ((p = strsep(&options, ",")) != NULL) {
                 int token;
  
diff --git a/include/linux/fs.h b/include/linux/fs.h

index a877ed3..ea80f1c 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2572,6 +2572,9 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
  void inode_dio_wait(struct inode *inode);
  void inode_dio_done(struct inode *inode);
  
+extern void inode_set_flags(struct inode *inode, unsigned int flags,
+                           unsigned int mask);
+
  extern const struct file_operations generic_ro_fops;
  
  #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h

index 5525d37..6a392e7 100644 (file)
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -3,19 +3,21 @@
  
    (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
  */
-
  struct mb_cache_entry {
         struct list_head                e_lru_list;
         struct mb_cache                 *e_cache;
         unsigned short                  e_used;
         unsigned short                  e_queued;
+       atomic_t                        e_refcnt;
         struct block_device             *e_bdev;
         sector_t                        e_block;
-       struct list_head                e_block_list;
+       struct hlist_bl_node            e_block_list;
         struct {
-               struct list_head        o_list;
+               struct hlist_bl_node    o_list;
                 unsigned int            o_key;
         } e_index;
+       struct hlist_bl_head            *e_block_hash_p;
+       struct hlist_bl_head            *e_index_hash_p;
  };
  
  struct mb_cache {
@@ -25,8 +27,8 @@ struct mb_cache {
         int                             c_max_entries;
         int                             c_bucket_bits;
         struct kmem_cache               *c_entry_cache;
-       struct list_head                *c_block_hash;
-       struct list_head                *c_index_hash;
+       struct hlist_bl_head            *c_block_hash;
+       struct hlist_bl_head            *c_index_hash;
  };
  
  /* Functions on caches */
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h

index 197d312..010ea89 100644 (file)
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -16,6 +16,15 @@ struct mpage_da_data;
  struct ext4_map_blocks;
  struct extent_status;
  
+/* shim until we merge in the xfs_collapse_range branch */
+#ifndef FALLOC_FL_COLLAPSE_RANGE
+#define FALLOC_FL_COLLAPSE_RANGE       0x08
+#endif
+
+#ifndef FALLOC_FL_ZERO_RANGE
+#define FALLOC_FL_ZERO_RANGE           0x10
+#endif
+
  #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
  
  #define show_mballoc_flags(flags) __print_flags(flags, "|",    \
@@ -68,6 +77,13 @@ struct extent_status;
         { EXTENT_STATUS_DELAYED,        "D" },                  \
         { EXTENT_STATUS_HOLE,           "H" })
  
+#define show_falloc_mode(mode) __print_flags(mode, "|",                \
+       { FALLOC_FL_KEEP_SIZE,          "KEEP_SIZE"},           \
+       { FALLOC_FL_PUNCH_HOLE,         "PUNCH_HOLE"},          \
+       { FALLOC_FL_NO_HIDE_STALE,      "NO_HIDE_STALE"},       \
+       { FALLOC_FL_COLLAPSE_RANGE,     "COLLAPSE_RANGE"},      \
+       { FALLOC_FL_ZERO_RANGE,         "ZERO_RANGE"})
+
  
  TRACE_EVENT(ext4_free_inode,
         TP_PROTO(struct inode *inode),
@@ -1328,7 +1344,7 @@ TRACE_EVENT(ext4_direct_IO_exit,
                   __entry->rw, __entry->ret)
  );
  
-TRACE_EVENT(ext4_fallocate_enter,
+DECLARE_EVENT_CLASS(ext4__fallocate_mode,
         TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
  
         TP_ARGS(inode, offset, len, mode),
@@ -1336,23 +1352,45 @@ TRACE_EVENT(ext4_fallocate_enter,
         TP_STRUCT__entry(
                 __field(        dev_t,  dev                     )
                 __field(        ino_t,  ino                     )
-               __field(        loff_t, pos                     )
-               __field(        loff_t, len                     )
+               __field(        loff_t, offset                  )
+               __field(        loff_t, len                     )
                 __field(        int,    mode                    )
         ),
  
         TP_fast_assign(
                 __entry->dev    = inode->i_sb->s_dev;
                 __entry->ino    = inode->i_ino;
-               __entry->pos    = offset;
+               __entry->offset = offset;
                 __entry->len    = len;
                 __entry->mode   = mode;
         ),
  
-       TP_printk("dev %d,%d ino %lu pos %lld len %lld mode %d",
+       TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long) __entry->ino, __entry->pos,
-                 __entry->len, __entry->mode)
+                 (unsigned long) __entry->ino,
+                 __entry->offset, __entry->len,
+                 show_falloc_mode(__entry->mode))
+);
+
+DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,
+
+       TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+       TP_ARGS(inode, offset, len, mode)
+);
+
+DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,
+
+       TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+       TP_ARGS(inode, offset, len, mode)
+);
+
+DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,
+
+       TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),
+
+       TP_ARGS(inode, offset, len, mode)
  );
  
  TRACE_EVENT(ext4_fallocate_exit,
@@ -1384,31 +1422,6 @@ TRACE_EVENT(ext4_fallocate_exit,
                   __entry->ret)
  );
  
-TRACE_EVENT(ext4_punch_hole,
-       TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
-
-       TP_ARGS(inode, offset, len),
-
-       TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
-               __field(        loff_t, offset                  )
-               __field(        loff_t, len                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
-               __entry->ino    = inode->i_ino;
-               __entry->offset = offset;
-               __entry->len    = len;
-       ),
-
-       TP_printk("dev %d,%d ino %lu offset %lld len %lld",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long) __entry->ino,
-                 __entry->offset, __entry->len)
-);
-
  TRACE_EVENT(ext4_unlink_enter,
         TP_PROTO(struct inode *parent, struct dentry *dentry),
  
@@ -2410,6 +2423,31 @@ TRACE_EVENT(ext4_es_shrink_exit,
                   __entry->shrunk_nr, __entry->cache_cnt)
  );
  
+TRACE_EVENT(ext4_collapse_range,
+       TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
+
+       TP_ARGS(inode, offset, len),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(loff_t, offset)
+               __field(loff_t, len)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = inode->i_sb->s_dev;
+               __entry->ino    = inode->i_ino;
+               __entry->offset = offset;
+               __entry->len    = len;
+       ),
+
+       TP_printk("dev %d,%d ino %lu offset %lld len %lld",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long) __entry->ino,
+                 __entry->offset, __entry->len)
+);
+
  #endif /* _TRACE_EXT4_H */
  
  /* This part must be outside protection */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 4 Apr 2014 22:39:39 +0000 (15:39 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 4 Apr 2014 22:39:39 +0000 (15:39 -0700)
fs/adfs/super.c		patch \| blob \| history
fs/affs/super.c		patch \| blob \| history
fs/befs/linuxvfs.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/cifs/cifsfs.c		patch \| blob \| history
fs/coda/inode.c		patch \| blob \| history
fs/cramfs/inode.c		patch \| blob \| history
fs/debugfs/inode.c		patch \| blob \| history
fs/devpts/inode.c		patch \| blob \| history
fs/efs/super.c		patch \| blob \| history
fs/ext2/super.c		patch \| blob \| history
fs/ext3/super.c		patch \| blob \| history
fs/ext4/ext4.h		patch \| blob \| history
fs/ext4/ext4_jbd2.c		patch \| blob \| history
fs/ext4/extents.c		patch \| blob \| history
fs/ext4/extents_status.c		patch \| blob \| history
fs/ext4/extents_status.h		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/ext4/ioctl.c		patch \| blob \| history
fs/ext4/mballoc.c		patch \| blob \| history
fs/ext4/mballoc.h		patch \| blob \| history
fs/ext4/move_extent.c		patch \| blob \| history
fs/ext4/super.c		patch \| blob \| history
fs/ext4/xattr.c		patch \| blob \| history
fs/ext4/xattr.h		patch \| blob \| history
fs/f2fs/super.c		patch \| blob \| history
fs/fat/inode.c		patch \| blob \| history
fs/freevxfs/vxfs_super.c		patch \| blob \| history
fs/fuse/inode.c		patch \| blob \| history
fs/gfs2/super.c		patch \| blob \| history
fs/hfs/super.c		patch \| blob \| history
fs/hfsplus/super.c		patch \| blob \| history
fs/hpfs/super.c		patch \| blob \| history
fs/inode.c		patch \| blob \| history
fs/isofs/inode.c		patch \| blob \| history
fs/jbd2/commit.c		patch \| blob \| history
fs/jbd2/journal.c		patch \| blob \| history
fs/jbd2/transaction.c		patch \| blob \| history
fs/jffs2/super.c		patch \| blob \| history
fs/jfs/super.c		patch \| blob \| history
fs/mbcache.c		patch \| blob \| history
fs/minix/inode.c		patch \| blob \| history
fs/ncpfs/inode.c		patch \| blob \| history
fs/nfs/super.c		patch \| blob \| history
fs/nilfs2/super.c		patch \| blob \| history
fs/ntfs/super.c		patch \| blob \| history
fs/ocfs2/super.c		patch \| blob \| history
fs/openpromfs/inode.c		patch \| blob \| history
fs/proc/root.c		patch \| blob \| history
fs/pstore/inode.c		patch \| blob \| history
fs/qnx4/inode.c		patch \| blob \| history
fs/qnx6/inode.c		patch \| blob \| history
fs/reiserfs/super.c		patch \| blob \| history
fs/romfs/super.c		patch \| blob \| history
fs/squashfs/super.c		patch \| blob \| history
fs/super.c		patch \| blob \| history
fs/sysv/inode.c		patch \| blob \| history
fs/ubifs/super.c		patch \| blob \| history
fs/udf/super.c		patch \| blob \| history
fs/ufs/super.c		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/mbcache.h		patch \| blob \| history
include/trace/events/ext4.h		patch \| blob \| history