writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
[cascardo/linux.git] / mm / filemap.c
index 12548d0..bfc1ab0 100644 (file)
  *    ->tree_lock              (page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock          (page_remove_rmap->set_page_dirty)
+ *    ->memcg->move_lock       (page_remove_rmap->mem_cgroup_begin_page_stat)
  *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
  *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
  *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
@@ -174,9 +175,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 /*
  * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock and
+ * mem_cgroup_begin_page_stat().
  */
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page, void *shadow,
+                             struct mem_cgroup *memcg)
 {
        struct address_space *mapping = page->mapping;
 
@@ -210,7 +213,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
         * anyway will be cleared before returning page into buddy allocator.
         */
        if (WARN_ON_ONCE(PageDirty(page)))
-               account_page_cleaned(page, mapping);
+               account_page_cleaned(page, mapping, memcg,
+                                    inode_to_wb(mapping->host));
 }
 
 /**
@@ -224,14 +228,20 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 void delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+       struct mem_cgroup *memcg;
+       unsigned long flags;
+
        void (*freepage)(struct page *);
 
        BUG_ON(!PageLocked(page));
 
        freepage = mapping->a_ops->freepage;
-       spin_lock_irq(&mapping->tree_lock);
-       __delete_from_page_cache(page, NULL);
-       spin_unlock_irq(&mapping->tree_lock);
+
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
+       __delete_from_page_cache(page, NULL, memcg);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
 
        if (freepage)
                freepage(page);
@@ -281,7 +291,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
        if (!mapping_cap_writeback_dirty(mapping))
                return 0;
 
+       wbc_attach_fdatawrite_inode(&wbc, mapping->host);
        ret = do_writepages(mapping, &wbc);
+       wbc_detach_inode(&wbc);
        return ret;
 }
 
@@ -470,6 +482,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
        if (!error) {
                struct address_space *mapping = old->mapping;
                void (*freepage)(struct page *);
+               struct mem_cgroup *memcg;
+               unsigned long flags;
 
                pgoff_t offset = old->index;
                freepage = mapping->a_ops->freepage;
@@ -478,15 +492,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                new->mapping = mapping;
                new->index = offset;
 
-               spin_lock_irq(&mapping->tree_lock);
-               __delete_from_page_cache(old, NULL);
+               memcg = mem_cgroup_begin_page_stat(old);
+               spin_lock_irqsave(&mapping->tree_lock, flags);
+               __delete_from_page_cache(old, NULL, memcg);
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
                __inc_zone_page_state(new, NR_FILE_PAGES);
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
-               spin_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                mem_cgroup_migrate(old, new, true);
                radix_tree_preload_end();
                if (freepage)
@@ -1693,7 +1709,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
        loff_t *ppos = &iocb->ki_pos;
        loff_t pos = *ppos;
 
-       if (io_is_direct(file)) {
+       if (iocb->ki_flags & IOCB_DIRECT) {
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;
                size_t count = iov_iter_count(iter);
@@ -1706,7 +1722,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
                                        pos + count - 1);
                if (!retval) {
                        struct iov_iter data = *iter;
-                       retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
+                       retval = mapping->a_ops->direct_IO(iocb, &data, pos);
                }
 
                if (retval > 0) {
@@ -2259,41 +2275,38 @@ EXPORT_SYMBOL(read_cache_page_gfp);
  * Returns appropriate error code that caller should return or
  * zero in case that write should be allowed.
  */
-inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
+inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
 {
+       struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        unsigned long limit = rlimit(RLIMIT_FSIZE);
+       loff_t pos;
 
-        if (unlikely(*pos < 0))
-                return -EINVAL;
+       if (!iov_iter_count(from))
+               return 0;
 
-       if (!isblk) {
-               /* FIXME: this is for backwards compatibility with 2.4 */
-               if (file->f_flags & O_APPEND)
-                        *pos = i_size_read(inode);
+       /* FIXME: this is for backwards compatibility with 2.4 */
+       if (iocb->ki_flags & IOCB_APPEND)
+               iocb->ki_pos = i_size_read(inode);
 
-               if (limit != RLIM_INFINITY) {
-                       if (*pos >= limit) {
-                               send_sig(SIGXFSZ, current, 0);
-                               return -EFBIG;
-                       }
-                       if (*count > limit - (typeof(limit))*pos) {
-                               *count = limit - (typeof(limit))*pos;
-                       }
+       pos = iocb->ki_pos;
+
+       if (limit != RLIM_INFINITY) {
+               if (iocb->ki_pos >= limit) {
+                       send_sig(SIGXFSZ, current, 0);
+                       return -EFBIG;
                }
+               iov_iter_truncate(from, limit - (unsigned long)pos);
        }
 
        /*
         * LFS rule
         */
-       if (unlikely(*pos + *count > MAX_NON_LFS &&
+       if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
                                !(file->f_flags & O_LARGEFILE))) {
-               if (*pos >= MAX_NON_LFS) {
+               if (pos >= MAX_NON_LFS)
                        return -EFBIG;
-               }
-               if (*count > MAX_NON_LFS - (unsigned long)*pos) {
-                       *count = MAX_NON_LFS - (unsigned long)*pos;
-               }
+               iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
        }
 
        /*
@@ -2303,34 +2316,11 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
         * exceeded without writing data we send a signal and return EFBIG.
         * Linus frestrict idea will clean these up nicely..
         */
-       if (likely(!isblk)) {
-               if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
-                       if (*count || *pos > inode->i_sb->s_maxbytes) {
-                               return -EFBIG;
-                       }
-                       /* zero-length writes at ->s_maxbytes are OK */
-               }
+       if (unlikely(pos >= inode->i_sb->s_maxbytes))
+               return -EFBIG;
 
-               if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
-                       *count = inode->i_sb->s_maxbytes - *pos;
-       } else {
-#ifdef CONFIG_BLOCK
-               loff_t isize;
-               if (bdev_read_only(I_BDEV(inode)))
-                       return -EPERM;
-               isize = i_size_read(inode);
-               if (*pos >= isize) {
-                       if (*count || *pos > isize)
-                               return -ENOSPC;
-               }
-
-               if (*pos + *count > isize)
-                       *count = isize - *pos;
-#else
-               return -EPERM;
-#endif
-       }
-       return 0;
+       iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
+       return iov_iter_count(from);
 }
 EXPORT_SYMBOL(generic_write_checks);
 
@@ -2394,7 +2384,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
        }
 
        data = *from;
-       written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+       written = mapping->a_ops->direct_IO(iocb, &data, pos);
 
        /*
         * Finally, try again to invalidate clean pages which might have been
@@ -2556,23 +2546,12 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct file *file = iocb->ki_filp;
        struct address_space * mapping = file->f_mapping;
        struct inode    *inode = mapping->host;
-       loff_t          pos = iocb->ki_pos;
        ssize_t         written = 0;
        ssize_t         err;
        ssize_t         status;
-       size_t          count = iov_iter_count(from);
 
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(inode);
-       err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-       if (err)
-               goto out;
-
-       if (count == 0)
-               goto out;
-
-       iov_iter_truncate(from, count);
-
        err = file_remove_suid(file);
        if (err)
                goto out;
@@ -2581,10 +2560,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        if (err)
                goto out;
 
-       if (io_is_direct(file)) {
-               loff_t endbyte;
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               loff_t pos, endbyte;
 
-               written = generic_file_direct_write(iocb, from, pos);
+               written = generic_file_direct_write(iocb, from, iocb->ki_pos);
                /*
                 * If the write stopped short of completing, fall back to
                 * buffered writes.  Some filesystems do this for writes to
@@ -2592,13 +2571,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                 * not succeed (even if it did, DAX does not handle dirty
                 * page-cache pages correctly).
                 */
-               if (written < 0 || written == count || IS_DAX(inode))
+               if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
                        goto out;
 
-               pos += written;
-               count -= written;
-
-               status = generic_perform_write(file, from, pos);
+               status = generic_perform_write(file, from, pos = iocb->ki_pos);
                /*
                 * If generic_perform_write() returned a synchronous error
                 * then we want to return the number of bytes which were
@@ -2610,15 +2586,15 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                        err = status;
                        goto out;
                }
-               iocb->ki_pos = pos + status;
                /*
                 * We need to ensure that the page cache pages are written to
                 * disk and invalidated to preserve the expected O_DIRECT
                 * semantics.
                 */
                endbyte = pos + status - 1;
-               err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+               err = filemap_write_and_wait_range(mapping, pos, endbyte);
                if (err == 0) {
+                       iocb->ki_pos = endbyte + 1;
                        written += status;
                        invalidate_mapping_pages(mapping,
                                                 pos >> PAGE_CACHE_SHIFT,
@@ -2630,9 +2606,9 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
                         */
                }
        } else {
-               written = generic_perform_write(file, from, pos);
-               if (likely(written >= 0))
-                       iocb->ki_pos = pos + written;
+               written = generic_perform_write(file, from, iocb->ki_pos);
+               if (likely(written > 0))
+                       iocb->ki_pos += written;
        }
 out:
        current->backing_dev_info = NULL;
@@ -2656,7 +2632,9 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        ssize_t ret;
 
        mutex_lock(&inode->i_mutex);
-       ret = __generic_file_write_iter(iocb, from);
+       ret = generic_write_checks(iocb, from);
+       if (ret > 0)
+               ret = __generic_file_write_iter(iocb, from);
        mutex_unlock(&inode->i_mutex);
 
        if (ret > 0) {