Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next
[cascardo/linux.git] / fs / fuse / file.c
index 4598345..7e70506 100644 (file)
@@ -334,7 +334,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
 
                BUG_ON(req->inode != inode);
                curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
-               if (curr_index == index) {
+               if (curr_index <= index &&
+                   index < curr_index + req->num_pages) {
                        found = true;
                        break;
                }
@@ -1409,8 +1410,13 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
-       __free_page(req->pages[0]);
-       fuse_file_put(req->ff, false);
+       int i;
+
+       for (i = 0; i < req->num_pages; i++)
+               __free_page(req->pages[i]);
+
+       if (req->ff)
+               fuse_file_put(req->ff, false);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1418,30 +1424,34 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
        struct inode *inode = req->inode;
        struct fuse_inode *fi = get_fuse_inode(inode);
        struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+       int i;
 
        list_del(&req->writepages_entry);
-       dec_bdi_stat(bdi, BDI_WRITEBACK);
-       dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
-       bdi_writeout_inc(bdi);
+       for (i = 0; i < req->num_pages; i++) {
+               dec_bdi_stat(bdi, BDI_WRITEBACK);
+               dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+               bdi_writeout_inc(bdi);
+       }
        wake_up(&fi->page_waitq);
 }
 
 /* Called under fc->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
+                               loff_t size)
 __releases(fc->lock)
 __acquires(fc->lock)
 {
        struct fuse_inode *fi = get_fuse_inode(req->inode);
-       loff_t size = i_size_read(req->inode);
        struct fuse_write_in *inarg = &req->misc.write.in;
+       __u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
 
        if (!fc->connected)
                goto out_free;
 
-       if (inarg->offset + PAGE_CACHE_SIZE <= size) {
-               inarg->size = PAGE_CACHE_SIZE;
+       if (inarg->offset + data_size <= size) {
+               inarg->size = data_size;
        } else if (inarg->offset < size) {
-               inarg->size = size & (PAGE_CACHE_SIZE - 1);
+               inarg->size = size - inarg->offset;
        } else {
                /* Got truncated off completely */
                goto out_free;
@@ -1472,12 +1482,13 @@ __acquires(fc->lock)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
+       size_t crop = i_size_read(inode);
        struct fuse_req *req;
 
        while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
                req = list_entry(fi->queued_writes.next, struct fuse_req, list);
                list_del_init(&req->list);
-               fuse_send_writepage(fc, req);
+               fuse_send_writepage(fc, req, crop);
        }
 }
 
@@ -1488,12 +1499,62 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 
        mapping_set_error(inode->i_mapping, req->out.h.error);
        spin_lock(&fc->lock);
+       while (req->misc.write.next) {
+               struct fuse_conn *fc = get_fuse_conn(inode);
+               struct fuse_write_in *inarg = &req->misc.write.in;
+               struct fuse_req *next = req->misc.write.next;
+               req->misc.write.next = next->misc.write.next;
+               next->misc.write.next = NULL;
+               next->ff = fuse_file_get(req->ff);
+               list_add(&next->writepages_entry, &fi->writepages);
+
+               /*
+                * Skip fuse_flush_writepages() to make it easy to crop requests
+                * based on primary request size.
+                *
+                * 1st case (trivial): there are no concurrent activities using
+                * fuse_set/release_nowrite.  Then we're on safe side because
+                * fuse_flush_writepages() would call fuse_send_writepage()
+                * anyway.
+                *
+                * 2nd case: someone called fuse_set_nowrite and it is waiting
+                * now for completion of all in-flight requests.  This happens
+                * rarely and no more than once per page, so this should be
+                * okay.
+                *
+                * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
+                * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
+                * that fuse_set_nowrite returned implies that all in-flight
+                * requests were completed along with all of their secondary
+                * requests.  Further primary requests are blocked by negative
+                * writectr.  Hence there cannot be any in-flight requests and
+                * no invocations of fuse_writepage_end() while we're in
+                * fuse_set_nowrite..fuse_release_nowrite section.
+                */
+               fuse_send_writepage(fc, next, inarg->offset + inarg->size);
+       }
        fi->writectr--;
        fuse_writepage_finish(fc, req);
        spin_unlock(&fc->lock);
        fuse_writepage_free(fc, req);
 }
 
+static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
+                                            struct fuse_inode *fi)
+{
+       struct fuse_file *ff = NULL;
+
+       spin_lock(&fc->lock);
+       if (!WARN_ON(list_empty(&fi->write_files))) {
+               ff = list_entry(fi->write_files.next, struct fuse_file,
+                               write_entry);
+               fuse_file_get(ff);
+       }
+       spin_unlock(&fc->lock);
+
+       return ff;
+}
+
 static int fuse_writepage_locked(struct page *page)
 {
        struct address_space *mapping = page->mapping;
@@ -1501,8 +1562,8 @@ static int fuse_writepage_locked(struct page *page)
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
        struct fuse_req *req;
-       struct fuse_file *ff;
        struct page *tmp_page;
+       int error = -ENOMEM;
 
        set_page_writeback(page);
 
@@ -1515,16 +1576,16 @@ static int fuse_writepage_locked(struct page *page)
        if (!tmp_page)
                goto err_free;
 
-       spin_lock(&fc->lock);
-       BUG_ON(list_empty(&fi->write_files));
-       ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
-       req->ff = fuse_file_get(ff);
-       spin_unlock(&fc->lock);
+       error = -EIO;
+       req->ff = fuse_write_file_get(fc, fi);
+       if (!req->ff)
+               goto err_free;
 
-       fuse_write_fill(req, ff, page_offset(page), 0);
+       fuse_write_fill(req, req->ff, page_offset(page), 0);
 
        copy_highpage(tmp_page, page);
        req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+       req->misc.write.next = NULL;
        req->in.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = tmp_page;
@@ -1550,19 +1611,263 @@ err_free:
        fuse_request_free(req);
 err:
        end_page_writeback(page);
-       return -ENOMEM;
+       return error;
 }
 
 static int fuse_writepage(struct page *page, struct writeback_control *wbc)
 {
        int err;
 
+       if (fuse_page_is_writeback(page->mapping->host, page->index)) {
+               /*
+                * ->writepages() should be called for sync() and friends.  We
+                * should only get here on direct reclaim and then we are
+                * allowed to skip a page which is already in flight
+                */
+               WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
+
+               redirty_page_for_writepage(wbc, page);
+               return 0;
+       }
+
        err = fuse_writepage_locked(page);
        unlock_page(page);
 
        return err;
 }
 
+struct fuse_fill_wb_data {
+       struct fuse_req *req;
+       struct fuse_file *ff;
+       struct inode *inode;
+       struct page **orig_pages;
+};
+
+static void fuse_writepages_send(struct fuse_fill_wb_data *data)
+{
+       struct fuse_req *req = data->req;
+       struct inode *inode = data->inode;
+       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_inode *fi = get_fuse_inode(inode);
+       int num_pages = req->num_pages;
+       int i;
+
+       req->ff = fuse_file_get(data->ff);
+       spin_lock(&fc->lock);
+       list_add_tail(&req->list, &fi->queued_writes);
+       fuse_flush_writepages(inode);
+       spin_unlock(&fc->lock);
+
+       for (i = 0; i < num_pages; i++)
+               end_page_writeback(data->orig_pages[i]);
+}
+
+static bool fuse_writepage_in_flight(struct fuse_req *new_req,
+                                    struct page *page)
+{
+       struct fuse_conn *fc = get_fuse_conn(new_req->inode);
+       struct fuse_inode *fi = get_fuse_inode(new_req->inode);
+       struct fuse_req *tmp;
+       struct fuse_req *old_req;
+       bool found = false;
+       pgoff_t curr_index;
+
+       BUG_ON(new_req->num_pages != 0);
+
+       spin_lock(&fc->lock);
+       list_del(&new_req->writepages_entry);
+       list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
+               BUG_ON(old_req->inode != new_req->inode);
+               curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+               if (curr_index <= page->index &&
+                   page->index < curr_index + old_req->num_pages) {
+                       found = true;
+                       break;
+               }
+       }
+       if (!found) {
+               list_add(&new_req->writepages_entry, &fi->writepages);
+               goto out_unlock;
+       }
+
+       new_req->num_pages = 1;
+       for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
+               BUG_ON(tmp->inode != new_req->inode);
+               curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+               if (tmp->num_pages == 1 &&
+                   curr_index == page->index) {
+                       old_req = tmp;
+               }
+       }
+
+       if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
+                                       old_req->state == FUSE_REQ_PENDING)) {
+               struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+
+               copy_highpage(old_req->pages[0], page);
+               spin_unlock(&fc->lock);
+
+               dec_bdi_stat(bdi, BDI_WRITEBACK);
+               dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+               bdi_writeout_inc(bdi);
+               fuse_writepage_free(fc, new_req);
+               fuse_request_free(new_req);
+               goto out;
+       } else {
+               new_req->misc.write.next = old_req->misc.write.next;
+               old_req->misc.write.next = new_req;
+       }
+out_unlock:
+       spin_unlock(&fc->lock);
+out:
+       return found;
+}
+
+static int fuse_writepages_fill(struct page *page,
+               struct writeback_control *wbc, void *_data)
+{
+       struct fuse_fill_wb_data *data = _data;
+       struct fuse_req *req = data->req;
+       struct inode *inode = data->inode;
+       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct page *tmp_page;
+       bool is_writeback;
+       int err;
+
+       if (!data->ff) {
+               err = -EIO;
+               data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
+               if (!data->ff)
+                       goto out_unlock;
+       }
+
+       /*
+        * Being under writeback is unlikely but possible.  For example direct
+        * read to an mmaped fuse file will set the page dirty twice; once when
+        * the pages are faulted with get_user_pages(), and then after the read
+        * completed.
+        */
+       is_writeback = fuse_page_is_writeback(inode, page->index);
+
+       if (req && req->num_pages &&
+           (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+            (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+            data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
+               fuse_writepages_send(data);
+               data->req = NULL;
+       }
+       err = -ENOMEM;
+       tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       if (!tmp_page)
+               goto out_unlock;
+
+       /*
+        * The page must not be redirtied until the writeout is completed
+        * (i.e. userspace has sent a reply to the write request).  Otherwise
+        * there could be more than one temporary page instance for each real
+        * page.
+        *
+        * This is ensured by holding the page lock in page_mkwrite() while
+        * checking fuse_page_is_writeback().  We already hold the page lock
+        * since clear_page_dirty_for_io() and keep it held until we add the
+        * request to the fi->writepages list and increment req->num_pages.
+        * After this fuse_page_is_writeback() will indicate that the page is
+        * under writeback, so we can release the page lock.
+        */
+       if (data->req == NULL) {
+               struct fuse_inode *fi = get_fuse_inode(inode);
+
+               err = -ENOMEM;
+               req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+               if (!req) {
+                       __free_page(tmp_page);
+                       goto out_unlock;
+               }
+
+               fuse_write_fill(req, data->ff, page_offset(page), 0);
+               req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+               req->misc.write.next = NULL;
+               req->in.argpages = 1;
+               req->background = 1;
+               req->num_pages = 0;
+               req->end = fuse_writepage_end;
+               req->inode = inode;
+
+               spin_lock(&fc->lock);
+               list_add(&req->writepages_entry, &fi->writepages);
+               spin_unlock(&fc->lock);
+
+               data->req = req;
+       }
+       set_page_writeback(page);
+
+       copy_highpage(tmp_page, page);
+       req->pages[req->num_pages] = tmp_page;
+       req->page_descs[req->num_pages].offset = 0;
+       req->page_descs[req->num_pages].length = PAGE_SIZE;
+
+       inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+       inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+
+       err = 0;
+       if (is_writeback && fuse_writepage_in_flight(req, page)) {
+               end_page_writeback(page);
+               data->req = NULL;
+               goto out_unlock;
+       }
+       data->orig_pages[req->num_pages] = page;
+
+       /*
+        * Protected by fc->lock against concurrent access by
+        * fuse_page_is_writeback().
+        */
+       spin_lock(&fc->lock);
+       req->num_pages++;
+       spin_unlock(&fc->lock);
+
+out_unlock:
+       unlock_page(page);
+
+       return err;
+}
+
+static int fuse_writepages(struct address_space *mapping,
+                          struct writeback_control *wbc)
+{
+       struct inode *inode = mapping->host;
+       struct fuse_fill_wb_data data;
+       int err;
+
+       err = -EIO;
+       if (is_bad_inode(inode))
+               goto out;
+
+       data.inode = inode;
+       data.req = NULL;
+       data.ff = NULL;
+
+       err = -ENOMEM;
+       data.orig_pages = kzalloc(sizeof(struct page *) *
+                                 FUSE_MAX_PAGES_PER_REQ,
+                                 GFP_NOFS);
+       if (!data.orig_pages)
+               goto out;
+
+       err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
+       if (data.req) {
+               /* Ignore errors if we can write at least one page */
+               BUG_ON(!data.req->num_pages);
+               fuse_writepages_send(&data);
+               err = 0;
+       }
+       if (data.ff)
+               fuse_file_put(data.ff, false);
+
+       kfree(data.orig_pages);
+out:
+       return err;
+}
+
 static int fuse_launder_page(struct page *page)
 {
        int err = 0;
@@ -1602,14 +1907,17 @@ static void fuse_vma_close(struct vm_area_struct *vma)
 static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct page *page = vmf->page;
-       /*
-        * Don't use page->mapping as it may become NULL from a
-        * concurrent truncate.
-        */
-       struct inode *inode = vma->vm_file->f_mapping->host;
+       struct inode *inode = file_inode(vma->vm_file);
+
+       file_update_time(vma->vm_file);
+       lock_page(page);
+       if (page->mapping != inode->i_mapping) {
+               unlock_page(page);
+               return VM_FAULT_NOPAGE;
+       }
 
        fuse_wait_on_page_writeback(inode, page->index);
-       return 0;
+       return VM_FAULT_LOCKED;
 }
 
 static const struct vm_operations_struct fuse_file_vm_ops = {
@@ -2581,6 +2889,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 static const struct address_space_operations fuse_file_aops  = {
        .readpage       = fuse_readpage,
        .writepage      = fuse_writepage,
+       .writepages     = fuse_writepages,
        .launder_page   = fuse_launder_page,
        .readpages      = fuse_readpages,
        .set_page_dirty = __set_page_dirty_nobuffers,