writeback: fix race that cause writeback hung
authorJunxiao Bi <junxiao.bi@oracle.com>
Wed, 11 Sep 2013 21:23:04 +0000 (14:23 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 11 Sep 2013 22:58:13 +0000 (15:58 -0700)
There is a race between mark inode dirty and writeback thread, see the
following scenario.  In this case, writeback thread will not run though
there is dirty_io.

__mark_inode_dirty()                                          bdi_writeback_workfn()
...                                                        ...
spin_lock(&inode->i_lock);
...
if (bdi_cap_writeback_dirty(bdi)) {
    <<< assume wb has dirty_io, so wakeup_bdi is false.
    <<< the following inode_dirty also have wakeup_bdi false.
    if (!wb_has_dirty_io(&bdi->wb))
    wakeup_bdi = true;
}
spin_unlock(&inode->i_lock);
                                                            <<< assume last dirty_io is removed here.
                                                            pages_written = wb_do_writeback(wb);
                                                            ...
                                                            <<< work_list empty and wb has no dirty_io,
                                                            <<< delayed_work will not be queued.
                                                            if (!list_empty(&bdi->work_list) ||
                                                                (wb_has_dirty_io(wb) && dirty_writeback_interval))
                                                                queue_delayed_work(bdi_wq, &wb->dwork,
                                                                    msecs_to_jiffies(dirty_writeback_interval * 10));
spin_lock(&bdi->wb.list_lock);
inode->dirtied_when = jiffies;
<<< new dirty_io is added.
list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
spin_unlock(&bdi->wb.list_lock);

<<< though there is dirty_io, but wakeup_bdi is false,
<<< so writeback thread will not be waked up and
<<< the new dirty_io will not be flushed.
if (wakeup_bdi)
    bdi_wakeup_thread_delayed(bdi);

Writeback will run until there is a new flush work queued.  This may cause
a lot of dirty pages stay in memory for a long time.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/fs-writeback.c

index 54b3c31..30f6f27 100644 (file)
@@ -1171,6 +1171,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        bool wakeup_bdi = false;
                        bdi = inode_to_bdi(inode);
 
+                       spin_unlock(&inode->i_lock);
+                       spin_lock(&bdi->wb.list_lock);
                        if (bdi_cap_writeback_dirty(bdi)) {
                                WARN(!test_bit(BDI_registered, &bdi->state),
                                     "bdi-%s not registered\n", bdi->name);
@@ -1185,8 +1187,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                                        wakeup_bdi = true;
                        }
 
-                       spin_unlock(&inode->i_lock);
-                       spin_lock(&bdi->wb.list_lock);
                        inode->dirtied_when = jiffies;
                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
                        spin_unlock(&bdi->wb.list_lock);