void bio_init(struct bio *bio)
{
memset(bio, 0, sizeof(*bio));
- bio->bi_flags = 1 << BIO_UPTODATE;
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
}
__bio_free(bio);
memset(bio, 0, BIO_RESET_BYTES);
- bio->bi_flags = flags | (1 << BIO_UPTODATE);
+ bio->bi_flags = flags;
atomic_set(&bio->__bi_remaining, 1);
}
EXPORT_SYMBOL(bio_reset);
-static void bio_chain_endio(struct bio *bio, int error)
+static void bio_chain_endio(struct bio *bio)
{
- bio_endio(bio->bi_private, error);
+ struct bio *parent = bio->bi_private;
+
+ parent->bi_error = bio->bi_error;
+ bio_endio(parent);
bio_put(bio);
}
*/
static inline void bio_inc_remaining(struct bio *bio)
{
- bio->bi_flags |= (1 << BIO_CHAIN);
+ bio_set_flag(bio, BIO_CHAIN);
smp_mb__before_atomic();
atomic_inc(&bio->__bi_remaining);
}
if (unlikely(!bvl))
goto err_free;
- bio->bi_flags |= 1 << BIO_OWNS_VEC;
+ bio_set_flag(bio, BIO_OWNS_VEC);
} else if (nr_iovecs) {
bvl = bio->bi_inline_vecs;
}
* so we don't set nor calculate new physical/hw segment counts here
*/
bio->bi_bdev = bio_src->bi_bdev;
- bio->bi_flags |= 1 << BIO_CLONED;
+ bio_set_flag(bio, BIO_CLONED);
bio->bi_rw = bio_src->bi_rw;
bio->bi_iter = bio_src->bi_iter;
bio->bi_io_vec = bio_src->bi_io_vec;
EXPORT_SYMBOL(bio_clone_bioset);
/**
- * bio_get_nr_vecs - return approx number of vecs
- * @bdev: I/O target
+ * bio_add_pc_page - attempt to add page to bio
+ * @q: the target queue
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
*
- * Return the approximate number of pages we can send to this target.
- * There's no guarantee that you will be able to fit this number of pages
- * into a bio, it does not account for dynamic restrictions that vary
- * on offset.
+ * Attempt to add a page to the bio_vec maplist. This can fail for a
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
+ *
+ * This should only be used by REQ_PC bios.
*/
-int bio_get_nr_vecs(struct block_device *bdev)
-{
- struct request_queue *q = bdev_get_queue(bdev);
- int nr_pages;
-
- nr_pages = min_t(unsigned,
- queue_max_segments(q),
- queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
-
- return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
-
-}
-EXPORT_SYMBOL(bio_get_nr_vecs);
-
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
- *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors)
+int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
+ *page, unsigned int len, unsigned int offset)
{
int retried_segments = 0;
struct bio_vec *bvec;
if (unlikely(bio_flagged(bio, BIO_CLONED)))
return 0;
- if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
+ if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
return 0;
/*
if (page == prev->bv_page &&
offset == prev->bv_offset + prev->bv_len) {
- unsigned int prev_bv_len = prev->bv_len;
prev->bv_len += len;
-
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- /* prev_bvec is already charged in
- bi_size, discharge it in order to
- simulate merging updated prev_bvec
- as new bvec. */
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_iter.bi_sector,
- .bi_size = bio->bi_iter.bi_size -
- prev_bv_len,
- .bi_rw = bio->bi_rw,
- };
-
- if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
- prev->bv_len -= len;
- return 0;
- }
- }
-
bio->bi_iter.bi_size += len;
goto done;
}
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
- if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) &&
- bvec_gap_to_prev(prev, offset))
+ if (bvec_gap_to_prev(q, prev, offset))
return 0;
}
blk_recount_segments(q, bio);
}
- /*
- * if queue has other restrictions (eg varying max sector size
- * depending on offset), it can specify a merge_bvec_fn in the
- * queue to get further control
- */
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_iter.bi_sector,
- .bi_size = bio->bi_iter.bi_size - len,
- .bi_rw = bio->bi_rw,
- };
-
- /*
- * merge_bvec_fn() returns number of bytes it can accept
- * at this offset
- */
- if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
- goto failed;
- }
-
/* If we may be able to merge these biovecs, force a recount */
if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
- bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+ bio_clear_flag(bio, BIO_SEG_VALID);
done:
return len;
blk_recount_segments(q, bio);
return 0;
}
-
-/**
- * bio_add_pc_page - attempt to add page to bio
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
- *
- * This should only be used by REQ_PC bios.
- */
-int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
-{
- return __bio_add_page(q, bio, page, len, offset,
- queue_max_hw_sectors(q));
-}
EXPORT_SYMBOL(bio_add_pc_page);
/**
* @len: vec entry length
* @offset: vec entry offset
*
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
+ * Attempt to add a page to the bio_vec maplist. This will only fail
+ * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
*/
-int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
- unsigned int offset)
+int bio_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- unsigned int max_sectors;
+ struct bio_vec *bv;
+
+ /*
+ * cloned bio must not modify vec list
+ */
+ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+ return 0;
- max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
- if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size)
- max_sectors = len >> 9;
+ /*
+ * For filesystems with a blocksize smaller than the pagesize
+ * we will often be called with the same page as last time and
+ * a consecutive offset. Optimize this special case.
+ */
+ if (bio->bi_vcnt > 0) {
+ bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
- return __bio_add_page(q, bio, page, len, offset, max_sectors);
+ if (page == bv->bv_page &&
+ offset == bv->bv_offset + bv->bv_len) {
+ bv->bv_len += len;
+ goto done;
+ }
+ }
+
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return 0;
+
+ bv = &bio->bi_io_vec[bio->bi_vcnt];
+ bv->bv_page = page;
+ bv->bv_len = len;
+ bv->bv_offset = offset;
+
+ bio->bi_vcnt++;
+done:
+ bio->bi_iter.bi_size += len;
+ return len;
}
EXPORT_SYMBOL(bio_add_page);
int error;
};
-static void submit_bio_wait_endio(struct bio *bio, int error)
+static void submit_bio_wait_endio(struct bio *bio)
{
struct submit_bio_ret *ret = bio->bi_private;
- ret->error = error;
+ ret->error = bio->bi_error;
complete(&ret->event);
}
if (iter->type & WRITE)
bio->bi_rw |= REQ_WRITE;
- bio->bi_flags |= (1 << BIO_USER_MAPPED);
+ bio_set_flag(bio, BIO_USER_MAPPED);
/*
* subtle -- if __bio_map_user() ended up bouncing a bio,
}
EXPORT_SYMBOL(bio_unmap_user);
-static void bio_map_kern_endio(struct bio *bio, int err)
+static void bio_map_kern_endio(struct bio *bio)
{
bio_put(bio);
}
}
EXPORT_SYMBOL(bio_map_kern);
-static void bio_copy_kern_endio(struct bio *bio, int err)
+static void bio_copy_kern_endio(struct bio *bio)
{
bio_free_pages(bio);
bio_put(bio);
}
-static void bio_copy_kern_endio_read(struct bio *bio, int err)
+static void bio_copy_kern_endio_read(struct bio *bio)
{
char *p = bio->bi_private;
struct bio_vec *bvec;
p += bvec->bv_len;
}
- bio_copy_kern_endio(bio, err);
+ bio_copy_kern_endio(bio);
}
/**
BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
if (atomic_dec_and_test(&bio->__bi_remaining)) {
- clear_bit(BIO_CHAIN, &bio->bi_flags);
+ bio_clear_flag(bio, BIO_CHAIN);
return true;
}
/**
* bio_endio - end I/O on a bio
* @bio: bio
- * @error: error, if any
*
* Description:
- * bio_endio() will end I/O on the whole bio. bio_endio() is the
- * preferred way to end I/O on a bio, it takes care of clearing
- * BIO_UPTODATE on error. @error is 0 on success, and and one of the
- * established -Exxxx (-EIO, for instance) error values in case
- * something went wrong. No one should call bi_end_io() directly on a
- * bio unless they own it and thus know that it has an end_io
- * function.
+ * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
+ * way to end I/O on a bio. No one should call bi_end_io() directly on a
+ * bio unless they own it and thus know that it has an end_io function.
**/
-void bio_endio(struct bio *bio, int error)
+void bio_endio(struct bio *bio)
{
while (bio) {
- if (error)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
-
if (unlikely(!bio_remaining_done(bio)))
break;
*/
if (bio->bi_end_io == bio_chain_endio) {
struct bio *parent = bio->bi_private;
+ parent->bi_error = bio->bi_error;
bio_put(bio);
bio = parent;
} else {
if (bio->bi_end_io)
- bio->bi_end_io(bio, error);
+ bio->bi_end_io(bio);
bio = NULL;
}
}
if (offset == 0 && size == bio->bi_iter.bi_size)
return;
- clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+ bio_clear_flag(bio, BIO_SEG_VALID);
bio_advance(bio, offset << 9);
get_io_context_active(ioc);
bio->bi_ioc = ioc;
- bio->bi_css = task_get_css(current, blkio_cgrp_id);
+ bio->bi_css = task_get_css(current, io_cgrp_id);
return 0;
}
EXPORT_SYMBOL_GPL(bio_associate_current);
unsigned int nbytes, int error)
{
if (error)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
+ bio->bi_error = error;
if (unlikely(rq->cmd_flags & REQ_QUIET))
- set_bit(BIO_QUIET, &bio->bi_flags);
+ bio_set_flag(bio, BIO_QUIET);
bio_advance(bio, nbytes);
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
- bio_endio(bio, error);
+ bio_endio(bio);
}
void blk_dump_rq_flags(struct request *rq, char *msg)
if (q->id < 0)
goto fail_q;
+ q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+ if (!q->bio_split)
+ goto fail_id;
+
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
err = bdi_init(&q->backing_dev_info);
if (err)
- goto fail_id;
+ goto fail_split;
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
fail_bdi:
bdi_destroy(&q->backing_dev_info);
+fail_split:
+ bioset_free(q->bio_split);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
fail_q:
struct request *req;
unsigned int request_count = 0;
+ blk_queue_split(q, &bio, q->bio_split);
+
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio_endio(bio, -EIO);
+ bio->bi_error = -EIO;
+ bio_endio(bio);
return;
}
*/
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (IS_ERR(req)) {
- bio_endio(bio, PTR_ERR(req)); /* @q is dead */
+ bio->bi_error = PTR_ERR(req);
+ bio_endio(bio);
goto out_unlock;
}
goto end_io;
}
- if (likely(bio_is_rw(bio) &&
- nr_sectors > queue_max_hw_sectors(q))) {
- printk(KERN_ERR "bio too big device %s (%u > %u)\n",
- bdevname(bio->bi_bdev, b),
- bio_sectors(bio),
- queue_max_hw_sectors(q));
- goto end_io;
- }
-
part = bio->bi_bdev->bd_part;
if (should_fail_request(part, bio->bi_iter.bi_size) ||
should_fail_request(&part_to_disk(part)->part0,
*/
create_io_context(GFP_ATOMIC, q->node);
- if (blk_throtl_bio(q, bio))
- return false; /* throttled, will be resubmitted later */
+ if (!blkcg_bio_issue_check(q, bio))
+ return false;
trace_block_bio_queue(q, bio);
return true;
end_io:
- bio_endio(bio, err);
+ bio->bi_error = err;
+ bio_endio(bio);
return false;
}
struct list_head flush_queue[2];
struct list_head flush_data_in_flight;
struct request *flush_rq;
+
+ /*
+ * flush_rq shares tag with this rq, both can't be active
+ * at the same time
+ */
+ struct request *orig_rq;
spinlock_t mq_flush_lock;
};
* Internal throttling interface
*/
#ifdef CONFIG_BLK_DEV_THROTTLING
- extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
extern void blk_throtl_drain(struct request_queue *q);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
#else /* CONFIG_BLK_DEV_THROTTLING */
- static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
- {
- return false;
- }
static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
unsigned int auto_free:1; /* free on completion */
- unsigned int single_wait:1;
- unsigned int single_done:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
static inline struct inode *wb_inode(struct list_head *head)
{
- return list_entry(head, struct inode, i_wb_list);
+ return list_entry(head, struct inode, i_io_list);
}
/*
}
/**
- * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
* @inode: inode to be moved
* @wb: target bdi_writeback
* @head: one of @wb->b_{dirty|io|more_io}
*
- * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
* Returns %true if @inode is the first occupant of the !dirty_time IO
* lists; otherwise, %false.
*/
-static bool inode_wb_list_move_locked(struct inode *inode,
+static bool inode_io_list_move_locked(struct inode *inode,
struct bdi_writeback *wb,
struct list_head *head)
{
assert_spin_locked(&wb->list_lock);
- list_move(&inode->i_wb_list, head);
+ list_move(&inode->i_io_list, head);
/* dirty_time doesn't count as dirty_io until expiration */
if (head != &wb->b_dirty_time)
}
/**
- * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
* @inode: inode to be removed
* @wb: bdi_writeback @inode is being removed from
*
* Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
* clear %WB_has_dirty_io if all are empty afterwards.
*/
-static void inode_wb_list_del_locked(struct inode *inode,
+static void inode_io_list_del_locked(struct inode *inode,
struct bdi_writeback *wb)
{
assert_spin_locked(&wb->list_lock);
- list_del_init(&inode->i_wb_list);
+ list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
static void wb_queue_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- trace_writeback_queue(wb->bdi, work);
+ trace_writeback_queue(wb, work);
spin_lock_bh(&wb->work_lock);
- if (!test_bit(WB_registered, &wb->state)) {
- if (work->single_wait)
- work->single_done = 1;
+ if (!test_bit(WB_registered, &wb->state))
goto out_unlock;
- }
if (work->done)
atomic_inc(&work->done->cnt);
list_add_tail(&work->list, &wb->work_list);
/*
* Once I_FREEING is visible under i_lock, the eviction path owns
- * the inode and we shouldn't modify ->i_wb_list.
+ * the inode and we shouldn't modify ->i_io_list.
*/
if (unlikely(inode->i_state & I_FREEING))
goto skip_switch;
* is always correct including from ->b_dirty_time. The transfer
* preserves @inode->dirtied_when ordering.
*/
- if (!list_empty(&inode->i_wb_list)) {
+ if (!list_empty(&inode->i_io_list)) {
struct inode *pos;
- inode_wb_list_del_locked(inode, old_wb);
+ inode_io_list_del_locked(inode, old_wb);
inode->i_wb = new_wb;
- list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+ list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
if (time_after_eq(inode->dirtied_when,
pos->dirtied_when))
break;
- inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+ inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
} else {
inode->i_wb = new_wb;
}
/**
* inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion
+ * @inode: inode to test for congestion (may be NULL)
* @cong_bits: mask of WB_[a]sync_congested bits to test
*
* Tests whether @inode is congested. @cong_bits is the mask of congestion
* determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
* associated with @inode is congested; otherwise, the root wb's congestion
* state is used.
+ *
+ * @inode is allowed to be NULL as this function is often called on
+ * mapping->host which is NULL for the swapper space.
*/
int inode_congested(struct inode *inode, int cong_bits)
{
}
EXPORT_SYMBOL_GPL(inode_congested);
- /**
- * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
- * @bdi: bdi the work item was issued to
- * @work: work item to wait for
- *
- * Wait for the completion of @work which was issued to one of @bdi's
- * bdi_writeback's. The caller must have set @work->single_wait before
- * issuing it. This wait operates independently fo
- * wb_wait_for_completion() and also disables automatic freeing of @work.
- */
- static void wb_wait_for_single_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
- {
- if (WARN_ON_ONCE(!work->single_wait))
- return;
-
- wait_event(bdi->wb_waitq, work->single_done);
-
- /*
- * Paired with smp_wmb() in wb_do_writeback() and ensures that all
- * modifications to @work prior to assertion of ->single_done is
- * visible to the caller once this function returns.
- */
- smp_rmb();
- }
-
/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
}
- /**
- * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
- * @wb: target bdi_writeback
- * @base_work: source wb_writeback_work
- *
- * Try to make a clone of @base_work and issue it to @wb. If cloning
- * succeeds, %true is returned; otherwise, @base_work is issued directly
- * and %false is returned. In the latter case, the caller is required to
- * wait for @base_work's completion using wb_wait_for_single_work().
- *
- * A clone is auto-freed on completion. @base_work never is.
- */
- static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
- struct wb_writeback_work *base_work)
- {
- struct wb_writeback_work *work;
-
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- *work = *base_work;
- work->auto_free = 1;
- work->single_wait = 0;
- } else {
- work = base_work;
- work->auto_free = 0;
- work->single_wait = 1;
- }
- work->single_done = 0;
- wb_queue_work(wb, work);
- return work != base_work;
- }
-
/**
* bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
* @bdi: target backing_dev_info
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
- long nr_pages = base_work->nr_pages;
- int next_blkcg_id = 0;
+ int next_memcg_id = 0;
struct bdi_writeback *wb;
struct wb_iter iter;
might_sleep();
-
- if (!bdi_has_dirty_io(bdi))
- return;
restart:
rcu_read_lock();
- bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+ bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+ DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+ struct wb_writeback_work fallback_work;
+ struct wb_writeback_work *work;
+ long nr_pages;
+
- if (!wb_has_dirty_io(wb) ||
- (skip_if_busy && writeback_in_progress(wb)))
+ /* SYNC_ALL writes out I_DIRTY_TIME too */
+ if (!wb_has_dirty_io(wb) &&
+ (base_work->sync_mode == WB_SYNC_NONE ||
+ list_empty(&wb->b_dirty_time)))
+ continue;
+ if (skip_if_busy && writeback_in_progress(wb))
continue;
- base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
- if (!wb_clone_and_queue_work(wb, base_work)) {
- next_blkcg_id = wb->blkcg_css->id + 1;
- rcu_read_unlock();
- wb_wait_for_single_work(bdi, base_work);
- goto restart;
+ nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 1;
+ wb_queue_work(wb, work);
+ continue;
}
+
+ /* alloc failed, execute synchronously using on-stack fallback */
+ work = &fallback_work;
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 0;
+ work->done = &fallback_work_done;
+
+ wb_queue_work(wb, work);
+
+ next_memcg_id = wb->memcg_css->id + 1;
+ rcu_read_unlock();
+ wb_wait_for_completion(bdi, &fallback_work_done);
+ goto restart;
}
rcu_read_unlock();
}
{
might_sleep();
- if (bdi_has_dirty_io(bdi) &&
- (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
+ if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
base_work->auto_free = 0;
- base_work->single_wait = 0;
- base_work->single_done = 0;
wb_queue_work(&bdi->wb, base_work);
}
}
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- trace_writeback_nowork(wb->bdi);
+ trace_writeback_nowork(wb);
wb_wakeup(wb);
return;
}
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
- trace_writeback_wake_background(wb->bdi);
+ trace_writeback_wake_background(wb);
wb_wakeup(wb);
}
/*
* Remove the inode from the writeback list it is on.
*/
-void inode_wb_list_del(struct inode *inode)
+void inode_io_list_del(struct inode *inode)
{
struct bdi_writeback *wb;
wb = inode_to_wb_and_lock_list(inode);
- inode_wb_list_del_locked(inode, wb);
+ inode_io_list_del_locked(inode, wb);
spin_unlock(&wb->list_lock);
}
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
+ inode_io_list_move_locked(inode, wb, &wb->b_dirty);
}
/*
*/
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
- inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
+ inode_io_list_move_locked(inode, wb, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
- list_move(&inode->i_wb_list, &tmp);
+ list_move(&inode->i_io_list, &tmp);
moved++;
if (flags & EXPIRE_DIRTY_ATIME)
set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
list_for_each_prev_safe(pos, node, &tmp) {
inode = wb_inode(pos);
if (inode->i_sb == sb)
- list_move(&inode->i_wb_list, dispatch_queue);
+ list_move(&inode->i_io_list, dispatch_queue);
}
}
out:
redirty_tail(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
- inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
+ inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
- inode_wb_list_del_locked(inode, wb);
+ inode_io_list_del_locked(inode, wb);
}
}
* touch it. See comment above for explanation.
*/
if (!(inode->i_state & I_DIRTY_ALL))
- inode_wb_list_del_locked(inode, wb);
+ inode_io_list_del_locked(inode, wb);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
unsigned long start_time = jiffies;
long write_chunk;
long wrote = 0; /* count both pages and inodes */
+ struct blk_plug plug;
+ blk_start_plug(&plug);
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
break;
}
}
+ blk_finish_plug(&plug);
return wrote;
}
} else if (work->for_background)
oldest_jif = jiffies;
- trace_writeback_start(wb->bdi, work);
+ trace_writeback_start(wb, work);
if (list_empty(&wb->b_io))
queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
- trace_writeback_written(wb->bdi, work);
+ trace_writeback_written(wb, work);
wb_update_bandwidth(wb, wb_start);
* we'll just busyloop.
*/
if (!list_empty(&wb->b_more_io)) {
- trace_writeback_wait(wb->bdi, work);
+ trace_writeback_wait(wb, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
set_bit(WB_writeback_running, &wb->state);
while ((work = get_next_work_item(wb)) != NULL) {
struct wb_completion *done = work->done;
- bool need_wake_up = false;
- trace_writeback_exec(wb->bdi, work);
+ trace_writeback_exec(wb, work);
wrote += wb_writeback(wb, work);
- if (work->single_wait) {
- WARN_ON_ONCE(work->auto_free);
- /* paired w/ rmb in wb_wait_for_single_work() */
- smp_wmb();
- work->single_done = 1;
- need_wake_up = true;
- } else if (work->auto_free) {
+ if (work->auto_free)
kfree(work);
- }
-
if (done && atomic_dec_and_test(&done->cnt))
- need_wake_up = true;
-
- if (need_wake_up)
wake_up_all(&wb->bdi->wb_waitq);
}
else
dirty_list = &wb->b_dirty_time;
- wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+ wakeup_bdi = inode_io_list_move_locked(inode, wb,
dirty_list);
spin_unlock(&wb->list_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);
+/*
+ * The @s_sync_lock is used to serialise concurrent sync operations
+ * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
+ * Concurrent callers will block on the s_sync_lock rather than doing contending
+ * walks. The queueing maintains sync(2) required behaviour as all the IO that
+ * has been issued up to the time this function is enter is guaranteed to be
+ * completed by the time we have gained the lock and waited for all IO that is
+ * in progress regardless of the order callers are granted the lock.
+ */
static void wait_sb_inodes(struct super_block *sb)
{
struct inode *inode, *old_inode = NULL;
*/
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- spin_lock(&inode_sb_list_lock);
+ mutex_lock(&sb->s_sync_lock);
+ spin_lock(&sb->s_inode_list_lock);
/*
* Data integrity sync. Must wait for all pages under writeback,
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
/*
* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the
- * inode_sb_list_lock. We cannot iput the inode now as we can
+ * s_inode_list_lock. We cannot iput the inode now as we can
* be holding the last reference and we cannot iput it under
- * inode_sb_list_lock. So we keep the reference and iput it
+ * s_inode_list_lock. So we keep the reference and iput it
* later.
*/
iput(old_inode);
cond_resched();
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
iput(old_inode);
+ mutex_unlock(&sb->s_sync_lock);
}
static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
};
struct backing_dev_info *bdi = sb->s_bdi;
- /* Nothing to do? */
- if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
+ /*
+ * Can't skip on !bdi_has_dirty() because we should wait for !dirty
+ * inodes under writeback and I_DIRTY_TIME inodes ignored by
+ * bdi_has_dirty() need to be written out too.
+ */
+ if (bdi == &noop_backing_dev_info)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
*
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
*/
+
+/*
+ * This file *must* be included with SUBSYS() defined.
+ * SUBSYS_TAG() is a noop if undefined.
+ */
+
+#ifndef SUBSYS_TAG
+#define __TMP_SUBSYS_TAG
+#define SUBSYS_TAG(_x)
+#endif
+
#if IS_ENABLED(CONFIG_CPUSETS)
SUBSYS(cpuset)
#endif
#endif
#if IS_ENABLED(CONFIG_BLK_CGROUP)
- SUBSYS(blkio)
+ SUBSYS(io)
#endif
#if IS_ENABLED(CONFIG_MEMCG)
SUBSYS(hugetlb)
#endif
+/*
+ * Subsystems that implement the can_fork() family of callbacks.
+ */
+SUBSYS_TAG(CANFORK_START)
+
+#if IS_ENABLED(CONFIG_CGROUP_PIDS)
+SUBSYS(pids)
+#endif
+
+SUBSYS_TAG(CANFORK_END)
+
/*
* The following subsystems are not supported on the default hierarchy.
*/
#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
SUBSYS(debug)
#endif
+
+#ifdef __TMP_SUBSYS_TAG
+#undef __TMP_SUBSYS_TAG
+#undef SUBSYS_TAG
+#endif
+
/*
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
*/
nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
spin_lock(&wb->list_lock);
- list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
+ list_for_each_entry(inode, &wb->b_dirty, i_io_list)
nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_wb_list)
+ list_for_each_entry(inode, &wb->b_io, i_io_list)
nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
+ list_for_each_entry(inode, &wb->b_more_io, i_io_list)
nr_more_io++;
- list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+ list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
if (inode->i_state & I_DIRTY_TIME)
nr_dirty_time++;
spin_unlock(&wb->list_lock);
int ret = 0;
memcg = mem_cgroup_from_css(memcg_css);
- blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
blkcg = css_to_blkcg(blkcg_css);
memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
blkcg_cgwb_list = &blkcg->cgwb_list;
/* see whether the blkcg association has changed */
blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
- &blkio_cgrp_subsys);
+ &io_cgrp_subsys);
if (unlikely(wb->blkcg_css != blkcg_css ||
!wb_tryget(wb)))
wb = NULL;
wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
- trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
+ trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
}
static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
* do a reset, as it may be a light dirtier.
*/
if (pause < min_pause) {
- trace_balance_dirty_pages(bdi,
+ trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,
}
pause:
- trace_balance_dirty_pages(bdi,
+ trace_balance_dirty_pages(wb,
sdtc->thresh,
sdtc->bg_thresh,
sdtc->dirty,
*/
void __init page_writeback_init(void)
{
+ BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
+
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
-
- BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
}
/**