switch generic_file_splice_read() to use of ->read_iter()
[cascardo/linux.git] / fs / splice.c
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files, network, direct splicing, etc and
13  * fixing lots of bugs.
14  *
15  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18  *
19  */
20 #include <linux/fs.h>
21 #include <linux/file.h>
22 #include <linux/pagemap.h>
23 #include <linux/splice.h>
24 #include <linux/memcontrol.h>
25 #include <linux/mm_inline.h>
26 #include <linux/swap.h>
27 #include <linux/writeback.h>
28 #include <linux/export.h>
29 #include <linux/syscalls.h>
30 #include <linux/uio.h>
31 #include <linux/security.h>
32 #include <linux/gfp.h>
33 #include <linux/socket.h>
34 #include <linux/compat.h>
35 #include "internal.h"
36
37 /*
38  * Attempt to steal a page from a pipe buffer. This should perhaps go into
39  * a vm helper function, it's already simplified quite a bit by the
40  * addition of remove_mapping(). If success is returned, the caller may
41  * attempt to reuse this page for another destination.
42  */
43 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
44                                      struct pipe_buffer *buf)
45 {
46         struct page *page = buf->page;
47         struct address_space *mapping;
48
49         lock_page(page);
50
51         mapping = page_mapping(page);
52         if (mapping) {
53                 WARN_ON(!PageUptodate(page));
54
55                 /*
56                  * At least for ext2 with nobh option, we need to wait on
57                  * writeback completing on this page, since we'll remove it
58                  * from the pagecache.  Otherwise truncate wont wait on the
59                  * page, allowing the disk blocks to be reused by someone else
60                  * before we actually wrote our data to them. fs corruption
61                  * ensues.
62                  */
63                 wait_on_page_writeback(page);
64
65                 if (page_has_private(page) &&
66                     !try_to_release_page(page, GFP_KERNEL))
67                         goto out_unlock;
68
69                 /*
70                  * If we succeeded in removing the mapping, set LRU flag
71                  * and return good.
72                  */
73                 if (remove_mapping(mapping, page)) {
74                         buf->flags |= PIPE_BUF_FLAG_LRU;
75                         return 0;
76                 }
77         }
78
79         /*
80          * Raced with truncate or failed to remove page from current
81          * address space, unlock and return failure.
82          */
83 out_unlock:
84         unlock_page(page);
85         return 1;
86 }
87
88 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
89                                         struct pipe_buffer *buf)
90 {
91         put_page(buf->page);
92         buf->flags &= ~PIPE_BUF_FLAG_LRU;
93 }
94
95 /*
96  * Check whether the contents of buf is OK to access. Since the content
97  * is a page cache page, IO may be in flight.
98  */
99 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
100                                        struct pipe_buffer *buf)
101 {
102         struct page *page = buf->page;
103         int err;
104
105         if (!PageUptodate(page)) {
106                 lock_page(page);
107
108                 /*
109                  * Page got truncated/unhashed. This will cause a 0-byte
110                  * splice, if this is the first page.
111                  */
112                 if (!page->mapping) {
113                         err = -ENODATA;
114                         goto error;
115                 }
116
117                 /*
118                  * Uh oh, read-error from disk.
119                  */
120                 if (!PageUptodate(page)) {
121                         err = -EIO;
122                         goto error;
123                 }
124
125                 /*
126                  * Page is ok afterall, we are done.
127                  */
128                 unlock_page(page);
129         }
130
131         return 0;
132 error:
133         unlock_page(page);
134         return err;
135 }
136
137 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
138         .can_merge = 0,
139         .confirm = page_cache_pipe_buf_confirm,
140         .release = page_cache_pipe_buf_release,
141         .steal = page_cache_pipe_buf_steal,
142         .get = generic_pipe_buf_get,
143 };
144
145 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
146                                     struct pipe_buffer *buf)
147 {
148         if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
149                 return 1;
150
151         buf->flags |= PIPE_BUF_FLAG_LRU;
152         return generic_pipe_buf_steal(pipe, buf);
153 }
154
155 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
156         .can_merge = 0,
157         .confirm = generic_pipe_buf_confirm,
158         .release = page_cache_pipe_buf_release,
159         .steal = user_page_pipe_buf_steal,
160         .get = generic_pipe_buf_get,
161 };
162
163 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
164 {
165         smp_mb();
166         if (waitqueue_active(&pipe->wait))
167                 wake_up_interruptible(&pipe->wait);
168         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
169 }
170
171 /**
172  * splice_to_pipe - fill passed data into a pipe
173  * @pipe:       pipe to fill
174  * @spd:        data to fill
175  *
176  * Description:
177  *    @spd contains a map of pages and len/offset tuples, along with
178  *    the struct pipe_buf_operations associated with these pages. This
179  *    function will link that data to the pipe.
180  *
181  */
182 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
183                        struct splice_pipe_desc *spd)
184 {
185         unsigned int spd_pages = spd->nr_pages;
186         int ret = 0, page_nr = 0;
187
188         if (!spd_pages)
189                 return 0;
190
191         if (unlikely(!pipe->readers)) {
192                 send_sig(SIGPIPE, current, 0);
193                 ret = -EPIPE;
194                 goto out;
195         }
196
197         while (pipe->nrbufs < pipe->buffers) {
198                 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
199                 struct pipe_buffer *buf = pipe->bufs + newbuf;
200
201                 buf->page = spd->pages[page_nr];
202                 buf->offset = spd->partial[page_nr].offset;
203                 buf->len = spd->partial[page_nr].len;
204                 buf->private = spd->partial[page_nr].private;
205                 buf->ops = spd->ops;
206
207                 pipe->nrbufs++;
208                 page_nr++;
209                 ret += buf->len;
210
211                 if (!--spd->nr_pages)
212                         break;
213         }
214
215         if (!ret)
216                 ret = -EAGAIN;
217
218 out:
219         while (page_nr < spd_pages)
220                 spd->spd_release(spd, page_nr++);
221
222         return ret;
223 }
224 EXPORT_SYMBOL_GPL(splice_to_pipe);
225
226 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
227 {
228         int ret;
229
230         if (unlikely(!pipe->readers)) {
231                 send_sig(SIGPIPE, current, 0);
232                 ret = -EPIPE;
233         } else if (pipe->nrbufs == pipe->buffers) {
234                 ret = -EAGAIN;
235         } else {
236                 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
237                 pipe->bufs[newbuf] = *buf;
238                 pipe->nrbufs++;
239                 return buf->len;
240         }
241         buf->ops->release(pipe, buf);
242         buf->ops = NULL;
243         return ret;
244 }
245 EXPORT_SYMBOL(add_to_pipe);
246
247 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
248 {
249         put_page(spd->pages[i]);
250 }
251
252 /*
253  * Check if we need to grow the arrays holding pages and partial page
254  * descriptions.
255  */
256 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
257 {
258         unsigned int buffers = ACCESS_ONCE(pipe->buffers);
259
260         spd->nr_pages_max = buffers;
261         if (buffers <= PIPE_DEF_BUFFERS)
262                 return 0;
263
264         spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
265         spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
266
267         if (spd->pages && spd->partial)
268                 return 0;
269
270         kfree(spd->pages);
271         kfree(spd->partial);
272         return -ENOMEM;
273 }
274
275 void splice_shrink_spd(struct splice_pipe_desc *spd)
276 {
277         if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
278                 return;
279
280         kfree(spd->pages);
281         kfree(spd->partial);
282 }
283
284 /**
285  * generic_file_splice_read - splice data from file to a pipe
286  * @in:         file to splice from
287  * @ppos:       position in @in
288  * @pipe:       pipe to splice to
289  * @len:        number of bytes to splice
290  * @flags:      splice modifier flags
291  *
292  * Description:
293  *    Will read pages from given file and fill them into a pipe. Can be
294  *    used as long as it has more or less sane ->read_iter().
295  *
296  */
297 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
298                                  struct pipe_inode_info *pipe, size_t len,
299                                  unsigned int flags)
300 {
301         struct iov_iter to;
302         struct kiocb kiocb;
303         loff_t isize;
304         int idx, ret;
305
306         isize = i_size_read(in->f_mapping->host);
307         if (unlikely(*ppos >= isize))
308                 return 0;
309
310         iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len);
311         idx = to.idx;
312         init_sync_kiocb(&kiocb, in);
313         kiocb.ki_pos = *ppos;
314         ret = in->f_op->read_iter(&kiocb, &to);
315         if (ret > 0) {
316                 *ppos = kiocb.ki_pos;
317                 file_accessed(in);
318         } else if (ret < 0) {
319                 if (WARN_ON(to.idx != idx || to.iov_offset)) {
320                         /*
321                          * a bogus ->read_iter() has copied something and still
322                          * returned an error instead of a short read.
323                          */
324                         to.idx = idx;
325                         to.iov_offset = 0;
326                         iov_iter_advance(&to, 0); /* to free what was emitted */
327                 }
328                 /*
329                  * callers of ->splice_read() expect -EAGAIN on
330                  * "can't put anything in there", rather than -EFAULT.
331                  */
332                 if (ret == -EFAULT)
333                         ret = -EAGAIN;
334         }
335
336         return ret;
337 }
338 EXPORT_SYMBOL(generic_file_splice_read);
339
340 const struct pipe_buf_operations default_pipe_buf_ops = {
341         .can_merge = 0,
342         .confirm = generic_pipe_buf_confirm,
343         .release = generic_pipe_buf_release,
344         .steal = generic_pipe_buf_steal,
345         .get = generic_pipe_buf_get,
346 };
347
348 static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
349                                     struct pipe_buffer *buf)
350 {
351         return 1;
352 }
353
354 /* Pipe buffer operations for a socket and similar. */
355 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
356         .can_merge = 0,
357         .confirm = generic_pipe_buf_confirm,
358         .release = generic_pipe_buf_release,
359         .steal = generic_pipe_buf_nosteal,
360         .get = generic_pipe_buf_get,
361 };
362 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
363
364 static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
365                             unsigned long vlen, loff_t offset)
366 {
367         mm_segment_t old_fs;
368         loff_t pos = offset;
369         ssize_t res;
370
371         old_fs = get_fs();
372         set_fs(get_ds());
373         /* The cast to a user pointer is valid due to the set_fs() */
374         res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
375         set_fs(old_fs);
376
377         return res;
378 }
379
380 ssize_t kernel_write(struct file *file, const char *buf, size_t count,
381                             loff_t pos)
382 {
383         mm_segment_t old_fs;
384         ssize_t res;
385
386         old_fs = get_fs();
387         set_fs(get_ds());
388         /* The cast to a user pointer is valid due to the set_fs() */
389         res = vfs_write(file, (__force const char __user *)buf, count, &pos);
390         set_fs(old_fs);
391
392         return res;
393 }
394 EXPORT_SYMBOL(kernel_write);
395
396 static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
397                                  struct pipe_inode_info *pipe, size_t len,
398                                  unsigned int flags)
399 {
400         unsigned int nr_pages;
401         unsigned int nr_freed;
402         size_t offset;
403         struct page *pages[PIPE_DEF_BUFFERS];
404         struct partial_page partial[PIPE_DEF_BUFFERS];
405         struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
406         ssize_t res;
407         size_t this_len;
408         int error;
409         int i;
410         struct splice_pipe_desc spd = {
411                 .pages = pages,
412                 .partial = partial,
413                 .nr_pages_max = PIPE_DEF_BUFFERS,
414                 .flags = flags,
415                 .ops = &default_pipe_buf_ops,
416                 .spd_release = spd_release_page,
417         };
418
419         if (splice_grow_spd(pipe, &spd))
420                 return -ENOMEM;
421
422         res = -ENOMEM;
423         vec = __vec;
424         if (spd.nr_pages_max > PIPE_DEF_BUFFERS) {
425                 vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL);
426                 if (!vec)
427                         goto shrink_ret;
428         }
429
430         offset = *ppos & ~PAGE_MASK;
431         nr_pages = (len + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
432
433         for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
434                 struct page *page;
435
436                 page = alloc_page(GFP_USER);
437                 error = -ENOMEM;
438                 if (!page)
439                         goto err;
440
441                 this_len = min_t(size_t, len, PAGE_SIZE - offset);
442                 vec[i].iov_base = (void __user *) page_address(page);
443                 vec[i].iov_len = this_len;
444                 spd.pages[i] = page;
445                 spd.nr_pages++;
446                 len -= this_len;
447                 offset = 0;
448         }
449
450         res = kernel_readv(in, vec, spd.nr_pages, *ppos);
451         if (res < 0) {
452                 error = res;
453                 goto err;
454         }
455
456         error = 0;
457         if (!res)
458                 goto err;
459
460         nr_freed = 0;
461         for (i = 0; i < spd.nr_pages; i++) {
462                 this_len = min_t(size_t, vec[i].iov_len, res);
463                 spd.partial[i].offset = 0;
464                 spd.partial[i].len = this_len;
465                 if (!this_len) {
466                         __free_page(spd.pages[i]);
467                         spd.pages[i] = NULL;
468                         nr_freed++;
469                 }
470                 res -= this_len;
471         }
472         spd.nr_pages -= nr_freed;
473
474         res = splice_to_pipe(pipe, &spd);
475         if (res > 0)
476                 *ppos += res;
477
478 shrink_ret:
479         if (vec != __vec)
480                 kfree(vec);
481         splice_shrink_spd(&spd);
482         return res;
483
484 err:
485         for (i = 0; i < spd.nr_pages; i++)
486                 __free_page(spd.pages[i]);
487
488         res = error;
489         goto shrink_ret;
490 }
491
492 /*
493  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
494  * using sendpage(). Return the number of bytes sent.
495  */
496 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
497                             struct pipe_buffer *buf, struct splice_desc *sd)
498 {
499         struct file *file = sd->u.file;
500         loff_t pos = sd->pos;
501         int more;
502
503         if (!likely(file->f_op->sendpage))
504                 return -EINVAL;
505
506         more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
507
508         if (sd->len < sd->total_len && pipe->nrbufs > 1)
509                 more |= MSG_SENDPAGE_NOTLAST;
510
511         return file->f_op->sendpage(file, buf->page, buf->offset,
512                                     sd->len, &pos, more);
513 }
514
515 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
516 {
517         smp_mb();
518         if (waitqueue_active(&pipe->wait))
519                 wake_up_interruptible(&pipe->wait);
520         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
521 }
522
523 /**
524  * splice_from_pipe_feed - feed available data from a pipe to a file
525  * @pipe:       pipe to splice from
526  * @sd:         information to @actor
527  * @actor:      handler that splices the data
528  *
529  * Description:
530  *    This function loops over the pipe and calls @actor to do the
531  *    actual moving of a single struct pipe_buffer to the desired
532  *    destination.  It returns when there's no more buffers left in
533  *    the pipe or if the requested number of bytes (@sd->total_len)
534  *    have been copied.  It returns a positive number (one) if the
535  *    pipe needs to be filled with more data, zero if the required
536  *    number of bytes have been copied and -errno on error.
537  *
538  *    This, together with splice_from_pipe_{begin,end,next}, may be
539  *    used to implement the functionality of __splice_from_pipe() when
540  *    locking is required around copying the pipe buffers to the
541  *    destination.
542  */
543 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
544                           splice_actor *actor)
545 {
546         int ret;
547
548         while (pipe->nrbufs) {
549                 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
550                 const struct pipe_buf_operations *ops = buf->ops;
551
552                 sd->len = buf->len;
553                 if (sd->len > sd->total_len)
554                         sd->len = sd->total_len;
555
556                 ret = buf->ops->confirm(pipe, buf);
557                 if (unlikely(ret)) {
558                         if (ret == -ENODATA)
559                                 ret = 0;
560                         return ret;
561                 }
562
563                 ret = actor(pipe, buf, sd);
564                 if (ret <= 0)
565                         return ret;
566
567                 buf->offset += ret;
568                 buf->len -= ret;
569
570                 sd->num_spliced += ret;
571                 sd->len -= ret;
572                 sd->pos += ret;
573                 sd->total_len -= ret;
574
575                 if (!buf->len) {
576                         buf->ops = NULL;
577                         ops->release(pipe, buf);
578                         pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
579                         pipe->nrbufs--;
580                         if (pipe->files)
581                                 sd->need_wakeup = true;
582                 }
583
584                 if (!sd->total_len)
585                         return 0;
586         }
587
588         return 1;
589 }
590
591 /**
592  * splice_from_pipe_next - wait for some data to splice from
593  * @pipe:       pipe to splice from
594  * @sd:         information about the splice operation
595  *
596  * Description:
597  *    This function will wait for some data and return a positive
598  *    value (one) if pipe buffers are available.  It will return zero
599  *    or -errno if no more data needs to be spliced.
600  */
601 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
602 {
603         /*
604          * Check for signal early to make process killable when there are
605          * always buffers available
606          */
607         if (signal_pending(current))
608                 return -ERESTARTSYS;
609
610         while (!pipe->nrbufs) {
611                 if (!pipe->writers)
612                         return 0;
613
614                 if (!pipe->waiting_writers && sd->num_spliced)
615                         return 0;
616
617                 if (sd->flags & SPLICE_F_NONBLOCK)
618                         return -EAGAIN;
619
620                 if (signal_pending(current))
621                         return -ERESTARTSYS;
622
623                 if (sd->need_wakeup) {
624                         wakeup_pipe_writers(pipe);
625                         sd->need_wakeup = false;
626                 }
627
628                 pipe_wait(pipe);
629         }
630
631         return 1;
632 }
633
634 /**
635  * splice_from_pipe_begin - start splicing from pipe
636  * @sd:         information about the splice operation
637  *
638  * Description:
639  *    This function should be called before a loop containing
640  *    splice_from_pipe_next() and splice_from_pipe_feed() to
641  *    initialize the necessary fields of @sd.
642  */
643 static void splice_from_pipe_begin(struct splice_desc *sd)
644 {
645         sd->num_spliced = 0;
646         sd->need_wakeup = false;
647 }
648
649 /**
650  * splice_from_pipe_end - finish splicing from pipe
651  * @pipe:       pipe to splice from
652  * @sd:         information about the splice operation
653  *
654  * Description:
655  *    This function will wake up pipe writers if necessary.  It should
656  *    be called after a loop containing splice_from_pipe_next() and
657  *    splice_from_pipe_feed().
658  */
659 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
660 {
661         if (sd->need_wakeup)
662                 wakeup_pipe_writers(pipe);
663 }
664
665 /**
666  * __splice_from_pipe - splice data from a pipe to given actor
667  * @pipe:       pipe to splice from
668  * @sd:         information to @actor
669  * @actor:      handler that splices the data
670  *
671  * Description:
672  *    This function does little more than loop over the pipe and call
673  *    @actor to do the actual moving of a single struct pipe_buffer to
674  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
675  *    pipe_to_user.
676  *
677  */
678 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
679                            splice_actor *actor)
680 {
681         int ret;
682
683         splice_from_pipe_begin(sd);
684         do {
685                 cond_resched();
686                 ret = splice_from_pipe_next(pipe, sd);
687                 if (ret > 0)
688                         ret = splice_from_pipe_feed(pipe, sd, actor);
689         } while (ret > 0);
690         splice_from_pipe_end(pipe, sd);
691
692         return sd->num_spliced ? sd->num_spliced : ret;
693 }
694 EXPORT_SYMBOL(__splice_from_pipe);
695
696 /**
697  * splice_from_pipe - splice data from a pipe to a file
698  * @pipe:       pipe to splice from
699  * @out:        file to splice to
700  * @ppos:       position in @out
701  * @len:        how many bytes to splice
702  * @flags:      splice modifier flags
703  * @actor:      handler that splices the data
704  *
705  * Description:
706  *    See __splice_from_pipe. This function locks the pipe inode,
707  *    otherwise it's identical to __splice_from_pipe().
708  *
709  */
710 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
711                          loff_t *ppos, size_t len, unsigned int flags,
712                          splice_actor *actor)
713 {
714         ssize_t ret;
715         struct splice_desc sd = {
716                 .total_len = len,
717                 .flags = flags,
718                 .pos = *ppos,
719                 .u.file = out,
720         };
721
722         pipe_lock(pipe);
723         ret = __splice_from_pipe(pipe, &sd, actor);
724         pipe_unlock(pipe);
725
726         return ret;
727 }
728
729 /**
730  * iter_file_splice_write - splice data from a pipe to a file
731  * @pipe:       pipe info
732  * @out:        file to write to
733  * @ppos:       position in @out
734  * @len:        number of bytes to splice
735  * @flags:      splice modifier flags
736  *
737  * Description:
738  *    Will either move or copy pages (determined by @flags options) from
739  *    the given pipe inode to the given file.
740  *    This one is ->write_iter-based.
741  *
742  */
743 ssize_t
744 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
745                           loff_t *ppos, size_t len, unsigned int flags)
746 {
747         struct splice_desc sd = {
748                 .total_len = len,
749                 .flags = flags,
750                 .pos = *ppos,
751                 .u.file = out,
752         };
753         int nbufs = pipe->buffers;
754         struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
755                                         GFP_KERNEL);
756         ssize_t ret;
757
758         if (unlikely(!array))
759                 return -ENOMEM;
760
761         pipe_lock(pipe);
762
763         splice_from_pipe_begin(&sd);
764         while (sd.total_len) {
765                 struct iov_iter from;
766                 size_t left;
767                 int n, idx;
768
769                 ret = splice_from_pipe_next(pipe, &sd);
770                 if (ret <= 0)
771                         break;
772
773                 if (unlikely(nbufs < pipe->buffers)) {
774                         kfree(array);
775                         nbufs = pipe->buffers;
776                         array = kcalloc(nbufs, sizeof(struct bio_vec),
777                                         GFP_KERNEL);
778                         if (!array) {
779                                 ret = -ENOMEM;
780                                 break;
781                         }
782                 }
783
784                 /* build the vector */
785                 left = sd.total_len;
786                 for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
787                         struct pipe_buffer *buf = pipe->bufs + idx;
788                         size_t this_len = buf->len;
789
790                         if (this_len > left)
791                                 this_len = left;
792
793                         if (idx == pipe->buffers - 1)
794                                 idx = -1;
795
796                         ret = buf->ops->confirm(pipe, buf);
797                         if (unlikely(ret)) {
798                                 if (ret == -ENODATA)
799                                         ret = 0;
800                                 goto done;
801                         }
802
803                         array[n].bv_page = buf->page;
804                         array[n].bv_len = this_len;
805                         array[n].bv_offset = buf->offset;
806                         left -= this_len;
807                 }
808
809                 iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
810                               sd.total_len - left);
811                 ret = vfs_iter_write(out, &from, &sd.pos);
812                 if (ret <= 0)
813                         break;
814
815                 sd.num_spliced += ret;
816                 sd.total_len -= ret;
817                 *ppos = sd.pos;
818
819                 /* dismiss the fully eaten buffers, adjust the partial one */
820                 while (ret) {
821                         struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
822                         if (ret >= buf->len) {
823                                 const struct pipe_buf_operations *ops = buf->ops;
824                                 ret -= buf->len;
825                                 buf->len = 0;
826                                 buf->ops = NULL;
827                                 ops->release(pipe, buf);
828                                 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
829                                 pipe->nrbufs--;
830                                 if (pipe->files)
831                                         sd.need_wakeup = true;
832                         } else {
833                                 buf->offset += ret;
834                                 buf->len -= ret;
835                                 ret = 0;
836                         }
837                 }
838         }
839 done:
840         kfree(array);
841         splice_from_pipe_end(pipe, &sd);
842
843         pipe_unlock(pipe);
844
845         if (sd.num_spliced)
846                 ret = sd.num_spliced;
847
848         return ret;
849 }
850
851 EXPORT_SYMBOL(iter_file_splice_write);
852
853 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
854                           struct splice_desc *sd)
855 {
856         int ret;
857         void *data;
858         loff_t tmp = sd->pos;
859
860         data = kmap(buf->page);
861         ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
862         kunmap(buf->page);
863
864         return ret;
865 }
866
867 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
868                                          struct file *out, loff_t *ppos,
869                                          size_t len, unsigned int flags)
870 {
871         ssize_t ret;
872
873         ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
874         if (ret > 0)
875                 *ppos += ret;
876
877         return ret;
878 }
879
880 /**
881  * generic_splice_sendpage - splice data from a pipe to a socket
882  * @pipe:       pipe to splice from
883  * @out:        socket to write to
884  * @ppos:       position in @out
885  * @len:        number of bytes to splice
886  * @flags:      splice modifier flags
887  *
888  * Description:
889  *    Will send @len bytes from the pipe to a network socket. No data copying
890  *    is involved.
891  *
892  */
893 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
894                                 loff_t *ppos, size_t len, unsigned int flags)
895 {
896         return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
897 }
898
899 EXPORT_SYMBOL(generic_splice_sendpage);
900
901 /*
902  * Attempt to initiate a splice from pipe to file.
903  */
904 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
905                            loff_t *ppos, size_t len, unsigned int flags)
906 {
907         ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
908                                 loff_t *, size_t, unsigned int);
909
910         if (out->f_op->splice_write)
911                 splice_write = out->f_op->splice_write;
912         else
913                 splice_write = default_file_splice_write;
914
915         return splice_write(pipe, out, ppos, len, flags);
916 }
917
918 /*
919  * Attempt to initiate a splice from a file to a pipe.
920  */
921 static long do_splice_to(struct file *in, loff_t *ppos,
922                          struct pipe_inode_info *pipe, size_t len,
923                          unsigned int flags)
924 {
925         ssize_t (*splice_read)(struct file *, loff_t *,
926                                struct pipe_inode_info *, size_t, unsigned int);
927         int ret;
928
929         if (unlikely(!(in->f_mode & FMODE_READ)))
930                 return -EBADF;
931
932         ret = rw_verify_area(READ, in, ppos, len);
933         if (unlikely(ret < 0))
934                 return ret;
935
936         if (unlikely(len > MAX_RW_COUNT))
937                 len = MAX_RW_COUNT;
938
939         if (in->f_op->splice_read)
940                 splice_read = in->f_op->splice_read;
941         else
942                 splice_read = default_file_splice_read;
943
944         return splice_read(in, ppos, pipe, len, flags);
945 }
946
947 /**
948  * splice_direct_to_actor - splices data directly between two non-pipes
949  * @in:         file to splice from
950  * @sd:         actor information on where to splice to
951  * @actor:      handles the data splicing
952  *
953  * Description:
954  *    This is a special case helper to splice directly between two
955  *    points, without requiring an explicit pipe. Internally an allocated
956  *    pipe is cached in the process, and reused during the lifetime of
957  *    that process.
958  *
959  */
960 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
961                                splice_direct_actor *actor)
962 {
963         struct pipe_inode_info *pipe;
964         long ret, bytes;
965         umode_t i_mode;
966         size_t len;
967         int i, flags, more;
968
969         /*
970          * We require the input being a regular file, as we don't want to
971          * randomly drop data for eg socket -> socket splicing. Use the
972          * piped splicing for that!
973          */
974         i_mode = file_inode(in)->i_mode;
975         if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
976                 return -EINVAL;
977
978         /*
979          * neither in nor out is a pipe, setup an internal pipe attached to
980          * 'out' and transfer the wanted data from 'in' to 'out' through that
981          */
982         pipe = current->splice_pipe;
983         if (unlikely(!pipe)) {
984                 pipe = alloc_pipe_info();
985                 if (!pipe)
986                         return -ENOMEM;
987
988                 /*
989                  * We don't have an immediate reader, but we'll read the stuff
990                  * out of the pipe right after the splice_to_pipe(). So set
991                  * PIPE_READERS appropriately.
992                  */
993                 pipe->readers = 1;
994
995                 current->splice_pipe = pipe;
996         }
997
998         /*
999          * Do the splice.
1000          */
1001         ret = 0;
1002         bytes = 0;
1003         len = sd->total_len;
1004         flags = sd->flags;
1005
1006         /*
1007          * Don't block on output, we have to drain the direct pipe.
1008          */
1009         sd->flags &= ~SPLICE_F_NONBLOCK;
1010         more = sd->flags & SPLICE_F_MORE;
1011
1012         while (len) {
1013                 size_t read_len;
1014                 loff_t pos = sd->pos, prev_pos = pos;
1015
1016                 ret = do_splice_to(in, &pos, pipe, len, flags);
1017                 if (unlikely(ret <= 0))
1018                         goto out_release;
1019
1020                 read_len = ret;
1021                 sd->total_len = read_len;
1022
1023                 /*
1024                  * If more data is pending, set SPLICE_F_MORE
1025                  * If this is the last data and SPLICE_F_MORE was not set
1026                  * initially, clears it.
1027                  */
1028                 if (read_len < len)
1029                         sd->flags |= SPLICE_F_MORE;
1030                 else if (!more)
1031                         sd->flags &= ~SPLICE_F_MORE;
1032                 /*
1033                  * NOTE: nonblocking mode only applies to the input. We
1034                  * must not do the output in nonblocking mode as then we
1035                  * could get stuck data in the internal pipe:
1036                  */
1037                 ret = actor(pipe, sd);
1038                 if (unlikely(ret <= 0)) {
1039                         sd->pos = prev_pos;
1040                         goto out_release;
1041                 }
1042
1043                 bytes += ret;
1044                 len -= ret;
1045                 sd->pos = pos;
1046
1047                 if (ret < read_len) {
1048                         sd->pos = prev_pos + ret;
1049                         goto out_release;
1050                 }
1051         }
1052
1053 done:
1054         pipe->nrbufs = pipe->curbuf = 0;
1055         file_accessed(in);
1056         return bytes;
1057
1058 out_release:
1059         /*
1060          * If we did an incomplete transfer we must release
1061          * the pipe buffers in question:
1062          */
1063         for (i = 0; i < pipe->buffers; i++) {
1064                 struct pipe_buffer *buf = pipe->bufs + i;
1065
1066                 if (buf->ops) {
1067                         buf->ops->release(pipe, buf);
1068                         buf->ops = NULL;
1069                 }
1070         }
1071
1072         if (!bytes)
1073                 bytes = ret;
1074
1075         goto done;
1076 }
1077 EXPORT_SYMBOL(splice_direct_to_actor);
1078
1079 static int direct_splice_actor(struct pipe_inode_info *pipe,
1080                                struct splice_desc *sd)
1081 {
1082         struct file *file = sd->u.file;
1083
1084         return do_splice_from(pipe, file, sd->opos, sd->total_len,
1085                               sd->flags);
1086 }
1087
1088 /**
1089  * do_splice_direct - splices data directly between two files
1090  * @in:         file to splice from
1091  * @ppos:       input file offset
1092  * @out:        file to splice to
1093  * @opos:       output file offset
1094  * @len:        number of bytes to splice
1095  * @flags:      splice modifier flags
1096  *
1097  * Description:
1098  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1099  *    doing it in the application would incur an extra system call
1100  *    (splice in + splice out, as compared to just sendfile()). So this helper
1101  *    can splice directly through a process-private pipe.
1102  *
1103  */
1104 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1105                       loff_t *opos, size_t len, unsigned int flags)
1106 {
1107         struct splice_desc sd = {
1108                 .len            = len,
1109                 .total_len      = len,
1110                 .flags          = flags,
1111                 .pos            = *ppos,
1112                 .u.file         = out,
1113                 .opos           = opos,
1114         };
1115         long ret;
1116
1117         if (unlikely(!(out->f_mode & FMODE_WRITE)))
1118                 return -EBADF;
1119
1120         if (unlikely(out->f_flags & O_APPEND))
1121                 return -EINVAL;
1122
1123         ret = rw_verify_area(WRITE, out, opos, len);
1124         if (unlikely(ret < 0))
1125                 return ret;
1126
1127         ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1128         if (ret > 0)
1129                 *ppos = sd.pos;
1130
1131         return ret;
1132 }
1133 EXPORT_SYMBOL(do_splice_direct);
1134
1135 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1136 {
1137         while (pipe->nrbufs == pipe->buffers) {
1138                 if (flags & SPLICE_F_NONBLOCK)
1139                         return -EAGAIN;
1140                 if (signal_pending(current))
1141                         return -ERESTARTSYS;
1142                 pipe->waiting_writers++;
1143                 pipe_wait(pipe);
1144                 pipe->waiting_writers--;
1145         }
1146         return 0;
1147 }
1148
1149 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1150                                struct pipe_inode_info *opipe,
1151                                size_t len, unsigned int flags);
1152
1153 /*
1154  * Determine where to splice to/from.
1155  */
1156 static long do_splice(struct file *in, loff_t __user *off_in,
1157                       struct file *out, loff_t __user *off_out,
1158                       size_t len, unsigned int flags)
1159 {
1160         struct pipe_inode_info *ipipe;
1161         struct pipe_inode_info *opipe;
1162         loff_t offset;
1163         long ret;
1164
1165         ipipe = get_pipe_info(in);
1166         opipe = get_pipe_info(out);
1167
1168         if (ipipe && opipe) {
1169                 if (off_in || off_out)
1170                         return -ESPIPE;
1171
1172                 if (!(in->f_mode & FMODE_READ))
1173                         return -EBADF;
1174
1175                 if (!(out->f_mode & FMODE_WRITE))
1176                         return -EBADF;
1177
1178                 /* Splicing to self would be fun, but... */
1179                 if (ipipe == opipe)
1180                         return -EINVAL;
1181
1182                 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1183         }
1184
1185         if (ipipe) {
1186                 if (off_in)
1187                         return -ESPIPE;
1188                 if (off_out) {
1189                         if (!(out->f_mode & FMODE_PWRITE))
1190                                 return -EINVAL;
1191                         if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1192                                 return -EFAULT;
1193                 } else {
1194                         offset = out->f_pos;
1195                 }
1196
1197                 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1198                         return -EBADF;
1199
1200                 if (unlikely(out->f_flags & O_APPEND))
1201                         return -EINVAL;
1202
1203                 ret = rw_verify_area(WRITE, out, &offset, len);
1204                 if (unlikely(ret < 0))
1205                         return ret;
1206
1207                 file_start_write(out);
1208                 ret = do_splice_from(ipipe, out, &offset, len, flags);
1209                 file_end_write(out);
1210
1211                 if (!off_out)
1212                         out->f_pos = offset;
1213                 else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1214                         ret = -EFAULT;
1215
1216                 return ret;
1217         }
1218
1219         if (opipe) {
1220                 if (off_out)
1221                         return -ESPIPE;
1222                 if (off_in) {
1223                         if (!(in->f_mode & FMODE_PREAD))
1224                                 return -EINVAL;
1225                         if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1226                                 return -EFAULT;
1227                 } else {
1228                         offset = in->f_pos;
1229                 }
1230
1231                 pipe_lock(opipe);
1232                 ret = wait_for_space(opipe, flags);
1233                 if (!ret)
1234                         ret = do_splice_to(in, &offset, opipe, len, flags);
1235                 pipe_unlock(opipe);
1236                 if (ret > 0)
1237                         wakeup_pipe_readers(opipe);
1238                 if (!off_in)
1239                         in->f_pos = offset;
1240                 else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1241                         ret = -EFAULT;
1242
1243                 return ret;
1244         }
1245
1246         return -EINVAL;
1247 }
1248
1249 static int iter_to_pipe(struct iov_iter *from,
1250                         struct pipe_inode_info *pipe,
1251                         unsigned flags)
1252 {
1253         struct pipe_buffer buf = {
1254                 .ops = &user_page_pipe_buf_ops,
1255                 .flags = flags
1256         };
1257         size_t total = 0;
1258         int ret = 0;
1259         bool failed = false;
1260
1261         while (iov_iter_count(from) && !failed) {
1262                 struct page *pages[16];
1263                 ssize_t copied;
1264                 size_t start;
1265                 int n;
1266
1267                 copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1268                 if (copied <= 0) {
1269                         ret = copied;
1270                         break;
1271                 }
1272
1273                 for (n = 0; copied; n++, start = 0) {
1274                         int size = min_t(int, copied, PAGE_SIZE - start);
1275                         if (!failed) {
1276                                 buf.page = pages[n];
1277                                 buf.offset = start;
1278                                 buf.len = size;
1279                                 ret = add_to_pipe(pipe, &buf);
1280                                 if (unlikely(ret < 0)) {
1281                                         failed = true;
1282                                 } else {
1283                                         iov_iter_advance(from, ret);
1284                                         total += ret;
1285                                 }
1286                         } else {
1287                                 put_page(pages[n]);
1288                         }
1289                         copied -= size;
1290                 }
1291         }
1292         return total ? total : ret;
1293 }
1294
1295 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1296                         struct splice_desc *sd)
1297 {
1298         int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1299         return n == sd->len ? n : -EFAULT;
1300 }
1301
1302 /*
1303  * For lack of a better implementation, implement vmsplice() to userspace
1304  * as a simple copy of the pipes pages to the user iov.
1305  */
1306 static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
1307                              unsigned long nr_segs, unsigned int flags)
1308 {
1309         struct pipe_inode_info *pipe;
1310         struct splice_desc sd;
1311         long ret;
1312         struct iovec iovstack[UIO_FASTIOV];
1313         struct iovec *iov = iovstack;
1314         struct iov_iter iter;
1315
1316         pipe = get_pipe_info(file);
1317         if (!pipe)
1318                 return -EBADF;
1319
1320         ret = import_iovec(READ, uiov, nr_segs,
1321                            ARRAY_SIZE(iovstack), &iov, &iter);
1322         if (ret < 0)
1323                 return ret;
1324
1325         sd.total_len = iov_iter_count(&iter);
1326         sd.len = 0;
1327         sd.flags = flags;
1328         sd.u.data = &iter;
1329         sd.pos = 0;
1330
1331         if (sd.total_len) {
1332                 pipe_lock(pipe);
1333                 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1334                 pipe_unlock(pipe);
1335         }
1336
1337         kfree(iov);
1338         return ret;
1339 }
1340
1341 /*
1342  * vmsplice splices a user address range into a pipe. It can be thought of
1343  * as splice-from-memory, where the regular splice is splice-from-file (or
1344  * to file). In both cases the output is a pipe, naturally.
1345  */
1346 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov,
1347                              unsigned long nr_segs, unsigned int flags)
1348 {
1349         struct pipe_inode_info *pipe;
1350         struct iovec iovstack[UIO_FASTIOV];
1351         struct iovec *iov = iovstack;
1352         struct iov_iter from;
1353         long ret;
1354         unsigned buf_flag = 0;
1355
1356         if (flags & SPLICE_F_GIFT)
1357                 buf_flag = PIPE_BUF_FLAG_GIFT;
1358
1359         pipe = get_pipe_info(file);
1360         if (!pipe)
1361                 return -EBADF;
1362
1363         ret = import_iovec(WRITE, uiov, nr_segs,
1364                            ARRAY_SIZE(iovstack), &iov, &from);
1365         if (ret < 0)
1366                 return ret;
1367
1368         pipe_lock(pipe);
1369         ret = wait_for_space(pipe, flags);
1370         if (!ret)
1371                 ret = iter_to_pipe(&from, pipe, buf_flag);
1372         pipe_unlock(pipe);
1373         if (ret > 0)
1374                 wakeup_pipe_readers(pipe);
1375         kfree(iov);
1376         return ret;
1377 }
1378
1379 /*
1380  * Note that vmsplice only really supports true splicing _from_ user memory
1381  * to a pipe, not the other way around. Splicing from user memory is a simple
1382  * operation that can be supported without any funky alignment restrictions
1383  * or nasty vm tricks. We simply map in the user memory and fill them into
1384  * a pipe. The reverse isn't quite as easy, though. There are two possible
1385  * solutions for that:
1386  *
1387  *      - memcpy() the data internally, at which point we might as well just
1388  *        do a regular read() on the buffer anyway.
1389  *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1390  *        has restriction limitations on both ends of the pipe).
1391  *
1392  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1393  *
1394  */
1395 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1396                 unsigned long, nr_segs, unsigned int, flags)
1397 {
1398         struct fd f;
1399         long error;
1400
1401         if (unlikely(nr_segs > UIO_MAXIOV))
1402                 return -EINVAL;
1403         else if (unlikely(!nr_segs))
1404                 return 0;
1405
1406         error = -EBADF;
1407         f = fdget(fd);
1408         if (f.file) {
1409                 if (f.file->f_mode & FMODE_WRITE)
1410                         error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
1411                 else if (f.file->f_mode & FMODE_READ)
1412                         error = vmsplice_to_user(f.file, iov, nr_segs, flags);
1413
1414                 fdput(f);
1415         }
1416
1417         return error;
1418 }
1419
1420 #ifdef CONFIG_COMPAT
1421 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1422                     unsigned int, nr_segs, unsigned int, flags)
1423 {
1424         unsigned i;
1425         struct iovec __user *iov;
1426         if (nr_segs > UIO_MAXIOV)
1427                 return -EINVAL;
1428         iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
1429         for (i = 0; i < nr_segs; i++) {
1430                 struct compat_iovec v;
1431                 if (get_user(v.iov_base, &iov32[i].iov_base) ||
1432                     get_user(v.iov_len, &iov32[i].iov_len) ||
1433                     put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
1434                     put_user(v.iov_len, &iov[i].iov_len))
1435                         return -EFAULT;
1436         }
1437         return sys_vmsplice(fd, iov, nr_segs, flags);
1438 }
1439 #endif
1440
1441 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1442                 int, fd_out, loff_t __user *, off_out,
1443                 size_t, len, unsigned int, flags)
1444 {
1445         struct fd in, out;
1446         long error;
1447
1448         if (unlikely(!len))
1449                 return 0;
1450
1451         error = -EBADF;
1452         in = fdget(fd_in);
1453         if (in.file) {
1454                 if (in.file->f_mode & FMODE_READ) {
1455                         out = fdget(fd_out);
1456                         if (out.file) {
1457                                 if (out.file->f_mode & FMODE_WRITE)
1458                                         error = do_splice(in.file, off_in,
1459                                                           out.file, off_out,
1460                                                           len, flags);
1461                                 fdput(out);
1462                         }
1463                 }
1464                 fdput(in);
1465         }
1466         return error;
1467 }
1468
1469 /*
1470  * Make sure there's data to read. Wait for input if we can, otherwise
1471  * return an appropriate error.
1472  */
1473 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1474 {
1475         int ret;
1476
1477         /*
1478          * Check ->nrbufs without the inode lock first. This function
1479          * is speculative anyways, so missing one is ok.
1480          */
1481         if (pipe->nrbufs)
1482                 return 0;
1483
1484         ret = 0;
1485         pipe_lock(pipe);
1486
1487         while (!pipe->nrbufs) {
1488                 if (signal_pending(current)) {
1489                         ret = -ERESTARTSYS;
1490                         break;
1491                 }
1492                 if (!pipe->writers)
1493                         break;
1494                 if (!pipe->waiting_writers) {
1495                         if (flags & SPLICE_F_NONBLOCK) {
1496                                 ret = -EAGAIN;
1497                                 break;
1498                         }
1499                 }
1500                 pipe_wait(pipe);
1501         }
1502
1503         pipe_unlock(pipe);
1504         return ret;
1505 }
1506
1507 /*
1508  * Make sure there's writeable room. Wait for room if we can, otherwise
1509  * return an appropriate error.
1510  */
1511 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1512 {
1513         int ret;
1514
1515         /*
1516          * Check ->nrbufs without the inode lock first. This function
1517          * is speculative anyways, so missing one is ok.
1518          */
1519         if (pipe->nrbufs < pipe->buffers)
1520                 return 0;
1521
1522         ret = 0;
1523         pipe_lock(pipe);
1524
1525         while (pipe->nrbufs >= pipe->buffers) {
1526                 if (!pipe->readers) {
1527                         send_sig(SIGPIPE, current, 0);
1528                         ret = -EPIPE;
1529                         break;
1530                 }
1531                 if (flags & SPLICE_F_NONBLOCK) {
1532                         ret = -EAGAIN;
1533                         break;
1534                 }
1535                 if (signal_pending(current)) {
1536                         ret = -ERESTARTSYS;
1537                         break;
1538                 }
1539                 pipe->waiting_writers++;
1540                 pipe_wait(pipe);
1541                 pipe->waiting_writers--;
1542         }
1543
1544         pipe_unlock(pipe);
1545         return ret;
1546 }
1547
1548 /*
1549  * Splice contents of ipipe to opipe.
1550  */
1551 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1552                                struct pipe_inode_info *opipe,
1553                                size_t len, unsigned int flags)
1554 {
1555         struct pipe_buffer *ibuf, *obuf;
1556         int ret = 0, nbuf;
1557         bool input_wakeup = false;
1558
1559
1560 retry:
1561         ret = ipipe_prep(ipipe, flags);
1562         if (ret)
1563                 return ret;
1564
1565         ret = opipe_prep(opipe, flags);
1566         if (ret)
1567                 return ret;
1568
1569         /*
1570          * Potential ABBA deadlock, work around it by ordering lock
1571          * grabbing by pipe info address. Otherwise two different processes
1572          * could deadlock (one doing tee from A -> B, the other from B -> A).
1573          */
1574         pipe_double_lock(ipipe, opipe);
1575
1576         do {
1577                 if (!opipe->readers) {
1578                         send_sig(SIGPIPE, current, 0);
1579                         if (!ret)
1580                                 ret = -EPIPE;
1581                         break;
1582                 }
1583
1584                 if (!ipipe->nrbufs && !ipipe->writers)
1585                         break;
1586
1587                 /*
1588                  * Cannot make any progress, because either the input
1589                  * pipe is empty or the output pipe is full.
1590                  */
1591                 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1592                         /* Already processed some buffers, break */
1593                         if (ret)
1594                                 break;
1595
1596                         if (flags & SPLICE_F_NONBLOCK) {
1597                                 ret = -EAGAIN;
1598                                 break;
1599                         }
1600
1601                         /*
1602                          * We raced with another reader/writer and haven't
1603                          * managed to process any buffers.  A zero return
1604                          * value means EOF, so retry instead.
1605                          */
1606                         pipe_unlock(ipipe);
1607                         pipe_unlock(opipe);
1608                         goto retry;
1609                 }
1610
1611                 ibuf = ipipe->bufs + ipipe->curbuf;
1612                 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1613                 obuf = opipe->bufs + nbuf;
1614
1615                 if (len >= ibuf->len) {
1616                         /*
1617                          * Simply move the whole buffer from ipipe to opipe
1618                          */
1619                         *obuf = *ibuf;
1620                         ibuf->ops = NULL;
1621                         opipe->nrbufs++;
1622                         ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1623                         ipipe->nrbufs--;
1624                         input_wakeup = true;
1625                 } else {
1626                         /*
1627                          * Get a reference to this pipe buffer,
1628                          * so we can copy the contents over.
1629                          */
1630                         ibuf->ops->get(ipipe, ibuf);
1631                         *obuf = *ibuf;
1632
1633                         /*
1634                          * Don't inherit the gift flag, we need to
1635                          * prevent multiple steals of this page.
1636                          */
1637                         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1638
1639                         obuf->len = len;
1640                         opipe->nrbufs++;
1641                         ibuf->offset += obuf->len;
1642                         ibuf->len -= obuf->len;
1643                 }
1644                 ret += obuf->len;
1645                 len -= obuf->len;
1646         } while (len);
1647
1648         pipe_unlock(ipipe);
1649         pipe_unlock(opipe);
1650
1651         /*
1652          * If we put data in the output pipe, wakeup any potential readers.
1653          */
1654         if (ret > 0)
1655                 wakeup_pipe_readers(opipe);
1656
1657         if (input_wakeup)
1658                 wakeup_pipe_writers(ipipe);
1659
1660         return ret;
1661 }
1662
1663 /*
1664  * Link contents of ipipe to opipe.
1665  */
1666 static int link_pipe(struct pipe_inode_info *ipipe,
1667                      struct pipe_inode_info *opipe,
1668                      size_t len, unsigned int flags)
1669 {
1670         struct pipe_buffer *ibuf, *obuf;
1671         int ret = 0, i = 0, nbuf;
1672
1673         /*
1674          * Potential ABBA deadlock, work around it by ordering lock
1675          * grabbing by pipe info address. Otherwise two different processes
1676          * could deadlock (one doing tee from A -> B, the other from B -> A).
1677          */
1678         pipe_double_lock(ipipe, opipe);
1679
1680         do {
1681                 if (!opipe->readers) {
1682                         send_sig(SIGPIPE, current, 0);
1683                         if (!ret)
1684                                 ret = -EPIPE;
1685                         break;
1686                 }
1687
1688                 /*
1689                  * If we have iterated all input buffers or ran out of
1690                  * output room, break.
1691                  */
1692                 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1693                         break;
1694
1695                 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1696                 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1697
1698                 /*
1699                  * Get a reference to this pipe buffer,
1700                  * so we can copy the contents over.
1701                  */
1702                 ibuf->ops->get(ipipe, ibuf);
1703
1704                 obuf = opipe->bufs + nbuf;
1705                 *obuf = *ibuf;
1706
1707                 /*
1708                  * Don't inherit the gift flag, we need to
1709                  * prevent multiple steals of this page.
1710                  */
1711                 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1712
1713                 if (obuf->len > len)
1714                         obuf->len = len;
1715
1716                 opipe->nrbufs++;
1717                 ret += obuf->len;
1718                 len -= obuf->len;
1719                 i++;
1720         } while (len);
1721
1722         /*
1723          * return EAGAIN if we have the potential of some data in the
1724          * future, otherwise just return 0
1725          */
1726         if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1727                 ret = -EAGAIN;
1728
1729         pipe_unlock(ipipe);
1730         pipe_unlock(opipe);
1731
1732         /*
1733          * If we put data in the output pipe, wakeup any potential readers.
1734          */
1735         if (ret > 0)
1736                 wakeup_pipe_readers(opipe);
1737
1738         return ret;
1739 }
1740
1741 /*
1742  * This is a tee(1) implementation that works on pipes. It doesn't copy
1743  * any data, it simply references the 'in' pages on the 'out' pipe.
1744  * The 'flags' used are the SPLICE_F_* variants, currently the only
1745  * applicable one is SPLICE_F_NONBLOCK.
1746  */
1747 static long do_tee(struct file *in, struct file *out, size_t len,
1748                    unsigned int flags)
1749 {
1750         struct pipe_inode_info *ipipe = get_pipe_info(in);
1751         struct pipe_inode_info *opipe = get_pipe_info(out);
1752         int ret = -EINVAL;
1753
1754         /*
1755          * Duplicate the contents of ipipe to opipe without actually
1756          * copying the data.
1757          */
1758         if (ipipe && opipe && ipipe != opipe) {
1759                 /*
1760                  * Keep going, unless we encounter an error. The ipipe/opipe
1761                  * ordering doesn't really matter.
1762                  */
1763                 ret = ipipe_prep(ipipe, flags);
1764                 if (!ret) {
1765                         ret = opipe_prep(opipe, flags);
1766                         if (!ret)
1767                                 ret = link_pipe(ipipe, opipe, len, flags);
1768                 }
1769         }
1770
1771         return ret;
1772 }
1773
1774 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1775 {
1776         struct fd in;
1777         int error;
1778
1779         if (unlikely(!len))
1780                 return 0;
1781
1782         error = -EBADF;
1783         in = fdget(fdin);
1784         if (in.file) {
1785                 if (in.file->f_mode & FMODE_READ) {
1786                         struct fd out = fdget(fdout);
1787                         if (out.file) {
1788                                 if (out.file->f_mode & FMODE_WRITE)
1789                                         error = do_tee(in.file, out.file,
1790                                                         len, flags);
1791                                 fdput(out);
1792                         }
1793                 }
1794                 fdput(in);
1795         }
1796
1797         return error;
1798 }