2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Alan Cox : verify_area() now used correctly
14 * Alan Cox : new skbuff lists, look ma no backlogs!
15 * Alan Cox : tidied skbuff lists.
16 * Alan Cox : Now uses generic datagram routines I
17 * added. Also fixed the peek/read crash
18 * from all old Linux datagram code.
19 * Alan Cox : Uses the improved datagram code.
20 * Alan Cox : Added NULL's for socket options.
21 * Alan Cox : Re-commented the code.
22 * Alan Cox : Use new kernel side addressing
23 * Rob Janssen : Correct MTU usage.
24 * Dave Platt : Counter leaks caused by incorrect
25 * interrupt locking and some slightly
26 * dubious gcc output. Can you read
27 * compiler: it said _VOLATILE_
28 * Richard Kooijman : Timestamp fixes.
29 * Alan Cox : New buffers. Use sk->mac.raw.
30 * Alan Cox : sendmsg/recvmsg support.
31 * Alan Cox : Protocol setting support
32 * Alexey Kuznetsov : Untied from IPv4 stack.
33 * Cyrus Durgin : Fixed kerneld for kmod.
34 * Michal Ostrowski : Module initialization cleanup.
35 * Ulises Alonso : Frame number limit removal and
36 * packet_set_ring memory leak.
37 * Eric Biederman : Allow for > 8 byte hardware addresses.
38 * The convention is that longer addresses
39 * will simply extend the hardware address
40 * byte arrays at the end of sockaddr_ll
42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
48 * This program is free software; you can redistribute it and/or
49 * modify it under the terms of the GNU General Public License
50 * as published by the Free Software Foundation; either version
51 * 2 of the License, or (at your option) any later version.
55 #include <linux/types.h>
57 #include <linux/capability.h>
58 #include <linux/fcntl.h>
59 #include <linux/socket.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_packet.h>
64 #include <linux/wireless.h>
65 #include <linux/kernel.h>
66 #include <linux/kmod.h>
67 #include <linux/slab.h>
68 #include <linux/vmalloc.h>
69 #include <net/net_namespace.h>
71 #include <net/protocol.h>
72 #include <linux/skbuff.h>
74 #include <linux/errno.h>
75 #include <linux/timer.h>
76 #include <asm/uaccess.h>
77 #include <asm/ioctls.h>
79 #include <asm/cacheflush.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 #include <linux/poll.h>
84 #include <linux/module.h>
85 #include <linux/init.h>
86 #include <linux/mutex.h>
87 #include <linux/if_vlan.h>
88 #include <linux/virtio_net.h>
89 #include <linux/errqueue.h>
90 #include <linux/net_tstamp.h>
91 #include <linux/percpu.h>
93 #include <net/inet_common.h>
100 - if device has no dev->hard_header routine, it adds and removes ll header
101 inside itself. In this case ll header is invisible outside of device,
102 but higher levels still should reserve dev->hard_header_len.
103 Some devices are enough clever to reallocate skb, when header
104 will not fit to reserved space (tunnel), another ones are silly
106 - packet socket receives packets with pulled ll header,
107 so that SOCK_RAW should push it back.
112 Incoming, dev->hard_header!=NULL
113 mac_header -> ll header
116 Outgoing, dev->hard_header!=NULL
117 mac_header -> ll header
120 Incoming, dev->hard_header==NULL
121 mac_header -> UNKNOWN position. It is very likely, that it points to ll
122 header. PPP makes it, that is wrong, because introduce
123 assymetry between rx and tx paths.
126 Outgoing, dev->hard_header==NULL
127 mac_header -> data. ll header is still not built!
131 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
137 dev->hard_header != NULL
138 mac_header -> ll header
141 dev->hard_header == NULL (ll header is added by device, we cannot control it)
145 We should set nh.raw on output to correct posistion,
146 packet classifier depends on it.
149 /* Private packet socket structures. */
151 /* identical to struct packet_mreq except it has
152 * a longer address field.
154 struct packet_mreq_max {
156 unsigned short mr_type;
157 unsigned short mr_alen;
158 unsigned char mr_address[MAX_ADDR_LEN];
162 struct tpacket_hdr *h1;
163 struct tpacket2_hdr *h2;
164 struct tpacket3_hdr *h3;
168 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
169 int closing, int tx_ring);
171 #define V3_ALIGNMENT (8)
173 #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
175 #define BLK_PLUS_PRIV(sz_of_priv) \
176 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178 #define PGV_FROM_VMALLOC 1
180 #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
181 #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
182 #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
183 #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
184 #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
185 #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
186 #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
189 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
190 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
191 struct packet_type *pt, struct net_device *orig_dev);
193 static void *packet_previous_frame(struct packet_sock *po,
194 struct packet_ring_buffer *rb,
196 static void packet_increment_head(struct packet_ring_buffer *buff);
197 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
198 struct tpacket_block_desc *);
199 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
200 struct packet_sock *);
201 static void prb_retire_current_block(struct tpacket_kbdq_core *,
202 struct packet_sock *, unsigned int status);
203 static int prb_queue_frozen(struct tpacket_kbdq_core *);
204 static void prb_open_block(struct tpacket_kbdq_core *,
205 struct tpacket_block_desc *);
206 static void prb_retire_rx_blk_timer_expired(unsigned long);
207 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
208 static void prb_init_blk_timer(struct packet_sock *,
209 struct tpacket_kbdq_core *,
210 void (*func) (unsigned long));
211 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
212 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
213 struct tpacket3_hdr *);
214 static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
215 struct tpacket3_hdr *);
216 static void packet_flush_mclist(struct sock *sk);
218 struct packet_skb_cb {
220 struct sockaddr_pkt pkt;
222 /* Trick: alias skb original length with
223 * ll.sll_family and ll.protocol in order
226 unsigned int origlen;
227 struct sockaddr_ll ll;
232 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
234 #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
235 #define GET_PBLOCK_DESC(x, bid) \
236 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
237 #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
238 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
239 #define GET_NEXT_PRB_BLK_NUM(x) \
240 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
241 ((x)->kactive_blk_num+1) : 0)
243 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
244 static void __fanout_link(struct sock *sk, struct packet_sock *po);
246 static int packet_direct_xmit(struct sk_buff *skb)
248 struct net_device *dev = skb->dev;
249 netdev_features_t features;
250 struct netdev_queue *txq;
251 int ret = NETDEV_TX_BUSY;
253 if (unlikely(!netif_running(dev) ||
254 !netif_carrier_ok(dev)))
257 features = netif_skb_features(skb);
258 if (skb_needs_linearize(skb, features) &&
259 __skb_linearize(skb))
262 txq = skb_get_tx_queue(dev, skb);
266 HARD_TX_LOCK(dev, txq, smp_processor_id());
267 if (!netif_xmit_frozen_or_drv_stopped(txq))
268 ret = netdev_start_xmit(skb, dev, txq, false);
269 HARD_TX_UNLOCK(dev, txq);
273 if (!dev_xmit_complete(ret))
278 atomic_long_inc(&dev->tx_dropped);
280 return NET_XMIT_DROP;
283 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
285 struct net_device *dev;
288 dev = rcu_dereference(po->cached_dev);
296 static void packet_cached_dev_assign(struct packet_sock *po,
297 struct net_device *dev)
299 rcu_assign_pointer(po->cached_dev, dev);
302 static void packet_cached_dev_reset(struct packet_sock *po)
304 RCU_INIT_POINTER(po->cached_dev, NULL);
307 static bool packet_use_direct_xmit(const struct packet_sock *po)
309 return po->xmit == packet_direct_xmit;
312 static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
314 return (u16) raw_smp_processor_id() % dev->real_num_tx_queues;
317 static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb)
319 const struct net_device_ops *ops = dev->netdev_ops;
322 if (ops->ndo_select_queue) {
323 queue_index = ops->ndo_select_queue(dev, skb, NULL,
324 __packet_pick_tx_queue);
325 queue_index = netdev_cap_txqueue(dev, queue_index);
327 queue_index = __packet_pick_tx_queue(dev, skb);
330 skb_set_queue_mapping(skb, queue_index);
333 /* register_prot_hook must be invoked with the po->bind_lock held,
334 * or from a context in which asynchronous accesses to the packet
335 * socket is not possible (packet_create()).
337 static void register_prot_hook(struct sock *sk)
339 struct packet_sock *po = pkt_sk(sk);
343 __fanout_link(sk, po);
345 dev_add_pack(&po->prot_hook);
352 /* {,__}unregister_prot_hook() must be invoked with the po->bind_lock
353 * held. If the sync parameter is true, we will temporarily drop
354 * the po->bind_lock and do a synchronize_net to make sure no
355 * asynchronous packet processing paths still refer to the elements
356 * of po->prot_hook. If the sync parameter is false, it is the
357 * callers responsibility to take care of this.
359 static void __unregister_prot_hook(struct sock *sk, bool sync)
361 struct packet_sock *po = pkt_sk(sk);
366 __fanout_unlink(sk, po);
368 __dev_remove_pack(&po->prot_hook);
373 spin_unlock(&po->bind_lock);
375 spin_lock(&po->bind_lock);
379 static void unregister_prot_hook(struct sock *sk, bool sync)
381 struct packet_sock *po = pkt_sk(sk);
384 __unregister_prot_hook(sk, sync);
387 static inline struct page * __pure pgv_to_page(void *addr)
389 if (is_vmalloc_addr(addr))
390 return vmalloc_to_page(addr);
391 return virt_to_page(addr);
394 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
396 union tpacket_uhdr h;
399 switch (po->tp_version) {
401 h.h1->tp_status = status;
402 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
405 h.h2->tp_status = status;
406 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
410 WARN(1, "TPACKET version not supported.\n");
417 static int __packet_get_status(struct packet_sock *po, void *frame)
419 union tpacket_uhdr h;
424 switch (po->tp_version) {
426 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
427 return h.h1->tp_status;
429 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
430 return h.h2->tp_status;
433 WARN(1, "TPACKET version not supported.\n");
439 static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
442 struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
445 (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
446 ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
447 return TP_STATUS_TS_RAW_HARDWARE;
449 if (ktime_to_timespec_cond(skb->tstamp, ts))
450 return TP_STATUS_TS_SOFTWARE;
455 static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
458 union tpacket_uhdr h;
462 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
466 switch (po->tp_version) {
468 h.h1->tp_sec = ts.tv_sec;
469 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
472 h.h2->tp_sec = ts.tv_sec;
473 h.h2->tp_nsec = ts.tv_nsec;
477 WARN(1, "TPACKET version not supported.\n");
481 /* one flush is safe, as both fields always lie on the same cacheline */
482 flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
488 static void *packet_lookup_frame(struct packet_sock *po,
489 struct packet_ring_buffer *rb,
490 unsigned int position,
493 unsigned int pg_vec_pos, frame_offset;
494 union tpacket_uhdr h;
496 pg_vec_pos = position / rb->frames_per_block;
497 frame_offset = position % rb->frames_per_block;
499 h.raw = rb->pg_vec[pg_vec_pos].buffer +
500 (frame_offset * rb->frame_size);
502 if (status != __packet_get_status(po, h.raw))
508 static void *packet_current_frame(struct packet_sock *po,
509 struct packet_ring_buffer *rb,
512 return packet_lookup_frame(po, rb, rb->head, status);
515 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
517 del_timer_sync(&pkc->retire_blk_timer);
520 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
522 struct sk_buff_head *rb_queue)
524 struct tpacket_kbdq_core *pkc;
526 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
527 GET_PBDQC_FROM_RB(&po->rx_ring);
529 spin_lock_bh(&rb_queue->lock);
530 pkc->delete_blk_timer = 1;
531 spin_unlock_bh(&rb_queue->lock);
533 prb_del_retire_blk_timer(pkc);
536 static void prb_init_blk_timer(struct packet_sock *po,
537 struct tpacket_kbdq_core *pkc,
538 void (*func) (unsigned long))
540 init_timer(&pkc->retire_blk_timer);
541 pkc->retire_blk_timer.data = (long)po;
542 pkc->retire_blk_timer.function = func;
543 pkc->retire_blk_timer.expires = jiffies;
546 static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
548 struct tpacket_kbdq_core *pkc;
553 pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) :
554 GET_PBDQC_FROM_RB(&po->rx_ring);
555 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
558 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
559 int blk_size_in_bytes)
561 struct net_device *dev;
562 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
563 struct ethtool_cmd ecmd;
568 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
569 if (unlikely(!dev)) {
571 return DEFAULT_PRB_RETIRE_TOV;
573 err = __ethtool_get_settings(dev, &ecmd);
574 speed = ethtool_cmd_speed(&ecmd);
578 * If the link speed is so slow you don't really
579 * need to worry about perf anyways
581 if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
582 return DEFAULT_PRB_RETIRE_TOV;
589 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
601 static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
602 union tpacket_req_u *req_u)
604 p1->feature_req_word = req_u->req3.tp_feature_req_word;
607 static void init_prb_bdqc(struct packet_sock *po,
608 struct packet_ring_buffer *rb,
610 union tpacket_req_u *req_u, int tx_ring)
612 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
613 struct tpacket_block_desc *pbd;
615 memset(p1, 0x0, sizeof(*p1));
617 p1->knxt_seq_num = 1;
619 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
620 p1->pkblk_start = pg_vec[0].buffer;
621 p1->kblk_size = req_u->req3.tp_block_size;
622 p1->knum_blocks = req_u->req3.tp_block_nr;
623 p1->hdrlen = po->tp_hdrlen;
624 p1->version = po->tp_version;
625 p1->last_kactive_blk_num = 0;
626 po->stats.stats3.tp_freeze_q_cnt = 0;
627 if (req_u->req3.tp_retire_blk_tov)
628 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
630 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
631 req_u->req3.tp_block_size);
632 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
633 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
635 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
636 prb_init_ft_ops(p1, req_u);
637 prb_setup_retire_blk_timer(po, tx_ring);
638 prb_open_block(p1, pbd);
641 /* Do NOT update the last_blk_num first.
642 * Assumes sk_buff_head lock is held.
644 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
646 mod_timer(&pkc->retire_blk_timer,
647 jiffies + pkc->tov_in_jiffies);
648 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
653 * 1) We refresh the timer only when we open a block.
654 * By doing this we don't waste cycles refreshing the timer
655 * on packet-by-packet basis.
657 * With a 1MB block-size, on a 1Gbps line, it will take
658 * i) ~8 ms to fill a block + ii) memcpy etc.
659 * In this cut we are not accounting for the memcpy time.
661 * So, if the user sets the 'tmo' to 10ms then the timer
662 * will never fire while the block is still getting filled
663 * (which is what we want). However, the user could choose
664 * to close a block early and that's fine.
666 * But when the timer does fire, we check whether or not to refresh it.
667 * Since the tmo granularity is in msecs, it is not too expensive
668 * to refresh the timer, lets say every '8' msecs.
669 * Either the user can set the 'tmo' or we can derive it based on
670 * a) line-speed and b) block-size.
671 * prb_calc_retire_blk_tmo() calculates the tmo.
674 static void prb_retire_rx_blk_timer_expired(unsigned long data)
676 struct packet_sock *po = (struct packet_sock *)data;
677 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
679 struct tpacket_block_desc *pbd;
681 spin_lock(&po->sk.sk_receive_queue.lock);
683 frozen = prb_queue_frozen(pkc);
684 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
686 if (unlikely(pkc->delete_blk_timer))
689 /* We only need to plug the race when the block is partially filled.
691 * lock(); increment BLOCK_NUM_PKTS; unlock()
692 * copy_bits() is in progress ...
693 * timer fires on other cpu:
694 * we can't retire the current block because copy_bits
698 if (BLOCK_NUM_PKTS(pbd)) {
699 while (atomic_read(&pkc->blk_fill_in_prog)) {
700 /* Waiting for skb_copy_bits to finish... */
705 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
707 if (!BLOCK_NUM_PKTS(pbd)) {
708 /* An empty block. Just refresh the timer. */
711 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
712 if (!prb_dispatch_next_block(pkc, po))
717 /* Case 1. Queue was frozen because user-space was
720 if (prb_curr_blk_in_use(pkc, pbd)) {
722 * Ok, user-space is still behind.
723 * So just refresh the timer.
727 /* Case 2. queue was frozen,user-space caught up,
728 * now the link went idle && the timer fired.
729 * We don't have a block to close.So we open this
730 * block and restart the timer.
731 * opening a block thaws the queue,restarts timer
732 * Thawing/timer-refresh is a side effect.
734 prb_open_block(pkc, pbd);
741 _prb_refresh_rx_retire_blk_timer(pkc);
744 spin_unlock(&po->sk.sk_receive_queue.lock);
747 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
748 struct tpacket_block_desc *pbd1, __u32 status)
750 /* Flush everything minus the block header */
752 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
757 /* Skip the block header(we know header WILL fit in 4K) */
760 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
761 for (; start < end; start += PAGE_SIZE)
762 flush_dcache_page(pgv_to_page(start));
767 /* Now update the block status. */
769 BLOCK_STATUS(pbd1) = status;
771 /* Flush the block header */
773 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
775 flush_dcache_page(pgv_to_page(start));
785 * 2) Increment active_blk_num
787 * Note:We DONT refresh the timer on purpose.
788 * Because almost always the next block will be opened.
790 static void prb_close_block(struct tpacket_kbdq_core *pkc1,
791 struct tpacket_block_desc *pbd1,
792 struct packet_sock *po, unsigned int stat)
794 __u32 status = TP_STATUS_USER | stat;
796 struct tpacket3_hdr *last_pkt;
797 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
798 struct sock *sk = &po->sk;
800 if (po->stats.stats3.tp_drops)
801 status |= TP_STATUS_LOSING;
803 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
804 last_pkt->tp_next_offset = 0;
806 /* Get the ts of the last pkt */
807 if (BLOCK_NUM_PKTS(pbd1)) {
808 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
809 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
811 /* Ok, we tmo'd - so get the current time.
813 * It shouldn't really happen as we don't close empty
814 * blocks. See prb_retire_rx_blk_timer_expired().
818 h1->ts_last_pkt.ts_sec = ts.tv_sec;
819 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
824 /* Flush the block */
825 prb_flush_block(pkc1, pbd1, status);
827 sk->sk_data_ready(sk);
829 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
832 static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
834 pkc->reset_pending_on_curr_blk = 0;
838 * Side effect of opening a block:
840 * 1) prb_queue is thawed.
841 * 2) retire_blk_timer is refreshed.
844 static void prb_open_block(struct tpacket_kbdq_core *pkc1,
845 struct tpacket_block_desc *pbd1)
848 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
852 /* We could have just memset this but we will lose the
853 * flexibility of making the priv area sticky
856 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
857 BLOCK_NUM_PKTS(pbd1) = 0;
858 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
862 h1->ts_first_pkt.ts_sec = ts.tv_sec;
863 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
865 pkc1->pkblk_start = (char *)pbd1;
866 pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
868 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
869 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
871 pbd1->version = pkc1->version;
872 pkc1->prev = pkc1->nxt_offset;
873 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
875 prb_thaw_queue(pkc1);
876 _prb_refresh_rx_retire_blk_timer(pkc1);
882 * Queue freeze logic:
883 * 1) Assume tp_block_nr = 8 blocks.
884 * 2) At time 't0', user opens Rx ring.
885 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
886 * 4) user-space is either sleeping or processing block '0'.
887 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
888 * it will close block-7,loop around and try to fill block '0'.
890 * __packet_lookup_frame_in_block
891 * prb_retire_current_block()
892 * prb_dispatch_next_block()
893 * |->(BLOCK_STATUS == USER) evaluates to true
894 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
895 * 6) Now there are two cases:
896 * 6.1) Link goes idle right after the queue is frozen.
897 * But remember, the last open_block() refreshed the timer.
898 * When this timer expires,it will refresh itself so that we can
899 * re-open block-0 in near future.
900 * 6.2) Link is busy and keeps on receiving packets. This is a simple
901 * case and __packet_lookup_frame_in_block will check if block-0
902 * is free and can now be re-used.
904 static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
905 struct packet_sock *po)
907 pkc->reset_pending_on_curr_blk = 1;
908 po->stats.stats3.tp_freeze_q_cnt++;
911 #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
914 * If the next block is free then we will dispatch it
915 * and return a good offset.
916 * Else, we will freeze the queue.
917 * So, caller must check the return value.
919 static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
920 struct packet_sock *po)
922 struct tpacket_block_desc *pbd;
926 /* 1. Get current block num */
927 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
929 /* 2. If this block is currently in_use then freeze the queue */
930 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
931 prb_freeze_queue(pkc, po);
937 * open this block and return the offset where the first packet
938 * needs to get stored.
940 prb_open_block(pkc, pbd);
941 return (void *)pkc->nxt_offset;
944 static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
945 struct packet_sock *po, unsigned int status)
947 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
949 /* retire/close the current block */
950 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
952 * Plug the case where copy_bits() is in progress on
953 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
954 * have space to copy the pkt in the current block and
955 * called prb_retire_current_block()
957 * We don't need to worry about the TMO case because
958 * the timer-handler already handled this case.
960 if (!(status & TP_STATUS_BLK_TMO)) {
961 while (atomic_read(&pkc->blk_fill_in_prog)) {
962 /* Waiting for skb_copy_bits to finish... */
966 prb_close_block(pkc, pbd, po, status);
971 static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
972 struct tpacket_block_desc *pbd)
974 return TP_STATUS_USER & BLOCK_STATUS(pbd);
977 static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
979 return pkc->reset_pending_on_curr_blk;
982 static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
984 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
985 atomic_dec(&pkc->blk_fill_in_prog);
988 static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
989 struct tpacket3_hdr *ppd)
991 ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
994 static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
995 struct tpacket3_hdr *ppd)
997 ppd->hv1.tp_rxhash = 0;
1000 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
1001 struct tpacket3_hdr *ppd)
1003 if (skb_vlan_tag_present(pkc->skb)) {
1004 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
1005 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
1006 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
1008 ppd->hv1.tp_vlan_tci = 0;
1009 ppd->hv1.tp_vlan_tpid = 0;
1010 ppd->tp_status = TP_STATUS_AVAILABLE;
1014 static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
1015 struct tpacket3_hdr *ppd)
1017 ppd->hv1.tp_padding = 0;
1018 prb_fill_vlan_info(pkc, ppd);
1020 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
1021 prb_fill_rxhash(pkc, ppd);
1023 prb_clear_rxhash(pkc, ppd);
1026 static void prb_fill_curr_block(char *curr,
1027 struct tpacket_kbdq_core *pkc,
1028 struct tpacket_block_desc *pbd,
1031 struct tpacket3_hdr *ppd;
1033 ppd = (struct tpacket3_hdr *)curr;
1034 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1036 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1037 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1038 BLOCK_NUM_PKTS(pbd) += 1;
1039 atomic_inc(&pkc->blk_fill_in_prog);
1040 prb_run_all_ft_ops(pkc, ppd);
1043 /* Assumes caller has the sk->rx_queue.lock */
1044 static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1045 struct sk_buff *skb,
1050 struct tpacket_kbdq_core *pkc;
1051 struct tpacket_block_desc *pbd;
1054 pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
1055 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1057 /* Queue is frozen when user space is lagging behind */
1058 if (prb_queue_frozen(pkc)) {
1060 * Check if that last block which caused the queue to freeze,
1061 * is still in_use by user-space.
1063 if (prb_curr_blk_in_use(pkc, pbd)) {
1064 /* Can't record this packet */
1068 * Ok, the block was released by user-space.
1069 * Now let's open that block.
1070 * opening a block also thaws the queue.
1071 * Thawing is a side effect.
1073 prb_open_block(pkc, pbd);
1078 curr = pkc->nxt_offset;
1080 end = (char *)pbd + pkc->kblk_size;
1082 /* first try the current block */
1083 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1084 prb_fill_curr_block(curr, pkc, pbd, len);
1085 return (void *)curr;
1088 /* Ok, close the current block */
1089 prb_retire_current_block(pkc, po, 0);
1091 /* Now, try to dispatch the next block */
1092 curr = (char *)prb_dispatch_next_block(pkc, po);
1094 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1095 prb_fill_curr_block(curr, pkc, pbd, len);
1096 return (void *)curr;
1100 * No free blocks are available.user_space hasn't caught up yet.
1101 * Queue was just frozen and now this packet will get dropped.
1106 static void *packet_current_rx_frame(struct packet_sock *po,
1107 struct sk_buff *skb,
1108 int status, unsigned int len)
1111 switch (po->tp_version) {
1114 curr = packet_lookup_frame(po, &po->rx_ring,
1115 po->rx_ring.head, status);
1118 return __packet_lookup_frame_in_block(po, skb, status, len);
1120 WARN(1, "TPACKET version not supported\n");
1126 static void *prb_lookup_block(struct packet_sock *po,
1127 struct packet_ring_buffer *rb,
1131 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1132 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
1134 if (status != BLOCK_STATUS(pbd))
1139 static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1142 if (rb->prb_bdqc.kactive_blk_num)
1143 prev = rb->prb_bdqc.kactive_blk_num-1;
1145 prev = rb->prb_bdqc.knum_blocks-1;
1149 /* Assumes caller has held the rx_queue.lock */
1150 static void *__prb_previous_block(struct packet_sock *po,
1151 struct packet_ring_buffer *rb,
1154 unsigned int previous = prb_previous_blk_num(rb);
1155 return prb_lookup_block(po, rb, previous, status);
1158 static void *packet_previous_rx_frame(struct packet_sock *po,
1159 struct packet_ring_buffer *rb,
1162 if (po->tp_version <= TPACKET_V2)
1163 return packet_previous_frame(po, rb, status);
1165 return __prb_previous_block(po, rb, status);
1168 static void packet_increment_rx_head(struct packet_sock *po,
1169 struct packet_ring_buffer *rb)
1171 switch (po->tp_version) {
1174 return packet_increment_head(rb);
1177 WARN(1, "TPACKET version not supported.\n");
1183 static void *packet_previous_frame(struct packet_sock *po,
1184 struct packet_ring_buffer *rb,
1187 unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
1188 return packet_lookup_frame(po, rb, previous, status);
1191 static void packet_increment_head(struct packet_ring_buffer *buff)
1193 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
1196 static void packet_inc_pending(struct packet_ring_buffer *rb)
1198 this_cpu_inc(*rb->pending_refcnt);
1201 static void packet_dec_pending(struct packet_ring_buffer *rb)
1203 this_cpu_dec(*rb->pending_refcnt);
1206 static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
1208 unsigned int refcnt = 0;
1211 /* We don't use pending refcount in rx_ring. */
1212 if (rb->pending_refcnt == NULL)
1215 for_each_possible_cpu(cpu)
1216 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
1221 static int packet_alloc_pending(struct packet_sock *po)
1223 po->rx_ring.pending_refcnt = NULL;
1225 po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
1226 if (unlikely(po->tx_ring.pending_refcnt == NULL))
1232 static void packet_free_pending(struct packet_sock *po)
1234 free_percpu(po->tx_ring.pending_refcnt);
1237 #define ROOM_POW_OFF 2
1238 #define ROOM_NONE 0x0
1239 #define ROOM_LOW 0x1
1240 #define ROOM_NORMAL 0x2
1242 static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
1246 len = po->rx_ring.frame_max + 1;
1247 idx = po->rx_ring.head;
1249 idx += len >> pow_off;
1252 return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1255 static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
1259 len = po->rx_ring.prb_bdqc.knum_blocks;
1260 idx = po->rx_ring.prb_bdqc.kactive_blk_num;
1262 idx += len >> pow_off;
1265 return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
1268 static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1270 struct sock *sk = &po->sk;
1271 int ret = ROOM_NONE;
1273 if (po->prot_hook.func != tpacket_rcv) {
1274 int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
1275 - (skb ? skb->truesize : 0);
1276 if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
1284 if (po->tp_version == TPACKET_V3) {
1285 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
1287 else if (__tpacket_v3_has_room(po, 0))
1290 if (__tpacket_has_room(po, ROOM_POW_OFF))
1292 else if (__tpacket_has_room(po, 0))
1299 static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
1304 if (po->prot_hook.func == tpacket_rcv) {
1305 spin_lock(&po->sk.sk_receive_queue.lock);
1306 ret = __packet_rcv_has_room(po, skb);
1307 spin_unlock(&po->sk.sk_receive_queue.lock);
1309 ret = __packet_rcv_has_room(po, skb);
1312 has_room = ret == ROOM_NORMAL;
1313 if (po->pressure == has_room)
1314 xchg(&po->pressure, !has_room);
1319 static void packet_sock_destruct(struct sock *sk)
1321 skb_queue_purge(&sk->sk_error_queue);
1323 WARN_ON(atomic_read(&sk->sk_rmem_alloc));
1324 WARN_ON(atomic_read(&sk->sk_wmem_alloc));
1326 if (!sock_flag(sk, SOCK_DEAD)) {
1327 pr_err("Attempt to release alive packet socket: %p\n", sk);
1331 sk_refcnt_debug_dec(sk);
1334 static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
1336 int x = atomic_read(&f->rr_cur) + 1;
1344 static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
1349 rxhash = skb_get_hash(skb);
1350 for (i = 0; i < ROLLOVER_HLEN; i++)
1351 if (po->rollover->history[i] == rxhash)
1354 po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
1355 return count > (ROLLOVER_HLEN >> 1);
1358 static unsigned int fanout_demux_hash(struct packet_fanout *f,
1359 struct sk_buff *skb,
1362 return reciprocal_scale(skb_get_hash(skb), num);
1365 static unsigned int fanout_demux_lb(struct packet_fanout *f,
1366 struct sk_buff *skb,
1371 cur = atomic_read(&f->rr_cur);
1372 while ((old = atomic_cmpxchg(&f->rr_cur, cur,
1373 fanout_rr_next(f, num))) != cur)
1378 static unsigned int fanout_demux_cpu(struct packet_fanout *f,
1379 struct sk_buff *skb,
1382 return smp_processor_id() % num;
1385 static unsigned int fanout_demux_rnd(struct packet_fanout *f,
1386 struct sk_buff *skb,
1389 return prandom_u32_max(num);
1392 static unsigned int fanout_demux_rollover(struct packet_fanout *f,
1393 struct sk_buff *skb,
1394 unsigned int idx, bool try_self,
1397 struct packet_sock *po, *po_next;
1398 unsigned int i, j, room;
1400 po = pkt_sk(f->arr[idx]);
1403 room = packet_rcv_has_room(po, skb);
1404 if (room == ROOM_NORMAL ||
1405 (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
1409 i = j = min_t(int, po->rollover->sock, num - 1);
1411 po_next = pkt_sk(f->arr[i]);
1412 if (po_next != po && !po_next->pressure &&
1413 packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
1415 po->rollover->sock = i;
1426 static unsigned int fanout_demux_qm(struct packet_fanout *f,
1427 struct sk_buff *skb,
1430 return skb_get_queue_mapping(skb) % num;
1433 static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
1435 return f->flags & (flag >> 8);
1438 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
1439 struct packet_type *pt, struct net_device *orig_dev)
1441 struct packet_fanout *f = pt->af_packet_priv;
1442 unsigned int num = f->num_members;
1443 struct packet_sock *po;
1446 if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
1452 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
1453 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
1458 case PACKET_FANOUT_HASH:
1460 idx = fanout_demux_hash(f, skb, num);
1462 case PACKET_FANOUT_LB:
1463 idx = fanout_demux_lb(f, skb, num);
1465 case PACKET_FANOUT_CPU:
1466 idx = fanout_demux_cpu(f, skb, num);
1468 case PACKET_FANOUT_RND:
1469 idx = fanout_demux_rnd(f, skb, num);
1471 case PACKET_FANOUT_QM:
1472 idx = fanout_demux_qm(f, skb, num);
1474 case PACKET_FANOUT_ROLLOVER:
1475 idx = fanout_demux_rollover(f, skb, 0, false, num);
1479 if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
1480 idx = fanout_demux_rollover(f, skb, idx, true, num);
1482 po = pkt_sk(f->arr[idx]);
1483 return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
1486 DEFINE_MUTEX(fanout_mutex);
1487 EXPORT_SYMBOL_GPL(fanout_mutex);
1488 static LIST_HEAD(fanout_list);
1490 static void __fanout_link(struct sock *sk, struct packet_sock *po)
1492 struct packet_fanout *f = po->fanout;
1494 spin_lock(&f->lock);
1495 f->arr[f->num_members] = sk;
1498 spin_unlock(&f->lock);
1501 static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1503 struct packet_fanout *f = po->fanout;
1506 spin_lock(&f->lock);
1507 for (i = 0; i < f->num_members; i++) {
1508 if (f->arr[i] == sk)
1511 BUG_ON(i >= f->num_members);
1512 f->arr[i] = f->arr[f->num_members - 1];
1514 spin_unlock(&f->lock);
1517 static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
1519 if (ptype->af_packet_priv == (void *)((struct packet_sock *)sk)->fanout)
1525 static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1527 struct packet_sock *po = pkt_sk(sk);
1528 struct packet_fanout *f, *match;
1529 u8 type = type_flags & 0xff;
1530 u8 flags = type_flags >> 8;
1534 case PACKET_FANOUT_ROLLOVER:
1535 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
1537 case PACKET_FANOUT_HASH:
1538 case PACKET_FANOUT_LB:
1539 case PACKET_FANOUT_CPU:
1540 case PACKET_FANOUT_RND:
1541 case PACKET_FANOUT_QM:
1553 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) {
1554 po->rollover = kzalloc(sizeof(*po->rollover), GFP_KERNEL);
1559 mutex_lock(&fanout_mutex);
1561 list_for_each_entry(f, &fanout_list, list) {
1563 read_pnet(&f->net) == sock_net(sk)) {
1569 if (match && match->flags != flags)
1573 match = kzalloc(sizeof(*match), GFP_KERNEL);
1576 write_pnet(&match->net, sock_net(sk));
1579 match->flags = flags;
1580 atomic_set(&match->rr_cur, 0);
1581 INIT_LIST_HEAD(&match->list);
1582 spin_lock_init(&match->lock);
1583 atomic_set(&match->sk_ref, 0);
1584 match->prot_hook.type = po->prot_hook.type;
1585 match->prot_hook.dev = po->prot_hook.dev;
1586 match->prot_hook.func = packet_rcv_fanout;
1587 match->prot_hook.af_packet_priv = match;
1588 match->prot_hook.id_match = match_fanout_group;
1589 dev_add_pack(&match->prot_hook);
1590 list_add(&match->list, &fanout_list);
1593 if (match->type == type &&
1594 match->prot_hook.type == po->prot_hook.type &&
1595 match->prot_hook.dev == po->prot_hook.dev) {
1597 if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
1598 __dev_remove_pack(&po->prot_hook);
1600 atomic_inc(&match->sk_ref);
1601 __fanout_link(sk, po);
1606 mutex_unlock(&fanout_mutex);
1608 kfree(po->rollover);
1609 po->rollover = NULL;
1614 static void fanout_release(struct sock *sk)
1616 struct packet_sock *po = pkt_sk(sk);
1617 struct packet_fanout *f;
1623 mutex_lock(&fanout_mutex);
1626 if (atomic_dec_and_test(&f->sk_ref)) {
1628 dev_remove_pack(&f->prot_hook);
1631 mutex_unlock(&fanout_mutex);
1633 kfree(po->rollover);
1636 static const struct proto_ops packet_ops;
1638 static const struct proto_ops packet_ops_spkt;
1640 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
1641 struct packet_type *pt, struct net_device *orig_dev)
1644 struct sockaddr_pkt *spkt;
1647 * When we registered the protocol we saved the socket in the data
1648 * field for just this event.
1651 sk = pt->af_packet_priv;
1654 * Yank back the headers [hope the device set this
1655 * right or kerboom...]
1657 * Incoming packets have ll header pulled,
1660 * For outgoing ones skb->data == skb_mac_header(skb)
1661 * so that this procedure is noop.
1664 if (skb->pkt_type == PACKET_LOOPBACK)
1667 if (!net_eq(dev_net(dev), sock_net(sk)))
1670 skb = skb_share_check(skb, GFP_ATOMIC);
1674 /* drop any routing info */
1677 /* drop conntrack reference */
1680 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
1682 skb_push(skb, skb->data - skb_mac_header(skb));
1685 * The SOCK_PACKET socket receives _all_ frames.
1688 spkt->spkt_family = dev->type;
1689 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
1690 spkt->spkt_protocol = skb->protocol;
1693 * Charge the memory to the socket. This is done specifically
1694 * to prevent sockets using all the memory up.
1697 if (sock_queue_rcv_skb(sk, skb) == 0)
1708 * Output a raw packet to a device layer. This bypasses all the other
1709 * protocol layers and you must therefore supply it with a complete frame
1712 static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
1715 struct sock *sk = sock->sk;
1716 DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
1717 struct sk_buff *skb = NULL;
1718 struct net_device *dev;
1724 * Get and verify the address.
1728 if (msg->msg_namelen < sizeof(struct sockaddr))
1730 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
1731 proto = saddr->spkt_protocol;
1733 return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */
1736 * Find the device first to size check it
1739 saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
1742 dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
1748 if (!(dev->flags & IFF_UP))
1752 * You may not queue a frame bigger than the mtu. This is the lowest level
1753 * raw protocol and you must do your own fragmentation at this level.
1756 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
1757 if (!netif_supports_nofcs(dev)) {
1758 err = -EPROTONOSUPPORT;
1761 extra_len = 4; /* We're doing our own CRC */
1765 if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
1769 size_t reserved = LL_RESERVED_SPACE(dev);
1770 int tlen = dev->needed_tailroom;
1771 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
1774 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
1777 /* FIXME: Save some space for broken drivers that write a hard
1778 * header at transmission time by themselves. PPP is the notable
1779 * one here. This should really be fixed at the driver level.
1781 skb_reserve(skb, reserved);
1782 skb_reset_network_header(skb);
1784 /* Try to align data part correctly */
1789 skb_reset_network_header(skb);
1791 err = memcpy_from_msg(skb_put(skb, len), msg, len);
1797 if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
1798 /* Earlier code assumed this would be a VLAN pkt,
1799 * double-check this now that we have the actual
1802 struct ethhdr *ehdr;
1803 skb_reset_mac_header(skb);
1804 ehdr = eth_hdr(skb);
1805 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
1811 skb->protocol = proto;
1813 skb->priority = sk->sk_priority;
1814 skb->mark = sk->sk_mark;
1816 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1818 if (unlikely(extra_len == 4))
1821 skb_probe_transport_header(skb, 0);
1823 dev_queue_xmit(skb);
1834 static unsigned int run_filter(const struct sk_buff *skb,
1835 const struct sock *sk,
1838 struct sk_filter *filter;
1841 filter = rcu_dereference(sk->sk_filter);
1843 res = SK_RUN_FILTER(filter, skb);
1850 * This function makes lazy skb cloning in hope that most of packets
1851 * are discarded by BPF.
1853 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
1854 * and skb->cb are mangled. It works because (and until) packets
1855 * falling here are owned by current CPU. Output packets are cloned
1856 * by dev_queue_xmit_nit(), input packets are processed by net_bh
1857 * sequencially, so that if we return skb to original state on exit,
1858 * we will not harm anyone.
1861 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
1862 struct packet_type *pt, struct net_device *orig_dev)
1865 struct sockaddr_ll *sll;
1866 struct packet_sock *po;
1867 u8 *skb_head = skb->data;
1868 int skb_len = skb->len;
1869 unsigned int snaplen, res;
1871 if (skb->pkt_type == PACKET_LOOPBACK)
1874 sk = pt->af_packet_priv;
1877 if (!net_eq(dev_net(dev), sock_net(sk)))
1882 if (dev->header_ops) {
1883 /* The device has an explicit notion of ll header,
1884 * exported to higher levels.
1886 * Otherwise, the device hides details of its frame
1887 * structure, so that corresponding packet head is
1888 * never delivered to user.
1890 if (sk->sk_type != SOCK_DGRAM)
1891 skb_push(skb, skb->data - skb_mac_header(skb));
1892 else if (skb->pkt_type == PACKET_OUTGOING) {
1893 /* Special case: outgoing packets have ll header at head */
1894 skb_pull(skb, skb_network_offset(skb));
1900 res = run_filter(skb, sk, snaplen);
1902 goto drop_n_restore;
1906 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
1909 if (skb_shared(skb)) {
1910 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
1914 if (skb_head != skb->data) {
1915 skb->data = skb_head;
1922 sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
1924 sll = &PACKET_SKB_CB(skb)->sa.ll;
1925 sll->sll_hatype = dev->type;
1926 sll->sll_pkttype = skb->pkt_type;
1927 if (unlikely(po->origdev))
1928 sll->sll_ifindex = orig_dev->ifindex;
1930 sll->sll_ifindex = dev->ifindex;
1932 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
1934 /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
1935 * Use their space for storing the original skb length.
1937 PACKET_SKB_CB(skb)->sa.origlen = skb->len;
1939 if (pskb_trim(skb, snaplen))
1942 skb_set_owner_r(skb, sk);
1946 /* drop conntrack reference */
1949 spin_lock(&sk->sk_receive_queue.lock);
1950 po->stats.stats1.tp_packets++;
1951 sock_skb_set_dropcount(sk, skb);
1952 __skb_queue_tail(&sk->sk_receive_queue, skb);
1953 spin_unlock(&sk->sk_receive_queue.lock);
1954 sk->sk_data_ready(sk);
1958 spin_lock(&sk->sk_receive_queue.lock);
1959 po->stats.stats1.tp_drops++;
1960 atomic_inc(&sk->sk_drops);
1961 spin_unlock(&sk->sk_receive_queue.lock);
1964 if (skb_head != skb->data && skb_shared(skb)) {
1965 skb->data = skb_head;
1973 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1974 struct packet_type *pt, struct net_device *orig_dev)
1977 struct packet_sock *po;
1978 struct sockaddr_ll *sll;
1979 union tpacket_uhdr h;
1980 u8 *skb_head = skb->data;
1981 int skb_len = skb->len;
1982 unsigned int snaplen, res;
1983 unsigned long status = TP_STATUS_USER;
1984 unsigned short macoff, netoff, hdrlen;
1985 struct sk_buff *copy_skb = NULL;
1989 /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
1990 * We may add members to them until current aligned size without forcing
1991 * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
1993 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
1994 BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
1996 if (skb->pkt_type == PACKET_LOOPBACK)
1999 sk = pt->af_packet_priv;
2002 if (!net_eq(dev_net(dev), sock_net(sk)))
2005 if (dev->header_ops) {
2006 if (sk->sk_type != SOCK_DGRAM)
2007 skb_push(skb, skb->data - skb_mac_header(skb));
2008 else if (skb->pkt_type == PACKET_OUTGOING) {
2009 /* Special case: outgoing packets have ll header at head */
2010 skb_pull(skb, skb_network_offset(skb));
2016 res = run_filter(skb, sk, snaplen);
2018 goto drop_n_restore;
2020 if (skb->ip_summed == CHECKSUM_PARTIAL)
2021 status |= TP_STATUS_CSUMNOTREADY;
2022 else if (skb->pkt_type != PACKET_OUTGOING &&
2023 (skb->ip_summed == CHECKSUM_COMPLETE ||
2024 skb_csum_unnecessary(skb)))
2025 status |= TP_STATUS_CSUM_VALID;
2030 if (sk->sk_type == SOCK_DGRAM) {
2031 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
2034 unsigned int maclen = skb_network_offset(skb);
2035 netoff = TPACKET_ALIGN(po->tp_hdrlen +
2036 (maclen < 16 ? 16 : maclen)) +
2038 macoff = netoff - maclen;
2040 if (po->tp_version <= TPACKET_V2) {
2041 if (macoff + snaplen > po->rx_ring.frame_size) {
2042 if (po->copy_thresh &&
2043 atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
2044 if (skb_shared(skb)) {
2045 copy_skb = skb_clone(skb, GFP_ATOMIC);
2047 copy_skb = skb_get(skb);
2048 skb_head = skb->data;
2051 skb_set_owner_r(copy_skb, sk);
2053 snaplen = po->rx_ring.frame_size - macoff;
2054 if ((int)snaplen < 0)
2057 } else if (unlikely(macoff + snaplen >
2058 GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
2061 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
2062 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
2063 snaplen, nval, macoff);
2065 if (unlikely((int)snaplen < 0)) {
2067 macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
2070 spin_lock(&sk->sk_receive_queue.lock);
2071 h.raw = packet_current_rx_frame(po, skb,
2072 TP_STATUS_KERNEL, (macoff+snaplen));
2075 if (po->tp_version <= TPACKET_V2) {
2076 packet_increment_rx_head(po, &po->rx_ring);
2078 * LOSING will be reported till you read the stats,
2079 * because it's COR - Clear On Read.
2080 * Anyways, moving it for V1/V2 only as V3 doesn't need this
2083 if (po->stats.stats1.tp_drops)
2084 status |= TP_STATUS_LOSING;
2086 po->stats.stats1.tp_packets++;
2088 status |= TP_STATUS_COPY;
2089 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
2091 spin_unlock(&sk->sk_receive_queue.lock);
2093 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
2095 if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
2096 getnstimeofday(&ts);
2098 status |= ts_status;
2100 switch (po->tp_version) {
2102 h.h1->tp_len = skb->len;
2103 h.h1->tp_snaplen = snaplen;
2104 h.h1->tp_mac = macoff;
2105 h.h1->tp_net = netoff;
2106 h.h1->tp_sec = ts.tv_sec;
2107 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
2108 hdrlen = sizeof(*h.h1);
2111 h.h2->tp_len = skb->len;
2112 h.h2->tp_snaplen = snaplen;
2113 h.h2->tp_mac = macoff;
2114 h.h2->tp_net = netoff;
2115 h.h2->tp_sec = ts.tv_sec;
2116 h.h2->tp_nsec = ts.tv_nsec;
2117 if (skb_vlan_tag_present(skb)) {
2118 h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
2119 h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
2120 status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
2122 h.h2->tp_vlan_tci = 0;
2123 h.h2->tp_vlan_tpid = 0;
2125 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
2126 hdrlen = sizeof(*h.h2);
2129 /* tp_nxt_offset,vlan are already populated above.
2130 * So DONT clear those fields here
2132 h.h3->tp_status |= status;
2133 h.h3->tp_len = skb->len;
2134 h.h3->tp_snaplen = snaplen;
2135 h.h3->tp_mac = macoff;
2136 h.h3->tp_net = netoff;
2137 h.h3->tp_sec = ts.tv_sec;
2138 h.h3->tp_nsec = ts.tv_nsec;
2139 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
2140 hdrlen = sizeof(*h.h3);
2146 sll = h.raw + TPACKET_ALIGN(hdrlen);
2147 sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
2148 sll->sll_family = AF_PACKET;
2149 sll->sll_hatype = dev->type;
2150 sll->sll_protocol = skb->protocol;
2151 sll->sll_pkttype = skb->pkt_type;
2152 if (unlikely(po->origdev))
2153 sll->sll_ifindex = orig_dev->ifindex;
2155 sll->sll_ifindex = dev->ifindex;
2159 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
2160 if (po->tp_version <= TPACKET_V2) {
2163 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
2166 for (start = h.raw; start < end; start += PAGE_SIZE)
2167 flush_dcache_page(pgv_to_page(start));
2172 if (po->tp_version <= TPACKET_V2) {
2173 __packet_set_status(po, h.raw, status);
2174 sk->sk_data_ready(sk);
2176 prb_clear_blk_fill_status(&po->rx_ring);
2180 if (skb_head != skb->data && skb_shared(skb)) {
2181 skb->data = skb_head;
2189 po->stats.stats1.tp_drops++;
2190 spin_unlock(&sk->sk_receive_queue.lock);
2192 sk->sk_data_ready(sk);
2193 kfree_skb(copy_skb);
2194 goto drop_n_restore;
2197 static void tpacket_destruct_skb(struct sk_buff *skb)
2199 struct packet_sock *po = pkt_sk(skb->sk);
2201 if (likely(po->tx_ring.pg_vec)) {
2205 ph = skb_shinfo(skb)->destructor_arg;
2206 packet_dec_pending(&po->tx_ring);
2208 ts = __packet_set_timestamp(po, ph, skb);
2209 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
2215 static bool ll_header_truncated(const struct net_device *dev, int len)
2217 /* net device doesn't like empty head */
2218 if (unlikely(len <= dev->hard_header_len)) {
2219 net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
2220 current->comm, len, dev->hard_header_len);
2227 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
2228 void *frame, struct net_device *dev, int size_max,
2229 __be16 proto, unsigned char *addr, int hlen)
2231 union tpacket_uhdr ph;
2232 int to_write, offset, len, tp_len, nr_frags, len_max;
2233 struct socket *sock = po->sk.sk_socket;
2240 skb->protocol = proto;
2242 skb->priority = po->sk.sk_priority;
2243 skb->mark = po->sk.sk_mark;
2244 sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
2245 skb_shinfo(skb)->destructor_arg = ph.raw;
2247 switch (po->tp_version) {
2249 tp_len = ph.h2->tp_len;
2252 tp_len = ph.h1->tp_len;
2255 if (unlikely(tp_len > size_max)) {
2256 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
2260 skb_reserve(skb, hlen);
2261 skb_reset_network_header(skb);
2263 if (!packet_use_direct_xmit(po))
2264 skb_probe_transport_header(skb, 0);
2265 if (unlikely(po->tp_tx_has_off)) {
2266 int off_min, off_max, off;
2267 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2268 off_max = po->tx_ring.frame_size - tp_len;
2269 if (sock->type == SOCK_DGRAM) {
2270 switch (po->tp_version) {
2272 off = ph.h2->tp_net;
2275 off = ph.h1->tp_net;
2279 switch (po->tp_version) {
2281 off = ph.h2->tp_mac;
2284 off = ph.h1->tp_mac;
2288 if (unlikely((off < off_min) || (off_max < off)))
2290 data = ph.raw + off;
2292 data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
2296 if (sock->type == SOCK_DGRAM) {
2297 err = dev_hard_header(skb, dev, ntohs(proto), addr,
2299 if (unlikely(err < 0))
2301 } else if (dev->hard_header_len) {
2302 if (ll_header_truncated(dev, tp_len))
2305 skb_push(skb, dev->hard_header_len);
2306 err = skb_store_bits(skb, 0, data,
2307 dev->hard_header_len);
2311 data += dev->hard_header_len;
2312 to_write -= dev->hard_header_len;
2315 offset = offset_in_page(data);
2316 len_max = PAGE_SIZE - offset;
2317 len = ((to_write > len_max) ? len_max : to_write);
2319 skb->data_len = to_write;
2320 skb->len += to_write;
2321 skb->truesize += to_write;
2322 atomic_add(to_write, &po->sk.sk_wmem_alloc);
2324 while (likely(to_write)) {
2325 nr_frags = skb_shinfo(skb)->nr_frags;
2327 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
2328 pr_err("Packet exceed the number of skb frags(%lu)\n",
2333 page = pgv_to_page(data);
2335 flush_dcache_page(page);
2337 skb_fill_page_desc(skb, nr_frags, page, offset, len);
2340 len_max = PAGE_SIZE;
2341 len = ((to_write > len_max) ? len_max : to_write);
2347 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
2349 struct sk_buff *skb;
2350 struct net_device *dev;
2352 int err, reserve = 0;
2354 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2355 bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
2356 int tp_len, size_max;
2357 unsigned char *addr;
2359 int status = TP_STATUS_AVAILABLE;
2362 mutex_lock(&po->pg_vec_lock);
2364 if (likely(saddr == NULL)) {
2365 dev = packet_cached_dev_get(po);
2370 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2372 if (msg->msg_namelen < (saddr->sll_halen
2373 + offsetof(struct sockaddr_ll,
2376 proto = saddr->sll_protocol;
2377 addr = saddr->sll_addr;
2378 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
2382 if (unlikely(dev == NULL))
2385 if (unlikely(!(dev->flags & IFF_UP)))
2388 reserve = dev->hard_header_len + VLAN_HLEN;
2389 size_max = po->tx_ring.frame_size
2390 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
2392 if (size_max > dev->mtu + reserve)
2393 size_max = dev->mtu + reserve;
2396 ph = packet_current_frame(po, &po->tx_ring,
2397 TP_STATUS_SEND_REQUEST);
2398 if (unlikely(ph == NULL)) {
2399 if (need_wait && need_resched())
2404 status = TP_STATUS_SEND_REQUEST;
2405 hlen = LL_RESERVED_SPACE(dev);
2406 tlen = dev->needed_tailroom;
2407 skb = sock_alloc_send_skb(&po->sk,
2408 hlen + tlen + sizeof(struct sockaddr_ll),
2411 if (unlikely(skb == NULL)) {
2412 /* we assume the socket was initially writeable ... */
2413 if (likely(len_sum > 0))
2417 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
2419 if (tp_len > dev->mtu + dev->hard_header_len) {
2420 struct ethhdr *ehdr;
2421 /* Earlier code assumed this would be a VLAN pkt,
2422 * double-check this now that we have the actual
2426 skb_reset_mac_header(skb);
2427 ehdr = eth_hdr(skb);
2428 if (ehdr->h_proto != htons(ETH_P_8021Q))
2431 if (unlikely(tp_len < 0)) {
2433 __packet_set_status(po, ph,
2434 TP_STATUS_AVAILABLE);
2435 packet_increment_head(&po->tx_ring);
2439 status = TP_STATUS_WRONG_FORMAT;
2445 packet_pick_tx_queue(dev, skb);
2447 skb->destructor = tpacket_destruct_skb;
2448 __packet_set_status(po, ph, TP_STATUS_SENDING);
2449 packet_inc_pending(&po->tx_ring);
2451 status = TP_STATUS_SEND_REQUEST;
2452 err = po->xmit(skb);
2453 if (unlikely(err > 0)) {
2454 err = net_xmit_errno(err);
2455 if (err && __packet_get_status(po, ph) ==
2456 TP_STATUS_AVAILABLE) {
2457 /* skb was destructed already */
2462 * skb was dropped but not destructed yet;
2463 * let's treat it like congestion or err < 0
2467 packet_increment_head(&po->tx_ring);
2469 } while (likely((ph != NULL) ||
2470 /* Note: packet_read_pending() might be slow if we have
2471 * to call it as it's per_cpu variable, but in fast-path
2472 * we already short-circuit the loop with the first
2473 * condition, and luckily don't have to go that path
2476 (need_wait && packet_read_pending(&po->tx_ring))));
2482 __packet_set_status(po, ph, status);
2487 mutex_unlock(&po->pg_vec_lock);
2491 static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
2492 size_t reserve, size_t len,
2493 size_t linear, int noblock,
2496 struct sk_buff *skb;
2498 /* Under a page? Don't bother with paged skb. */
2499 if (prepad + len < PAGE_SIZE || !linear)
2502 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
2507 skb_reserve(skb, reserve);
2508 skb_put(skb, linear);
2509 skb->data_len = len - linear;
2510 skb->len += len - linear;
2515 static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2517 struct sock *sk = sock->sk;
2518 DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
2519 struct sk_buff *skb;
2520 struct net_device *dev;
2522 unsigned char *addr;
2523 int err, reserve = 0;
2524 struct virtio_net_hdr vnet_hdr = { 0 };
2527 struct packet_sock *po = pkt_sk(sk);
2528 unsigned short gso_type = 0;
2534 * Get and verify the address.
2537 if (likely(saddr == NULL)) {
2538 dev = packet_cached_dev_get(po);
2543 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
2545 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
2547 proto = saddr->sll_protocol;
2548 addr = saddr->sll_addr;
2549 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
2553 if (unlikely(dev == NULL))
2556 if (unlikely(!(dev->flags & IFF_UP)))
2559 if (sock->type == SOCK_RAW)
2560 reserve = dev->hard_header_len;
2561 if (po->has_vnet_hdr) {
2562 vnet_hdr_len = sizeof(vnet_hdr);
2565 if (len < vnet_hdr_len)
2568 len -= vnet_hdr_len;
2571 n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
2572 if (n != vnet_hdr_len)
2575 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
2576 (__virtio16_to_cpu(false, vnet_hdr.csum_start) +
2577 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 >
2578 __virtio16_to_cpu(false, vnet_hdr.hdr_len)))
2579 vnet_hdr.hdr_len = __cpu_to_virtio16(false,
2580 __virtio16_to_cpu(false, vnet_hdr.csum_start) +
2581 __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2);
2584 if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len)
2587 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
2588 switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2589 case VIRTIO_NET_HDR_GSO_TCPV4:
2590 gso_type = SKB_GSO_TCPV4;
2592 case VIRTIO_NET_HDR_GSO_TCPV6:
2593 gso_type = SKB_GSO_TCPV6;
2595 case VIRTIO_NET_HDR_GSO_UDP:
2596 gso_type = SKB_GSO_UDP;
2602 if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
2603 gso_type |= SKB_GSO_TCP_ECN;
2605 if (vnet_hdr.gso_size == 0)
2611 if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
2612 if (!netif_supports_nofcs(dev)) {
2613 err = -EPROTONOSUPPORT;
2616 extra_len = 4; /* We're doing our own CRC */
2620 if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
2624 hlen = LL_RESERVED_SPACE(dev);
2625 tlen = dev->needed_tailroom;
2626 skb = packet_alloc_skb(sk, hlen + tlen, hlen, len,
2627 __virtio16_to_cpu(false, vnet_hdr.hdr_len),
2628 msg->msg_flags & MSG_DONTWAIT, &err);
2632 skb_set_network_header(skb, reserve);
2635 if (sock->type == SOCK_DGRAM) {
2636 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
2637 if (unlikely(offset < 0))
2640 if (ll_header_truncated(dev, len))
2644 /* Returns -EFAULT on error */
2645 err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
2649 sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
2651 if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
2652 /* Earlier code assumed this would be a VLAN pkt,
2653 * double-check this now that we have the actual
2656 struct ethhdr *ehdr;
2657 skb_reset_mac_header(skb);
2658 ehdr = eth_hdr(skb);
2659 if (ehdr->h_proto != htons(ETH_P_8021Q)) {
2665 skb->protocol = proto;
2667 skb->priority = sk->sk_priority;
2668 skb->mark = sk->sk_mark;
2670 packet_pick_tx_queue(dev, skb);
2672 if (po->has_vnet_hdr) {
2673 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2674 u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start);
2675 u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset);
2676 if (!skb_partial_csum_set(skb, s, o)) {
2682 skb_shinfo(skb)->gso_size =
2683 __virtio16_to_cpu(false, vnet_hdr.gso_size);
2684 skb_shinfo(skb)->gso_type = gso_type;
2686 /* Header must be checked, and gso_segs computed. */
2687 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2688 skb_shinfo(skb)->gso_segs = 0;
2690 len += vnet_hdr_len;
2693 if (!packet_use_direct_xmit(po))
2694 skb_probe_transport_header(skb, reserve);
2695 if (unlikely(extra_len == 4))
2698 err = po->xmit(skb);
2699 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2715 static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
2717 struct sock *sk = sock->sk;
2718 struct packet_sock *po = pkt_sk(sk);
2720 if (po->tx_ring.pg_vec)
2721 return tpacket_snd(po, msg);
2723 return packet_snd(sock, msg, len);
2727 * Close a PACKET socket. This is fairly simple. We immediately go
2728 * to 'closed' state and remove our protocol entry in the device list.
2731 static int packet_release(struct socket *sock)
2733 struct sock *sk = sock->sk;
2734 struct packet_sock *po;
2736 union tpacket_req_u req_u;
2744 mutex_lock(&net->packet.sklist_lock);
2745 sk_del_node_init_rcu(sk);
2746 mutex_unlock(&net->packet.sklist_lock);
2749 sock_prot_inuse_add(net, sk->sk_prot, -1);
2752 spin_lock(&po->bind_lock);
2753 unregister_prot_hook(sk, false);
2754 packet_cached_dev_reset(po);
2756 if (po->prot_hook.dev) {
2757 dev_put(po->prot_hook.dev);
2758 po->prot_hook.dev = NULL;
2760 spin_unlock(&po->bind_lock);
2762 packet_flush_mclist(sk);
2764 if (po->rx_ring.pg_vec) {
2765 memset(&req_u, 0, sizeof(req_u));
2766 packet_set_ring(sk, &req_u, 1, 0);
2769 if (po->tx_ring.pg_vec) {
2770 memset(&req_u, 0, sizeof(req_u));
2771 packet_set_ring(sk, &req_u, 1, 1);
2778 * Now the socket is dead. No more input will appear.
2785 skb_queue_purge(&sk->sk_receive_queue);
2786 packet_free_pending(po);
2787 sk_refcnt_debug_release(sk);
2794 * Attach a packet hook.
2797 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
2799 struct packet_sock *po = pkt_sk(sk);
2800 const struct net_device *dev_curr;
2812 spin_lock(&po->bind_lock);
2814 proto_curr = po->prot_hook.type;
2815 dev_curr = po->prot_hook.dev;
2817 need_rehook = proto_curr != proto || dev_curr != dev;
2820 unregister_prot_hook(sk, true);
2823 po->prot_hook.type = proto;
2825 if (po->prot_hook.dev)
2826 dev_put(po->prot_hook.dev);
2828 po->prot_hook.dev = dev;
2830 po->ifindex = dev ? dev->ifindex : 0;
2831 packet_cached_dev_assign(po, dev);
2834 if (proto == 0 || !need_rehook)
2837 if (!dev || (dev->flags & IFF_UP)) {
2838 register_prot_hook(sk);
2840 sk->sk_err = ENETDOWN;
2841 if (!sock_flag(sk, SOCK_DEAD))
2842 sk->sk_error_report(sk);
2846 spin_unlock(&po->bind_lock);
2852 * Bind a packet socket to a device
2855 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
2858 struct sock *sk = sock->sk;
2860 struct net_device *dev;
2867 if (addr_len != sizeof(struct sockaddr))
2869 strlcpy(name, uaddr->sa_data, sizeof(name));
2871 dev = dev_get_by_name(sock_net(sk), name);
2873 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
2877 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
2879 struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
2880 struct sock *sk = sock->sk;
2881 struct net_device *dev = NULL;
2889 if (addr_len < sizeof(struct sockaddr_ll))
2891 if (sll->sll_family != AF_PACKET)
2894 if (sll->sll_ifindex) {
2896 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
2900 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
2906 static struct proto packet_proto = {
2908 .owner = THIS_MODULE,
2909 .obj_size = sizeof(struct packet_sock),
2913 * Create a packet of type SOCK_PACKET.
2916 static int packet_create(struct net *net, struct socket *sock, int protocol,
2920 struct packet_sock *po;
2921 __be16 proto = (__force __be16)protocol; /* weird, but documented */
2924 if (!ns_capable(net->user_ns, CAP_NET_RAW))
2926 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
2927 sock->type != SOCK_PACKET)
2928 return -ESOCKTNOSUPPORT;
2930 sock->state = SS_UNCONNECTED;
2933 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
2937 sock->ops = &packet_ops;
2938 if (sock->type == SOCK_PACKET)
2939 sock->ops = &packet_ops_spkt;
2941 sock_init_data(sock, sk);
2944 sk->sk_family = PF_PACKET;
2946 po->xmit = dev_queue_xmit;
2948 err = packet_alloc_pending(po);
2952 packet_cached_dev_reset(po);
2954 sk->sk_destruct = packet_sock_destruct;
2955 sk_refcnt_debug_inc(sk);
2958 * Attach a protocol block
2961 spin_lock_init(&po->bind_lock);
2962 mutex_init(&po->pg_vec_lock);
2963 po->rollover = NULL;
2964 po->prot_hook.func = packet_rcv;
2966 if (sock->type == SOCK_PACKET)
2967 po->prot_hook.func = packet_rcv_spkt;
2969 po->prot_hook.af_packet_priv = sk;
2972 po->prot_hook.type = proto;
2973 register_prot_hook(sk);
2976 mutex_lock(&net->packet.sklist_lock);
2977 sk_add_node_rcu(sk, &net->packet.sklist);
2978 mutex_unlock(&net->packet.sklist_lock);
2981 sock_prot_inuse_add(net, &packet_proto, 1);
2992 * Pull a packet from our receive queue and hand it to the user.
2993 * If necessary we block.
2996 static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
2999 struct sock *sk = sock->sk;
3000 struct sk_buff *skb;
3002 int vnet_hdr_len = 0;
3003 unsigned int origlen = 0;
3006 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
3010 /* What error should we return now? EUNATTACH? */
3011 if (pkt_sk(sk)->ifindex < 0)
3015 if (flags & MSG_ERRQUEUE) {
3016 err = sock_recv_errqueue(sk, msg, len,
3017 SOL_PACKET, PACKET_TX_TIMESTAMP);
3022 * Call the generic datagram receiver. This handles all sorts
3023 * of horrible races and re-entrancy so we can forget about it
3024 * in the protocol layers.
3026 * Now it will return ENETDOWN, if device have just gone down,
3027 * but then it will block.
3030 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
3033 * An error occurred so return it. Because skb_recv_datagram()
3034 * handles the blocking we don't see and worry about blocking
3041 if (pkt_sk(sk)->pressure)
3042 packet_rcv_has_room(pkt_sk(sk), NULL);
3044 if (pkt_sk(sk)->has_vnet_hdr) {
3045 struct virtio_net_hdr vnet_hdr = { 0 };
3048 vnet_hdr_len = sizeof(vnet_hdr);
3049 if (len < vnet_hdr_len)
3052 len -= vnet_hdr_len;
3054 if (skb_is_gso(skb)) {
3055 struct skb_shared_info *sinfo = skb_shinfo(skb);
3057 /* This is a hint as to how much should be linear. */
3059 __cpu_to_virtio16(false, skb_headlen(skb));
3061 __cpu_to_virtio16(false, sinfo->gso_size);
3062 if (sinfo->gso_type & SKB_GSO_TCPV4)
3063 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
3064 else if (sinfo->gso_type & SKB_GSO_TCPV6)
3065 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
3066 else if (sinfo->gso_type & SKB_GSO_UDP)
3067 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
3068 else if (sinfo->gso_type & SKB_GSO_FCOE)
3072 if (sinfo->gso_type & SKB_GSO_TCP_ECN)
3073 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
3075 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
3077 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3078 vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
3079 vnet_hdr.csum_start = __cpu_to_virtio16(false,
3080 skb_checksum_start_offset(skb));
3081 vnet_hdr.csum_offset = __cpu_to_virtio16(false,
3083 } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
3084 vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
3085 } /* else everything is zero */
3087 err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
3092 /* You lose any data beyond the buffer you gave. If it worries
3093 * a user program they can ask the device for its MTU
3099 msg->msg_flags |= MSG_TRUNC;
3102 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3106 if (sock->type != SOCK_PACKET) {
3107 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3109 /* Original length was stored in sockaddr_ll fields */
3110 origlen = PACKET_SKB_CB(skb)->sa.origlen;
3111 sll->sll_family = AF_PACKET;
3112 sll->sll_protocol = skb->protocol;
3115 sock_recv_ts_and_drops(msg, sk, skb);
3117 if (msg->msg_name) {
3118 /* If the address length field is there to be filled
3119 * in, we fill it in now.
3121 if (sock->type == SOCK_PACKET) {
3122 __sockaddr_check_size(sizeof(struct sockaddr_pkt));
3123 msg->msg_namelen = sizeof(struct sockaddr_pkt);
3125 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
3127 msg->msg_namelen = sll->sll_halen +
3128 offsetof(struct sockaddr_ll, sll_addr);
3130 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
3134 if (pkt_sk(sk)->auxdata) {
3135 struct tpacket_auxdata aux;
3137 aux.tp_status = TP_STATUS_USER;
3138 if (skb->ip_summed == CHECKSUM_PARTIAL)
3139 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
3140 else if (skb->pkt_type != PACKET_OUTGOING &&
3141 (skb->ip_summed == CHECKSUM_COMPLETE ||
3142 skb_csum_unnecessary(skb)))
3143 aux.tp_status |= TP_STATUS_CSUM_VALID;
3145 aux.tp_len = origlen;
3146 aux.tp_snaplen = skb->len;
3148 aux.tp_net = skb_network_offset(skb);
3149 if (skb_vlan_tag_present(skb)) {
3150 aux.tp_vlan_tci = skb_vlan_tag_get(skb);
3151 aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
3152 aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
3154 aux.tp_vlan_tci = 0;
3155 aux.tp_vlan_tpid = 0;
3157 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
3161 * Free or return the buffer as appropriate. Again this
3162 * hides all the races and re-entrancy issues from us.
3164 err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
3167 skb_free_datagram(sk, skb);
3172 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
3173 int *uaddr_len, int peer)
3175 struct net_device *dev;
3176 struct sock *sk = sock->sk;
3181 uaddr->sa_family = AF_PACKET;
3182 memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
3184 dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
3186 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
3188 *uaddr_len = sizeof(*uaddr);
3193 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
3194 int *uaddr_len, int peer)
3196 struct net_device *dev;
3197 struct sock *sk = sock->sk;
3198 struct packet_sock *po = pkt_sk(sk);
3199 DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
3204 sll->sll_family = AF_PACKET;
3205 sll->sll_ifindex = po->ifindex;
3206 sll->sll_protocol = po->num;
3207 sll->sll_pkttype = 0;
3209 dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
3211 sll->sll_hatype = dev->type;
3212 sll->sll_halen = dev->addr_len;
3213 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
3215 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
3219 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
3224 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
3228 case PACKET_MR_MULTICAST:
3229 if (i->alen != dev->addr_len)
3232 return dev_mc_add(dev, i->addr);
3234 return dev_mc_del(dev, i->addr);
3236 case PACKET_MR_PROMISC:
3237 return dev_set_promiscuity(dev, what);
3238 case PACKET_MR_ALLMULTI:
3239 return dev_set_allmulti(dev, what);
3240 case PACKET_MR_UNICAST:
3241 if (i->alen != dev->addr_len)
3244 return dev_uc_add(dev, i->addr);
3246 return dev_uc_del(dev, i->addr);
3254 static void packet_dev_mclist_delete(struct net_device *dev,
3255 struct packet_mclist **mlp)
3257 struct packet_mclist *ml;
3259 while ((ml = *mlp) != NULL) {
3260 if (ml->ifindex == dev->ifindex) {
3261 packet_dev_mc(dev, ml, -1);
3269 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
3271 struct packet_sock *po = pkt_sk(sk);
3272 struct packet_mclist *ml, *i;
3273 struct net_device *dev;
3279 dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
3284 if (mreq->mr_alen > dev->addr_len)
3288 i = kmalloc(sizeof(*i), GFP_KERNEL);
3293 for (ml = po->mclist; ml; ml = ml->next) {
3294 if (ml->ifindex == mreq->mr_ifindex &&
3295 ml->type == mreq->mr_type &&
3296 ml->alen == mreq->mr_alen &&
3297 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3299 /* Free the new element ... */
3305 i->type = mreq->mr_type;
3306 i->ifindex = mreq->mr_ifindex;
3307 i->alen = mreq->mr_alen;
3308 memcpy(i->addr, mreq->mr_address, i->alen);
3310 i->next = po->mclist;
3312 err = packet_dev_mc(dev, i, 1);
3314 po->mclist = i->next;
3323 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
3325 struct packet_mclist *ml, **mlp;
3329 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
3330 if (ml->ifindex == mreq->mr_ifindex &&
3331 ml->type == mreq->mr_type &&
3332 ml->alen == mreq->mr_alen &&
3333 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
3334 if (--ml->count == 0) {
3335 struct net_device *dev;
3337 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3339 packet_dev_mc(dev, ml, -1);
3349 static void packet_flush_mclist(struct sock *sk)
3351 struct packet_sock *po = pkt_sk(sk);
3352 struct packet_mclist *ml;
3358 while ((ml = po->mclist) != NULL) {
3359 struct net_device *dev;
3361 po->mclist = ml->next;
3362 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
3364 packet_dev_mc(dev, ml, -1);
3371 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
3373 struct sock *sk = sock->sk;
3374 struct packet_sock *po = pkt_sk(sk);
3377 if (level != SOL_PACKET)
3378 return -ENOPROTOOPT;
3381 case PACKET_ADD_MEMBERSHIP:
3382 case PACKET_DROP_MEMBERSHIP:
3384 struct packet_mreq_max mreq;
3386 memset(&mreq, 0, sizeof(mreq));
3387 if (len < sizeof(struct packet_mreq))
3389 if (len > sizeof(mreq))
3391 if (copy_from_user(&mreq, optval, len))
3393 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
3395 if (optname == PACKET_ADD_MEMBERSHIP)
3396 ret = packet_mc_add(sk, &mreq);
3398 ret = packet_mc_drop(sk, &mreq);
3402 case PACKET_RX_RING:
3403 case PACKET_TX_RING:
3405 union tpacket_req_u req_u;
3408 switch (po->tp_version) {
3411 len = sizeof(req_u.req);
3415 len = sizeof(req_u.req3);
3420 if (pkt_sk(sk)->has_vnet_hdr)
3422 if (copy_from_user(&req_u.req, optval, len))
3424 return packet_set_ring(sk, &req_u, 0,
3425 optname == PACKET_TX_RING);
3427 case PACKET_COPY_THRESH:
3431 if (optlen != sizeof(val))
3433 if (copy_from_user(&val, optval, sizeof(val)))
3436 pkt_sk(sk)->copy_thresh = val;
3439 case PACKET_VERSION:
3443 if (optlen != sizeof(val))
3445 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3447 if (copy_from_user(&val, optval, sizeof(val)))
3453 po->tp_version = val;
3459 case PACKET_RESERVE:
3463 if (optlen != sizeof(val))
3465 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3467 if (copy_from_user(&val, optval, sizeof(val)))
3469 po->tp_reserve = val;
3476 if (optlen != sizeof(val))
3478 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3480 if (copy_from_user(&val, optval, sizeof(val)))
3482 po->tp_loss = !!val;
3485 case PACKET_AUXDATA:
3489 if (optlen < sizeof(val))
3491 if (copy_from_user(&val, optval, sizeof(val)))
3494 po->auxdata = !!val;
3497 case PACKET_ORIGDEV:
3501 if (optlen < sizeof(val))
3503 if (copy_from_user(&val, optval, sizeof(val)))
3506 po->origdev = !!val;
3509 case PACKET_VNET_HDR:
3513 if (sock->type != SOCK_RAW)
3515 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3517 if (optlen < sizeof(val))
3519 if (copy_from_user(&val, optval, sizeof(val)))
3522 po->has_vnet_hdr = !!val;
3525 case PACKET_TIMESTAMP:
3529 if (optlen != sizeof(val))
3531 if (copy_from_user(&val, optval, sizeof(val)))
3534 po->tp_tstamp = val;
3541 if (optlen != sizeof(val))
3543 if (copy_from_user(&val, optval, sizeof(val)))
3546 return fanout_add(sk, val & 0xffff, val >> 16);
3548 case PACKET_TX_HAS_OFF:
3552 if (optlen != sizeof(val))
3554 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
3556 if (copy_from_user(&val, optval, sizeof(val)))
3558 po->tp_tx_has_off = !!val;
3561 case PACKET_QDISC_BYPASS:
3565 if (optlen != sizeof(val))
3567 if (copy_from_user(&val, optval, sizeof(val)))
3570 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
3574 return -ENOPROTOOPT;
3578 static int packet_getsockopt(struct socket *sock, int level, int optname,
3579 char __user *optval, int __user *optlen)
3582 int val, lv = sizeof(val);
3583 struct sock *sk = sock->sk;
3584 struct packet_sock *po = pkt_sk(sk);
3586 union tpacket_stats_u st;
3588 if (level != SOL_PACKET)
3589 return -ENOPROTOOPT;
3591 if (get_user(len, optlen))
3598 case PACKET_STATISTICS:
3599 spin_lock_bh(&sk->sk_receive_queue.lock);
3600 memcpy(&st, &po->stats, sizeof(st));
3601 memset(&po->stats, 0, sizeof(po->stats));
3602 spin_unlock_bh(&sk->sk_receive_queue.lock);
3604 if (po->tp_version == TPACKET_V3) {
3605 lv = sizeof(struct tpacket_stats_v3);
3606 st.stats3.tp_packets += st.stats3.tp_drops;
3609 lv = sizeof(struct tpacket_stats);
3610 st.stats1.tp_packets += st.stats1.tp_drops;
3615 case PACKET_AUXDATA:
3618 case PACKET_ORIGDEV:
3621 case PACKET_VNET_HDR:
3622 val = po->has_vnet_hdr;
3624 case PACKET_VERSION:
3625 val = po->tp_version;
3628 if (len > sizeof(int))
3630 if (copy_from_user(&val, optval, len))
3634 val = sizeof(struct tpacket_hdr);
3637 val = sizeof(struct tpacket2_hdr);
3640 val = sizeof(struct tpacket3_hdr);
3646 case PACKET_RESERVE:
3647 val = po->tp_reserve;
3652 case PACKET_TIMESTAMP:
3653 val = po->tp_tstamp;
3657 ((u32)po->fanout->id |
3658 ((u32)po->fanout->type << 16) |
3659 ((u32)po->fanout->flags << 24)) :
3662 case PACKET_TX_HAS_OFF:
3663 val = po->tp_tx_has_off;
3665 case PACKET_QDISC_BYPASS:
3666 val = packet_use_direct_xmit(po);
3669 return -ENOPROTOOPT;
3674 if (put_user(len, optlen))
3676 if (copy_to_user(optval, data, len))
3682 static int packet_notifier(struct notifier_block *this,
3683 unsigned long msg, void *ptr)
3686 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3687 struct net *net = dev_net(dev);
3690 sk_for_each_rcu(sk, &net->packet.sklist) {
3691 struct packet_sock *po = pkt_sk(sk);
3694 case NETDEV_UNREGISTER:
3696 packet_dev_mclist_delete(dev, &po->mclist);
3700 if (dev->ifindex == po->ifindex) {
3701 spin_lock(&po->bind_lock);
3703 __unregister_prot_hook(sk, false);
3704 sk->sk_err = ENETDOWN;
3705 if (!sock_flag(sk, SOCK_DEAD))
3706 sk->sk_error_report(sk);
3708 if (msg == NETDEV_UNREGISTER) {
3709 packet_cached_dev_reset(po);
3711 if (po->prot_hook.dev)
3712 dev_put(po->prot_hook.dev);
3713 po->prot_hook.dev = NULL;
3715 spin_unlock(&po->bind_lock);
3719 if (dev->ifindex == po->ifindex) {
3720 spin_lock(&po->bind_lock);
3722 register_prot_hook(sk);
3723 spin_unlock(&po->bind_lock);
3733 static int packet_ioctl(struct socket *sock, unsigned int cmd,
3736 struct sock *sk = sock->sk;
3741 int amount = sk_wmem_alloc_get(sk);
3743 return put_user(amount, (int __user *)arg);
3747 struct sk_buff *skb;
3750 spin_lock_bh(&sk->sk_receive_queue.lock);
3751 skb = skb_peek(&sk->sk_receive_queue);
3754 spin_unlock_bh(&sk->sk_receive_queue.lock);
3755 return put_user(amount, (int __user *)arg);
3758 return sock_get_timestamp(sk, (struct timeval __user *)arg);
3760 return sock_get_timestampns(sk, (struct timespec __user *)arg);
3770 case SIOCGIFBRDADDR:
3771 case SIOCSIFBRDADDR:
3772 case SIOCGIFNETMASK:
3773 case SIOCSIFNETMASK:
3774 case SIOCGIFDSTADDR:
3775 case SIOCSIFDSTADDR:
3777 return inet_dgram_ops.ioctl(sock, cmd, arg);
3781 return -ENOIOCTLCMD;
3786 static unsigned int packet_poll(struct file *file, struct socket *sock,
3789 struct sock *sk = sock->sk;
3790 struct packet_sock *po = pkt_sk(sk);
3791 unsigned int mask = datagram_poll(file, sock, wait);
3793 spin_lock_bh(&sk->sk_receive_queue.lock);
3794 if (po->rx_ring.pg_vec) {
3795 if (!packet_previous_rx_frame(po, &po->rx_ring,
3797 mask |= POLLIN | POLLRDNORM;
3799 if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
3800 xchg(&po->pressure, 0);
3801 spin_unlock_bh(&sk->sk_receive_queue.lock);
3802 spin_lock_bh(&sk->sk_write_queue.lock);
3803 if (po->tx_ring.pg_vec) {
3804 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
3805 mask |= POLLOUT | POLLWRNORM;
3807 spin_unlock_bh(&sk->sk_write_queue.lock);
3812 /* Dirty? Well, I still did not learn better way to account
3816 static void packet_mm_open(struct vm_area_struct *vma)
3818 struct file *file = vma->vm_file;
3819 struct socket *sock = file->private_data;
3820 struct sock *sk = sock->sk;
3823 atomic_inc(&pkt_sk(sk)->mapped);
3826 static void packet_mm_close(struct vm_area_struct *vma)
3828 struct file *file = vma->vm_file;
3829 struct socket *sock = file->private_data;
3830 struct sock *sk = sock->sk;
3833 atomic_dec(&pkt_sk(sk)->mapped);
3836 static const struct vm_operations_struct packet_mmap_ops = {
3837 .open = packet_mm_open,
3838 .close = packet_mm_close,
3841 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
3846 for (i = 0; i < len; i++) {
3847 if (likely(pg_vec[i].buffer)) {
3848 if (is_vmalloc_addr(pg_vec[i].buffer))
3849 vfree(pg_vec[i].buffer);
3851 free_pages((unsigned long)pg_vec[i].buffer,
3853 pg_vec[i].buffer = NULL;
3859 static char *alloc_one_pg_vec_page(unsigned long order)
3862 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
3863 __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
3865 buffer = (char *) __get_free_pages(gfp_flags, order);
3869 /* __get_free_pages failed, fall back to vmalloc */
3870 buffer = vzalloc((1 << order) * PAGE_SIZE);
3874 /* vmalloc failed, lets dig into swap here */
3875 gfp_flags &= ~__GFP_NORETRY;
3876 buffer = (char *) __get_free_pages(gfp_flags, order);
3880 /* complete and utter failure */
3884 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
3886 unsigned int block_nr = req->tp_block_nr;
3890 pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
3891 if (unlikely(!pg_vec))
3894 for (i = 0; i < block_nr; i++) {
3895 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
3896 if (unlikely(!pg_vec[i].buffer))
3897 goto out_free_pgvec;
3904 free_pg_vec(pg_vec, order, block_nr);
3909 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
3910 int closing, int tx_ring)
3912 struct pgv *pg_vec = NULL;
3913 struct packet_sock *po = pkt_sk(sk);
3914 int was_running, order = 0;
3915 struct packet_ring_buffer *rb;
3916 struct sk_buff_head *rb_queue;
3919 /* Added to avoid minimal code churn */
3920 struct tpacket_req *req = &req_u->req;
3922 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3923 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3924 WARN(1, "Tx-ring is not supported.\n");
3928 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
3929 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
3933 if (atomic_read(&po->mapped))
3935 if (packet_read_pending(rb))
3939 if (req->tp_block_nr) {
3940 /* Sanity tests and some calculations */
3942 if (unlikely(rb->pg_vec))
3945 switch (po->tp_version) {
3947 po->tp_hdrlen = TPACKET_HDRLEN;
3950 po->tp_hdrlen = TPACKET2_HDRLEN;
3953 po->tp_hdrlen = TPACKET3_HDRLEN;
3958 if (unlikely((int)req->tp_block_size <= 0))
3960 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
3962 if (po->tp_version >= TPACKET_V3 &&
3963 (int)(req->tp_block_size -
3964 BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
3966 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
3969 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
3972 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
3973 if (unlikely(rb->frames_per_block <= 0))
3975 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
3980 order = get_order(req->tp_block_size);
3981 pg_vec = alloc_pg_vec(req, order);
3982 if (unlikely(!pg_vec))
3984 switch (po->tp_version) {
3986 /* Transmit path is not supported. We checked
3987 * it above but just being paranoid
3990 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3999 if (unlikely(req->tp_frame_nr))
4005 /* Detach socket from network */
4006 spin_lock(&po->bind_lock);
4007 was_running = po->running;
4011 __unregister_prot_hook(sk, false);
4013 spin_unlock(&po->bind_lock);
4018 mutex_lock(&po->pg_vec_lock);
4019 if (closing || atomic_read(&po->mapped) == 0) {
4021 spin_lock_bh(&rb_queue->lock);
4022 swap(rb->pg_vec, pg_vec);
4023 rb->frame_max = (req->tp_frame_nr - 1);
4025 rb->frame_size = req->tp_frame_size;
4026 spin_unlock_bh(&rb_queue->lock);
4028 swap(rb->pg_vec_order, order);
4029 swap(rb->pg_vec_len, req->tp_block_nr);
4031 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
4032 po->prot_hook.func = (po->rx_ring.pg_vec) ?
4033 tpacket_rcv : packet_rcv;
4034 skb_queue_purge(rb_queue);
4035 if (atomic_read(&po->mapped))
4036 pr_err("packet_mmap: vma is busy: %d\n",
4037 atomic_read(&po->mapped));
4039 mutex_unlock(&po->pg_vec_lock);
4041 spin_lock(&po->bind_lock);
4044 register_prot_hook(sk);
4046 spin_unlock(&po->bind_lock);
4047 if (closing && (po->tp_version > TPACKET_V2)) {
4048 /* Because we don't support block-based V3 on tx-ring */
4050 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
4055 free_pg_vec(pg_vec, order, req->tp_block_nr);
4060 static int packet_mmap(struct file *file, struct socket *sock,
4061 struct vm_area_struct *vma)
4063 struct sock *sk = sock->sk;
4064 struct packet_sock *po = pkt_sk(sk);
4065 unsigned long size, expected_size;
4066 struct packet_ring_buffer *rb;
4067 unsigned long start;
4074 mutex_lock(&po->pg_vec_lock);
4077 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4079 expected_size += rb->pg_vec_len
4085 if (expected_size == 0)
4088 size = vma->vm_end - vma->vm_start;
4089 if (size != expected_size)
4092 start = vma->vm_start;
4093 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
4094 if (rb->pg_vec == NULL)
4097 for (i = 0; i < rb->pg_vec_len; i++) {
4099 void *kaddr = rb->pg_vec[i].buffer;
4102 for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
4103 page = pgv_to_page(kaddr);
4104 err = vm_insert_page(vma, start, page);
4113 atomic_inc(&po->mapped);
4114 vma->vm_ops = &packet_mmap_ops;
4118 mutex_unlock(&po->pg_vec_lock);
4122 static const struct proto_ops packet_ops_spkt = {
4123 .family = PF_PACKET,
4124 .owner = THIS_MODULE,
4125 .release = packet_release,
4126 .bind = packet_bind_spkt,
4127 .connect = sock_no_connect,
4128 .socketpair = sock_no_socketpair,
4129 .accept = sock_no_accept,
4130 .getname = packet_getname_spkt,
4131 .poll = datagram_poll,
4132 .ioctl = packet_ioctl,
4133 .listen = sock_no_listen,
4134 .shutdown = sock_no_shutdown,
4135 .setsockopt = sock_no_setsockopt,
4136 .getsockopt = sock_no_getsockopt,
4137 .sendmsg = packet_sendmsg_spkt,
4138 .recvmsg = packet_recvmsg,
4139 .mmap = sock_no_mmap,
4140 .sendpage = sock_no_sendpage,
4143 static const struct proto_ops packet_ops = {
4144 .family = PF_PACKET,
4145 .owner = THIS_MODULE,
4146 .release = packet_release,
4147 .bind = packet_bind,
4148 .connect = sock_no_connect,
4149 .socketpair = sock_no_socketpair,
4150 .accept = sock_no_accept,
4151 .getname = packet_getname,
4152 .poll = packet_poll,
4153 .ioctl = packet_ioctl,
4154 .listen = sock_no_listen,
4155 .shutdown = sock_no_shutdown,
4156 .setsockopt = packet_setsockopt,
4157 .getsockopt = packet_getsockopt,
4158 .sendmsg = packet_sendmsg,
4159 .recvmsg = packet_recvmsg,
4160 .mmap = packet_mmap,
4161 .sendpage = sock_no_sendpage,
4164 static const struct net_proto_family packet_family_ops = {
4165 .family = PF_PACKET,
4166 .create = packet_create,
4167 .owner = THIS_MODULE,
4170 static struct notifier_block packet_netdev_notifier = {
4171 .notifier_call = packet_notifier,
4174 #ifdef CONFIG_PROC_FS
4176 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
4179 struct net *net = seq_file_net(seq);
4182 return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
4185 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4187 struct net *net = seq_file_net(seq);
4188 return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
4191 static void packet_seq_stop(struct seq_file *seq, void *v)
4197 static int packet_seq_show(struct seq_file *seq, void *v)
4199 if (v == SEQ_START_TOKEN)
4200 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
4202 struct sock *s = sk_entry(v);
4203 const struct packet_sock *po = pkt_sk(s);
4206 "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
4208 atomic_read(&s->sk_refcnt),
4213 atomic_read(&s->sk_rmem_alloc),
4214 from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
4221 static const struct seq_operations packet_seq_ops = {
4222 .start = packet_seq_start,
4223 .next = packet_seq_next,
4224 .stop = packet_seq_stop,
4225 .show = packet_seq_show,
4228 static int packet_seq_open(struct inode *inode, struct file *file)
4230 return seq_open_net(inode, file, &packet_seq_ops,
4231 sizeof(struct seq_net_private));
4234 static const struct file_operations packet_seq_fops = {
4235 .owner = THIS_MODULE,
4236 .open = packet_seq_open,
4238 .llseek = seq_lseek,
4239 .release = seq_release_net,
4244 static int __net_init packet_net_init(struct net *net)
4246 mutex_init(&net->packet.sklist_lock);
4247 INIT_HLIST_HEAD(&net->packet.sklist);
4249 if (!proc_create("packet", 0, net->proc_net, &packet_seq_fops))
4255 static void __net_exit packet_net_exit(struct net *net)
4257 remove_proc_entry("packet", net->proc_net);
4260 static struct pernet_operations packet_net_ops = {
4261 .init = packet_net_init,
4262 .exit = packet_net_exit,
4266 static void __exit packet_exit(void)
4268 unregister_netdevice_notifier(&packet_netdev_notifier);
4269 unregister_pernet_subsys(&packet_net_ops);
4270 sock_unregister(PF_PACKET);
4271 proto_unregister(&packet_proto);
4274 static int __init packet_init(void)
4276 int rc = proto_register(&packet_proto, 0);
4281 sock_register(&packet_family_ops);
4282 register_pernet_subsys(&packet_net_ops);
4283 register_netdevice_notifier(&packet_netdev_notifier);
4288 module_init(packet_init);
4289 module_exit(packet_exit);
4290 MODULE_LICENSE("GPL");
4291 MODULE_ALIAS_NETPROTO(PF_PACKET);