Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/nab/target...
[cascardo/linux.git] / drivers / net / ethernet / sfc / tx.c
1 /****************************************************************************
2  * Driver for Solarflare Solarstorm network controllers and boards
3  * Copyright 2005-2006 Fen Systems Ltd.
4  * Copyright 2005-2010 Solarflare Communications Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 as published
8  * by the Free Software Foundation, incorporated herein by reference.
9  */
10
11 #include <linux/pci.h>
12 #include <linux/tcp.h>
13 #include <linux/ip.h>
14 #include <linux/in.h>
15 #include <linux/ipv6.h>
16 #include <linux/slab.h>
17 #include <net/ipv6.h>
18 #include <linux/if_ether.h>
19 #include <linux/highmem.h>
20 #include "net_driver.h"
21 #include "efx.h"
22 #include "nic.h"
23 #include "workarounds.h"
24
25 /*
26  * TX descriptor ring full threshold
27  *
28  * The tx_queue descriptor ring fill-level must fall below this value
29  * before we restart the netif queue
30  */
31 #define EFX_TXQ_THRESHOLD(_efx) ((_efx)->txq_entries / 2u)
32
33 static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
34                                struct efx_tx_buffer *buffer,
35                                unsigned int *pkts_compl,
36                                unsigned int *bytes_compl)
37 {
38         if (buffer->unmap_len) {
39                 struct device *dma_dev = &tx_queue->efx->pci_dev->dev;
40                 dma_addr_t unmap_addr = (buffer->dma_addr + buffer->len -
41                                          buffer->unmap_len);
42                 if (buffer->unmap_single)
43                         dma_unmap_single(dma_dev, unmap_addr, buffer->unmap_len,
44                                          DMA_TO_DEVICE);
45                 else
46                         dma_unmap_page(dma_dev, unmap_addr, buffer->unmap_len,
47                                        DMA_TO_DEVICE);
48                 buffer->unmap_len = 0;
49                 buffer->unmap_single = false;
50         }
51
52         if (buffer->skb) {
53                 (*pkts_compl)++;
54                 (*bytes_compl) += buffer->skb->len;
55                 dev_kfree_skb_any((struct sk_buff *) buffer->skb);
56                 buffer->skb = NULL;
57                 netif_vdbg(tx_queue->efx, tx_done, tx_queue->efx->net_dev,
58                            "TX queue %d transmission id %x complete\n",
59                            tx_queue->queue, tx_queue->read_count);
60         }
61 }
62
63 /**
64  * struct efx_tso_header - a DMA mapped buffer for packet headers
65  * @next: Linked list of free ones.
66  *      The list is protected by the TX queue lock.
67  * @dma_unmap_len: Length to unmap for an oversize buffer, or 0.
68  * @dma_addr: The DMA address of the header below.
69  *
70  * This controls the memory used for a TSO header.  Use TSOH_DATA()
71  * to find the packet header data.  Use TSOH_SIZE() to calculate the
72  * total size required for a given packet header length.  TSO headers
73  * in the free list are exactly %TSOH_STD_SIZE bytes in size.
74  */
75 struct efx_tso_header {
76         union {
77                 struct efx_tso_header *next;
78                 size_t unmap_len;
79         };
80         dma_addr_t dma_addr;
81 };
82
83 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
84                                struct sk_buff *skb);
85 static void efx_fini_tso(struct efx_tx_queue *tx_queue);
86 static void efx_tsoh_heap_free(struct efx_tx_queue *tx_queue,
87                                struct efx_tso_header *tsoh);
88
89 static void efx_tsoh_free(struct efx_tx_queue *tx_queue,
90                           struct efx_tx_buffer *buffer)
91 {
92         if (buffer->tsoh) {
93                 if (likely(!buffer->tsoh->unmap_len)) {
94                         buffer->tsoh->next = tx_queue->tso_headers_free;
95                         tx_queue->tso_headers_free = buffer->tsoh;
96                 } else {
97                         efx_tsoh_heap_free(tx_queue, buffer->tsoh);
98                 }
99                 buffer->tsoh = NULL;
100         }
101 }
102
103
104 static inline unsigned
105 efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr)
106 {
107         /* Depending on the NIC revision, we can use descriptor
108          * lengths up to 8K or 8K-1.  However, since PCI Express
109          * devices must split read requests at 4K boundaries, there is
110          * little benefit from using descriptors that cross those
111          * boundaries and we keep things simple by not doing so.
112          */
113         unsigned len = (~dma_addr & (EFX_PAGE_SIZE - 1)) + 1;
114
115         /* Work around hardware bug for unaligned buffers. */
116         if (EFX_WORKAROUND_5391(efx) && (dma_addr & 0xf))
117                 len = min_t(unsigned, len, 512 - (dma_addr & 0xf));
118
119         return len;
120 }
121
122 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
123 {
124         /* Header and payload descriptor for each output segment, plus
125          * one for every input fragment boundary within a segment
126          */
127         unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS;
128
129         /* Possibly one more per segment for the alignment workaround */
130         if (EFX_WORKAROUND_5391(efx))
131                 max_descs += EFX_TSO_MAX_SEGS;
132
133         /* Possibly more for PCIe page boundaries within input fragments */
134         if (PAGE_SIZE > EFX_PAGE_SIZE)
135                 max_descs += max_t(unsigned int, MAX_SKB_FRAGS,
136                                    DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE));
137
138         return max_descs;
139 }
140
141 /*
142  * Add a socket buffer to a TX queue
143  *
144  * This maps all fragments of a socket buffer for DMA and adds them to
145  * the TX queue.  The queue's insert pointer will be incremented by
146  * the number of fragments in the socket buffer.
147  *
148  * If any DMA mapping fails, any mapped fragments will be unmapped,
149  * the queue's insert pointer will be restored to its original value.
150  *
151  * This function is split out from efx_hard_start_xmit to allow the
152  * loopback test to direct packets via specific TX queues.
153  *
154  * Returns NETDEV_TX_OK or NETDEV_TX_BUSY
155  * You must hold netif_tx_lock() to call this function.
156  */
157 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
158 {
159         struct efx_nic *efx = tx_queue->efx;
160         struct device *dma_dev = &efx->pci_dev->dev;
161         struct efx_tx_buffer *buffer;
162         skb_frag_t *fragment;
163         unsigned int len, unmap_len = 0, fill_level, insert_ptr;
164         dma_addr_t dma_addr, unmap_addr = 0;
165         unsigned int dma_len;
166         bool unmap_single;
167         int q_space, i = 0;
168         netdev_tx_t rc = NETDEV_TX_OK;
169
170         EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
171
172         if (skb_shinfo(skb)->gso_size)
173                 return efx_enqueue_skb_tso(tx_queue, skb);
174
175         /* Get size of the initial fragment */
176         len = skb_headlen(skb);
177
178         /* Pad if necessary */
179         if (EFX_WORKAROUND_15592(efx) && skb->len <= 32) {
180                 EFX_BUG_ON_PARANOID(skb->data_len);
181                 len = 32 + 1;
182                 if (skb_pad(skb, len - skb->len))
183                         return NETDEV_TX_OK;
184         }
185
186         fill_level = tx_queue->insert_count - tx_queue->old_read_count;
187         q_space = efx->txq_entries - 1 - fill_level;
188
189         /* Map for DMA.  Use dma_map_single rather than dma_map_page
190          * since this is more efficient on machines with sparse
191          * memory.
192          */
193         unmap_single = true;
194         dma_addr = dma_map_single(dma_dev, skb->data, len, PCI_DMA_TODEVICE);
195
196         /* Process all fragments */
197         while (1) {
198                 if (unlikely(dma_mapping_error(dma_dev, dma_addr)))
199                         goto dma_err;
200
201                 /* Store fields for marking in the per-fragment final
202                  * descriptor */
203                 unmap_len = len;
204                 unmap_addr = dma_addr;
205
206                 /* Add to TX queue, splitting across DMA boundaries */
207                 do {
208                         if (unlikely(q_space-- <= 0)) {
209                                 /* It might be that completions have
210                                  * happened since the xmit path last
211                                  * checked.  Update the xmit path's
212                                  * copy of read_count.
213                                  */
214                                 netif_tx_stop_queue(tx_queue->core_txq);
215                                 /* This memory barrier protects the
216                                  * change of queue state from the access
217                                  * of read_count. */
218                                 smp_mb();
219                                 tx_queue->old_read_count =
220                                         ACCESS_ONCE(tx_queue->read_count);
221                                 fill_level = (tx_queue->insert_count
222                                               - tx_queue->old_read_count);
223                                 q_space = efx->txq_entries - 1 - fill_level;
224                                 if (unlikely(q_space-- <= 0)) {
225                                         rc = NETDEV_TX_BUSY;
226                                         goto unwind;
227                                 }
228                                 smp_mb();
229                                 if (likely(!efx->loopback_selftest))
230                                         netif_tx_start_queue(
231                                                 tx_queue->core_txq);
232                         }
233
234                         insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
235                         buffer = &tx_queue->buffer[insert_ptr];
236                         efx_tsoh_free(tx_queue, buffer);
237                         EFX_BUG_ON_PARANOID(buffer->tsoh);
238                         EFX_BUG_ON_PARANOID(buffer->skb);
239                         EFX_BUG_ON_PARANOID(buffer->len);
240                         EFX_BUG_ON_PARANOID(!buffer->continuation);
241                         EFX_BUG_ON_PARANOID(buffer->unmap_len);
242
243                         dma_len = efx_max_tx_len(efx, dma_addr);
244                         if (likely(dma_len >= len))
245                                 dma_len = len;
246
247                         /* Fill out per descriptor fields */
248                         buffer->len = dma_len;
249                         buffer->dma_addr = dma_addr;
250                         len -= dma_len;
251                         dma_addr += dma_len;
252                         ++tx_queue->insert_count;
253                 } while (len);
254
255                 /* Transfer ownership of the unmapping to the final buffer */
256                 buffer->unmap_single = unmap_single;
257                 buffer->unmap_len = unmap_len;
258                 unmap_len = 0;
259
260                 /* Get address and size of next fragment */
261                 if (i >= skb_shinfo(skb)->nr_frags)
262                         break;
263                 fragment = &skb_shinfo(skb)->frags[i];
264                 len = skb_frag_size(fragment);
265                 i++;
266                 /* Map for DMA */
267                 unmap_single = false;
268                 dma_addr = skb_frag_dma_map(dma_dev, fragment, 0, len,
269                                             DMA_TO_DEVICE);
270         }
271
272         /* Transfer ownership of the skb to the final buffer */
273         buffer->skb = skb;
274         buffer->continuation = false;
275
276         netdev_tx_sent_queue(tx_queue->core_txq, skb->len);
277
278         /* Pass off to hardware */
279         efx_nic_push_buffers(tx_queue);
280
281         return NETDEV_TX_OK;
282
283  dma_err:
284         netif_err(efx, tx_err, efx->net_dev,
285                   " TX queue %d could not map skb with %d bytes %d "
286                   "fragments for DMA\n", tx_queue->queue, skb->len,
287                   skb_shinfo(skb)->nr_frags + 1);
288
289         /* Mark the packet as transmitted, and free the SKB ourselves */
290         dev_kfree_skb_any(skb);
291
292  unwind:
293         /* Work backwards until we hit the original insert pointer value */
294         while (tx_queue->insert_count != tx_queue->write_count) {
295                 unsigned int pkts_compl = 0, bytes_compl = 0;
296                 --tx_queue->insert_count;
297                 insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
298                 buffer = &tx_queue->buffer[insert_ptr];
299                 efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
300                 buffer->len = 0;
301         }
302
303         /* Free the fragment we were mid-way through pushing */
304         if (unmap_len) {
305                 if (unmap_single)
306                         dma_unmap_single(dma_dev, unmap_addr, unmap_len,
307                                          DMA_TO_DEVICE);
308                 else
309                         dma_unmap_page(dma_dev, unmap_addr, unmap_len,
310                                        DMA_TO_DEVICE);
311         }
312
313         return rc;
314 }
315
316 /* Remove packets from the TX queue
317  *
318  * This removes packets from the TX queue, up to and including the
319  * specified index.
320  */
321 static void efx_dequeue_buffers(struct efx_tx_queue *tx_queue,
322                                 unsigned int index,
323                                 unsigned int *pkts_compl,
324                                 unsigned int *bytes_compl)
325 {
326         struct efx_nic *efx = tx_queue->efx;
327         unsigned int stop_index, read_ptr;
328
329         stop_index = (index + 1) & tx_queue->ptr_mask;
330         read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
331
332         while (read_ptr != stop_index) {
333                 struct efx_tx_buffer *buffer = &tx_queue->buffer[read_ptr];
334                 if (unlikely(buffer->len == 0)) {
335                         netif_err(efx, tx_err, efx->net_dev,
336                                   "TX queue %d spurious TX completion id %x\n",
337                                   tx_queue->queue, read_ptr);
338                         efx_schedule_reset(efx, RESET_TYPE_TX_SKIP);
339                         return;
340                 }
341
342                 efx_dequeue_buffer(tx_queue, buffer, pkts_compl, bytes_compl);
343                 buffer->continuation = true;
344                 buffer->len = 0;
345
346                 ++tx_queue->read_count;
347                 read_ptr = tx_queue->read_count & tx_queue->ptr_mask;
348         }
349 }
350
351 /* Initiate a packet transmission.  We use one channel per CPU
352  * (sharing when we have more CPUs than channels).  On Falcon, the TX
353  * completion events will be directed back to the CPU that transmitted
354  * the packet, which should be cache-efficient.
355  *
356  * Context: non-blocking.
357  * Note that returning anything other than NETDEV_TX_OK will cause the
358  * OS to free the skb.
359  */
360 netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
361                                 struct net_device *net_dev)
362 {
363         struct efx_nic *efx = netdev_priv(net_dev);
364         struct efx_tx_queue *tx_queue;
365         unsigned index, type;
366
367         EFX_WARN_ON_PARANOID(!netif_device_present(net_dev));
368
369         index = skb_get_queue_mapping(skb);
370         type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0;
371         if (index >= efx->n_tx_channels) {
372                 index -= efx->n_tx_channels;
373                 type |= EFX_TXQ_TYPE_HIGHPRI;
374         }
375         tx_queue = efx_get_tx_queue(efx, index, type);
376
377         return efx_enqueue_skb(tx_queue, skb);
378 }
379
380 void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
381 {
382         struct efx_nic *efx = tx_queue->efx;
383
384         /* Must be inverse of queue lookup in efx_hard_start_xmit() */
385         tx_queue->core_txq =
386                 netdev_get_tx_queue(efx->net_dev,
387                                     tx_queue->queue / EFX_TXQ_TYPES +
388                                     ((tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI) ?
389                                      efx->n_tx_channels : 0));
390 }
391
392 int efx_setup_tc(struct net_device *net_dev, u8 num_tc)
393 {
394         struct efx_nic *efx = netdev_priv(net_dev);
395         struct efx_channel *channel;
396         struct efx_tx_queue *tx_queue;
397         unsigned tc;
398         int rc;
399
400         if (efx_nic_rev(efx) < EFX_REV_FALCON_B0 || num_tc > EFX_MAX_TX_TC)
401                 return -EINVAL;
402
403         if (num_tc == net_dev->num_tc)
404                 return 0;
405
406         for (tc = 0; tc < num_tc; tc++) {
407                 net_dev->tc_to_txq[tc].offset = tc * efx->n_tx_channels;
408                 net_dev->tc_to_txq[tc].count = efx->n_tx_channels;
409         }
410
411         if (num_tc > net_dev->num_tc) {
412                 /* Initialise high-priority queues as necessary */
413                 efx_for_each_channel(channel, efx) {
414                         efx_for_each_possible_channel_tx_queue(tx_queue,
415                                                                channel) {
416                                 if (!(tx_queue->queue & EFX_TXQ_TYPE_HIGHPRI))
417                                         continue;
418                                 if (!tx_queue->buffer) {
419                                         rc = efx_probe_tx_queue(tx_queue);
420                                         if (rc)
421                                                 return rc;
422                                 }
423                                 if (!tx_queue->initialised)
424                                         efx_init_tx_queue(tx_queue);
425                                 efx_init_tx_queue_core_txq(tx_queue);
426                         }
427                 }
428         } else {
429                 /* Reduce number of classes before number of queues */
430                 net_dev->num_tc = num_tc;
431         }
432
433         rc = netif_set_real_num_tx_queues(net_dev,
434                                           max_t(int, num_tc, 1) *
435                                           efx->n_tx_channels);
436         if (rc)
437                 return rc;
438
439         /* Do not destroy high-priority queues when they become
440          * unused.  We would have to flush them first, and it is
441          * fairly difficult to flush a subset of TX queues.  Leave
442          * it to efx_fini_channels().
443          */
444
445         net_dev->num_tc = num_tc;
446         return 0;
447 }
448
449 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
450 {
451         unsigned fill_level;
452         struct efx_nic *efx = tx_queue->efx;
453         unsigned int pkts_compl = 0, bytes_compl = 0;
454
455         EFX_BUG_ON_PARANOID(index > tx_queue->ptr_mask);
456
457         efx_dequeue_buffers(tx_queue, index, &pkts_compl, &bytes_compl);
458         netdev_tx_completed_queue(tx_queue->core_txq, pkts_compl, bytes_compl);
459
460         /* See if we need to restart the netif queue.  This barrier
461          * separates the update of read_count from the test of the
462          * queue state. */
463         smp_mb();
464         if (unlikely(netif_tx_queue_stopped(tx_queue->core_txq)) &&
465             likely(efx->port_enabled) &&
466             likely(netif_device_present(efx->net_dev))) {
467                 fill_level = tx_queue->insert_count - tx_queue->read_count;
468                 if (fill_level < EFX_TXQ_THRESHOLD(efx))
469                         netif_tx_wake_queue(tx_queue->core_txq);
470         }
471
472         /* Check whether the hardware queue is now empty */
473         if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) {
474                 tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count);
475                 if (tx_queue->read_count == tx_queue->old_write_count) {
476                         smp_mb();
477                         tx_queue->empty_read_count =
478                                 tx_queue->read_count | EFX_EMPTY_COUNT_VALID;
479                 }
480         }
481 }
482
483 int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
484 {
485         struct efx_nic *efx = tx_queue->efx;
486         unsigned int entries;
487         int i, rc;
488
489         /* Create the smallest power-of-two aligned ring */
490         entries = max(roundup_pow_of_two(efx->txq_entries), EFX_MIN_DMAQ_SIZE);
491         EFX_BUG_ON_PARANOID(entries > EFX_MAX_DMAQ_SIZE);
492         tx_queue->ptr_mask = entries - 1;
493
494         netif_dbg(efx, probe, efx->net_dev,
495                   "creating TX queue %d size %#x mask %#x\n",
496                   tx_queue->queue, efx->txq_entries, tx_queue->ptr_mask);
497
498         /* Allocate software ring */
499         tx_queue->buffer = kcalloc(entries, sizeof(*tx_queue->buffer),
500                                    GFP_KERNEL);
501         if (!tx_queue->buffer)
502                 return -ENOMEM;
503         for (i = 0; i <= tx_queue->ptr_mask; ++i)
504                 tx_queue->buffer[i].continuation = true;
505
506         /* Allocate hardware ring */
507         rc = efx_nic_probe_tx(tx_queue);
508         if (rc)
509                 goto fail;
510
511         return 0;
512
513  fail:
514         kfree(tx_queue->buffer);
515         tx_queue->buffer = NULL;
516         return rc;
517 }
518
519 void efx_init_tx_queue(struct efx_tx_queue *tx_queue)
520 {
521         netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
522                   "initialising TX queue %d\n", tx_queue->queue);
523
524         tx_queue->insert_count = 0;
525         tx_queue->write_count = 0;
526         tx_queue->old_write_count = 0;
527         tx_queue->read_count = 0;
528         tx_queue->old_read_count = 0;
529         tx_queue->empty_read_count = 0 | EFX_EMPTY_COUNT_VALID;
530
531         /* Set up TX descriptor ring */
532         efx_nic_init_tx(tx_queue);
533
534         tx_queue->initialised = true;
535 }
536
537 void efx_release_tx_buffers(struct efx_tx_queue *tx_queue)
538 {
539         struct efx_tx_buffer *buffer;
540
541         if (!tx_queue->buffer)
542                 return;
543
544         /* Free any buffers left in the ring */
545         while (tx_queue->read_count != tx_queue->write_count) {
546                 unsigned int pkts_compl = 0, bytes_compl = 0;
547                 buffer = &tx_queue->buffer[tx_queue->read_count & tx_queue->ptr_mask];
548                 efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
549                 buffer->continuation = true;
550                 buffer->len = 0;
551
552                 ++tx_queue->read_count;
553         }
554         netdev_tx_reset_queue(tx_queue->core_txq);
555 }
556
557 void efx_fini_tx_queue(struct efx_tx_queue *tx_queue)
558 {
559         if (!tx_queue->initialised)
560                 return;
561
562         netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
563                   "shutting down TX queue %d\n", tx_queue->queue);
564
565         tx_queue->initialised = false;
566
567         /* Flush TX queue, remove descriptor ring */
568         efx_nic_fini_tx(tx_queue);
569
570         efx_release_tx_buffers(tx_queue);
571
572         /* Free up TSO header cache */
573         efx_fini_tso(tx_queue);
574 }
575
576 void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
577 {
578         if (!tx_queue->buffer)
579                 return;
580
581         netif_dbg(tx_queue->efx, drv, tx_queue->efx->net_dev,
582                   "destroying TX queue %d\n", tx_queue->queue);
583         efx_nic_remove_tx(tx_queue);
584
585         kfree(tx_queue->buffer);
586         tx_queue->buffer = NULL;
587 }
588
589
590 /* Efx TCP segmentation acceleration.
591  *
592  * Why?  Because by doing it here in the driver we can go significantly
593  * faster than the GSO.
594  *
595  * Requires TX checksum offload support.
596  */
597
598 /* Number of bytes inserted at the start of a TSO header buffer,
599  * similar to NET_IP_ALIGN.
600  */
601 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
602 #define TSOH_OFFSET     0
603 #else
604 #define TSOH_OFFSET     NET_IP_ALIGN
605 #endif
606
607 #define TSOH_BUFFER(tsoh)       ((u8 *)(tsoh + 1) + TSOH_OFFSET)
608
609 /* Total size of struct efx_tso_header, buffer and padding */
610 #define TSOH_SIZE(hdr_len)                                      \
611         (sizeof(struct efx_tso_header) + TSOH_OFFSET + hdr_len)
612
613 /* Size of blocks on free list.  Larger blocks must be allocated from
614  * the heap.
615  */
616 #define TSOH_STD_SIZE           128
617
618 #define PTR_DIFF(p1, p2)  ((u8 *)(p1) - (u8 *)(p2))
619 #define ETH_HDR_LEN(skb)  (skb_network_header(skb) - (skb)->data)
620 #define SKB_TCP_OFF(skb)  PTR_DIFF(tcp_hdr(skb), (skb)->data)
621 #define SKB_IPV4_OFF(skb) PTR_DIFF(ip_hdr(skb), (skb)->data)
622 #define SKB_IPV6_OFF(skb) PTR_DIFF(ipv6_hdr(skb), (skb)->data)
623
624 /**
625  * struct tso_state - TSO state for an SKB
626  * @out_len: Remaining length in current segment
627  * @seqnum: Current sequence number
628  * @ipv4_id: Current IPv4 ID, host endian
629  * @packet_space: Remaining space in current packet
630  * @dma_addr: DMA address of current position
631  * @in_len: Remaining length in current SKB fragment
632  * @unmap_len: Length of SKB fragment
633  * @unmap_addr: DMA address of SKB fragment
634  * @unmap_single: DMA single vs page mapping flag
635  * @protocol: Network protocol (after any VLAN header)
636  * @header_len: Number of bytes of header
637  * @full_packet_size: Number of bytes to put in each outgoing segment
638  *
639  * The state used during segmentation.  It is put into this data structure
640  * just to make it easy to pass into inline functions.
641  */
642 struct tso_state {
643         /* Output position */
644         unsigned out_len;
645         unsigned seqnum;
646         unsigned ipv4_id;
647         unsigned packet_space;
648
649         /* Input position */
650         dma_addr_t dma_addr;
651         unsigned in_len;
652         unsigned unmap_len;
653         dma_addr_t unmap_addr;
654         bool unmap_single;
655
656         __be16 protocol;
657         unsigned header_len;
658         int full_packet_size;
659 };
660
661
662 /*
663  * Verify that our various assumptions about sk_buffs and the conditions
664  * under which TSO will be attempted hold true.  Return the protocol number.
665  */
666 static __be16 efx_tso_check_protocol(struct sk_buff *skb)
667 {
668         __be16 protocol = skb->protocol;
669
670         EFX_BUG_ON_PARANOID(((struct ethhdr *)skb->data)->h_proto !=
671                             protocol);
672         if (protocol == htons(ETH_P_8021Q)) {
673                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
674                 protocol = veh->h_vlan_encapsulated_proto;
675         }
676
677         if (protocol == htons(ETH_P_IP)) {
678                 EFX_BUG_ON_PARANOID(ip_hdr(skb)->protocol != IPPROTO_TCP);
679         } else {
680                 EFX_BUG_ON_PARANOID(protocol != htons(ETH_P_IPV6));
681                 EFX_BUG_ON_PARANOID(ipv6_hdr(skb)->nexthdr != NEXTHDR_TCP);
682         }
683         EFX_BUG_ON_PARANOID((PTR_DIFF(tcp_hdr(skb), skb->data)
684                              + (tcp_hdr(skb)->doff << 2u)) >
685                             skb_headlen(skb));
686
687         return protocol;
688 }
689
690
691 /*
692  * Allocate a page worth of efx_tso_header structures, and string them
693  * into the tx_queue->tso_headers_free linked list. Return 0 or -ENOMEM.
694  */
695 static int efx_tsoh_block_alloc(struct efx_tx_queue *tx_queue)
696 {
697         struct device *dma_dev = &tx_queue->efx->pci_dev->dev;
698         struct efx_tso_header *tsoh;
699         dma_addr_t dma_addr;
700         u8 *base_kva, *kva;
701
702         base_kva = dma_alloc_coherent(dma_dev, PAGE_SIZE, &dma_addr, GFP_ATOMIC);
703         if (base_kva == NULL) {
704                 netif_err(tx_queue->efx, tx_err, tx_queue->efx->net_dev,
705                           "Unable to allocate page for TSO headers\n");
706                 return -ENOMEM;
707         }
708
709         /* dma_alloc_coherent() allocates pages. */
710         EFX_BUG_ON_PARANOID(dma_addr & (PAGE_SIZE - 1u));
711
712         for (kva = base_kva; kva < base_kva + PAGE_SIZE; kva += TSOH_STD_SIZE) {
713                 tsoh = (struct efx_tso_header *)kva;
714                 tsoh->dma_addr = dma_addr + (TSOH_BUFFER(tsoh) - base_kva);
715                 tsoh->next = tx_queue->tso_headers_free;
716                 tx_queue->tso_headers_free = tsoh;
717         }
718
719         return 0;
720 }
721
722
723 /* Free up a TSO header, and all others in the same page. */
724 static void efx_tsoh_block_free(struct efx_tx_queue *tx_queue,
725                                 struct efx_tso_header *tsoh,
726                                 struct device *dma_dev)
727 {
728         struct efx_tso_header **p;
729         unsigned long base_kva;
730         dma_addr_t base_dma;
731
732         base_kva = (unsigned long)tsoh & PAGE_MASK;
733         base_dma = tsoh->dma_addr & PAGE_MASK;
734
735         p = &tx_queue->tso_headers_free;
736         while (*p != NULL) {
737                 if (((unsigned long)*p & PAGE_MASK) == base_kva)
738                         *p = (*p)->next;
739                 else
740                         p = &(*p)->next;
741         }
742
743         dma_free_coherent(dma_dev, PAGE_SIZE, (void *)base_kva, base_dma);
744 }
745
746 static struct efx_tso_header *
747 efx_tsoh_heap_alloc(struct efx_tx_queue *tx_queue, size_t header_len)
748 {
749         struct efx_tso_header *tsoh;
750
751         tsoh = kmalloc(TSOH_SIZE(header_len), GFP_ATOMIC | GFP_DMA);
752         if (unlikely(!tsoh))
753                 return NULL;
754
755         tsoh->dma_addr = dma_map_single(&tx_queue->efx->pci_dev->dev,
756                                         TSOH_BUFFER(tsoh), header_len,
757                                         DMA_TO_DEVICE);
758         if (unlikely(dma_mapping_error(&tx_queue->efx->pci_dev->dev,
759                                        tsoh->dma_addr))) {
760                 kfree(tsoh);
761                 return NULL;
762         }
763
764         tsoh->unmap_len = header_len;
765         return tsoh;
766 }
767
768 static void
769 efx_tsoh_heap_free(struct efx_tx_queue *tx_queue, struct efx_tso_header *tsoh)
770 {
771         dma_unmap_single(&tx_queue->efx->pci_dev->dev,
772                          tsoh->dma_addr, tsoh->unmap_len,
773                          DMA_TO_DEVICE);
774         kfree(tsoh);
775 }
776
777 /**
778  * efx_tx_queue_insert - push descriptors onto the TX queue
779  * @tx_queue:           Efx TX queue
780  * @dma_addr:           DMA address of fragment
781  * @len:                Length of fragment
782  * @final_buffer:       The final buffer inserted into the queue
783  *
784  * Push descriptors onto the TX queue.  Return 0 on success or 1 if
785  * @tx_queue full.
786  */
787 static int efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
788                                dma_addr_t dma_addr, unsigned len,
789                                struct efx_tx_buffer **final_buffer)
790 {
791         struct efx_tx_buffer *buffer;
792         struct efx_nic *efx = tx_queue->efx;
793         unsigned dma_len, fill_level, insert_ptr;
794         int q_space;
795
796         EFX_BUG_ON_PARANOID(len <= 0);
797
798         fill_level = tx_queue->insert_count - tx_queue->old_read_count;
799         /* -1 as there is no way to represent all descriptors used */
800         q_space = efx->txq_entries - 1 - fill_level;
801
802         while (1) {
803                 if (unlikely(q_space-- <= 0)) {
804                         /* It might be that completions have happened
805                          * since the xmit path last checked.  Update
806                          * the xmit path's copy of read_count.
807                          */
808                         netif_tx_stop_queue(tx_queue->core_txq);
809                         /* This memory barrier protects the change of
810                          * queue state from the access of read_count. */
811                         smp_mb();
812                         tx_queue->old_read_count =
813                                 ACCESS_ONCE(tx_queue->read_count);
814                         fill_level = (tx_queue->insert_count
815                                       - tx_queue->old_read_count);
816                         q_space = efx->txq_entries - 1 - fill_level;
817                         if (unlikely(q_space-- <= 0)) {
818                                 *final_buffer = NULL;
819                                 return 1;
820                         }
821                         smp_mb();
822                         netif_tx_start_queue(tx_queue->core_txq);
823                 }
824
825                 insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
826                 buffer = &tx_queue->buffer[insert_ptr];
827                 ++tx_queue->insert_count;
828
829                 EFX_BUG_ON_PARANOID(tx_queue->insert_count -
830                                     tx_queue->read_count >=
831                                     efx->txq_entries);
832
833                 efx_tsoh_free(tx_queue, buffer);
834                 EFX_BUG_ON_PARANOID(buffer->len);
835                 EFX_BUG_ON_PARANOID(buffer->unmap_len);
836                 EFX_BUG_ON_PARANOID(buffer->skb);
837                 EFX_BUG_ON_PARANOID(!buffer->continuation);
838                 EFX_BUG_ON_PARANOID(buffer->tsoh);
839
840                 buffer->dma_addr = dma_addr;
841
842                 dma_len = efx_max_tx_len(efx, dma_addr);
843
844                 /* If there is enough space to send then do so */
845                 if (dma_len >= len)
846                         break;
847
848                 buffer->len = dma_len; /* Don't set the other members */
849                 dma_addr += dma_len;
850                 len -= dma_len;
851         }
852
853         EFX_BUG_ON_PARANOID(!len);
854         buffer->len = len;
855         *final_buffer = buffer;
856         return 0;
857 }
858
859
860 /*
861  * Put a TSO header into the TX queue.
862  *
863  * This is special-cased because we know that it is small enough to fit in
864  * a single fragment, and we know it doesn't cross a page boundary.  It
865  * also allows us to not worry about end-of-packet etc.
866  */
867 static void efx_tso_put_header(struct efx_tx_queue *tx_queue,
868                                struct efx_tso_header *tsoh, unsigned len)
869 {
870         struct efx_tx_buffer *buffer;
871
872         buffer = &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask];
873         efx_tsoh_free(tx_queue, buffer);
874         EFX_BUG_ON_PARANOID(buffer->len);
875         EFX_BUG_ON_PARANOID(buffer->unmap_len);
876         EFX_BUG_ON_PARANOID(buffer->skb);
877         EFX_BUG_ON_PARANOID(!buffer->continuation);
878         EFX_BUG_ON_PARANOID(buffer->tsoh);
879         buffer->len = len;
880         buffer->dma_addr = tsoh->dma_addr;
881         buffer->tsoh = tsoh;
882
883         ++tx_queue->insert_count;
884 }
885
886
887 /* Remove descriptors put into a tx_queue. */
888 static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
889 {
890         struct efx_tx_buffer *buffer;
891         dma_addr_t unmap_addr;
892
893         /* Work backwards until we hit the original insert pointer value */
894         while (tx_queue->insert_count != tx_queue->write_count) {
895                 --tx_queue->insert_count;
896                 buffer = &tx_queue->buffer[tx_queue->insert_count &
897                                            tx_queue->ptr_mask];
898                 efx_tsoh_free(tx_queue, buffer);
899                 EFX_BUG_ON_PARANOID(buffer->skb);
900                 if (buffer->unmap_len) {
901                         unmap_addr = (buffer->dma_addr + buffer->len -
902                                       buffer->unmap_len);
903                         if (buffer->unmap_single)
904                                 dma_unmap_single(&tx_queue->efx->pci_dev->dev,
905                                                  unmap_addr, buffer->unmap_len,
906                                                  DMA_TO_DEVICE);
907                         else
908                                 dma_unmap_page(&tx_queue->efx->pci_dev->dev,
909                                                unmap_addr, buffer->unmap_len,
910                                                DMA_TO_DEVICE);
911                         buffer->unmap_len = 0;
912                 }
913                 buffer->len = 0;
914                 buffer->continuation = true;
915         }
916 }
917
918
919 /* Parse the SKB header and initialise state. */
920 static void tso_start(struct tso_state *st, const struct sk_buff *skb)
921 {
922         /* All ethernet/IP/TCP headers combined size is TCP header size
923          * plus offset of TCP header relative to start of packet.
924          */
925         st->header_len = ((tcp_hdr(skb)->doff << 2u)
926                           + PTR_DIFF(tcp_hdr(skb), skb->data));
927         st->full_packet_size = st->header_len + skb_shinfo(skb)->gso_size;
928
929         if (st->protocol == htons(ETH_P_IP))
930                 st->ipv4_id = ntohs(ip_hdr(skb)->id);
931         else
932                 st->ipv4_id = 0;
933         st->seqnum = ntohl(tcp_hdr(skb)->seq);
934
935         EFX_BUG_ON_PARANOID(tcp_hdr(skb)->urg);
936         EFX_BUG_ON_PARANOID(tcp_hdr(skb)->syn);
937         EFX_BUG_ON_PARANOID(tcp_hdr(skb)->rst);
938
939         st->out_len = skb->len - st->header_len;
940         st->unmap_len = 0;
941         st->unmap_single = false;
942 }
943
944 static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx,
945                             skb_frag_t *frag)
946 {
947         st->unmap_addr = skb_frag_dma_map(&efx->pci_dev->dev, frag, 0,
948                                           skb_frag_size(frag), DMA_TO_DEVICE);
949         if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) {
950                 st->unmap_single = false;
951                 st->unmap_len = skb_frag_size(frag);
952                 st->in_len = skb_frag_size(frag);
953                 st->dma_addr = st->unmap_addr;
954                 return 0;
955         }
956         return -ENOMEM;
957 }
958
959 static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx,
960                                  const struct sk_buff *skb)
961 {
962         int hl = st->header_len;
963         int len = skb_headlen(skb) - hl;
964
965         st->unmap_addr = dma_map_single(&efx->pci_dev->dev, skb->data + hl,
966                                         len, DMA_TO_DEVICE);
967         if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) {
968                 st->unmap_single = true;
969                 st->unmap_len = len;
970                 st->in_len = len;
971                 st->dma_addr = st->unmap_addr;
972                 return 0;
973         }
974         return -ENOMEM;
975 }
976
977
978 /**
979  * tso_fill_packet_with_fragment - form descriptors for the current fragment
980  * @tx_queue:           Efx TX queue
981  * @skb:                Socket buffer
982  * @st:                 TSO state
983  *
984  * Form descriptors for the current fragment, until we reach the end
985  * of fragment or end-of-packet.  Return 0 on success, 1 if not enough
986  * space in @tx_queue.
987  */
988 static int tso_fill_packet_with_fragment(struct efx_tx_queue *tx_queue,
989                                          const struct sk_buff *skb,
990                                          struct tso_state *st)
991 {
992         struct efx_tx_buffer *buffer;
993         int n, end_of_packet, rc;
994
995         if (st->in_len == 0)
996                 return 0;
997         if (st->packet_space == 0)
998                 return 0;
999
1000         EFX_BUG_ON_PARANOID(st->in_len <= 0);
1001         EFX_BUG_ON_PARANOID(st->packet_space <= 0);
1002
1003         n = min(st->in_len, st->packet_space);
1004
1005         st->packet_space -= n;
1006         st->out_len -= n;
1007         st->in_len -= n;
1008
1009         rc = efx_tx_queue_insert(tx_queue, st->dma_addr, n, &buffer);
1010         if (likely(rc == 0)) {
1011                 if (st->out_len == 0)
1012                         /* Transfer ownership of the skb */
1013                         buffer->skb = skb;
1014
1015                 end_of_packet = st->out_len == 0 || st->packet_space == 0;
1016                 buffer->continuation = !end_of_packet;
1017
1018                 if (st->in_len == 0) {
1019                         /* Transfer ownership of the DMA mapping */
1020                         buffer->unmap_len = st->unmap_len;
1021                         buffer->unmap_single = st->unmap_single;
1022                         st->unmap_len = 0;
1023                 }
1024         }
1025
1026         st->dma_addr += n;
1027         return rc;
1028 }
1029
1030
1031 /**
1032  * tso_start_new_packet - generate a new header and prepare for the new packet
1033  * @tx_queue:           Efx TX queue
1034  * @skb:                Socket buffer
1035  * @st:                 TSO state
1036  *
1037  * Generate a new header and prepare for the new packet.  Return 0 on
1038  * success, or -1 if failed to alloc header.
1039  */
1040 static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
1041                                 const struct sk_buff *skb,
1042                                 struct tso_state *st)
1043 {
1044         struct efx_tso_header *tsoh;
1045         struct tcphdr *tsoh_th;
1046         unsigned ip_length;
1047         u8 *header;
1048
1049         /* Allocate a DMA-mapped header buffer. */
1050         if (likely(TSOH_SIZE(st->header_len) <= TSOH_STD_SIZE)) {
1051                 if (tx_queue->tso_headers_free == NULL) {
1052                         if (efx_tsoh_block_alloc(tx_queue))
1053                                 return -1;
1054                 }
1055                 EFX_BUG_ON_PARANOID(!tx_queue->tso_headers_free);
1056                 tsoh = tx_queue->tso_headers_free;
1057                 tx_queue->tso_headers_free = tsoh->next;
1058                 tsoh->unmap_len = 0;
1059         } else {
1060                 tx_queue->tso_long_headers++;
1061                 tsoh = efx_tsoh_heap_alloc(tx_queue, st->header_len);
1062                 if (unlikely(!tsoh))
1063                         return -1;
1064         }
1065
1066         header = TSOH_BUFFER(tsoh);
1067         tsoh_th = (struct tcphdr *)(header + SKB_TCP_OFF(skb));
1068
1069         /* Copy and update the headers. */
1070         memcpy(header, skb->data, st->header_len);
1071
1072         tsoh_th->seq = htonl(st->seqnum);
1073         st->seqnum += skb_shinfo(skb)->gso_size;
1074         if (st->out_len > skb_shinfo(skb)->gso_size) {
1075                 /* This packet will not finish the TSO burst. */
1076                 ip_length = st->full_packet_size - ETH_HDR_LEN(skb);
1077                 tsoh_th->fin = 0;
1078                 tsoh_th->psh = 0;
1079         } else {
1080                 /* This packet will be the last in the TSO burst. */
1081                 ip_length = st->header_len - ETH_HDR_LEN(skb) + st->out_len;
1082                 tsoh_th->fin = tcp_hdr(skb)->fin;
1083                 tsoh_th->psh = tcp_hdr(skb)->psh;
1084         }
1085
1086         if (st->protocol == htons(ETH_P_IP)) {
1087                 struct iphdr *tsoh_iph =
1088                         (struct iphdr *)(header + SKB_IPV4_OFF(skb));
1089
1090                 tsoh_iph->tot_len = htons(ip_length);
1091
1092                 /* Linux leaves suitable gaps in the IP ID space for us to fill. */
1093                 tsoh_iph->id = htons(st->ipv4_id);
1094                 st->ipv4_id++;
1095         } else {
1096                 struct ipv6hdr *tsoh_iph =
1097                         (struct ipv6hdr *)(header + SKB_IPV6_OFF(skb));
1098
1099                 tsoh_iph->payload_len = htons(ip_length - sizeof(*tsoh_iph));
1100         }
1101
1102         st->packet_space = skb_shinfo(skb)->gso_size;
1103         ++tx_queue->tso_packets;
1104
1105         /* Form a descriptor for this header. */
1106         efx_tso_put_header(tx_queue, tsoh, st->header_len);
1107
1108         return 0;
1109 }
1110
1111
1112 /**
1113  * efx_enqueue_skb_tso - segment and transmit a TSO socket buffer
1114  * @tx_queue:           Efx TX queue
1115  * @skb:                Socket buffer
1116  *
1117  * Context: You must hold netif_tx_lock() to call this function.
1118  *
1119  * Add socket buffer @skb to @tx_queue, doing TSO or return != 0 if
1120  * @skb was not enqueued.  In all cases @skb is consumed.  Return
1121  * %NETDEV_TX_OK or %NETDEV_TX_BUSY.
1122  */
1123 static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
1124                                struct sk_buff *skb)
1125 {
1126         struct efx_nic *efx = tx_queue->efx;
1127         int frag_i, rc, rc2 = NETDEV_TX_OK;
1128         struct tso_state state;
1129
1130         /* Find the packet protocol and sanity-check it */
1131         state.protocol = efx_tso_check_protocol(skb);
1132
1133         EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
1134
1135         tso_start(&state, skb);
1136
1137         /* Assume that skb header area contains exactly the headers, and
1138          * all payload is in the frag list.
1139          */
1140         if (skb_headlen(skb) == state.header_len) {
1141                 /* Grab the first payload fragment. */
1142                 EFX_BUG_ON_PARANOID(skb_shinfo(skb)->nr_frags < 1);
1143                 frag_i = 0;
1144                 rc = tso_get_fragment(&state, efx,
1145                                       skb_shinfo(skb)->frags + frag_i);
1146                 if (rc)
1147                         goto mem_err;
1148         } else {
1149                 rc = tso_get_head_fragment(&state, efx, skb);
1150                 if (rc)
1151                         goto mem_err;
1152                 frag_i = -1;
1153         }
1154
1155         if (tso_start_new_packet(tx_queue, skb, &state) < 0)
1156                 goto mem_err;
1157
1158         while (1) {
1159                 rc = tso_fill_packet_with_fragment(tx_queue, skb, &state);
1160                 if (unlikely(rc)) {
1161                         rc2 = NETDEV_TX_BUSY;
1162                         goto unwind;
1163                 }
1164
1165                 /* Move onto the next fragment? */
1166                 if (state.in_len == 0) {
1167                         if (++frag_i >= skb_shinfo(skb)->nr_frags)
1168                                 /* End of payload reached. */
1169                                 break;
1170                         rc = tso_get_fragment(&state, efx,
1171                                               skb_shinfo(skb)->frags + frag_i);
1172                         if (rc)
1173                                 goto mem_err;
1174                 }
1175
1176                 /* Start at new packet? */
1177                 if (state.packet_space == 0 &&
1178                     tso_start_new_packet(tx_queue, skb, &state) < 0)
1179                         goto mem_err;
1180         }
1181
1182         netdev_tx_sent_queue(tx_queue->core_txq, skb->len);
1183
1184         /* Pass off to hardware */
1185         efx_nic_push_buffers(tx_queue);
1186
1187         tx_queue->tso_bursts++;
1188         return NETDEV_TX_OK;
1189
1190  mem_err:
1191         netif_err(efx, tx_err, efx->net_dev,
1192                   "Out of memory for TSO headers, or DMA mapping error\n");
1193         dev_kfree_skb_any(skb);
1194
1195  unwind:
1196         /* Free the DMA mapping we were in the process of writing out */
1197         if (state.unmap_len) {
1198                 if (state.unmap_single)
1199                         dma_unmap_single(&efx->pci_dev->dev, state.unmap_addr,
1200                                          state.unmap_len, DMA_TO_DEVICE);
1201                 else
1202                         dma_unmap_page(&efx->pci_dev->dev, state.unmap_addr,
1203                                        state.unmap_len, DMA_TO_DEVICE);
1204         }
1205
1206         efx_enqueue_unwind(tx_queue);
1207         return rc2;
1208 }
1209
1210
1211 /*
1212  * Free up all TSO datastructures associated with tx_queue. This
1213  * routine should be called only once the tx_queue is both empty and
1214  * will no longer be used.
1215  */
1216 static void efx_fini_tso(struct efx_tx_queue *tx_queue)
1217 {
1218         unsigned i;
1219
1220         if (tx_queue->buffer) {
1221                 for (i = 0; i <= tx_queue->ptr_mask; ++i)
1222                         efx_tsoh_free(tx_queue, &tx_queue->buffer[i]);
1223         }
1224
1225         while (tx_queue->tso_headers_free != NULL)
1226                 efx_tsoh_block_free(tx_queue, tx_queue->tso_headers_free,
1227                                     &tx_queue->efx->pci_dev->dev);
1228 }