X-Git-Url: http://git.cascardo.info/?a=blobdiff_plain;f=lib%2Fnetdev-dpdk.c;h=0ede2001d8e438c6d687158c95e61c90efad80a0;hb=1368af0c854030791376f3d235e630f129c0cd8a;hp=8f1fdb5b6639e6add2b77b2f17f06ab5c41dacd3;hpb=2f9dd77fcd172e2870d737ede67970836db3eb14;p=cascardo%2Fovs.git

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 8f1fdb5b6..0ede2001d 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -36,6 +36,7 @@
 #include "odp-util.h"
 #include "ofp-print.h"
 #include "ofpbuf.h"
+#include "ovs-numa.h"
 #include "ovs-thread.h"
 #include "ovs-rcu.h"
 #include "packet-dpif.h"
@@ -45,7 +46,7 @@
 #include "unaligned.h"
 #include "timeval.h"
 #include "unixctl.h"
-#include "vlog.h"
+#include "openvswitch/vlog.h"
 
 VLOG_DEFINE_THIS_MODULE(dpdk);
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
@@ -69,8 +70,6 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
 #define MP_CACHE_SZ          (256 * 2)
 #define SOCKET0              0
 
-#define NON_PMD_THREAD_TX_QUEUE 0
-
 #define NIC_PORT_RX_Q_SIZE 2048  /* Size of Physical NIC RX Queue, Max (n+32<=4096)*/
 #define NIC_PORT_TX_Q_SIZE 2048  /* Size of Physical NIC TX Queue, Max (n+32<=4096)*/
 
@@ -135,11 +134,11 @@ static int rte_eal_init_ret = ENODEV;
 static struct ovs_mutex dpdk_mutex = OVS_MUTEX_INITIALIZER;
 
 /* Contains all 'struct dpdk_dev's. */
-static struct list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
-    = LIST_INITIALIZER(&dpdk_list);
+static struct ovs_list dpdk_list OVS_GUARDED_BY(dpdk_mutex)
+    = OVS_LIST_INITIALIZER(&dpdk_list);
 
-static struct list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
-    = LIST_INITIALIZER(&dpdk_mp_list);
+static struct ovs_list dpdk_mp_list OVS_GUARDED_BY(dpdk_mutex)
+    = OVS_LIST_INITIALIZER(&dpdk_mp_list);
 
 /* This mutex must be used by non pmd threads when allocating or freeing
  * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() may
@@ -151,11 +150,14 @@ struct dpdk_mp {
     int mtu;
     int socket_id;
     int refcount;
-    struct list list_node OVS_GUARDED_BY(dpdk_mutex);
+    struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
 };
 
+/* There should be one 'struct dpdk_tx_queue' created for
+ * each cpu core. */
 struct dpdk_tx_queue {
-    rte_spinlock_t tx_lock;
+    bool flush_tx;                 /* Set to true to flush queue everytime */
+                                   /* pkts are queued. */
     int count;
     uint64_t tsc;
     struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN];
@@ -165,8 +167,8 @@ struct dpdk_tx_queue {
    so we have to keep them around once they've been created
 */
 
-static struct list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
-    = LIST_INITIALIZER(&dpdk_ring_list);
+static struct ovs_list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex)
+    = OVS_LIST_INITIALIZER(&dpdk_ring_list);
 
 struct dpdk_ring {
     /* For the client rings */
@@ -174,7 +176,7 @@ struct dpdk_ring {
     struct rte_ring *cring_rx;
     int user_port_id; /* User given port no, parsed from port name */
     int eth_port_id; /* ethernet device port id */
-    struct list list_node OVS_GUARDED_BY(dpdk_mutex);
+    struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
 };
 
 struct netdev_dpdk {
@@ -182,7 +184,7 @@ struct netdev_dpdk {
     int port_id;
     int max_packet_len;
 
-    struct dpdk_tx_queue tx_q[NR_QUEUE];
+    struct dpdk_tx_queue *tx_q;
 
     struct ovs_mutex mutex OVS_ACQ_AFTER(dpdk_mutex);
 
@@ -199,7 +201,8 @@ struct netdev_dpdk {
     int link_reset_cnt;
 
     /* In dpdk_list. */
-    struct list list_node OVS_GUARDED_BY(dpdk_mutex);
+    struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
+    rte_spinlock_t dpdkr_tx_lock;
 };
 
 struct netdev_rxq_dpdk {
@@ -398,13 +401,14 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
         return ENODEV;
     }
 
-    diag = rte_eth_dev_configure(dev->port_id, NR_QUEUE, NR_QUEUE,  &port_conf);
+    diag = rte_eth_dev_configure(dev->port_id, dev->up.n_rxq, dev->up.n_txq,
+                                 &port_conf);
     if (diag) {
         VLOG_ERR("eth dev config error %d",diag);
         return -diag;
     }
 
-    for (i = 0; i < NR_QUEUE; i++) {
+    for (i = 0; i < dev->up.n_txq; i++) {
         diag = rte_eth_tx_queue_setup(dev->port_id, i, NIC_PORT_TX_Q_SIZE,
                                       dev->socket_id, &tx_conf);
         if (diag) {
@@ -413,7 +417,7 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
         }
     }
 
-    for (i = 0; i < NR_QUEUE; i++) {
+    for (i = 0; i < dev->up.n_rxq; i++) {
         diag = rte_eth_rx_queue_setup(dev->port_id, i, NIC_PORT_RX_Q_SIZE,
                                       dev->socket_id,
                                       &rx_conf, dev->dpdk_mp->mp);
@@ -460,29 +464,46 @@ netdev_dpdk_alloc(void)
     return &netdev->up;
 }
 
+static void
+netdev_dpdk_alloc_txq(struct netdev_dpdk *netdev, unsigned int n_txqs)
+{
+    int i;
+
+    netdev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *netdev->tx_q);
+    /* Each index is considered as a cpu core id, since there should
+     * be one tx queue for each cpu core. */
+    for (i = 0; i < n_txqs; i++) {
+        int numa_id = ovs_numa_get_numa_id(i);
+
+        /* If the corresponding core is not on the same numa node
+         * as 'netdev', flags the 'flush_tx'. */
+        netdev->tx_q[i].flush_tx = netdev->socket_id == numa_id;
+    }
+}
+
 static int
-netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no) OVS_REQUIRES(dpdk_mutex)
+netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no)
+    OVS_REQUIRES(dpdk_mutex)
 {
     struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
+    int sid;
     int err = 0;
-    int i;
 
     ovs_mutex_init(&netdev->mutex);
 
     ovs_mutex_lock(&netdev->mutex);
 
-    for (i = 0; i < NR_QUEUE; i++) {
-        rte_spinlock_init(&netdev->tx_q[i].tx_lock);
-    }
-
+    /* If the 'sid' is negative, it means that the kernel fails
+     * to obtain the pci numa info.  In that situation, always
+     * use 'SOCKET0'. */
+    sid = rte_eth_dev_socket_id(port_no);
+    netdev->socket_id = sid < 0 ? SOCKET0 : sid;
+    netdev_dpdk_alloc_txq(netdev, NR_QUEUE);
     netdev->port_id = port_no;
-
     netdev->flags = 0;
     netdev->mtu = ETHER_MTU;
     netdev->max_packet_len = MTU_TO_MAX_LEN(netdev->mtu);
-
-    /* XXX: need to discover device node at run time. */
-    netdev->socket_id = SOCKET0;
+    rte_spinlock_init(&netdev->dpdkr_tx_lock);
 
     netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
     if (!netdev->dpdk_mp) {
@@ -490,16 +511,19 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no) OVS_REQUIRES(dpdk
         goto unlock;
     }
 
+    netdev_->n_txq = NR_QUEUE;
+    netdev_->n_rxq = NR_QUEUE;
     err = dpdk_eth_dev_init(netdev);
     if (err) {
         goto unlock;
     }
-    netdev_->n_txq = NR_QUEUE;
-    netdev_->n_rxq = NR_QUEUE;
 
     list_push_back(&dpdk_list, &netdev->list_node);
 
 unlock:
+    if (err) {
+        rte_free(netdev->tx_q);
+    }
     ovs_mutex_unlock(&netdev->mutex);
     return err;
 }
@@ -551,6 +575,7 @@ netdev_dpdk_destruct(struct netdev *netdev_)
     ovs_mutex_unlock(&dev->mutex);
 
     ovs_mutex_lock(&dpdk_mutex);
+    rte_free(dev->tx_q);
     list_remove(&dev->list_node);
     dpdk_mp_put(dev->dpdk_mp);
     ovs_mutex_unlock(&dpdk_mutex);
@@ -573,9 +598,8 @@ netdev_dpdk_get_config(const struct netdev *netdev_, struct smap *args)
 
     ovs_mutex_lock(&dev->mutex);
 
-    /* XXX: Allow to configure number of queues. */
-    smap_add_format(args, "configured_rx_queues", "%u", netdev_->n_rxq);
-    smap_add_format(args, "configured_tx_queues", "%u", netdev_->n_rxq);
+    smap_add_format(args, "configured_rx_queues", "%d", netdev_->n_rxq);
+    smap_add_format(args, "configured_tx_queues", "%d", netdev_->n_txq);
     ovs_mutex_unlock(&dev->mutex);
 
     return 0;
@@ -589,6 +613,37 @@ netdev_dpdk_get_numa_id(const struct netdev *netdev_)
     return netdev->socket_id;
 }
 
+/* Sets the number of tx queues and rx queues for the dpdk interface.
+ * If the configuration fails, do not try restoring its old configuration
+ * and just returns the error. */
+static int
+netdev_dpdk_set_multiq(struct netdev *netdev_, unsigned int n_txq,
+                       unsigned int n_rxq)
+{
+    struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
+    int err = 0;
+
+    if (netdev->up.n_txq == n_txq && netdev->up.n_rxq == n_rxq) {
+        return err;
+    }
+
+    ovs_mutex_lock(&dpdk_mutex);
+    ovs_mutex_lock(&netdev->mutex);
+
+    rte_eth_dev_stop(netdev->port_id);
+
+    netdev->up.n_txq = n_txq;
+    netdev->up.n_rxq = n_rxq;
+    rte_free(netdev->tx_q);
+    netdev_dpdk_alloc_txq(netdev, n_txq);
+    err = dpdk_eth_dev_init(netdev);
+
+    ovs_mutex_unlock(&netdev->mutex);
+    ovs_mutex_unlock(&dpdk_mutex);
+
+    return err;
+}
+
 static struct netdev_rxq *
 netdev_dpdk_rxq_alloc(void)
 {
@@ -672,9 +727,7 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int qid)
     if (txq->count == 0) {
         return;
     }
-    rte_spinlock_lock(&txq->tx_lock);
     dpdk_queue_flush__(dev, qid);
-    rte_spinlock_unlock(&txq->tx_lock);
 }
 
 static int
@@ -686,7 +739,11 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct dpif_packet **packets,
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
     int nb_rx;
 
-    dpdk_queue_flush(dev, rxq_->queue_id);
+    /* There is only one tx queue for this core.  Do not flush other
+     * queueus. */
+    if (rxq_->queue_id == rte_lcore_id()) {
+        dpdk_queue_flush(dev, rxq_->queue_id);
+    }
 
     nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id,
                              (struct rte_mbuf **) packets,
@@ -710,7 +767,6 @@ dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
 
     int i = 0;
 
-    rte_spinlock_lock(&txq->tx_lock);
     while (i < cnt) {
         int freeslots = MAX_TX_QUEUE_LEN - txq->count;
         int tocopy = MIN(freeslots, cnt-i);
@@ -721,7 +777,7 @@ dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
         txq->count += tocopy;
         i += tocopy;
 
-        if (txq->count == MAX_TX_QUEUE_LEN) {
+        if (txq->count == MAX_TX_QUEUE_LEN || txq->flush_tx) {
             dpdk_queue_flush__(dev, qid);
         }
         diff_tsc = rte_get_timer_cycles() - txq->tsc;
@@ -729,12 +785,12 @@ dpdk_queue_pkts(struct netdev_dpdk *dev, int qid,
             dpdk_queue_flush__(dev, qid);
         }
     }
-    rte_spinlock_unlock(&txq->tx_lock);
 }
 
 /* Tx function. Transmit packets indefinitely */
 static void
-dpdk_do_tx_copy(struct netdev *netdev, struct dpif_packet ** pkts, int cnt)
+dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dpif_packet ** pkts,
+                int cnt)
     OVS_NO_THREAD_SAFETY_ANALYSIS
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
@@ -783,24 +839,25 @@ dpdk_do_tx_copy(struct netdev *netdev, struct dpif_packet ** pkts, int cnt)
         ovs_mutex_unlock(&dev->mutex);
     }
 
-    dpdk_queue_pkts(dev, NON_PMD_THREAD_TX_QUEUE, mbufs, newcnt);
-    dpdk_queue_flush(dev, NON_PMD_THREAD_TX_QUEUE);
+    dpdk_queue_pkts(dev, qid, mbufs, newcnt);
+    dpdk_queue_flush(dev, qid);
 
     if (!thread_is_pmd()) {
         ovs_mutex_unlock(&nonpmd_mempool_mutex);
     }
 }
 
-static int
-netdev_dpdk_send(struct netdev *netdev, int qid, struct dpif_packet **pkts,
-                 int cnt, bool may_steal)
+static inline void
+netdev_dpdk_send__(struct netdev_dpdk *dev, int qid,
+                   struct dpif_packet **pkts, int cnt, bool may_steal)
 {
-    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
-    int ret;
     int i;
 
-    if (!may_steal || pkts[0]->ofpbuf.source != OFPBUF_DPDK) {
-        dpdk_do_tx_copy(netdev, pkts, cnt);
+    if (OVS_UNLIKELY(!may_steal ||
+                     pkts[0]->ofpbuf.source != OFPBUF_DPDK)) {
+        struct netdev *netdev = &dev->up;
+
+        dpdk_do_tx_copy(netdev, qid, pkts, cnt);
 
         if (may_steal) {
             for (i = 0; i < cnt; i++) {
@@ -811,8 +868,6 @@ netdev_dpdk_send(struct netdev *netdev, int qid, struct dpif_packet **pkts,
         int next_tx_idx = 0;
         int dropped = 0;
 
-        qid = rte_lcore_id() % NR_QUEUE;
-
         for (i = 0; i < cnt; i++) {
             int size = ofpbuf_size(&pkts[i]->ofpbuf);
             if (OVS_UNLIKELY(size > dev->max_packet_len)) {
@@ -842,9 +897,16 @@ netdev_dpdk_send(struct netdev *netdev, int qid, struct dpif_packet **pkts,
             ovs_mutex_unlock(&dev->mutex);
         }
     }
-    ret = 0;
+}
+
+static int
+netdev_dpdk_eth_send(struct netdev *netdev, int qid,
+                     struct dpif_packet **pkts, int cnt, bool may_steal)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 
-    return ret;
+    netdev_dpdk_send__(dev, qid, pkts, cnt, may_steal);
+    return 0;
 }
 
 static int
@@ -1239,12 +1301,15 @@ dpdk_ring_create(const char dev_name[], unsigned int port_no,
         return ENOMEM;
     }
 
+    /* XXX: Add support for multiquque ring. */
     err = snprintf(ring_name, 10, "%s_tx", dev_name);
     if (err < 0) {
         return -err;
     }
 
-    ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0, 0);
+    /* Create single consumer/producer rings, netdev does explicit locking. */
+    ivshmem->cring_tx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
+                                        RING_F_SP_ENQ | RING_F_SC_DEQ);
     if (ivshmem->cring_tx == NULL) {
         rte_free(ivshmem);
         return ENOMEM;
@@ -1255,7 +1320,9 @@ dpdk_ring_create(const char dev_name[], unsigned int port_no,
         return -err;
     }
 
-    ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0, 0);
+    /* Create single consumer/producer rings, netdev does explicit locking. */
+    ivshmem->cring_rx = rte_ring_create(ring_name, DPDK_RING_SIZE, SOCKET0,
+                                        RING_F_SP_ENQ | RING_F_SC_DEQ);
     if (ivshmem->cring_rx == NULL) {
         rte_free(ivshmem);
         return ENOMEM;
@@ -1302,6 +1369,19 @@ dpdk_ring_open(const char dev_name[], unsigned int *eth_port_id) OVS_REQUIRES(dp
     return dpdk_ring_create(dev_name, port_no, eth_port_id);
 }
 
+static int
+netdev_dpdk_ring_send(struct netdev *netdev, int qid OVS_UNUSED,
+                      struct dpif_packet **pkts, int cnt, bool may_steal)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+
+    /* DPDK Rings have a single TX queue, Therefore needs locking. */
+    rte_spinlock_lock(&dev->dpdkr_tx_lock);
+    netdev_dpdk_send__(dev, 0, pkts, cnt, may_steal);
+    rte_spinlock_unlock(&dev->dpdkr_tx_lock);
+    return 0;
+}
+
 static int
 netdev_dpdk_ring_construct(struct netdev *netdev)
 {
@@ -1326,7 +1406,7 @@ unlock_dpdk:
     return err;
 }
 
-#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT)              \
+#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, MULTIQ, SEND)      \
 {                                                             \
     NAME,                                                     \
     INIT,                       /* init */                    \
@@ -1340,9 +1420,13 @@ unlock_dpdk:
     netdev_dpdk_get_config,                                   \
     NULL,                       /* netdev_dpdk_set_config */  \
     NULL,                       /* get_tunnel_config */       \
+    NULL, /* build header */                                  \
+    NULL, /* push header */                                   \
+    NULL, /* pop header */                                    \
     netdev_dpdk_get_numa_id,    /* get_numa_id */             \
+    MULTIQ,                     /* set_multiq */              \
                                                               \
-    netdev_dpdk_send,           /* send */                    \
+    SEND,                       /* send */                    \
     NULL,                       /* send_wait */               \
                                                               \
     netdev_dpdk_set_etheraddr,                                \
@@ -1427,13 +1511,17 @@ const struct netdev_class dpdk_class =
     NETDEV_DPDK_CLASS(
         "dpdk",
         dpdk_class_init,
-        netdev_dpdk_construct);
+        netdev_dpdk_construct,
+        netdev_dpdk_set_multiq,
+        netdev_dpdk_eth_send);
 
 const struct netdev_class dpdk_ring_class =
     NETDEV_DPDK_CLASS(
         "dpdkr",
         NULL,
-        netdev_dpdk_ring_construct);
+        netdev_dpdk_ring_construct,
+        NULL,
+        netdev_dpdk_ring_send);
 
 void
 netdev_dpdk_register(void)
@@ -1466,7 +1554,8 @@ pmd_thread_setaffinity_cpu(int cpu)
         return err;
     }
     /* lcore_id 0 is reseved for use by non pmd threads. */
-    RTE_PER_LCORE(_lcore_id) = cpu + 1;
+    ovs_assert(cpu);
+    RTE_PER_LCORE(_lcore_id) = cpu;
 
     return 0;
 }
@@ -1474,9 +1563,6 @@ pmd_thread_setaffinity_cpu(int cpu)
 void
 thread_set_nonpmd(void)
 {
-    /* We cannot have RTE_MAX_LCORE pmd threads, because lcore_id 0 is reserved
-     * for non pmd threads */
-    BUILD_ASSERT(NR_PMD_THREADS < RTE_MAX_LCORE);
     /* We have to use 0 to allow non pmd threads to perform certain DPDK
      * operations, like rte_eth_dev_configure(). */
     RTE_PER_LCORE(_lcore_id) = 0;