enic: Add support for adaptive interrupt coalescing
authorSujith Sankar <ssujith@cisco.com>
Mon, 19 May 2014 21:44:05 +0000 (03:14 +0530)
committerDavid S. Miller <davem@davemloft.net>
Wed, 21 May 2014 21:04:13 +0000 (17:04 -0400)
This patch adds support for adaptive interrupt coalescing.

For small pkts with low pkt rate, we can decrease the coalescing interrupt
dynamically which decreases the latency. This however increases the cpu
utilization. Based on testing with different coal intr and pkt rate we came up
with a table(mod_table) with rx_rate and coalescing interrupt value where we
get low latency without significant increase in cpu. mod_table table stores
the coalescing timer percentage value for different throughputs.

Function enic_calc_int_moderation() calculates the desired coalescing intr timer
value. This function is called in driver rx napi_poll. The actual value is set
by enic_set_int_moderation() which is called when napi_poll is complete. i.e
when we unmask the rx intr.

Adaptive coal intr is support only when driver is using msix intr. Because
intr is not shared.

Struct mod_range is used to store only the default adaptive coalescing intr
value.

Adaptive coal intr calue is calculated by

timer = range_start + ((rx_coal->range_end - range_start) *
       mod_table[index].range_percent / 100);

rx_coal->range_end is the rx-usecs-high value set using ethtool.
range_start is rx-usecs-low, set using ethtool, if rx_small_pkt_bytes_cnt is
greater than 2 * rx_large_pkt_bytes_cnt. i.e small pkts are dominant. Else its
rx-usecs-low + 3.

Cc: Christian Benvenuti <benve@cisco.com>
Cc: Neel Patel <neepatel@cisco.com>
Signed-off-by: Sujith Sankar <ssujith@cisco.com>
Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/cisco/enic/enic.h
drivers/net/ethernet/cisco/enic/enic_ethtool.c
drivers/net/ethernet/cisco/enic/enic_main.c
drivers/net/ethernet/cisco/enic/vnic_cq.h

index e35c8e0..f23ef32 100644 (file)
@@ -43,6 +43,8 @@
 #define ENIC_CQ_MAX            (ENIC_WQ_MAX + ENIC_RQ_MAX)
 #define ENIC_INTR_MAX          (ENIC_CQ_MAX + 2)
 
+#define ENIC_AIC_LARGE_PKT_DIFF        3
+
 struct enic_msix_entry {
        int requested;
        char devname[IFNAMSIZ];
@@ -50,6 +52,33 @@ struct enic_msix_entry {
        void *devid;
 };
 
+/* Store only the lower range.  Higher range is given by fw. */
+struct enic_intr_mod_range {
+       u32 small_pkt_range_start;
+       u32 large_pkt_range_start;
+};
+
+struct enic_intr_mod_table {
+       u32 rx_rate;
+       u32 range_percent;
+};
+
+#define ENIC_MAX_LINK_SPEEDS           3
+#define ENIC_LINK_SPEED_10G            10000
+#define ENIC_LINK_SPEED_4G             4000
+#define ENIC_LINK_40G_INDEX            2
+#define ENIC_LINK_10G_INDEX            1
+#define ENIC_LINK_4G_INDEX             0
+#define ENIC_RX_COALESCE_RANGE_END     125
+#define ENIC_AIC_TS_BREAK              100
+
+struct enic_rx_coal {
+       u32 small_pkt_range_start;
+       u32 large_pkt_range_start;
+       u32 range_end;
+       u32 use_adaptive_rx_coalesce;
+};
+
 /* priv_flags */
 #define ENIC_SRIOV_ENABLED             (1 << 0)
 
@@ -92,6 +121,7 @@ struct enic {
        unsigned int mc_count;
        unsigned int uc_count;
        u32 port_mtu;
+       struct enic_rx_coal rx_coalesce_setting;
        u32 rx_coalesce_usecs;
        u32 tx_coalesce_usecs;
 #ifdef CONFIG_PCI_IOV
index 58a8c67..1882db2 100644 (file)
@@ -79,6 +79,17 @@ static const struct enic_stat enic_rx_stats[] = {
 static const unsigned int enic_n_tx_stats = ARRAY_SIZE(enic_tx_stats);
 static const unsigned int enic_n_rx_stats = ARRAY_SIZE(enic_rx_stats);
 
+void enic_intr_coal_set_rx(struct enic *enic, u32 timer)
+{
+       int i;
+       int intr;
+
+       for (i = 0; i < enic->rq_count; i++) {
+               intr = enic_msix_rq_intr(enic, i);
+               vnic_intr_coalescing_timer_set(&enic->intr[intr], timer);
+       }
+}
+
 static int enic_get_settings(struct net_device *netdev,
        struct ethtool_cmd *ecmd)
 {
@@ -178,9 +189,14 @@ static int enic_get_coalesce(struct net_device *netdev,
        struct ethtool_coalesce *ecmd)
 {
        struct enic *enic = netdev_priv(netdev);
+       struct enic_rx_coal *rxcoal = &enic->rx_coalesce_setting;
 
        ecmd->tx_coalesce_usecs = enic->tx_coalesce_usecs;
        ecmd->rx_coalesce_usecs = enic->rx_coalesce_usecs;
+       if (rxcoal->use_adaptive_rx_coalesce)
+               ecmd->use_adaptive_rx_coalesce = 1;
+       ecmd->rx_coalesce_usecs_low = rxcoal->small_pkt_range_start;
+       ecmd->rx_coalesce_usecs_high = rxcoal->range_end;
 
        return 0;
 }
@@ -191,17 +207,31 @@ static int enic_set_coalesce(struct net_device *netdev,
        struct enic *enic = netdev_priv(netdev);
        u32 tx_coalesce_usecs;
        u32 rx_coalesce_usecs;
+       u32 rx_coalesce_usecs_low;
+       u32 rx_coalesce_usecs_high;
+       u32 coalesce_usecs_max;
        unsigned int i, intr;
+       struct enic_rx_coal *rxcoal = &enic->rx_coalesce_setting;
 
+       coalesce_usecs_max = vnic_dev_get_intr_coal_timer_max(enic->vdev);
        tx_coalesce_usecs = min_t(u32, ecmd->tx_coalesce_usecs,
-               vnic_dev_get_intr_coal_timer_max(enic->vdev));
+                                 coalesce_usecs_max);
        rx_coalesce_usecs = min_t(u32, ecmd->rx_coalesce_usecs,
-               vnic_dev_get_intr_coal_timer_max(enic->vdev));
+                                 coalesce_usecs_max);
+
+       rx_coalesce_usecs_low = min_t(u32, ecmd->rx_coalesce_usecs_low,
+                                     coalesce_usecs_max);
+       rx_coalesce_usecs_high = min_t(u32, ecmd->rx_coalesce_usecs_high,
+                                      coalesce_usecs_max);
 
        switch (vnic_dev_get_intr_mode(enic->vdev)) {
        case VNIC_DEV_INTR_MODE_INTX:
                if (tx_coalesce_usecs != rx_coalesce_usecs)
                        return -EINVAL;
+               if (ecmd->use_adaptive_rx_coalesce      ||
+                   ecmd->rx_coalesce_usecs_low         ||
+                   ecmd->rx_coalesce_usecs_high)
+                       return -EOPNOTSUPP;
 
                intr = enic_legacy_io_intr();
                vnic_intr_coalescing_timer_set(&enic->intr[intr],
@@ -210,6 +240,10 @@ static int enic_set_coalesce(struct net_device *netdev,
        case VNIC_DEV_INTR_MODE_MSI:
                if (tx_coalesce_usecs != rx_coalesce_usecs)
                        return -EINVAL;
+               if (ecmd->use_adaptive_rx_coalesce      ||
+                   ecmd->rx_coalesce_usecs_low         ||
+                   ecmd->rx_coalesce_usecs_high)
+                       return -EOPNOTSUPP;
 
                vnic_intr_coalescing_timer_set(&enic->intr[0],
                        tx_coalesce_usecs);
@@ -221,12 +255,27 @@ static int enic_set_coalesce(struct net_device *netdev,
                                tx_coalesce_usecs);
                }
 
-               for (i = 0; i < enic->rq_count; i++) {
-                       intr = enic_msix_rq_intr(enic, i);
-                       vnic_intr_coalescing_timer_set(&enic->intr[intr],
-                               rx_coalesce_usecs);
+               if (rxcoal->use_adaptive_rx_coalesce) {
+                       if (!ecmd->use_adaptive_rx_coalesce) {
+                               rxcoal->use_adaptive_rx_coalesce = 0;
+                               enic_intr_coal_set_rx(enic, rx_coalesce_usecs);
+                       }
+               } else {
+                       if (ecmd->use_adaptive_rx_coalesce)
+                               rxcoal->use_adaptive_rx_coalesce = 1;
+                       else
+                               enic_intr_coal_set_rx(enic, rx_coalesce_usecs);
                }
 
+               if (ecmd->rx_coalesce_usecs_high) {
+                       if (rx_coalesce_usecs_high <
+                           (rx_coalesce_usecs_low + ENIC_AIC_LARGE_PKT_DIFF))
+                               return -EINVAL;
+                       rxcoal->range_end = rx_coalesce_usecs_high;
+                       rxcoal->small_pkt_range_start = rx_coalesce_usecs_low;
+                       rxcoal->large_pkt_range_start = rx_coalesce_usecs_low +
+                                                       ENIC_AIC_LARGE_PKT_DIFF;
+               }
                break;
        default:
                break;
index 2945718..0d8995c 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/prefetch.h>
 #include <net/ip6_checksum.h>
+#include <linux/ktime.h>
 
 #include "cq_enet_desc.h"
 #include "vnic_dev.h"
@@ -72,6 +73,35 @@ MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
 MODULE_DEVICE_TABLE(pci, enic_id_table);
 
+#define ENIC_LARGE_PKT_THRESHOLD               1000
+#define ENIC_MAX_COALESCE_TIMERS               10
+/*  Interrupt moderation table, which will be used to decide the
+ *  coalescing timer values
+ *  {rx_rate in Mbps, mapping percentage of the range}
+ */
+struct enic_intr_mod_table mod_table[ENIC_MAX_COALESCE_TIMERS + 1] = {
+       {4000,  0},
+       {4400, 10},
+       {5060, 20},
+       {5230, 30},
+       {5540, 40},
+       {5820, 50},
+       {6120, 60},
+       {6435, 70},
+       {6745, 80},
+       {7000, 90},
+       {0xFFFFFFFF, 100}
+};
+
+/* This table helps the driver to pick different ranges for rx coalescing
+ * timer depending on the link speed.
+ */
+struct enic_intr_mod_range mod_range[ENIC_MAX_LINK_SPEEDS] = {
+       {0,  0}, /* 0  - 4  Gbps */
+       {0,  3}, /* 4  - 10 Gbps */
+       {3,  6}, /* 10 - 40 Gbps */
+};
+
 int enic_is_dynamic(struct enic *enic)
 {
        return enic->pdev->device == PCI_DEVICE_ID_CISCO_VIC_ENET_DYN;
@@ -979,6 +1009,15 @@ static int enic_rq_alloc_buf(struct vnic_rq *rq)
        return 0;
 }
 
+static void enic_intr_update_pkt_size(struct vnic_rx_bytes_counter *pkt_size,
+                                     u32 pkt_len)
+{
+       if (ENIC_LARGE_PKT_THRESHOLD <= pkt_len)
+               pkt_size->large_pkt_bytes_cnt += pkt_len;
+       else
+               pkt_size->small_pkt_bytes_cnt += pkt_len;
+}
+
 static void enic_rq_indicate_buf(struct vnic_rq *rq,
        struct cq_desc *cq_desc, struct vnic_rq_buf *buf,
        int skipped, void *opaque)
@@ -986,6 +1025,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq,
        struct enic *enic = vnic_dev_priv(rq->vdev);
        struct net_device *netdev = enic->netdev;
        struct sk_buff *skb;
+       struct vnic_cq *cq = &enic->cq[enic_cq_rq(enic, rq->index)];
 
        u8 type, color, eop, sop, ingress_port, vlan_stripped;
        u8 fcoe, fcoe_sof, fcoe_fc_crc_ok, fcoe_enc_error, fcoe_eof;
@@ -1056,6 +1096,9 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq,
                        napi_gro_receive(&enic->napi[q_number], skb);
                else
                        netif_receive_skb(skb);
+               if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce)
+                       enic_intr_update_pkt_size(&cq->pkt_size_counter,
+                                                 bytes_written);
        } else {
 
                /* Buffer overflow
@@ -1134,6 +1177,64 @@ static int enic_poll(struct napi_struct *napi, int budget)
        return rq_work_done;
 }
 
+static void enic_set_int_moderation(struct enic *enic, struct vnic_rq *rq)
+{
+       unsigned int intr = enic_msix_rq_intr(enic, rq->index);
+       struct vnic_cq *cq = &enic->cq[enic_cq_rq(enic, rq->index)];
+       u32 timer = cq->tobe_rx_coal_timeval;
+
+       if (cq->tobe_rx_coal_timeval != cq->cur_rx_coal_timeval) {
+               vnic_intr_coalescing_timer_set(&enic->intr[intr], timer);
+               cq->cur_rx_coal_timeval = cq->tobe_rx_coal_timeval;
+       }
+}
+
+static void enic_calc_int_moderation(struct enic *enic, struct vnic_rq *rq)
+{
+       struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting;
+       struct vnic_cq *cq = &enic->cq[enic_cq_rq(enic, rq->index)];
+       struct vnic_rx_bytes_counter *pkt_size_counter = &cq->pkt_size_counter;
+       int index;
+       u32 timer;
+       u32 range_start;
+       u32 traffic;
+       u64 delta;
+       ktime_t now = ktime_get();
+
+       delta = ktime_us_delta(now, cq->prev_ts);
+       if (delta < ENIC_AIC_TS_BREAK)
+               return;
+       cq->prev_ts = now;
+
+       traffic = pkt_size_counter->large_pkt_bytes_cnt +
+                 pkt_size_counter->small_pkt_bytes_cnt;
+       /* The table takes Mbps
+        * traffic *= 8    => bits
+        * traffic *= (10^6 / delta)    => bps
+        * traffic /= 10^6     => Mbps
+        *
+        * Combining, traffic *= (8 / delta)
+        */
+
+       traffic <<= 3;
+       traffic /= delta;
+
+       for (index = 0; index < ENIC_MAX_COALESCE_TIMERS; index++)
+               if (traffic < mod_table[index].rx_rate)
+                       break;
+       range_start = (pkt_size_counter->small_pkt_bytes_cnt >
+                      pkt_size_counter->large_pkt_bytes_cnt << 1) ?
+                     rx_coal->small_pkt_range_start :
+                     rx_coal->large_pkt_range_start;
+       timer = range_start + ((rx_coal->range_end - range_start) *
+                              mod_table[index].range_percent / 100);
+       /* Damping */
+       cq->tobe_rx_coal_timeval = (timer + cq->tobe_rx_coal_timeval) >> 1;
+
+       pkt_size_counter->large_pkt_bytes_cnt = 0;
+       pkt_size_counter->small_pkt_bytes_cnt = 0;
+}
+
 static int enic_poll_msix(struct napi_struct *napi, int budget)
 {
        struct net_device *netdev = napi->dev;
@@ -1171,6 +1272,13 @@ static int enic_poll_msix(struct napi_struct *napi, int budget)
 
        if (err)
                work_done = work_to_do;
+       if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce)
+               /* Call the function which refreshes
+                * the intr coalescing timer value based on
+                * the traffic.  This is supported only in
+                * the case of MSI-x mode
+                */
+               enic_calc_int_moderation(enic, &enic->rq[rq]);
 
        if (work_done < work_to_do) {
 
@@ -1179,6 +1287,8 @@ static int enic_poll_msix(struct napi_struct *napi, int budget)
                 */
 
                napi_complete(napi);
+               if (enic->rx_coalesce_setting.use_adaptive_rx_coalesce)
+                       enic_set_int_moderation(enic, &enic->rq[rq]);
                vnic_intr_unmask(&enic->intr[intr]);
        }
 
@@ -1314,6 +1424,42 @@ static void enic_synchronize_irqs(struct enic *enic)
        }
 }
 
+static void enic_set_rx_coal_setting(struct enic *enic)
+{
+       unsigned int speed;
+       int index = -1;
+       struct enic_rx_coal *rx_coal = &enic->rx_coalesce_setting;
+
+       /* If intr mode is not MSIX, do not do adaptive coalescing */
+       if (VNIC_DEV_INTR_MODE_MSIX != vnic_dev_get_intr_mode(enic->vdev)) {
+               netdev_info(enic->netdev, "INTR mode is not MSIX, Not initializing adaptive coalescing");
+               return;
+       }
+
+       /* 1. Read the link speed from fw
+        * 2. Pick the default range for the speed
+        * 3. Update it in enic->rx_coalesce_setting
+        */
+       speed = vnic_dev_port_speed(enic->vdev);
+       if (ENIC_LINK_SPEED_10G < speed)
+               index = ENIC_LINK_40G_INDEX;
+       else if (ENIC_LINK_SPEED_4G < speed)
+               index = ENIC_LINK_10G_INDEX;
+       else
+               index = ENIC_LINK_4G_INDEX;
+
+       rx_coal->small_pkt_range_start = mod_range[index].small_pkt_range_start;
+       rx_coal->large_pkt_range_start = mod_range[index].large_pkt_range_start;
+       rx_coal->range_end = ENIC_RX_COALESCE_RANGE_END;
+
+       /* Start with the value provided by UCSM */
+       for (index = 0; index < enic->rq_count; index++)
+               enic->cq[index].cur_rx_coal_timeval =
+                               enic->config.intr_timer_usec;
+
+       rx_coal->use_adaptive_rx_coalesce = 1;
+}
+
 static int enic_dev_notify_set(struct enic *enic)
 {
        int err;
@@ -2231,6 +2377,7 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        enic->notify_timer.function = enic_notify_timer;
        enic->notify_timer.data = (unsigned long)enic;
 
+       enic_set_rx_coal_setting(enic);
        INIT_WORK(&enic->reset, enic_reset);
        INIT_WORK(&enic->change_mtu_work, enic_change_mtu_work);
 
@@ -2250,6 +2397,9 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        }
 
        enic->tx_coalesce_usecs = enic->config.intr_timer_usec;
+       /* rx coalesce time already got initialized. This gets used
+        * if adaptive coal is turned off
+        */
        enic->rx_coalesce_usecs = enic->tx_coalesce_usecs;
 
        if (enic_is_dynamic(enic) || enic_is_sriov_vf(enic))
index 579315c..4e6aa65 100644 (file)
@@ -50,6 +50,11 @@ struct vnic_cq_ctrl {
        u32 pad10;
 };
 
+struct vnic_rx_bytes_counter {
+       unsigned int small_pkt_bytes_cnt;
+       unsigned int large_pkt_bytes_cnt;
+};
+
 struct vnic_cq {
        unsigned int index;
        struct vnic_dev *vdev;
@@ -58,6 +63,10 @@ struct vnic_cq {
        unsigned int to_clean;
        unsigned int last_color;
        unsigned int interrupt_offset;
+       struct vnic_rx_bytes_counter pkt_size_counter;
+       unsigned int cur_rx_coal_timeval;
+       unsigned int tobe_rx_coal_timeval;
+       ktime_t prev_ts;
 };
 
 static inline unsigned int vnic_cq_service(struct vnic_cq *cq,