Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
authorDavid S. Miller <davem@davemloft.net>
Mon, 3 Oct 2016 01:17:07 +0000 (21:17 -0400)
committerDavid S. Miller <davem@davemloft.net>
Mon, 3 Oct 2016 02:20:41 +0000 (22:20 -0400)
Three sets of overlapping changes.  Nothing serious.

Signed-off-by: David S. Miller <davem@davemloft.net>
16 files changed:
1  2 
MAINTAINERS
drivers/net/ethernet/broadcom/tg3.c
drivers/net/ethernet/freescale/fec_main.c
include/net/sctp/structs.h
kernel/events/core.c
net/ipv4/route.c
net/ipv4/tcp_input.c
net/ipv4/tcp_output.c
net/ipv6/ip6_gre.c
net/ipv6/route.c
net/sched/act_ife.c
net/sctp/chunk.c
net/sctp/outqueue.c
net/sctp/sctp_diag.c
net/sctp/sm_make_chunk.c
net/sctp/socket.c

diff --combined MAINTAINERS
@@@ -636,15 -636,6 +636,15 @@@ F:       drivers/tty/serial/altera_jtaguart.
  F:    include/linux/altera_uart.h
  F:    include/linux/altera_jtaguart.h
  
 +AMAZON ETHERNET DRIVERS
 +M:    Netanel Belgazal <netanel@annapurnalabs.com>
 +R:    Saeed Bishara <saeed@annapurnalabs.com>
 +R:    Zorik Machulsky <zorik@annapurnalabs.com>
 +L:    netdev@vger.kernel.org
 +S:    Supported
 +F:    Documentation/networking/ena.txt
 +F:    drivers/net/ethernet/amazon/
 +
  AMD CRYPTOGRAPHIC COPROCESSOR (CCP) DRIVER
  M:    Tom Lendacky <thomas.lendacky@amd.com>
  M:    Gary Hook <gary.hook@amd.com>
@@@ -5593,9 -5584,10 +5593,9 @@@ F:     Documentation/devicetree/bindings/sc
  
  HOST AP DRIVER
  M:    Jouni Malinen <j@w1.fi>
 -L:    hostap@shmoo.com (subscribers-only)
  L:    linux-wireless@vger.kernel.org
 -W:    http://hostap.epitest.fi/
 -S:    Maintained
 +W:    http://w1.fi/hostap-driver.html
 +S:    Obsolete
  F:    drivers/net/wireless/intersil/hostap/
  
  HP COMPAQ TC1100 TABLET WMI EXTRAS DRIVER
@@@ -8753,7 -8745,7 +8753,7 @@@ F:      drivers/oprofile
  F:    include/linux/oprofile.h
  
  ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
- M:    Mark Fasheh <mfasheh@suse.com>
+ M:    Mark Fasheh <mfasheh@versity.com>
  M:    Joel Becker <jlbec@evilplan.org>
  L:    ocfs2-devel@oss.oracle.com (moderated for non-subscribers)
  W:    http://ocfs2.wiki.kernel.org
@@@ -9708,12 -9700,6 +9708,12 @@@ T:    git git://git.kernel.org/pub/scm/lin
  S:    Supported
  F:    drivers/net/wireless/ath/ath10k/
  
 +QUALCOMM EMAC GIGABIT ETHERNET DRIVER
 +M:    Timur Tabi <timur@codeaurora.org>
 +L:    netdev@vger.kernel.org
 +S:    Supported
 +F:    drivers/net/ethernet/qualcomm/emac/
 +
  QUALCOMM HEXAGON ARCHITECTURE
  M:    Richard Kuo <rkuo@codeaurora.org>
  L:    linux-hexagon@vger.kernel.org
@@@ -9969,7 -9955,6 +9969,7 @@@ F:      net/rfkill
  
  RHASHTABLE
  M:    Thomas Graf <tgraf@suug.ch>
 +M:    Herbert Xu <herbert@gondor.apana.org.au>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    lib/rhashtable.c
@@@ -11641,7 -11626,7 +11641,7 @@@ F:   Documentation/devicetree/bindings/th
  THERMAL/CPU_COOLING
  M:    Amit Daniel Kachhap <amit.kachhap@gmail.com>
  M:    Viresh Kumar <viresh.kumar@linaro.org>
- M:    Javi Merino <javi.merino@arm.com>
+ M:    Javi Merino <javi.merino@kernel.org>
  L:    linux-pm@vger.kernel.org
  S:    Supported
  F:    Documentation/thermal/cpu-cooling-api.txt
@@@ -12305,7 -12290,6 +12305,7 @@@ F:   drivers/net/usb/smsc75xx.
  
  USB SMSC95XX ETHERNET DRIVER
  M:    Steve Glendinning <steve.glendinning@shawell.net>
 +M:    Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/usb/smsc95xx.*
@@@ -12079,107 -12079,95 +12079,107 @@@ static int tg3_set_eeprom(struct net_de
        return ret;
  }
  
 -static int tg3_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 +static int tg3_get_link_ksettings(struct net_device *dev,
 +                                struct ethtool_link_ksettings *cmd)
  {
        struct tg3 *tp = netdev_priv(dev);
 +      u32 supported, advertising;
  
        if (tg3_flag(tp, USE_PHYLIB)) {
                struct phy_device *phydev;
                if (!(tp->phy_flags & TG3_PHYFLG_IS_CONNECTED))
                        return -EAGAIN;
                phydev = mdiobus_get_phy(tp->mdio_bus, tp->phy_addr);
 -              return phy_ethtool_gset(phydev, cmd);
 +              return phy_ethtool_ksettings_get(phydev, cmd);
        }
  
 -      cmd->supported = (SUPPORTED_Autoneg);
 +      supported = (SUPPORTED_Autoneg);
  
        if (!(tp->phy_flags & TG3_PHYFLG_10_100_ONLY))
 -              cmd->supported |= (SUPPORTED_1000baseT_Half |
 -                                 SUPPORTED_1000baseT_Full);
 +              supported |= (SUPPORTED_1000baseT_Half |
 +                            SUPPORTED_1000baseT_Full);
  
        if (!(tp->phy_flags & TG3_PHYFLG_ANY_SERDES)) {
 -              cmd->supported |= (SUPPORTED_100baseT_Half |
 -                                SUPPORTED_100baseT_Full |
 -                                SUPPORTED_10baseT_Half |
 -                                SUPPORTED_10baseT_Full |
 -                                SUPPORTED_TP);
 -              cmd->port = PORT_TP;
 +              supported |= (SUPPORTED_100baseT_Half |
 +                            SUPPORTED_100baseT_Full |
 +                            SUPPORTED_10baseT_Half |
 +                            SUPPORTED_10baseT_Full |
 +                            SUPPORTED_TP);
 +              cmd->base.port = PORT_TP;
        } else {
 -              cmd->supported |= SUPPORTED_FIBRE;
 -              cmd->port = PORT_FIBRE;
 +              supported |= SUPPORTED_FIBRE;
 +              cmd->base.port = PORT_FIBRE;
        }
 +      ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported,
 +                                              supported);
  
 -      cmd->advertising = tp->link_config.advertising;
 +      advertising = tp->link_config.advertising;
        if (tg3_flag(tp, PAUSE_AUTONEG)) {
                if (tp->link_config.flowctrl & FLOW_CTRL_RX) {
                        if (tp->link_config.flowctrl & FLOW_CTRL_TX) {
 -                              cmd->advertising |= ADVERTISED_Pause;
 +                              advertising |= ADVERTISED_Pause;
                        } else {
 -                              cmd->advertising |= ADVERTISED_Pause |
 -                                                  ADVERTISED_Asym_Pause;
 +                              advertising |= ADVERTISED_Pause |
 +                                      ADVERTISED_Asym_Pause;
                        }
                } else if (tp->link_config.flowctrl & FLOW_CTRL_TX) {
 -                      cmd->advertising |= ADVERTISED_Asym_Pause;
 +                      advertising |= ADVERTISED_Asym_Pause;
                }
        }
 +      ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising,
 +                                              advertising);
 +
        if (netif_running(dev) && tp->link_up) {
 -              ethtool_cmd_speed_set(cmd, tp->link_config.active_speed);
 -              cmd->duplex = tp->link_config.active_duplex;
 -              cmd->lp_advertising = tp->link_config.rmt_adv;
 +              cmd->base.speed = tp->link_config.active_speed;
 +              cmd->base.duplex = tp->link_config.active_duplex;
 +              ethtool_convert_legacy_u32_to_link_mode(
 +                      cmd->link_modes.lp_advertising,
 +                      tp->link_config.rmt_adv);
 +
                if (!(tp->phy_flags & TG3_PHYFLG_ANY_SERDES)) {
                        if (tp->phy_flags & TG3_PHYFLG_MDIX_STATE)
 -                              cmd->eth_tp_mdix = ETH_TP_MDI_X;
 +                              cmd->base.eth_tp_mdix = ETH_TP_MDI_X;
                        else
 -                              cmd->eth_tp_mdix = ETH_TP_MDI;
 +                              cmd->base.eth_tp_mdix = ETH_TP_MDI;
                }
        } else {
 -              ethtool_cmd_speed_set(cmd, SPEED_UNKNOWN);
 -              cmd->duplex = DUPLEX_UNKNOWN;
 -              cmd->eth_tp_mdix = ETH_TP_MDI_INVALID;
 -      }
 -      cmd->phy_address = tp->phy_addr;
 -      cmd->transceiver = XCVR_INTERNAL;
 -      cmd->autoneg = tp->link_config.autoneg;
 -      cmd->maxtxpkt = 0;
 -      cmd->maxrxpkt = 0;
 +              cmd->base.speed = SPEED_UNKNOWN;
 +              cmd->base.duplex = DUPLEX_UNKNOWN;
 +              cmd->base.eth_tp_mdix = ETH_TP_MDI_INVALID;
 +      }
 +      cmd->base.phy_address = tp->phy_addr;
 +      cmd->base.autoneg = tp->link_config.autoneg;
        return 0;
  }
  
 -static int tg3_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 +static int tg3_set_link_ksettings(struct net_device *dev,
 +                                const struct ethtool_link_ksettings *cmd)
  {
        struct tg3 *tp = netdev_priv(dev);
 -      u32 speed = ethtool_cmd_speed(cmd);
 +      u32 speed = cmd->base.speed;
 +      u32 advertising;
  
        if (tg3_flag(tp, USE_PHYLIB)) {
                struct phy_device *phydev;
                if (!(tp->phy_flags & TG3_PHYFLG_IS_CONNECTED))
                        return -EAGAIN;
                phydev = mdiobus_get_phy(tp->mdio_bus, tp->phy_addr);
 -              return phy_ethtool_sset(phydev, cmd);
 +              return phy_ethtool_ksettings_set(phydev, cmd);
        }
  
 -      if (cmd->autoneg != AUTONEG_ENABLE &&
 -          cmd->autoneg != AUTONEG_DISABLE)
 +      if (cmd->base.autoneg != AUTONEG_ENABLE &&
 +          cmd->base.autoneg != AUTONEG_DISABLE)
                return -EINVAL;
  
 -      if (cmd->autoneg == AUTONEG_DISABLE &&
 -          cmd->duplex != DUPLEX_FULL &&
 -          cmd->duplex != DUPLEX_HALF)
 +      if (cmd->base.autoneg == AUTONEG_DISABLE &&
 +          cmd->base.duplex != DUPLEX_FULL &&
 +          cmd->base.duplex != DUPLEX_HALF)
                return -EINVAL;
  
 -      if (cmd->autoneg == AUTONEG_ENABLE) {
 +      ethtool_convert_link_mode_to_legacy_u32(&advertising,
 +                                              cmd->link_modes.advertising);
 +
 +      if (cmd->base.autoneg == AUTONEG_ENABLE) {
                u32 mask = ADVERTISED_Autoneg |
                           ADVERTISED_Pause |
                           ADVERTISED_Asym_Pause;
                else
                        mask |= ADVERTISED_FIBRE;
  
 -              if (cmd->advertising & ~mask)
 +              if (advertising & ~mask)
                        return -EINVAL;
  
                mask &= (ADVERTISED_1000baseT_Half |
                         ADVERTISED_10baseT_Half |
                         ADVERTISED_10baseT_Full);
  
 -              cmd->advertising &= mask;
 +              advertising &= mask;
        } else {
                if (tp->phy_flags & TG3_PHYFLG_ANY_SERDES) {
                        if (speed != SPEED_1000)
                                return -EINVAL;
  
 -                      if (cmd->duplex != DUPLEX_FULL)
 +                      if (cmd->base.duplex != DUPLEX_FULL)
                                return -EINVAL;
                } else {
                        if (speed != SPEED_100 &&
  
        tg3_full_lock(tp, 0);
  
 -      tp->link_config.autoneg = cmd->autoneg;
 -      if (cmd->autoneg == AUTONEG_ENABLE) {
 -              tp->link_config.advertising = (cmd->advertising |
 +      tp->link_config.autoneg = cmd->base.autoneg;
 +      if (cmd->base.autoneg == AUTONEG_ENABLE) {
 +              tp->link_config.advertising = (advertising |
                                              ADVERTISED_Autoneg);
                tp->link_config.speed = SPEED_UNKNOWN;
                tp->link_config.duplex = DUPLEX_UNKNOWN;
        } else {
                tp->link_config.advertising = 0;
                tp->link_config.speed = speed;
 -              tp->link_config.duplex = cmd->duplex;
 +              tp->link_config.duplex = cmd->base.duplex;
        }
  
        tp->phy_flags |= TG3_PHYFLG_USER_CONFIGURED;
@@@ -14106,6 -14094,8 +14106,6 @@@ static int tg3_get_eee(struct net_devic
  }
  
  static const struct ethtool_ops tg3_ethtool_ops = {
 -      .get_settings           = tg3_get_settings,
 -      .set_settings           = tg3_set_settings,
        .get_drvinfo            = tg3_get_drvinfo,
        .get_regs_len           = tg3_get_regs_len,
        .get_regs               = tg3_get_regs,
        .get_ts_info            = tg3_get_ts_info,
        .get_eee                = tg3_get_eee,
        .set_eee                = tg3_set_eee,
 +      .get_link_ksettings     = tg3_get_link_ksettings,
 +      .set_link_ksettings     = tg3_set_link_ksettings,
  };
  
  static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev,
@@@ -18134,14 -18122,14 +18134,14 @@@ static pci_ers_result_t tg3_io_error_de
  
        rtnl_lock();
  
-       /* We needn't recover from permanent error */
-       if (state == pci_channel_io_frozen)
-               tp->pcierr_recovery = true;
        /* We probably don't have netdev yet */
        if (!netdev || !netif_running(netdev))
                goto done;
  
+       /* We needn't recover from permanent error */
+       if (state == pci_channel_io_frozen)
+               tp->pcierr_recovery = true;
        tg3_phy_stop(tp);
  
        tg3_netif_stop(tp);
@@@ -18238,7 -18226,7 +18238,7 @@@ static void tg3_io_resume(struct pci_de
  
        rtnl_lock();
  
-       if (!netif_running(netdev))
+       if (!netdev || !netif_running(netdev))
                goto done;
  
        tg3_full_lock(tp, 0);
@@@ -89,10 -89,10 +89,10 @@@ static struct platform_device_id fec_de
                .driver_data = 0,
        }, {
                .name = "imx25-fec",
-               .driver_data = FEC_QUIRK_USE_GASKET | FEC_QUIRK_HAS_RACC,
+               .driver_data = FEC_QUIRK_USE_GASKET,
        }, {
                .name = "imx27-fec",
-               .driver_data = FEC_QUIRK_HAS_RACC,
+               .driver_data = 0,
        }, {
                .name = "imx28-fec",
                .driver_data = FEC_QUIRK_ENET_MAC | FEC_QUIRK_SWAP_FRAME |
@@@ -180,6 -180,7 +180,7 @@@ MODULE_PARM_DESC(macaddr, "FEC Etherne
  /* FEC receive acceleration */
  #define FEC_RACC_IPDIS                (1 << 1)
  #define FEC_RACC_PRODIS               (1 << 2)
+ #define FEC_RACC_SHIFT16      BIT(7)
  #define FEC_RACC_OPTIONS      (FEC_RACC_IPDIS | FEC_RACC_PRODIS)
  
  /*
@@@ -945,9 -946,11 +946,11 @@@ fec_restart(struct net_device *ndev
  
  #if !defined(CONFIG_M5272)
        if (fep->quirks & FEC_QUIRK_HAS_RACC) {
-               /* set RX checksum */
                val = readl(fep->hwp + FEC_RACC);
+               /* align IP header */
+               val |= FEC_RACC_SHIFT16;
                if (fep->csum_flags & FLAG_RX_CSUM_ENABLED)
+                       /* set RX checksum */
                        val |= FEC_RACC_OPTIONS;
                else
                        val &= ~FEC_RACC_OPTIONS;
@@@ -1428,6 -1431,12 +1431,12 @@@ fec_enet_rx_queue(struct net_device *nd
                prefetch(skb->data - NET_IP_ALIGN);
                skb_put(skb, pkt_len - 4);
                data = skb->data;
+ #if !defined(CONFIG_M5272)
+               if (fep->quirks & FEC_QUIRK_HAS_RACC)
+                       data = skb_pull_inline(skb, 2);
+ #endif
                if (!is_copybreak && need_swap)
                        swap_buffer(data, pkt_len);
  
@@@ -2887,7 -2896,7 +2896,7 @@@ fec_enet_close(struct net_device *ndev
   * this kind of feature?).
   */
  
 -#define HASH_BITS     6               /* #bits in hash */
 +#define FEC_HASH_BITS 6               /* #bits in hash */
  #define CRC32_POLY    0xEDB88320
  
  static void set_multicast_list(struct net_device *ndev)
                        }
                }
  
 -              /* only upper 6 bits (HASH_BITS) are used
 +              /* only upper 6 bits (FEC_HASH_BITS) are used
                 * which point to specific bit in he hash registers
                 */
 -              hash = (crc >> (32 - HASH_BITS)) & 0x3f;
 +              hash = (crc >> (32 - FEC_HASH_BITS)) & 0x3f;
  
                if (hash > 31) {
                        tmp = readl(fep->hwp + FEC_GRP_HASH_TABLE_HIGH);
@@@ -537,7 -537,6 +537,7 @@@ struct sctp_datamsg 
  struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *,
                                            struct sctp_sndrcvinfo *,
                                            struct iov_iter *);
 +void sctp_datamsg_free(struct sctp_datamsg *);
  void sctp_datamsg_put(struct sctp_datamsg *);
  void sctp_chunk_fail(struct sctp_chunk *, int error);
  int sctp_chunk_abandoned(struct sctp_chunk *);
@@@ -555,6 -554,9 +555,9 @@@ struct sctp_chunk 
  
        atomic_t refcnt;
  
+       /* How many times this chunk have been sent, for prsctp RTX policy */
+       int sent_count;
        /* This is our link to the per-transport transmitted list.  */
        struct list_head transmitted_list;
  
        /* This needs to be recoverable for SCTP_SEND_FAILED events. */
        struct sctp_sndrcvinfo sinfo;
  
-       /* We use this field to record param for prsctp policies,
-        * for TTL policy, it is the time_to_drop of this chunk,
-        * for RTX policy, it is the max_sent_count of this chunk,
-        * for PRIO policy, it is the priority of this chunk.
-        */
-       unsigned long prsctp_param;
-       /* How many times this chunk have been sent, for prsctp RTX policy */
-       int sent_count;
        /* Which association does this belong to?  */
        struct sctp_association *asoc;
  
@@@ -1077,7 -1069,7 +1070,7 @@@ struct sctp_outq 
  void sctp_outq_init(struct sctp_association *, struct sctp_outq *);
  void sctp_outq_teardown(struct sctp_outq *);
  void sctp_outq_free(struct sctp_outq*);
 -int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t);
 +void sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t);
  int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *);
  int sctp_outq_is_empty(const struct sctp_outq *);
  void sctp_outq_restart(struct sctp_outq *);
  void sctp_retransmit(struct sctp_outq *, struct sctp_transport *,
                     sctp_retransmit_reason_t);
  void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8);
 -int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
 +void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
  void sctp_prsctp_prune(struct sctp_association *asoc,
                       struct sctp_sndrcvinfo *sinfo, int msg_len);
  /* Uncork and flush an outqueue.  */
diff --combined kernel/events/core.c
@@@ -3929,7 -3929,7 +3929,7 @@@ static void exclusive_event_destroy(str
  
  static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
  {
-       if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+       if ((e1->pmu == e2->pmu) &&
            (e1->cpu == e2->cpu ||
             e1->cpu == -1 ||
             e2->cpu == -1))
@@@ -7049,7 -7049,7 +7049,7 @@@ static int __perf_event_overflow(struc
                irq_work_queue(&event->pending);
        }
  
 -      event->overflow_handler(event, data, regs);
 +      READ_ONCE(event->overflow_handler)(event, data, regs);
  
        if (*perf_event_fasync(event) && event->pending_kill) {
                event->pending_wakeup = 1;
@@@ -7664,83 -7664,11 +7664,83 @@@ static void perf_event_free_filter(stru
        ftrace_profile_free_filter(event);
  }
  
 +#ifdef CONFIG_BPF_SYSCALL
 +static void bpf_overflow_handler(struct perf_event *event,
 +                               struct perf_sample_data *data,
 +                               struct pt_regs *regs)
 +{
 +      struct bpf_perf_event_data_kern ctx = {
 +              .data = data,
 +              .regs = regs,
 +      };
 +      int ret = 0;
 +
 +      preempt_disable();
 +      if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
 +              goto out;
 +      rcu_read_lock();
 +      ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
 +      rcu_read_unlock();
 +out:
 +      __this_cpu_dec(bpf_prog_active);
 +      preempt_enable();
 +      if (!ret)
 +              return;
 +
 +      event->orig_overflow_handler(event, data, regs);
 +}
 +
 +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
 +{
 +      struct bpf_prog *prog;
 +
 +      if (event->overflow_handler_context)
 +              /* hw breakpoint or kernel counter */
 +              return -EINVAL;
 +
 +      if (event->prog)
 +              return -EEXIST;
 +
 +      prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
 +      if (IS_ERR(prog))
 +              return PTR_ERR(prog);
 +
 +      event->prog = prog;
 +      event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
 +      WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
 +      return 0;
 +}
 +
 +static void perf_event_free_bpf_handler(struct perf_event *event)
 +{
 +      struct bpf_prog *prog = event->prog;
 +
 +      if (!prog)
 +              return;
 +
 +      WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
 +      event->prog = NULL;
 +      bpf_prog_put(prog);
 +}
 +#else
 +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
 +{
 +      return -EOPNOTSUPP;
 +}
 +static void perf_event_free_bpf_handler(struct perf_event *event)
 +{
 +}
 +#endif
 +
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
  {
        bool is_kprobe, is_tracepoint;
        struct bpf_prog *prog;
  
 +      if (event->attr.type == PERF_TYPE_HARDWARE ||
 +          event->attr.type == PERF_TYPE_SOFTWARE)
 +              return perf_event_set_bpf_handler(event, prog_fd);
 +
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -EINVAL;
  
@@@ -7781,8 -7709,6 +7781,8 @@@ static void perf_event_free_bpf_prog(st
  {
        struct bpf_prog *prog;
  
 +      perf_event_free_bpf_handler(event);
 +
        if (!event->tp_event)
                return;
  
@@@ -9099,19 -9025,6 +9099,19 @@@ perf_event_alloc(struct perf_event_att
        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
 +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
 +              if (overflow_handler == bpf_overflow_handler) {
 +                      struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
 +
 +                      if (IS_ERR(prog)) {
 +                              err = PTR_ERR(prog);
 +                              goto err_ns;
 +                      }
 +                      event->prog = prog;
 +                      event->orig_overflow_handler =
 +                              parent_event->orig_overflow_handler;
 +              }
 +#endif
        }
  
        if (overflow_handler) {
diff --combined net/ipv4/route.c
@@@ -1252,9 -1252,7 +1252,9 @@@ static unsigned int ipv4_mtu(const stru
                        mtu = 576;
        }
  
 -      return min_t(unsigned int, mtu, IP_MAX_MTU);
 +      mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
 +
 +      return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
  }
  
  static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
@@@ -1837,7 -1835,7 +1837,7 @@@ static int ip_route_input_slow(struct s
         *      Now we are ready to route packet.
         */
        fl4.flowi4_oif = 0;
 -      fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
 +      fl4.flowi4_iif = dev->ifindex;
        fl4.flowi4_mark = skb->mark;
        fl4.flowi4_tos = tos;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
@@@ -2024,9 -2022,7 +2024,9 @@@ static struct rtable *__mkroute_output(
                return ERR_PTR(-EINVAL);
  
        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
 -              if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
 +              if (ipv4_is_loopback(fl4->saddr) &&
 +                  !(dev_out->flags & IFF_LOOPBACK) &&
 +                  !netif_is_l3_master(dev_out))
                        return ERR_PTR(-EINVAL);
  
        if (ipv4_is_lbcast(fl4->daddr))
@@@ -2156,6 -2152,7 +2156,6 @@@ struct rtable *__ip_route_output_key_ha
        unsigned int flags = 0;
        struct fib_result res;
        struct rtable *rth;
 -      int master_idx;
        int orig_oif;
        int err = -ENETUNREACH;
  
  
        orig_oif = fl4->flowi4_oif;
  
 -      master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
 -      if (master_idx)
 -              fl4->flowi4_oif = master_idx;
        fl4->flowi4_iif = LOOPBACK_IFINDEX;
        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_HOST);
                }
 -
 -              rth = l3mdev_get_rtable(dev_out, fl4);
 -              if (rth)
 -                      goto out;
        }
  
        if (!fl4->daddr) {
        if (err) {
                res.fi = NULL;
                res.table = NULL;
 -              if (fl4->flowi4_oif &&
 -                  !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
 +              if (fl4->flowi4_oif) {
                        /* Apparently, routing tables are wrong. Assume,
                           that the destination is on link.
  
                        else
                                fl4->saddr = fl4->daddr;
                }
 -              dev_out = net->loopback_dev;
 +
 +              /* L3 master device is the loopback for that domain */
 +              dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
                fl4->flowi4_oif = dev_out->ifindex;
                flags |= RTCF_LOCAL;
                goto make_route;
@@@ -2500,7 -2503,8 +2500,8 @@@ static int rt_fill_info(struct net *net
                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
                        int err = ipmr_get_route(net, skb,
                                                 fl4->saddr, fl4->daddr,
-                                                r, nowait);
+                                                r, nowait, portid);
                        if (err <= 0) {
                                if (!nowait) {
                                        if (err == 0)
@@@ -2578,6 -2582,9 +2579,6 @@@ static int inet_rtm_getroute(struct sk_
        fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
        fl4.flowi4_mark = mark;
  
 -      if (netif_index_is_l3_master(net, fl4.flowi4_oif))
 -              fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
 -
        if (iif) {
                struct net_device *dev;
  
diff --combined net/ipv4/tcp_input.c
@@@ -289,7 -289,6 +289,7 @@@ static bool tcp_ecn_rcv_ecn_echo(const 
  static void tcp_sndbuf_expand(struct sock *sk)
  {
        const struct tcp_sock *tp = tcp_sk(sk);
 +      const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
        int sndmem, per_mss;
        u32 nr_segs;
  
         * Cubic needs 1.7 factor, rounded to 2 to include
         * extra cushion (application might react slowly to POLLOUT)
         */
 -      sndmem = 2 * nr_segs * per_mss;
 +      sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
 +      sndmem *= nr_segs * per_mss;
  
        if (sk->sk_sndbuf < sndmem)
                sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
@@@ -901,29 -899,12 +901,29 @@@ static void tcp_verify_retransmit_hint(
                tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
  }
  
 +/* Sum the number of packets on the wire we have marked as lost.
 + * There are two cases we care about here:
 + * a) Packet hasn't been marked lost (nor retransmitted),
 + *    and this is the first loss.
 + * b) Packet has been marked both lost and retransmitted,
 + *    and this means we think it was lost again.
 + */
 +static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
 +{
 +      __u8 sacked = TCP_SKB_CB(skb)->sacked;
 +
 +      if (!(sacked & TCPCB_LOST) ||
 +          ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
 +              tp->lost += tcp_skb_pcount(skb);
 +}
 +
  static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
  {
        if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
                tcp_verify_retransmit_hint(tp, skb);
  
                tp->lost_out += tcp_skb_pcount(skb);
 +              tcp_sum_lost(tp, skb);
                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
        }
  }
@@@ -932,7 -913,6 +932,7 @@@ void tcp_skb_mark_lost_uncond_verify(st
  {
        tcp_verify_retransmit_hint(tp, skb);
  
 +      tcp_sum_lost(tp, skb);
        if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
                tp->lost_out += tcp_skb_pcount(skb);
                TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@@ -1114,7 -1094,6 +1114,7 @@@ struct tcp_sacktag_state 
         */
        struct skb_mstamp first_sackt;
        struct skb_mstamp last_sackt;
 +      struct rate_sample *rate;
        int     flag;
  };
  
@@@ -1282,7 -1261,6 +1282,7 @@@ static bool tcp_shifted_skb(struct soc
        tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
                        start_seq, end_seq, dup_sack, pcount,
                        &skb->skb_mstamp);
 +      tcp_rate_skb_delivered(sk, skb, state->rate);
  
        if (skb == tp->lost_skb_hint)
                tp->lost_cnt_hint += pcount;
                tcp_advance_highest_sack(sk, skb);
  
        tcp_skb_collapse_tstamp(prev, skb);
 +      if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
 +              TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
 +
        tcp_unlink_write_queue(skb, sk);
        sk_wmem_free_skb(sk, skb);
  
@@@ -1565,7 -1540,6 +1565,7 @@@ static struct sk_buff *tcp_sacktag_walk
                                                dup_sack,
                                                tcp_skb_pcount(skb),
                                                &skb->skb_mstamp);
 +                      tcp_rate_skb_delivered(sk, skb, state->rate);
  
                        if (!before(TCP_SKB_CB(skb)->seq,
                                    tcp_highest_sack_seq(tp)))
@@@ -1648,10 -1622,8 +1648,10 @@@ tcp_sacktag_write_queue(struct sock *sk
  
        found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
                                         num_sacks, prior_snd_una);
 -      if (found_dup_sack)
 +      if (found_dup_sack) {
                state->flag |= FLAG_DSACKING_ACK;
 +              tp->delivered++; /* A spurious retransmission is delivered */
 +      }
  
        /* Eliminate too old ACKs, but take into
         * account more or less fresh ones, they can
@@@ -1918,7 -1890,6 +1918,7 @@@ void tcp_enter_loss(struct sock *sk
        struct sk_buff *skb;
        bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
        bool is_reneg;                  /* is receiver reneging on SACKs? */
 +      bool mark_lost;
  
        /* Reduce ssthresh if it has not yet been made inside this window. */
        if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
                if (skb == tcp_send_head(sk))
                        break;
  
 +              mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
 +                           is_reneg);
 +              if (mark_lost)
 +                      tcp_sum_lost(tp, skb);
                TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
 -              if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
 +              if (mark_lost) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                        tp->lost_out += tcp_skb_pcount(skb);
@@@ -2362,10 -2329,9 +2362,9 @@@ static void DBGUNDO(struct sock *sk, co
        }
  #if IS_ENABLED(CONFIG_IPV6)
        else if (sk->sk_family == AF_INET6) {
-               struct ipv6_pinfo *np = inet6_sk(sk);
                pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
                         msg,
-                        &np->daddr, ntohs(inet->inet_dport),
+                        &sk->sk_v6_daddr, ntohs(inet->inet_dport),
                         tp->snd_cwnd, tcp_left_out(tp),
                         tp->snd_ssthresh, tp->prior_ssthresh,
                         tp->packets_out);
@@@ -2536,9 -2502,6 +2535,9 @@@ static inline void tcp_end_cwnd_reducti
  {
        struct tcp_sock *tp = tcp_sk(sk);
  
 +      if (inet_csk(sk)->icsk_ca_ops->cong_control)
 +              return;
 +
        /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
        if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
            (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
@@@ -2915,13 -2878,67 +2914,13 @@@ static void tcp_fastretrans_alert(struc
        *rexmit = REXMIT_LOST;
  }
  
 -/* Kathleen Nichols' algorithm for tracking the minimum value of
 - * a data stream over some fixed time interval. (E.g., the minimum
 - * RTT over the past five minutes.) It uses constant space and constant
 - * time per update yet almost always delivers the same minimum as an
 - * implementation that has to keep all the data in the window.
 - *
 - * The algorithm keeps track of the best, 2nd best & 3rd best min
 - * values, maintaining an invariant that the measurement time of the
 - * n'th best >= n-1'th best. It also makes sure that the three values
 - * are widely separated in the time window since that bounds the worse
 - * case error when that data is monotonically increasing over the window.
 - *
 - * Upon getting a new min, we can forget everything earlier because it
 - * has no value - the new min is <= everything else in the window by
 - * definition and it's the most recent. So we restart fresh on every new min
 - * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
 - * best.
 - */
  static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
  {
 -      const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
 -      struct rtt_meas *m = tcp_sk(sk)->rtt_min;
 -      struct rtt_meas rttm = {
 -              .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
 -              .ts = now,
 -      };
 -      u32 elapsed;
 -
 -      /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
 -      if (unlikely(rttm.rtt <= m[0].rtt))
 -              m[0] = m[1] = m[2] = rttm;
 -      else if (rttm.rtt <= m[1].rtt)
 -              m[1] = m[2] = rttm;
 -      else if (rttm.rtt <= m[2].rtt)
 -              m[2] = rttm;
 -
 -      elapsed = now - m[0].ts;
 -      if (unlikely(elapsed > wlen)) {
 -              /* Passed entire window without a new min so make 2nd choice
 -               * the new min & 3rd choice the new 2nd. So forth and so on.
 -               */
 -              m[0] = m[1];
 -              m[1] = m[2];
 -              m[2] = rttm;
 -              if (now - m[0].ts > wlen) {
 -                      m[0] = m[1];
 -                      m[1] = rttm;
 -                      if (now - m[0].ts > wlen)
 -                              m[0] = rttm;
 -              }
 -      } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
 -              /* Passed a quarter of the window without a new min so
 -               * take 2nd choice from the 2nd quarter of the window.
 -               */
 -              m[2] = m[1] = rttm;
 -      } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
 -              /* Passed half the window without a new min so take the 3rd
 -               * choice from the last half of the window.
 -               */
 -              m[2] = rttm;
 -      }
 +      struct tcp_sock *tp = tcp_sk(sk);
 +      u32 wlen = sysctl_tcp_min_rtt_wlen * HZ;
 +
 +      minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp,
 +                         rtt_us ? : jiffies_to_usecs(1));
  }
  
  static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
@@@ -3084,11 -3101,10 +3083,11 @@@ static void tcp_ack_tstamp(struct sock 
   */
  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                               u32 prior_snd_una, int *acked,
 -                             struct tcp_sacktag_state *sack)
 +                             struct tcp_sacktag_state *sack,
 +                             struct skb_mstamp *now)
  {
        const struct inet_connection_sock *icsk = inet_csk(sk);
 -      struct skb_mstamp first_ackt, last_ackt, now;
 +      struct skb_mstamp first_ackt, last_ackt;
        struct tcp_sock *tp = tcp_sk(sk);
        u32 prior_sacked = tp->sacked_out;
        u32 reord = tp->packets_out;
                        acked_pcount = tcp_tso_acked(sk, skb);
                        if (!acked_pcount)
                                break;
 -
                        fully_acked = false;
                } else {
                        /* Speedup tcp_unlink_write_queue() and next loop */
  
                tp->packets_out -= acked_pcount;
                pkts_acked += acked_pcount;
 +              tcp_rate_skb_delivered(sk, skb, sack->rate);
  
                /* Initial outgoing SYN's get put onto the write_queue
                 * just like anything else we transmit.  It is not
        if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                flag |= FLAG_SACK_RENEGING;
  
 -      skb_mstamp_get(&now);
        if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
 -              seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
 -              ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
 +              seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
 +              ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
        }
        if (sack->first_sackt.v64) {
 -              sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
 -              ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
 +              sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
 +              ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
        }
 -
 +      sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
        rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
                                        ca_rtt_us);
  
                tp->fackets_out -= min(pkts_acked, tp->fackets_out);
  
        } else if (skb && rtt_update && sack_rtt_us >= 0 &&
 -                 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
 +                 sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
                /* Do not re-arm RTO if the sack RTT is measured from data sent
                 * after when the head was last (re)transmitted. Otherwise the
                 * timeout may continue to extend in loss recovery.
@@@ -3315,15 -3332,8 +3314,15 @@@ static inline bool tcp_may_raise_cwnd(c
   * information. All transmission or retransmission are delayed afterwards.
   */
  static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
 -                           int flag)
 +                           int flag, const struct rate_sample *rs)
  {
 +      const struct inet_connection_sock *icsk = inet_csk(sk);
 +
 +      if (icsk->icsk_ca_ops->cong_control) {
 +              icsk->icsk_ca_ops->cong_control(sk, rs);
 +              return;
 +      }
 +
        if (tcp_in_cwnd_reduction(sk)) {
                /* Reduce cwnd if state mandates */
                tcp_cwnd_reduction(sk, acked_sacked, flag);
@@@ -3568,21 -3578,17 +3567,21 @@@ static int tcp_ack(struct sock *sk, con
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_sacktag_state sack_state;
 +      struct rate_sample rs = { .prior_delivered = 0 };
        u32 prior_snd_una = tp->snd_una;
        u32 ack_seq = TCP_SKB_CB(skb)->seq;
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
        bool is_dupack = false;
        u32 prior_fackets;
        int prior_packets = tp->packets_out;
 -      u32 prior_delivered = tp->delivered;
 +      u32 delivered = tp->delivered;
 +      u32 lost = tp->lost;
        int acked = 0; /* Number of packets newly acked */
        int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
 +      struct skb_mstamp now;
  
        sack_state.first_sackt.v64 = 0;
 +      sack_state.rate = &rs;
  
        /* We very likely will need to access write queue head. */
        prefetchw(sk->sk_write_queue.next);
        if (after(ack, tp->snd_nxt))
                goto invalid_ack;
  
 +      skb_mstamp_get(&now);
 +
        if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
            icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                tcp_rearm_rto(sk);
        }
  
        prior_fackets = tp->fackets_out;
 +      rs.prior_in_flight = tcp_packets_in_flight(tp);
  
        /* ts_recent update must be made after we are sure that the packet
         * is in window.
  
        /* See if we can take anything off of the retransmit queue. */
        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
 -                                  &sack_state);
 +                                  &sack_state, &now);
  
        if (tcp_ack_is_dubious(sk, flag)) {
                is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
  
        if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                tcp_schedule_loss_probe(sk);
 -      tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
 +      delivered = tp->delivered - delivered;  /* freshly ACKed or SACKed */
 +      lost = tp->lost - lost;                 /* freshly marked lost */
 +      tcp_rate_gen(sk, delivered, lost, &now, &rs);
 +      tcp_cong_control(sk, ack, delivered, flag, &rs);
        tcp_xmit_recovery(sk, rexmit);
        return 1;
  
@@@ -4107,7 -4107,7 +4106,7 @@@ void tcp_fin(struct sock *sk
        /* It _is_ possible, that we have something out-of-order _after_ FIN.
         * Probably, we should reset in this case. For now drop them.
         */
 -      __skb_queue_purge(&tp->out_of_order_queue);
 +      skb_rbtree_purge(&tp->out_of_order_queue);
        if (tcp_is_sack(tp))
                tcp_sack_reset(&tp->rx_opt);
        sk_mem_reclaim(sk);
@@@ -4267,7 -4267,7 +4266,7 @@@ static void tcp_sack_remove(struct tcp_
        int this_sack;
  
        /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
 -      if (skb_queue_empty(&tp->out_of_order_queue)) {
 +      if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                tp->rx_opt.num_sacks = 0;
                return;
        }
@@@ -4343,13 -4343,10 +4342,13 @@@ static void tcp_ofo_queue(struct sock *
  {
        struct tcp_sock *tp = tcp_sk(sk);
        __u32 dsack_high = tp->rcv_nxt;
 +      bool fin, fragstolen, eaten;
        struct sk_buff *skb, *tail;
 -      bool fragstolen, eaten;
 +      struct rb_node *p;
  
 -      while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
 +      p = rb_first(&tp->out_of_order_queue);
 +      while (p) {
 +              skb = rb_entry(p, struct sk_buff, rbnode);
                if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
                        break;
  
                                dsack_high = TCP_SKB_CB(skb)->end_seq;
                        tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
                }
 +              p = rb_next(p);
 +              rb_erase(&skb->rbnode, &tp->out_of_order_queue);
  
 -              __skb_unlink(skb, &tp->out_of_order_queue);
 -              if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 +              if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
                        SOCK_DEBUG(sk, "ofo packet was already received\n");
                        tcp_drop(sk, skb);
                        continue;
                tail = skb_peek_tail(&sk->sk_receive_queue);
                eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
                tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 +              fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
                if (!eaten)
                        __skb_queue_tail(&sk->sk_receive_queue, skb);
 -              if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 -                      tcp_fin(sk);
 -              if (eaten)
 +              else
                        kfree_skb_partial(skb, fragstolen);
 +
 +              if (unlikely(fin)) {
 +                      tcp_fin(sk);
 +                      /* tcp_fin() purges tp->out_of_order_queue,
 +                       * so we must end this loop right now.
 +                       */
 +                      break;
 +              }
        }
  }
  
@@@ -4402,9 -4391,12 +4401,9 @@@ static int tcp_try_rmem_schedule(struc
                if (tcp_prune_queue(sk) < 0)
                        return -1;
  
 -              if (!sk_rmem_schedule(sk, skb, size)) {
 +              while (!sk_rmem_schedule(sk, skb, size)) {
                        if (!tcp_prune_ofo_queue(sk))
                                return -1;
 -
 -                      if (!sk_rmem_schedule(sk, skb, size))
 -                              return -1;
                }
        }
        return 0;
  static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
  {
        struct tcp_sock *tp = tcp_sk(sk);
 +      struct rb_node **p, *q, *parent;
        struct sk_buff *skb1;
        u32 seq, end_seq;
 +      bool fragstolen;
  
        tcp_ecn_check_ce(tp, skb);
  
        inet_csk_schedule_ack(sk);
  
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
 +      seq = TCP_SKB_CB(skb)->seq;
 +      end_seq = TCP_SKB_CB(skb)->end_seq;
        SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 -                 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 +                 tp->rcv_nxt, seq, end_seq);
  
 -      skb1 = skb_peek_tail(&tp->out_of_order_queue);
 -      if (!skb1) {
 +      p = &tp->out_of_order_queue.rb_node;
 +      if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                /* Initial out of order segment, build 1 SACK. */
                if (tcp_is_sack(tp)) {
                        tp->rx_opt.num_sacks = 1;
 -                      tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
 -                      tp->selective_acks[0].end_seq =
 -                                              TCP_SKB_CB(skb)->end_seq;
 +                      tp->selective_acks[0].start_seq = seq;
 +                      tp->selective_acks[0].end_seq = end_seq;
                }
 -              __skb_queue_head(&tp->out_of_order_queue, skb);
 +              rb_link_node(&skb->rbnode, NULL, p);
 +              rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
 +              tp->ooo_last_skb = skb;
                goto end;
        }
  
 -      seq = TCP_SKB_CB(skb)->seq;
 -      end_seq = TCP_SKB_CB(skb)->end_seq;
 -
 -      if (seq == TCP_SKB_CB(skb1)->end_seq) {
 -              bool fragstolen;
 -
 -              if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
 -                      __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 -              } else {
 -                      tcp_grow_window(sk, skb);
 -                      kfree_skb_partial(skb, fragstolen);
 -                      skb = NULL;
 -              }
 -
 -              if (!tp->rx_opt.num_sacks ||
 -                  tp->selective_acks[0].end_seq != seq)
 -                      goto add_sack;
 -
 -              /* Common case: data arrive in order after hole. */
 -              tp->selective_acks[0].end_seq = end_seq;
 -              goto end;
 -      }
 -
 -      /* Find place to insert this segment. */
 -      while (1) {
 -              if (!after(TCP_SKB_CB(skb1)->seq, seq))
 -                      break;
 -              if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
 -                      skb1 = NULL;
 -                      break;
 -              }
 -              skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
 -      }
 -
 -      /* Do skb overlap to previous one? */
 -      if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
 -              if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
 -                      /* All the bits are present. Drop. */
 -                      NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
 -                      tcp_drop(sk, skb);
 -                      skb = NULL;
 -                      tcp_dsack_set(sk, seq, end_seq);
 -                      goto add_sack;
 +      /* In the typical case, we are adding an skb to the end of the list.
 +       * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 +       */
 +      if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
 +coalesce_done:
 +              tcp_grow_window(sk, skb);
 +              kfree_skb_partial(skb, fragstolen);
 +              skb = NULL;
 +              goto add_sack;
 +      }
 +      /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */
 +      if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
 +              parent = &tp->ooo_last_skb->rbnode;
 +              p = &parent->rb_right;
 +              goto insert;
 +      }
 +
 +      /* Find place to insert this segment. Handle overlaps on the way. */
 +      parent = NULL;
 +      while (*p) {
 +              parent = *p;
 +              skb1 = rb_entry(parent, struct sk_buff, rbnode);
 +              if (before(seq, TCP_SKB_CB(skb1)->seq)) {
 +                      p = &parent->rb_left;
 +                      continue;
                }
 -              if (after(seq, TCP_SKB_CB(skb1)->seq)) {
 -                      /* Partial overlap. */
 -                      tcp_dsack_set(sk, seq,
 -                                    TCP_SKB_CB(skb1)->end_seq);
 -              } else {
 -                      if (skb_queue_is_first(&tp->out_of_order_queue,
 -                                             skb1))
 -                              skb1 = NULL;
 -                      else
 -                              skb1 = skb_queue_prev(
 -                                      &tp->out_of_order_queue,
 -                                      skb1);
 +              if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
 +                      if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
 +                              /* All the bits are present. Drop. */
 +                              NET_INC_STATS(sock_net(sk),
 +                                            LINUX_MIB_TCPOFOMERGE);
 +                              __kfree_skb(skb);
 +                              skb = NULL;
 +                              tcp_dsack_set(sk, seq, end_seq);
 +                              goto add_sack;
 +                      }
 +                      if (after(seq, TCP_SKB_CB(skb1)->seq)) {
 +                              /* Partial overlap. */
 +                              tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
 +                      } else {
 +                              /* skb's seq == skb1's seq and skb covers skb1.
 +                               * Replace skb1 with skb.
 +                               */
 +                              rb_replace_node(&skb1->rbnode, &skb->rbnode,
 +                                              &tp->out_of_order_queue);
 +                              tcp_dsack_extend(sk,
 +                                               TCP_SKB_CB(skb1)->seq,
 +                                               TCP_SKB_CB(skb1)->end_seq);
 +                              NET_INC_STATS(sock_net(sk),
 +                                            LINUX_MIB_TCPOFOMERGE);
 +                              __kfree_skb(skb1);
 +                              goto merge_right;
 +                      }
 +              } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
 +                      goto coalesce_done;
                }
 +              p = &parent->rb_right;
        }
 -      if (!skb1)
 -              __skb_queue_head(&tp->out_of_order_queue, skb);
 -      else
 -              __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 +insert:
 +      /* Insert segment into RB tree. */
 +      rb_link_node(&skb->rbnode, parent, p);
 +      rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
  
 -      /* And clean segments covered by new one as whole. */
 -      while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
 -              skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
 +merge_right:
 +      /* Remove other segments covered by skb. */
 +      while ((q = rb_next(&skb->rbnode)) != NULL) {
 +              skb1 = rb_entry(q, struct sk_buff, rbnode);
  
                if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
                        break;
                                         end_seq);
                        break;
                }
 -              __skb_unlink(skb1, &tp->out_of_order_queue);
 +              rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
                tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
                                 TCP_SKB_CB(skb1)->end_seq);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
                tcp_drop(sk, skb1);
        }
 +      /* If there is no skb after us, we are the last_skb ! */
 +      if (!q)
 +              tp->ooo_last_skb = skb;
  
  add_sack:
        if (tcp_is_sack(tp))
@@@ -4670,13 -4653,13 +4669,13 @@@ queue_and_out
                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        tcp_fin(sk);
  
 -              if (!skb_queue_empty(&tp->out_of_order_queue)) {
 +              if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
                        tcp_ofo_queue(sk);
  
                        /* RFC2581. 4.2. SHOULD send immediate ACK, when
                         * gap in queue is filled.
                         */
 -                      if (skb_queue_empty(&tp->out_of_order_queue))
 +                      if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
                                inet_csk(sk)->icsk_ack.pingpong = 0;
                }
  
@@@ -4730,76 -4713,48 +4729,76 @@@ drop
        tcp_data_queue_ofo(sk, skb);
  }
  
 +static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list)
 +{
 +      if (list)
 +              return !skb_queue_is_last(list, skb) ? skb->next : NULL;
 +
 +      return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
 +}
 +
  static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
 -                                      struct sk_buff_head *list)
 +                                      struct sk_buff_head *list,
 +                                      struct rb_root *root)
  {
 -      struct sk_buff *next = NULL;
 +      struct sk_buff *next = tcp_skb_next(skb, list);
  
 -      if (!skb_queue_is_last(list, skb))
 -              next = skb_queue_next(list, skb);
 +      if (list)
 +              __skb_unlink(skb, list);
 +      else
 +              rb_erase(&skb->rbnode, root);
  
 -      __skb_unlink(skb, list);
        __kfree_skb(skb);
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
  
        return next;
  }
  
 +/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
 +static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 +{
 +      struct rb_node **p = &root->rb_node;
 +      struct rb_node *parent = NULL;
 +      struct sk_buff *skb1;
 +
 +      while (*p) {
 +              parent = *p;
 +              skb1 = rb_entry(parent, struct sk_buff, rbnode);
 +              if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
 +                      p = &parent->rb_left;
 +              else
 +                      p = &parent->rb_right;
 +      }
 +      rb_link_node(&skb->rbnode, parent, p);
 +      rb_insert_color(&skb->rbnode, root);
 +}
 +
  /* Collapse contiguous sequence of skbs head..tail with
   * sequence numbers start..end.
   *
 - * If tail is NULL, this means until the end of the list.
 + * If tail is NULL, this means until the end of the queue.
   *
   * Segments with FIN/SYN are not collapsed (only because this
   * simplifies code)
   */
  static void
 -tcp_collapse(struct sock *sk, struct sk_buff_head *list,
 -           struct sk_buff *head, struct sk_buff *tail,
 -           u32 start, u32 end)
 +tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
 +           struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end)
  {
 -      struct sk_buff *skb, *n;
 +      struct sk_buff *skb = head, *n;
 +      struct sk_buff_head tmp;
        bool end_of_skbs;
  
        /* First, check that queue is collapsible and find
 -       * the point where collapsing can be useful. */
 -      skb = head;
 +       * the point where collapsing can be useful.
 +       */
  restart:
 -      end_of_skbs = true;
 -      skb_queue_walk_from_safe(list, skb, n) {
 -              if (skb == tail)
 -                      break;
 +      for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
 +              n = tcp_skb_next(skb, list);
 +
                /* No new bits? It is possible on ofo queue. */
                if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 -                      skb = tcp_collapse_one(sk, skb, list);
 +                      skb = tcp_collapse_one(sk, skb, list, root);
                        if (!skb)
                                break;
                        goto restart;
                        break;
                }
  
 -              if (!skb_queue_is_last(list, skb)) {
 -                      struct sk_buff *next = skb_queue_next(list, skb);
 -                      if (next != tail &&
 -                          TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
 -                              end_of_skbs = false;
 -                              break;
 -                      }
 +              if (n && n != tail &&
 +                  TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
 +                      end_of_skbs = false;
 +                      break;
                }
  
                /* Decided to skip this, advance start seq. */
            (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
                return;
  
 +      __skb_queue_head_init(&tmp);
 +
        while (before(start, end)) {
                int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
                struct sk_buff *nskb;
  
                nskb = alloc_skb(copy, GFP_ATOMIC);
                if (!nskb)
 -                      return;
 +                      break;
  
                memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
                TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
 -              __skb_queue_before(list, skb, nskb);
 +              if (list)
 +                      __skb_queue_before(list, skb, nskb);
 +              else
 +                      __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
                skb_set_owner_r(nskb, sk);
  
                /* Copy data, releasing collapsed skbs. */
                                start += size;
                        }
                        if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
 -                              skb = tcp_collapse_one(sk, skb, list);
 +                              skb = tcp_collapse_one(sk, skb, list, root);
                                if (!skb ||
                                    skb == tail ||
                                    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
 -                                      return;
 +                                      goto end;
                        }
                }
        }
 +end:
 +      skb_queue_walk_safe(&tmp, skb, n)
 +              tcp_rbtree_insert(root, skb);
  }
  
  /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
  static void tcp_collapse_ofo_queue(struct sock *sk)
  {
        struct tcp_sock *tp = tcp_sk(sk);
 -      struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
 -      struct sk_buff *head;
 +      struct sk_buff *skb, *head;
 +      struct rb_node *p;
        u32 start, end;
  
 -      if (!skb)
 +      p = rb_first(&tp->out_of_order_queue);
 +      skb = rb_entry_safe(p, struct sk_buff, rbnode);
 +new_range:
 +      if (!skb) {
 +              p = rb_last(&tp->out_of_order_queue);
 +              /* Note: This is possible p is NULL here. We do not
 +               * use rb_entry_safe(), as ooo_last_skb is valid only
 +               * if rbtree is not empty.
 +               */
 +              tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
                return;
 -
 +      }
        start = TCP_SKB_CB(skb)->seq;
        end = TCP_SKB_CB(skb)->end_seq;
 -      head = skb;
 -
 -      for (;;) {
 -              struct sk_buff *next = NULL;
  
 -              if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
 -                      next = skb_queue_next(&tp->out_of_order_queue, skb);
 -              skb = next;
 +      for (head = skb;;) {
 +              skb = tcp_skb_next(skb, NULL);
  
 -              /* Segment is terminated when we see gap or when
 -               * we are at the end of all the queue. */
 +              /* Range is terminated when we see a gap or when
 +               * we are at the queue end.
 +               */
                if (!skb ||
                    after(TCP_SKB_CB(skb)->seq, end) ||
                    before(TCP_SKB_CB(skb)->end_seq, start)) {
 -                      tcp_collapse(sk, &tp->out_of_order_queue,
 +                      tcp_collapse(sk, NULL, &tp->out_of_order_queue,
                                     head, skb, start, end);
 -                      head = skb;
 -                      if (!skb)
 -                              break;
 -                      /* Start new segment */
 +                      goto new_range;
 +              }
 +
 +              if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
                        start = TCP_SKB_CB(skb)->seq;
 +              if (after(TCP_SKB_CB(skb)->end_seq, end))
                        end = TCP_SKB_CB(skb)->end_seq;
 -              } else {
 -                      if (before(TCP_SKB_CB(skb)->seq, start))
 -                              start = TCP_SKB_CB(skb)->seq;
 -                      if (after(TCP_SKB_CB(skb)->end_seq, end))
 -                              end = TCP_SKB_CB(skb)->end_seq;
 -              }
        }
  }
  
  /*
 - * Purge the out-of-order queue.
 - * Return true if queue was pruned.
 + * Clean the out-of-order queue to make room.
 + * We drop high sequences packets to :
 + * 1) Let a chance for holes to be filled.
 + * 2) not add too big latencies if thousands of packets sit there.
 + *    (But if application shrinks SO_RCVBUF, we could still end up
 + *     freeing whole queue here)
 + *
 + * Return true if queue has shrunk.
   */
  static bool tcp_prune_ofo_queue(struct sock *sk)
  {
        struct tcp_sock *tp = tcp_sk(sk);
 -      bool res = false;
 +      struct rb_node *node, *prev;
  
 -      if (!skb_queue_empty(&tp->out_of_order_queue)) {
 -              NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
 -              __skb_queue_purge(&tp->out_of_order_queue);
 +      if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 +              return false;
  
 -              /* Reset SACK state.  A conforming SACK implementation will
 -               * do the same at a timeout based retransmit.  When a connection
 -               * is in a sad state like this, we care only about integrity
 -               * of the connection not performance.
 -               */
 -              if (tp->rx_opt.sack_ok)
 -                      tcp_sack_reset(&tp->rx_opt);
 +      NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
 +      node = &tp->ooo_last_skb->rbnode;
 +      do {
 +              prev = rb_prev(node);
 +              rb_erase(node, &tp->out_of_order_queue);
 +              tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
                sk_mem_reclaim(sk);
 -              res = true;
 -      }
 -      return res;
 +              if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
 +                  !tcp_under_memory_pressure(sk))
 +                      break;
 +              node = prev;
 +      } while (node);
 +      tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
 +
 +      /* Reset SACK state.  A conforming SACK implementation will
 +       * do the same at a timeout based retransmit.  When a connection
 +       * is in a sad state like this, we care only about integrity
 +       * of the connection not performance.
 +       */
 +      if (tp->rx_opt.sack_ok)
 +              tcp_sack_reset(&tp->rx_opt);
 +      return true;
  }
  
  /* Reduce allocated memory if we can, trying to get
@@@ -4986,7 -4920,7 +4985,7 @@@ static int tcp_prune_queue(struct sock 
  
        tcp_collapse_ofo_queue(sk);
        if (!skb_queue_empty(&sk->sk_receive_queue))
 -              tcp_collapse(sk, &sk->sk_receive_queue,
 +              tcp_collapse(sk, &sk->sk_receive_queue, NULL,
                             skb_peek(&sk->sk_receive_queue),
                             NULL,
                             tp->copied_seq, tp->rcv_nxt);
@@@ -5091,7 -5025,7 +5090,7 @@@ static void __tcp_ack_snd_check(struct 
            /* We ACK each frame or... */
            tcp_in_quickack_mode(sk) ||
            /* We have out of order data. */
 -          (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
 +          (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
                /* Then ack it now */
                tcp_send_ack(sk);
        } else {
@@@ -5992,8 -5926,7 +5991,8 @@@ int tcp_rcv_state_process(struct sock *
                } else
                        tcp_init_metrics(sk);
  
 -              tcp_update_pacing_rate(sk);
 +              if (!inet_csk(sk)->icsk_ca_ops->cong_control)
 +                      tcp_update_pacing_rate(sk);
  
                /* Prevent spurious tcp_cwnd_restart() on first data packet */
                tp->lsndtime = tcp_time_stamp;
@@@ -6326,7 -6259,6 +6325,7 @@@ int tcp_conn_request(struct request_soc
  
        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
        tcp_openreq_init(req, &tmp_opt, skb, sk);
 +      inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
  
        /* Note: tcp_v6_init_req() might override ir_iif for link locals */
        inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
diff --combined net/ipv4/tcp_output.c
@@@ -734,16 -734,9 +734,16 @@@ static void tcp_tsq_handler(struct soc
  {
        if ((1 << sk->sk_state) &
            (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
 -           TCPF_CLOSE_WAIT  | TCPF_LAST_ACK))
 -              tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
 +           TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
 +              struct tcp_sock *tp = tcp_sk(sk);
 +
 +              if (tp->lost_out > tp->retrans_out &&
 +                  tp->snd_cwnd > tcp_packets_in_flight(tp))
 +                      tcp_xmit_retransmit_queue(sk);
 +
 +              tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
                               0, GFP_ATOMIC);
 +      }
  }
  /*
   * One tasklet per cpu tries to send more skbs.
@@@ -925,7 -918,6 +925,7 @@@ static int tcp_transmit_skb(struct soc
                skb_mstamp_get(&skb->skb_mstamp);
                TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
                        - tp->snd_una;
 +              tcp_rate_skb_sent(sk, skb);
  
                if (unlikely(skb_cloned(skb)))
                        skb = pskb_copy(skb, gfp_mask);
@@@ -1221,9 -1213,6 +1221,9 @@@ int tcp_fragment(struct sock *sk, struc
        tcp_set_skb_tso_segs(skb, mss_now);
        tcp_set_skb_tso_segs(buff, mss_now);
  
 +      /* Update delivered info for the new segment */
 +      TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
 +
        /* If this packet has been sent out already, we must
         * adjust the various packet counters.
         */
@@@ -1369,7 -1358,6 +1369,7 @@@ int tcp_mss_to_mtu(struct sock *sk, in
        }
        return mtu;
  }
 +EXPORT_SYMBOL(tcp_mss_to_mtu);
  
  /* MTU probing init per socket */
  void tcp_mtup_init(struct sock *sk)
@@@ -1557,8 -1545,7 +1557,8 @@@ static bool tcp_nagle_check(bool partia
  /* Return how many segs we'd like on a TSO packet,
   * to send one TSO packet per ms
   */
 -static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
 +u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 +                   int min_tso_segs)
  {
        u32 bytes, segs;
  
         * This preserves ACK clocking and is consistent
         * with tcp_tso_should_defer() heuristic.
         */
 -      segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
 +      segs = max_t(u32, bytes / mss_now, min_tso_segs);
  
        return min_t(u32, segs, sk->sk_gso_max_segs);
  }
 +EXPORT_SYMBOL(tcp_tso_autosize);
 +
 +/* Return the number of segments we want in the skb we are transmitting.
 + * See if congestion control module wants to decide; otherwise, autosize.
 + */
 +static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 +{
 +      const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
 +      u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
 +
 +      return tso_segs ? :
 +              tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs);
 +}
  
  /* Returns the portion of skb which can be sent right away */
  static unsigned int tcp_mss_split_point(const struct sock *sk,
@@@ -1992,12 -1966,14 +1992,14 @@@ static int tcp_mtu_probe(struct sock *s
        len = 0;
        tcp_for_write_queue_from_safe(skb, next, sk) {
                copy = min_t(int, skb->len, probe_size - len);
-               if (nskb->ip_summed)
+               if (nskb->ip_summed) {
                        skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
-               else
-                       nskb->csum = skb_copy_and_csum_bits(skb, 0,
-                                                           skb_put(nskb, copy),
-                                                           copy, nskb->csum);
+               } else {
+                       __wsum csum = skb_copy_and_csum_bits(skb, 0,
+                                                            skb_put(nskb, copy),
+                                                            copy, 0);
+                       nskb->csum = csum_block_add(nskb->csum, csum, len);
+               }
  
                if (skb->len <= copy) {
                        /* We've eaten all the data from this skb.
        return -1;
  }
  
 +/* TCP Small Queues :
 + * Control number of packets in qdisc/devices to two packets / or ~1 ms.
 + * (These limits are doubled for retransmits)
 + * This allows for :
 + *  - better RTT estimation and ACK scheduling
 + *  - faster recovery
 + *  - high rates
 + * Alas, some drivers / subsystems require a fair amount
 + * of queued bytes to ensure line rate.
 + * One example is wifi aggregation (802.11 AMPDU)
 + */
 +static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 +                                unsigned int factor)
 +{
 +      unsigned int limit;
 +
 +      limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
 +      limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
 +      limit <<= factor;
 +
 +      if (atomic_read(&sk->sk_wmem_alloc) > limit) {
 +              set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags);
 +              /* It is possible TX completion already happened
 +               * before we set TSQ_THROTTLED, so we must
 +               * test again the condition.
 +               */
 +              smp_mb__after_atomic();
 +              if (atomic_read(&sk->sk_wmem_alloc) > limit)
 +                      return true;
 +      }
 +      return false;
 +}
 +
  /* This routine writes packets to the network.  It advances the
   * send_head.  This happens as incoming acks open up the remote
   * window for us.
@@@ -2116,7 -2059,7 +2118,7 @@@ static bool tcp_write_xmit(struct sock 
                }
        }
  
 -      max_segs = tcp_tso_autosize(sk, mss_now);
 +      max_segs = tcp_tso_segs(sk, mss_now);
        while ((skb = tcp_send_head(sk))) {
                unsigned int limit;
  
                    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
                        break;
  
 -              /* TCP Small Queues :
 -               * Control number of packets in qdisc/devices to two packets / or ~1 ms.
 -               * This allows for :
 -               *  - better RTT estimation and ACK scheduling
 -               *  - faster recovery
 -               *  - high rates
 -               * Alas, some drivers / subsystems require a fair amount
 -               * of queued bytes to ensure line rate.
 -               * One example is wifi aggregation (802.11 AMPDU)
 -               */
 -              limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
 -              limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
 -
 -              if (atomic_read(&sk->sk_wmem_alloc) > limit) {
 -                      set_bit(TSQ_THROTTLED, &tp->tsq_flags);
 -                      /* It is possible TX completion already happened
 -                       * before we set TSQ_THROTTLED, so we must
 -                       * test again the condition.
 -                       */
 -                      smp_mb__after_atomic();
 -                      if (atomic_read(&sk->sk_wmem_alloc) > limit)
 -                              break;
 -              }
 +              if (tcp_small_queue_check(sk, skb, 0))
 +                      break;
  
                if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
                        break;
@@@ -2813,9 -2777,9 +2815,9 @@@ void tcp_xmit_retransmit_queue(struct s
                last_lost = tp->snd_una;
        }
  
 -      max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
 +      max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
        tcp_for_write_queue_from(skb, sk) {
 -              __u8 sacked = TCP_SKB_CB(skb)->sacked;
 +              __u8 sacked;
                int segs;
  
                if (skb == tcp_send_head(sk))
                segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
                if (segs <= 0)
                        return;
 +              sacked = TCP_SKB_CB(skb)->sacked;
                /* In case tcp_shift_skb_data() have aggregated large skbs,
                 * we need to make sure not sending too bigs TSO packets
                 */
@@@ -2867,9 -2830,6 +2869,9 @@@ begin_fwd
                if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
                        continue;
  
 +              if (tcp_small_queue_check(sk, skb, 1))
 +                      return;
 +
                if (tcp_retransmit_skb(sk, skb, segs))
                        return;
  
diff --combined net/ipv6/ip6_gre.c
@@@ -61,12 -61,12 +61,12 @@@ static bool log_ecn_error = true
  module_param(log_ecn_error, bool, 0644);
  MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
  
 -#define HASH_SIZE_SHIFT  5
 -#define HASH_SIZE (1 << HASH_SIZE_SHIFT)
 +#define IP6_GRE_HASH_SIZE_SHIFT  5
 +#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)
  
  static int ip6gre_net_id __read_mostly;
  struct ip6gre_net {
 -      struct ip6_tnl __rcu *tunnels[4][HASH_SIZE];
 +      struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
  
        struct net_device *fb_tunnel_dev;
  };
@@@ -96,12 -96,12 +96,12 @@@ static void ip6gre_tnl_link_config(stru
     will match fallback tunnel.
   */
  
 -#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(HASH_SIZE - 1))
 +#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1))
  static u32 HASH_ADDR(const struct in6_addr *addr)
  {
        u32 hash = ipv6_addr_hash(addr);
  
 -      return hash_32(hash, HASH_SIZE_SHIFT);
 +      return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT);
  }
  
  #define tunnels_r_l   tunnels[3]
@@@ -648,7 -648,6 +648,6 @@@ static int ip6gre_xmit_other(struct sk_
                encap_limit = t->parms.encap_limit;
  
        memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
-       fl6.flowi6_proto = skb->protocol;
  
        err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
        if (err)
@@@ -1087,7 -1086,7 +1086,7 @@@ static void ip6gre_destroy_tunnels(stru
  
        for (prio = 0; prio < 4; prio++) {
                int h;
 -              for (h = 0; h < HASH_SIZE; h++) {
 +              for (h = 0; h < IP6_GRE_HASH_SIZE; h++) {
                        struct ip6_tnl *t;
  
                        t = rtnl_dereference(ign->tunnels[prio][h]);
@@@ -1239,7 -1238,7 +1238,7 @@@ static void ip6gre_netlink_parms(struc
                parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]);
  
        if (data[IFLA_GRE_FLOWINFO])
 -              parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]);
 +              parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]);
  
        if (data[IFLA_GRE_FLAGS])
                parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]);
diff --combined net/ipv6/route.c
@@@ -1147,16 -1147,15 +1147,16 @@@ static struct rt6_info *ip6_pol_route_i
        return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
  }
  
 -static struct dst_entry *ip6_route_input_lookup(struct net *net,
 -                                              struct net_device *dev,
 -                                              struct flowi6 *fl6, int flags)
 +struct dst_entry *ip6_route_input_lookup(struct net *net,
 +                                       struct net_device *dev,
 +                                       struct flowi6 *fl6, int flags)
  {
        if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
                flags |= RT6_LOOKUP_F_IFACE;
  
        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
  }
 +EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
  
  void ip6_route_input(struct sk_buff *skb)
  {
        int flags = RT6_LOOKUP_F_HAS_SADDR;
        struct ip_tunnel_info *tun_info;
        struct flowi6 fl6 = {
 -              .flowi6_iif = l3mdev_fib_oif(skb->dev),
 +              .flowi6_iif = skb->dev->ifindex,
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
@@@ -1189,15 -1188,12 +1189,15 @@@ static struct rt6_info *ip6_pol_route_o
  struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
                                         struct flowi6 *fl6, int flags)
  {
 -      struct dst_entry *dst;
        bool any_src;
  
 -      dst = l3mdev_get_rt6_dst(net, fl6);
 -      if (dst)
 -              return dst;
 +      if (rt6_need_strict(&fl6->daddr)) {
 +              struct dst_entry *dst;
 +
 +              dst = l3mdev_link_scope_lookup(net, fl6);
 +              if (dst)
 +                      return dst;
 +      }
  
        fl6->flowi6_iif = LOOPBACK_IFINDEX;
  
@@@ -1608,9 -1604,7 +1608,9 @@@ static unsigned int ip6_mtu(const struc
        rcu_read_unlock();
  
  out:
 -      return min_t(unsigned int, mtu, IP6_MAX_MTU);
 +      mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
 +
 +      return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
  }
  
  static struct dst_entry *icmp6_dst_gc_list;
@@@ -2571,16 -2565,8 +2571,16 @@@ struct rt6_info *addrconf_dst_alloc(str
  {
        u32 tb_id;
        struct net *net = dev_net(idev->dev);
 -      struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
 -                                          DST_NOCOUNT);
 +      struct net_device *dev = net->loopback_dev;
 +      struct rt6_info *rt;
 +
 +      /* use L3 Master device as loopback for host routes if device
 +       * is enslaved and address is not link local or multicast
 +       */
 +      if (!rt6_need_strict(addr))
 +              dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
 +
 +      rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
        if (!rt)
                return ERR_PTR(-ENOMEM);
  
@@@ -3216,7 -3202,9 +3216,9 @@@ static int rt6_fill_node(struct net *ne
        if (iif) {
  #ifdef CONFIG_IPV6_MROUTE
                if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
-                       int err = ip6mr_get_route(net, skb, rtm, nowait);
+                       int err = ip6mr_get_route(net, skb, rtm, nowait,
+                                                 portid);
                        if (err <= 0) {
                                if (!nowait) {
                                        if (err == 0)
@@@ -3359,6 -3347,11 +3361,6 @@@ static int inet6_rtm_getroute(struct sk
        } else {
                fl6.flowi6_oif = oif;
  
 -              if (netif_index_is_l3_master(net, oif)) {
 -                      fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
 -                                         FLOWI_FLAG_SKIP_NH_OIF;
 -              }
 -
                rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
        }
  
diff --combined net/sched/act_ife.c
@@@ -53,7 -53,7 +53,7 @@@ int ife_tlv_meta_encode(void *skbdata, 
        u32 *tlv = (u32 *)(skbdata);
        u16 totlen = nla_total_size(dlen);      /*alignment + hdr */
        char *dptr = (char *)tlv + NLA_HDRLEN;
-       u32 htlv = attrtype << 16 | dlen;
+       u32 htlv = attrtype << 16 | (dlen + NLA_HDRLEN);
  
        *tlv = htonl(htlv);
        memset(dptr, 0, totlen - NLA_HDRLEN);
  }
  EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
  
 +int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi)
 +{
 +      u16 edata = 0;
 +
 +      if (mi->metaval)
 +              edata = *(u16 *)mi->metaval;
 +      else if (metaval)
 +              edata = metaval;
 +
 +      if (!edata) /* will not encode */
 +              return 0;
 +
 +      edata = htons(edata);
 +      return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata);
 +}
 +EXPORT_SYMBOL_GPL(ife_encode_meta_u16);
 +
  int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
  {
        if (mi->metaval)
@@@ -98,15 -81,6 +98,15 @@@ int ife_check_meta_u32(u32 metaval, str
  }
  EXPORT_SYMBOL_GPL(ife_check_meta_u32);
  
 +int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi)
 +{
 +      if (metaval || mi->metaval)
 +              return 8; /* T+L+(V) == 2+2+(2+2bytepad) */
 +
 +      return 0;
 +}
 +EXPORT_SYMBOL_GPL(ife_check_meta_u16);
 +
  int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
  {
        u32 edata = metaval;
@@@ -653,7 -627,7 +653,7 @@@ static int tcf_ife_decode(struct sk_buf
        struct tcf_ife_info *ife = to_ife(a);
        int action = ife->tcf_action;
        struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
-       u16 ifehdrln = ifehdr->metalen;
+       int ifehdrln = (int)ifehdr->metalen;
        struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
  
        spin_lock(&ife->tcf_lock);
@@@ -766,8 -740,6 +766,6 @@@ static int tcf_ife_encode(struct sk_buf
                return TC_ACT_SHOT;
        }
  
-       iethh = eth_hdr(skb);
        err = skb_cow_head(skb, hdrm);
        if (unlikely(err)) {
                ife->tcf_qstats.drops++;
        if (!(at & AT_EGRESS))
                skb_push(skb, skb->dev->hard_header_len);
  
+       iethh = (struct ethhdr *)skb->data;
        __skb_push(skb, hdrm);
        memcpy(skb->data, iethh, skb->mac_len);
        skb_reset_mac_header(skb);
diff --combined net/sctp/chunk.c
@@@ -70,19 -70,6 +70,19 @@@ static struct sctp_datamsg *sctp_datams
        return msg;
  }
  
 +void sctp_datamsg_free(struct sctp_datamsg *msg)
 +{
 +      struct sctp_chunk *chunk;
 +
 +      /* This doesn't have to be a _safe vairant because
 +       * sctp_chunk_free() only drops the refs.
 +       */
 +      list_for_each_entry(chunk, &msg->chunks, frag_list)
 +              sctp_chunk_free(chunk);
 +
 +      sctp_datamsg_put(msg);
 +}
 +
  /* Final destructruction of datamsg memory. */
  static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
  {
@@@ -192,13 -179,17 +192,18 @@@ struct sctp_datamsg *sctp_datamsg_from_
                         msg, msg->expires_at, jiffies);
        }
  
+       if (asoc->peer.prsctp_capable &&
+           SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
+               msg->expires_at =
+                       jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
        /* This is the biggest possible DATA chunk that can fit into
         * the packet
         */
 -      max_data = (asoc->pathmtu -
 -              sctp_sk(asoc->base.sk)->pf->af->net_header_len -
 -              sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3;
 +      max_data = asoc->pathmtu -
 +                 sctp_sk(asoc->base.sk)->pf->af->net_header_len -
 +                 sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk);
 +      max_data = SCTP_TRUNC4(max_data);
  
        max = asoc->frag_point;
        /* If the the peer requested that we authenticate DATA chunks
                struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
  
                if (hmac_desc)
 -                      max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) +
 -                                          hmac_desc->hmac_len);
 +                      max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) +
 +                                            hmac_desc->hmac_len);
        }
  
        /* Now, check if we need to reduce our max */
            asoc->outqueue.out_qlen == 0 &&
            list_empty(&asoc->outqueue.retransmit) &&
            msg_len > max)
 -              max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t));
 +              max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
  
        /* Encourage Cookie-ECHO bundling. */
        if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
@@@ -349,7 -340,7 +354,7 @@@ errout
  /* Check whether this message has expired. */
  int sctp_chunk_abandoned(struct sctp_chunk *chunk)
  {
-       if (!chunk->asoc->prsctp_enable ||
+       if (!chunk->asoc->peer.prsctp_capable ||
            !SCTP_PR_POLICY(chunk->sinfo.sinfo_flags)) {
                struct sctp_datamsg *msg = chunk->msg;
  
        }
  
        if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) &&
-           time_after(jiffies, chunk->prsctp_param)) {
+           time_after(jiffies, chunk->msg->expires_at)) {
                if (chunk->sent_count)
                        chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++;
                else
                        chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++;
                return 1;
        } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) &&
-                  chunk->sent_count > chunk->prsctp_param) {
+                  chunk->sent_count > chunk->sinfo.sinfo_timetolive) {
                chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++;
                return 1;
        }
diff --combined net/sctp/outqueue.c
@@@ -68,7 -68,7 +68,7 @@@ static void sctp_mark_missing(struct sc
  
  static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
  
 -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
 +static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
  
  /* Add data to the front of the queue. */
  static inline void sctp_outq_head_data(struct sctp_outq *q,
@@@ -285,9 -285,10 +285,9 @@@ void sctp_outq_free(struct sctp_outq *q
  }
  
  /* Put a new chunk in an sctp_outq.  */
 -int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
 +void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
  {
        struct net *net = sock_net(q->asoc->base.sk);
 -      int error = 0;
  
        pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk,
                 chunk && chunk->chunk_hdr ?
         * immediately.
         */
        if (sctp_chunk_is_data(chunk)) {
 -              /* Is it OK to queue data chunks?  */
 -              /* From 9. Termination of Association
 -               *
 -               * When either endpoint performs a shutdown, the
 -               * association on each peer will stop accepting new
 -               * data from its user and only deliver data in queue
 -               * at the time of sending or receiving the SHUTDOWN
 -               * chunk.
 -               */
 -              switch (q->asoc->state) {
 -              case SCTP_STATE_CLOSED:
 -              case SCTP_STATE_SHUTDOWN_PENDING:
 -              case SCTP_STATE_SHUTDOWN_SENT:
 -              case SCTP_STATE_SHUTDOWN_RECEIVED:
 -              case SCTP_STATE_SHUTDOWN_ACK_SENT:
 -                      /* Cannot send after transport endpoint shutdown */
 -                      error = -ESHUTDOWN;
 -                      break;
 -
 -              default:
 -                      pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
 -                               __func__, q, chunk, chunk && chunk->chunk_hdr ?
 -                               sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
 -                               "illegal chunk");
 -
 -                      sctp_chunk_hold(chunk);
 -                      sctp_outq_tail_data(q, chunk);
 -                      if (chunk->asoc->peer.prsctp_capable &&
 -                          SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
 -                              chunk->asoc->sent_cnt_removable++;
 -                      if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
 -                              SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
 -                      else
 -                              SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
 -                      break;
 -              }
 +              pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n",
 +                       __func__, q, chunk, chunk && chunk->chunk_hdr ?
 +                       sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
 +                       "illegal chunk");
 +
 +              sctp_outq_tail_data(q, chunk);
-               if (chunk->asoc->prsctp_enable &&
++              if (chunk->asoc->peer.prsctp_capable &&
 +                  SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
 +                      chunk->asoc->sent_cnt_removable++;
 +              if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
 +                      SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
 +              else
 +                      SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS);
        } else {
                list_add_tail(&chunk->list, &q->control_chunk_list);
                SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
        }
  
 -      if (error < 0)
 -              return error;
 -
        if (!q->cork)
 -              error = sctp_outq_flush(q, 0, gfp);
 -
 -      return error;
 +              sctp_outq_flush(q, 0, gfp);
  }
  
  /* Insert a chunk into the sorted list based on the TSNs.  The retransmit list
@@@ -354,7 -383,7 +354,7 @@@ static int sctp_prsctp_prune_sent(struc
  
        list_for_each_entry_safe(chk, temp, queue, transmitted_list) {
                if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
-                   chk->prsctp_param <= sinfo->sinfo_timetolive)
+                   chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
                        continue;
  
                list_del_init(&chk->transmitted_list);
@@@ -389,7 -418,7 +389,7 @@@ static int sctp_prsctp_prune_unsent(str
  
        list_for_each_entry_safe(chk, temp, queue, list) {
                if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) ||
-                   chk->prsctp_param <= sinfo->sinfo_timetolive)
+                   chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive)
                        continue;
  
                list_del_init(&chk->list);
@@@ -413,7 -442,7 +413,7 @@@ void sctp_prsctp_prune(struct sctp_asso
  {
        struct sctp_transport *transport;
  
-       if (!asoc->prsctp_enable || !asoc->sent_cnt_removable)
+       if (!asoc->peer.prsctp_capable || !asoc->sent_cnt_removable)
                return;
  
        msg_len = sctp_prsctp_prune_sent(asoc, sinfo,
@@@ -530,6 -559,7 +530,6 @@@ void sctp_retransmit(struct sctp_outq *
                     sctp_retransmit_reason_t reason)
  {
        struct net *net = sock_net(q->asoc->base.sk);
 -      int error = 0;
  
        switch (reason) {
        case SCTP_RTXR_T3_RTX:
         * will be flushed at the end.
         */
        if (reason != SCTP_RTXR_FAST_RTX)
 -              error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
 -
 -      if (error)
 -              q->asoc->base.sk->sk_err = -error;
 +              sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
  }
  
  /*
@@@ -745,12 -778,12 +745,12 @@@ redo
  }
  
  /* Cork the outqueue so queued chunks are really queued. */
 -int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
 +void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
  {
        if (q->cork)
                q->cork = 0;
  
 -      return sctp_outq_flush(q, 0, gfp);
 +      sctp_outq_flush(q, 0, gfp);
  }
  
  
   * locking concerns must be made.  Today we use the sock lock to protect
   * this function.
   */
 -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 +static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
  {
        struct sctp_packet *packet;
        struct sctp_packet singleton;
                        sctp_packet_config(&singleton, vtag, 0);
                        sctp_packet_append_chunk(&singleton, chunk);
                        error = sctp_packet_transmit(&singleton, gfp);
 -                      if (error < 0)
 -                              return error;
 +                      if (error < 0) {
 +                              asoc->base.sk->sk_err = -error;
 +                              return;
 +                      }
                        break;
  
                case SCTP_CID_ABORT:
                retran:
                        error = sctp_outq_flush_rtx(q, packet,
                                                    rtx_timeout, &start_timer);
 +                      if (error < 0)
 +                              asoc->base.sk->sk_err = -error;
  
                        if (start_timer) {
                                sctp_transport_reset_t3_rtx(transport);
  
                                /* Mark as failed send. */
                                sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM);
-                               if (asoc->prsctp_enable &&
+                               if (asoc->peer.prsctp_capable &&
                                    SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
                                        asoc->sent_cnt_removable--;
                                sctp_chunk_free(chunk);
@@@ -1163,15 -1192,14 +1163,15 @@@ sctp_flush_out
                                                      struct sctp_transport,
                                                      send_ready);
                packet = &t->packet;
 -              if (!sctp_packet_empty(packet))
 +              if (!sctp_packet_empty(packet)) {
                        error = sctp_packet_transmit(packet, gfp);
 +                      if (error < 0)
 +                              asoc->base.sk->sk_err = -error;
 +              }
  
                /* Clear the burst limited state, if any */
                sctp_transport_burst_reset(t);
        }
 -
 -      return error;
  }
  
  /* Update unack_data based on the incoming SACK chunk */
@@@ -1319,7 -1347,7 +1319,7 @@@ int sctp_outq_sack(struct sctp_outq *q
                tsn = ntohl(tchunk->subh.data_hdr->tsn);
                if (TSN_lte(tsn, ctsn)) {
                        list_del_init(&tchunk->transmitted_list);
-                       if (asoc->prsctp_enable &&
+                       if (asoc->peer.prsctp_capable &&
                            SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags))
                                asoc->sent_cnt_removable--;
                        sctp_chunk_free(tchunk);
@@@ -1719,7 -1747,7 +1719,7 @@@ static int sctp_acked(struct sctp_sackh
  {
        int i;
        sctp_sack_variable_t *frags;
 -      __u16 gap;
 +      __u16 tsn_offset, blocks;
        __u32 ctsn = ntohl(sack->cum_tsn_ack);
  
        if (TSN_lte(tsn, ctsn))
         */
  
        frags = sack->variable;
 -      gap = tsn - ctsn;
 -      for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) {
 -              if (TSN_lte(ntohs(frags[i].gab.start), gap) &&
 -                  TSN_lte(gap, ntohs(frags[i].gab.end)))
 +      blocks = ntohs(sack->num_gap_ack_blocks);
 +      tsn_offset = tsn - ctsn;
 +      for (i = 0; i < blocks; ++i) {
 +              if (tsn_offset >= ntohs(frags[i].gab.start) &&
 +                  tsn_offset <= ntohs(frags[i].gab.end))
                        goto pass;
        }
  
diff --combined net/sctp/sctp_diag.c
@@@ -106,8 -106,7 +106,8 @@@ static int inet_sctp_diag_fill(struct s
                               const struct inet_diag_req_v2 *req,
                               struct user_namespace *user_ns,
                               int portid, u32 seq, u16 nlmsg_flags,
 -                             const struct nlmsghdr *unlh)
 +                             const struct nlmsghdr *unlh,
 +                             bool net_admin)
  {
        struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct list_head *addr_list;
                r->idiag_retrans = 0;
        }
  
 -      if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
 +      if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
                goto errout;
  
        if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
@@@ -204,7 -203,6 +204,7 @@@ struct sctp_comm_param 
        struct netlink_callback *cb;
        const struct inet_diag_req_v2 *r;
        const struct nlmsghdr *nlh;
 +      bool net_admin;
  };
  
  static size_t inet_assoc_attr_size(struct sctp_association *asoc)
                + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
                + nla_total_size(1) /* INET_DIAG_TOS */
                + nla_total_size(1) /* INET_DIAG_TCLASS */
 +              + nla_total_size(4) /* INET_DIAG_MARK */
                + nla_total_size(addrlen * asoc->peer.transport_count)
                + nla_total_size(addrlen * addrcnt)
                + nla_total_size(sizeof(struct inet_diag_meminfo))
@@@ -259,8 -256,7 +259,8 @@@ static int sctp_tsp_dump_one(struct sct
        err = inet_sctp_diag_fill(sk, assoc, rep, req,
                                  sk_user_ns(NETLINK_CB(in_skb).sk),
                                  NETLINK_CB(in_skb).portid,
 -                                nlh->nlmsg_seq, 0, nlh);
 +                                nlh->nlmsg_seq, 0, nlh,
 +                                commp->net_admin);
        release_sock(sk);
        if (err < 0) {
                WARN_ON(err == -EMSGSIZE);
@@@ -276,28 -272,17 +276,17 @@@ out
        return err;
  }
  
- static int sctp_tsp_dump(struct sctp_transport *tsp, void *p)
+ static int sctp_sock_dump(struct sock *sk, void *p)
  {
-       struct sctp_endpoint *ep = tsp->asoc->ep;
+       struct sctp_endpoint *ep = sctp_sk(sk)->ep;
        struct sctp_comm_param *commp = p;
-       struct sock *sk = ep->base.sk;
        struct sk_buff *skb = commp->skb;
        struct netlink_callback *cb = commp->cb;
        const struct inet_diag_req_v2 *r = commp->r;
-       struct sctp_association *assoc =
-               list_entry(ep->asocs.next, struct sctp_association, asocs);
+       struct sctp_association *assoc;
        int err = 0;
  
-       /* find the ep only once through the transports by this condition */
-       if (tsp->asoc != assoc)
-               goto out;
-       if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
-               goto out;
        lock_sock(sk);
-       if (sk != assoc->base.sk)
-               goto release;
        list_for_each_entry(assoc, &ep->asocs, asocs) {
                if (cb->args[4] < cb->args[1])
                        goto next;
                                        sk_user_ns(NETLINK_CB(cb->skb).sk),
                                        NETLINK_CB(cb->skb).portid,
                                        cb->nlh->nlmsg_seq,
 -                                      NLM_F_MULTI, cb->nlh) < 0) {
 +                                      NLM_F_MULTI, cb->nlh,
 +                                      commp->net_admin) < 0) {
                        cb->args[3] = 1;
-                       err = 2;
+                       err = 1;
                        goto release;
                }
                cb->args[3] = 1;
                if (inet_sctp_diag_fill(sk, assoc, skb, r,
                                        sk_user_ns(NETLINK_CB(cb->skb).sk),
                                        NETLINK_CB(cb->skb).portid,
 -                                      cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) {
 +                                      cb->nlh->nlmsg_seq, 0, cb->nlh,
 +                                      commp->net_admin) < 0) {
-                       err = 2;
+                       err = 1;
                        goto release;
                }
  next:
        cb->args[4] = 0;
  release:
        release_sock(sk);
+       sock_put(sk);
        return err;
+ }
+ static int sctp_get_sock(struct sctp_transport *tsp, void *p)
+ {
+       struct sctp_endpoint *ep = tsp->asoc->ep;
+       struct sctp_comm_param *commp = p;
+       struct sock *sk = ep->base.sk;
+       struct netlink_callback *cb = commp->cb;
+       const struct inet_diag_req_v2 *r = commp->r;
+       struct sctp_association *assoc =
+               list_entry(ep->asocs.next, struct sctp_association, asocs);
+       /* find the ep only once through the transports by this condition */
+       if (tsp->asoc != assoc)
+               goto out;
+       if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
+               goto out;
+       sock_hold(sk);
+       cb->args[5] = (long)sk;
+       return 1;
  out:
        cb->args[2]++;
-       return err;
+       return 0;
  }
  
  static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
                                sk_user_ns(NETLINK_CB(cb->skb).sk),
                                NETLINK_CB(cb->skb).portid,
                                cb->nlh->nlmsg_seq, NLM_F_MULTI,
 -                              cb->nlh) < 0) {
 +                              cb->nlh, commp->net_admin) < 0) {
                err = 2;
                goto out;
        }
@@@ -418,7 -426,6 +432,7 @@@ static int sctp_diag_dump_one(struct sk
                .skb = in_skb,
                .r = req,
                .nlh = nlh,
 +              .net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN),
        };
  
        if (req->sdiag_family == AF_INET) {
@@@ -454,7 -461,6 +468,7 @@@ static void sctp_diag_dump(struct sk_bu
                .skb = skb,
                .cb = cb,
                .r = r,
 +              .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
        };
  
        /* eps hashtable dumps
@@@ -480,10 -486,18 +494,18 @@@ skip
         * 2 : to record the transport pos of this time's traversal
         * 3 : to mark if we have dumped the ep info of the current asoc
         * 4 : to work as a temporary variable to traversal list
+        * 5 : to save the sk we get from travelsing the tsp list.
         */
        if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE)))
                goto done;
-       sctp_for_each_transport(sctp_tsp_dump, net, cb->args[2], &commp);
+ next:
+       cb->args[5] = 0;
+       sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp);
+       if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp))
+               goto next;
  done:
        cb->args[1] = cb->args[4];
        cb->args[4] = 0;
diff --combined net/sctp/sm_make_chunk.c
@@@ -253,7 -253,7 +253,7 @@@ struct sctp_chunk *sctp_make_init(cons
        num_types = sp->pf->supported_addrs(sp, types);
  
        chunksize = sizeof(init) + addrs_len;
 -      chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types));
 +      chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types));
        chunksize += sizeof(ecap_param);
  
        if (asoc->prsctp_enable)
                /* Add HMACS parameter length if any were defined */
                auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
                if (auth_hmacs->length)
 -                      chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
 +                      chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
                else
                        auth_hmacs = NULL;
  
                /* Add CHUNKS parameter length */
                auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
                if (auth_chunks->length)
 -                      chunksize += WORD_ROUND(ntohs(auth_chunks->length));
 +                      chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
                else
                        auth_chunks = NULL;
  
  
        /* If we have any extensions to report, account for that */
        if (num_ext)
 -              chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
 -                                      num_ext);
 +              chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
 +                                     num_ext);
  
        /* RFC 2960 3.3.2 Initiation (INIT) (1)
         *
@@@ -443,13 -443,13 +443,13 @@@ struct sctp_chunk *sctp_make_init_ack(c
  
                auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs;
                if (auth_hmacs->length)
 -                      chunksize += WORD_ROUND(ntohs(auth_hmacs->length));
 +                      chunksize += SCTP_PAD4(ntohs(auth_hmacs->length));
                else
                        auth_hmacs = NULL;
  
                auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks;
                if (auth_chunks->length)
 -                      chunksize += WORD_ROUND(ntohs(auth_chunks->length));
 +                      chunksize += SCTP_PAD4(ntohs(auth_chunks->length));
                else
                        auth_chunks = NULL;
  
        }
  
        if (num_ext)
 -              chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) +
 -                                      num_ext);
 +              chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
 +                                     num_ext);
  
        /* Now allocate and fill out the chunk.  */
        retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
@@@ -706,20 -706,6 +706,6 @@@ nodata
        return retval;
  }
  
- static void sctp_set_prsctp_policy(struct sctp_chunk *chunk,
-                                  const struct sctp_sndrcvinfo *sinfo)
- {
-       if (!chunk->asoc->prsctp_enable)
-               return;
-       if (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags))
-               chunk->prsctp_param =
-                       jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive);
-       else if (SCTP_PR_RTX_ENABLED(sinfo->sinfo_flags) ||
-                SCTP_PR_PRIO_ENABLED(sinfo->sinfo_flags))
-               chunk->prsctp_param = sinfo->sinfo_timetolive;
- }
  /* Make a DATA chunk for the given association from the provided
   * parameters.  However, do not populate the data payload.
   */
@@@ -753,7 -739,6 +739,6 @@@ struct sctp_chunk *sctp_make_datafrag_e
  
        retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp);
        memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo));
-       sctp_set_prsctp_policy(retval, sinfo);
  
  nodata:
        return retval;
@@@ -1390,7 -1375,7 +1375,7 @@@ static struct sctp_chunk *_sctp_make_ch
        struct sock *sk;
  
        /* No need to allocate LL here, as this is only a chunk. */
 -      skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp);
 +      skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp);
        if (!skb)
                goto nodata;
  
@@@ -1482,7 -1467,7 +1467,7 @@@ void *sctp_addto_chunk(struct sctp_chun
        void *target;
        void *padding;
        int chunklen = ntohs(chunk->chunk_hdr->length);
 -      int padlen = WORD_ROUND(chunklen) - chunklen;
 +      int padlen = SCTP_PAD4(chunklen) - chunklen;
  
        padding = skb_put(chunk->skb, padlen);
        target = skb_put(chunk->skb, len);
@@@ -1900,7 -1885,7 +1885,7 @@@ static int sctp_process_missing_param(c
        struct __sctp_missing report;
        __u16 len;
  
 -      len = WORD_ROUND(sizeof(report));
 +      len = SCTP_PAD4(sizeof(report));
  
        /* Make an ERROR chunk, preparing enough room for
         * returning multiple unknown parameters.
@@@ -2098,9 -2083,9 +2083,9 @@@ static sctp_ierror_t sctp_process_unk_p
  
                if (*errp) {
                        if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
 -                                      WORD_ROUND(ntohs(param.p->length))))
 +                                      SCTP_PAD4(ntohs(param.p->length))))
                                sctp_addto_chunk_fixed(*errp,
 -                                              WORD_ROUND(ntohs(param.p->length)),
 +                                              SCTP_PAD4(ntohs(param.p->length)),
                                                param.v);
                } else {
                        /* If there is no memory for generating the ERROR
diff --combined net/sctp/socket.c
@@@ -1958,8 -1958,6 +1958,8 @@@ static int sctp_sendmsg(struct sock *sk
  
        /* Now send the (possibly) fragmented message. */
        list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
 +              sctp_chunk_hold(chunk);
 +
                /* Do accounting for the write space.  */
                sctp_set_owner_w(chunk);
  
         * breaks.
         */
        err = sctp_primitive_SEND(net, asoc, datamsg);
 -      sctp_datamsg_put(datamsg);
        /* Did the lower layer accept the chunk? */
 -      if (err)
 +      if (err) {
 +              sctp_datamsg_free(datamsg);
                goto out_free;
 +      }
  
        pr_debug("%s: we sent primitively\n", __func__);
  
 +      sctp_datamsg_put(datamsg);
        err = msg_len;
  
        if (unlikely(wait_connect)) {
@@@ -4473,17 -4469,21 +4473,21 @@@ int sctp_transport_lookup_process(int (
                                  const union sctp_addr *paddr, void *p)
  {
        struct sctp_transport *transport;
-       int err = 0;
+       int err = -ENOENT;
  
        rcu_read_lock();
        transport = sctp_addrs_lookup_transport(net, laddr, paddr);
        if (!transport || !sctp_transport_hold(transport))
                goto out;
-       err = cb(transport, p);
+       sctp_association_hold(transport->asoc);
        sctp_transport_put(transport);
  
- out:
        rcu_read_unlock();
+       err = cb(transport, p);
+       sctp_association_put(transport->asoc);
+ out:
        return err;
  }
  EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);