lib/netlink-socket.c

   1 /*
   2  * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "netlink-socket.h"
  19 #include <errno.h>
  20 #include <inttypes.h>
  21 #include <stdlib.h>
  22 #include <sys/types.h>
  23 #include <sys/uio.h>
  24 #include <unistd.h>
  25 #include "coverage.h"
  26 #include "dynamic-string.h"
  27 #include "hash.h"
  28 #include "hmap.h"
  29 #include "netlink.h"
  30 #include "netlink-protocol.h"
  31 #include "odp-netlink.h"
  32 #include "ofpbuf.h"
  33 #include "ovs-thread.h"
  34 #include "poll-loop.h"
  35 #include "seq.h"
  36 #include "socket-util.h"
  37 #include "util.h"
  38 #include "vlog.h"
  39
  40 VLOG_DEFINE_THIS_MODULE(netlink_socket);
  41
  42 COVERAGE_DEFINE(netlink_overflow);
  43 COVERAGE_DEFINE(netlink_received);
  44 COVERAGE_DEFINE(netlink_recv_jumbo);
  45 COVERAGE_DEFINE(netlink_sent);
  46
  47 /* Linux header file confusion causes this to be undefined. */
  48 #ifndef SOL_NETLINK
  49 #define SOL_NETLINK 270
  50 #endif
  51
  52 #ifdef _WIN32
  53 static struct ovs_mutex portid_mutex = OVS_MUTEX_INITIALIZER;
  54 static uint32_t g_last_portid = 0;
  55
  56 /* Port IDs must be unique! */
  57 static uint32_t
  58 portid_next(void)
  59     OVS_GUARDED_BY(portid_mutex)
  60 {
  61     g_last_portid++;
  62     return g_last_portid;
  63 }
  64 #endif /* _WIN32 */
  65
  66 /* A single (bad) Netlink message can in theory dump out many, many log
  67  * messages, so the burst size is set quite high here to avoid missing useful
  68  * information.  Also, at high logging levels we log *all* Netlink messages. */
  69 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600);
  70
  71 static uint32_t nl_sock_allocate_seq(struct nl_sock *, unsigned int n);
  72 static void log_nlmsg(const char *function, int error,
  73                       const void *message, size_t size, int protocol);
  74 #ifdef _WIN32
  75 static int get_sock_pid_from_kernel(struct nl_sock *sock, uint32_t *pid);
  76 #endif
  77 \f
  78 /* Netlink sockets. */
  79
  80 struct nl_sock {
  81 #ifdef _WIN32
  82     HANDLE handle;
  83 #else
  84     int fd;
  85 #endif
  86     uint32_t next_seq;
  87     uint32_t pid;
  88     int protocol;
  89     unsigned int rcvbuf;        /* Receive buffer size (SO_RCVBUF). */
  90 };
  91
  92 /* Compile-time limit on iovecs, so that we can allocate a maximum-size array
  93  * of iovecs on the stack. */
  94 #define MAX_IOVS 128
  95
  96 /* Maximum number of iovecs that may be passed to sendmsg, capped at a
  97  * minimum of _XOPEN_IOV_MAX (16) and a maximum of MAX_IOVS.
  98  *
  99  * Initialized by nl_sock_create(). */
 100 static int max_iovs;
 101
 102 static int nl_pool_alloc(int protocol, struct nl_sock **sockp);
 103 static void nl_pool_release(struct nl_sock *);
 104
 105 /* Creates a new netlink socket for the given netlink 'protocol'
 106  * (NETLINK_ROUTE, NETLINK_GENERIC, ...).  Returns 0 and sets '*sockp' to the
 107  * new socket if successful, otherwise returns a positive errno value. */
 108 int
 109 nl_sock_create(int protocol, struct nl_sock **sockp)
 110 {
 111     static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
 112     struct nl_sock *sock;
 113 #ifndef _WIN32
 114     struct sockaddr_nl local, remote;
 115 #endif
 116     socklen_t local_size;
 117     int rcvbuf;
 118     int retval = 0;
 119
 120     if (ovsthread_once_start(&once)) {
 121         int save_errno = errno;
 122         errno = 0;
 123
 124         max_iovs = sysconf(_SC_UIO_MAXIOV);
 125         if (max_iovs < _XOPEN_IOV_MAX) {
 126             if (max_iovs == -1 && errno) {
 127                 VLOG_WARN("sysconf(_SC_UIO_MAXIOV): %s", ovs_strerror(errno));
 128             }
 129             max_iovs = _XOPEN_IOV_MAX;
 130         } else if (max_iovs > MAX_IOVS) {
 131             max_iovs = MAX_IOVS;
 132         }
 133
 134         errno = save_errno;
 135         ovsthread_once_done(&once);
 136     }
 137
 138     *sockp = NULL;
 139     sock = xmalloc(sizeof *sock);
 140
 141 #ifdef _WIN32
 142     sock->handle = CreateFileA("\\\\.\\OpenVSwitchDevice",
 143                                GENERIC_READ | GENERIC_WRITE,
 144                                FILE_SHARE_READ | FILE_SHARE_WRITE,
 145                                NULL, OPEN_EXISTING,
 146                                FILE_ATTRIBUTE_NORMAL, NULL);
 147
 148     int last_error = GetLastError();
 149
 150     if (sock->handle == INVALID_HANDLE_VALUE) {
 151         VLOG_ERR("fcntl: %s", ovs_strerror(last_error));
 152         goto error;
 153     }
 154 #else
 155     sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol);
 156     if (sock->fd < 0) {
 157         VLOG_ERR("fcntl: %s", ovs_strerror(errno));
 158         goto error;
 159     }
 160 #endif
 161
 162     sock->protocol = protocol;
 163     sock->next_seq = 1;
 164
 165     rcvbuf = 1024 * 1024;
 166 #ifdef _WIN32
 167     sock->rcvbuf = rcvbuf;
 168     retval = get_sock_pid_from_kernel(sock, &sock->pid);
 169     if (retval != 0) {
 170         goto error;
 171     }
 172 #else
 173     if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUFFORCE,
 174                    &rcvbuf, sizeof rcvbuf)) {
 175         /* Only root can use SO_RCVBUFFORCE.  Everyone else gets EPERM.
 176          * Warn only if the failure is therefore unexpected. */
 177         if (errno != EPERM) {
 178             VLOG_WARN_RL(&rl, "setting %d-byte socket receive buffer failed "
 179                          "(%s)", rcvbuf, ovs_strerror(errno));
 180         }
 181     }
 182
 183     retval = get_socket_rcvbuf(sock->fd);
 184     if (retval < 0) {
 185         retval = -retval;
 186         goto error;
 187     }
 188     sock->rcvbuf = retval;
 189
 190     /* Connect to kernel (pid 0) as remote address. */
 191     memset(&remote, 0, sizeof remote);
 192     remote.nl_family = AF_NETLINK;
 193     remote.nl_pid = 0;
 194     if (connect(sock->fd, (struct sockaddr *) &remote, sizeof remote) < 0) {
 195         VLOG_ERR("connect(0): %s", ovs_strerror(errno));
 196         goto error;
 197     }
 198
 199     /* Obtain pid assigned by kernel. */
 200     local_size = sizeof local;
 201     if (getsockname(sock->fd, (struct sockaddr *) &local, &local_size) < 0) {
 202         VLOG_ERR("getsockname: %s", ovs_strerror(errno));
 203         goto error;
 204     }
 205     if (local_size < sizeof local || local.nl_family != AF_NETLINK) {
 206         VLOG_ERR("getsockname returned bad Netlink name");
 207         retval = EINVAL;
 208         goto error;
 209     }
 210     sock->pid = local.nl_pid;
 211 #endif
 212
 213     *sockp = sock;
 214     return 0;
 215
 216 error:
 217     if (retval == 0) {
 218         retval = errno;
 219         if (retval == 0) {
 220             retval = EINVAL;
 221         }
 222     }
 223 #ifdef _WIN32
 224     if (sock->handle != INVALID_HANDLE_VALUE) {
 225         CloseHandle(sock->handle);
 226     }
 227 #else
 228     if (sock->fd >= 0) {
 229         close(sock->fd);
 230     }
 231 #endif
 232     free(sock);
 233     return retval;
 234 }
 235
 236 /* Creates a new netlink socket for the same protocol as 'src'.  Returns 0 and
 237  * sets '*sockp' to the new socket if successful, otherwise returns a positive
 238  * errno value.  */
 239 int
 240 nl_sock_clone(const struct nl_sock *src, struct nl_sock **sockp)
 241 {
 242     return nl_sock_create(src->protocol, sockp);
 243 }
 244
 245 /* Destroys netlink socket 'sock'. */
 246 void
 247 nl_sock_destroy(struct nl_sock *sock)
 248 {
 249     if (sock) {
 250 #ifdef _WIN32
 251         CloseHandle(sock->handle);
 252 #else
 253         close(sock->fd);
 254 #endif
 255         free(sock);
 256     }
 257 }
 258
 259 #ifdef _WIN32
 260 /* Reads the pid for 'sock' generated in the kernel datapath. The function
 261  * follows a transaction semantic. Eventually this function should call into
 262  * nl_transact. */
 263 static int
 264 get_sock_pid_from_kernel(struct nl_sock *sock, uint32_t *pid)
 265 {
 266     struct nl_transaction txn;
 267     struct ofpbuf request;
 268     uint64_t request_stub[128];
 269     struct ofpbuf reply;
 270     uint64_t reply_stub[128];
 271     struct ovs_header *ovs_header;
 272     struct nlmsghdr *nlmsg;
 273     uint32_t seq;
 274     int retval;
 275     DWORD bytes;
 276     int ovs_msg_size = sizeof (struct nlmsghdr) + sizeof (struct genlmsghdr) +
 277                        sizeof (struct ovs_header);
 278
 279     ofpbuf_use_stub(&request, request_stub, sizeof request_stub);
 280     txn.request = &request;
 281     ofpbuf_use_stub(&reply, reply_stub, sizeof reply_stub);
 282     txn.reply = &reply;
 283
 284     seq = nl_sock_allocate_seq(sock, 1);
 285     nl_msg_put_genlmsghdr(&request, 0, OVS_WIN_NL_CTRL_FAMILY_ID, 0,
 286                           OVS_CTRL_CMD_WIN_GET_PID, OVS_WIN_CONTROL_VERSION);
 287     nlmsg = nl_msg_nlmsghdr(txn.request);
 288     nlmsg->nlmsg_seq = seq;
 289
 290     ovs_header = ofpbuf_put_uninit(&request, sizeof *ovs_header);
 291     ovs_header->dp_ifindex = 0;
 292     ovs_header = ofpbuf_put_uninit(&reply, ovs_msg_size);
 293
 294     if (!DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
 295                          ofpbuf_data(txn.request), ofpbuf_size(txn.request),
 296                          ofpbuf_data(txn.reply), ofpbuf_size(txn.reply),
 297                          &bytes, NULL)) {
 298         retval = EINVAL;
 299         goto done;
 300     } else {
 301         if (bytes < ovs_msg_size) {
 302             retval = EINVAL;
 303             goto done;
 304         }
 305
 306         nlmsg = nl_msg_nlmsghdr(txn.request);
 307         if (nlmsg->nlmsg_seq != seq) {
 308             retval = EINVAL;
 309             goto done;
 310         }
 311         *pid = nlmsg->nlmsg_pid;
 312     }
 313     retval = 0;
 314
 315 done:
 316     ofpbuf_uninit(&request);
 317     ofpbuf_uninit(&reply);
 318     return retval;
 319 }
 320 #endif  /* _WIN32 */
 321
 322 /* Tries to add 'sock' as a listener for 'multicast_group'.  Returns 0 if
 323  * successful, otherwise a positive errno value.
 324  *
 325  * A socket that is subscribed to a multicast group that receives asynchronous
 326  * notifications must not be used for Netlink transactions or dumps, because
 327  * transactions and dumps can cause notifications to be lost.
 328  *
 329  * Multicast group numbers are always positive.
 330  *
 331  * It is not an error to attempt to join a multicast group to which a socket
 332  * already belongs. */
 333 int
 334 nl_sock_join_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
 335 {
 336 #ifdef _WIN32
 337 #define OVS_VPORT_MCGROUP_FALLBACK_ID 33
 338     struct ofpbuf msg_buf;
 339     struct message_multicast
 340     {
 341         struct nlmsghdr;
 342         /* if true, join; if else, leave */
 343         unsigned char join;
 344         unsigned int groupId;
 345     };
 346
 347     struct message_multicast msg = { 0 };
 348
 349     msg.nlmsg_len = sizeof(struct message_multicast);
 350     msg.nlmsg_type = OVS_VPORT_MCGROUP_FALLBACK_ID;
 351     msg.nlmsg_flags = 0;
 352     msg.nlmsg_seq = 0;
 353     msg.nlmsg_pid = sock->pid;
 354
 355     msg.join = 1;
 356     msg.groupId = multicast_group;
 357     msg_buf.base_ = &msg;
 358     msg_buf.data_ = &msg;
 359     msg_buf.size_ = msg.nlmsg_len;
 360
 361     nl_sock_send__(sock, &msg_buf, msg.nlmsg_seq, 0);
 362 #else
 363     if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
 364                    &multicast_group, sizeof multicast_group) < 0) {
 365         VLOG_WARN("could not join multicast group %u (%s)",
 366                   multicast_group, ovs_strerror(errno));
 367         return errno;
 368     }
 369 #endif
 370     return 0;
 371 }
 372
 373 /* Tries to make 'sock' stop listening to 'multicast_group'.  Returns 0 if
 374  * successful, otherwise a positive errno value.
 375  *
 376  * Multicast group numbers are always positive.
 377  *
 378  * It is not an error to attempt to leave a multicast group to which a socket
 379  * does not belong.
 380  *
 381  * On success, reading from 'sock' will still return any messages that were
 382  * received on 'multicast_group' before the group was left. */
 383 int
 384 nl_sock_leave_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
 385 {
 386 #ifdef _WIN32
 387     struct ofpbuf msg_buf;
 388     struct message_multicast
 389     {
 390         struct nlmsghdr;
 391         /* if true, join; if else, leave*/
 392         unsigned char join;
 393     };
 394
 395     struct message_multicast msg = { 0 };
 396     nl_msg_put_nlmsghdr(&msg, sizeof(struct message_multicast),
 397                         multicast_group, 0);
 398     msg.join = 0;
 399
 400     msg_buf.base_ = &msg;
 401     msg_buf.data_ = &msg;
 402     msg_buf.size_ = msg.nlmsg_len;
 403
 404     nl_sock_send__(sock, &msg_buf, msg.nlmsg_seq, 0);
 405 #else
 406     if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_DROP_MEMBERSHIP,
 407                    &multicast_group, sizeof multicast_group) < 0) {
 408         VLOG_WARN("could not leave multicast group %u (%s)",
 409                   multicast_group, ovs_strerror(errno));
 410         return errno;
 411     }
 412 #endif
 413     return 0;
 414 }
 415
 416 static int
 417 nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
 418                uint32_t nlmsg_seq, bool wait)
 419 {
 420     struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(msg);
 421     int error;
 422
 423     nlmsg->nlmsg_len = ofpbuf_size(msg);
 424     nlmsg->nlmsg_seq = nlmsg_seq;
 425     nlmsg->nlmsg_pid = sock->pid;
 426     do {
 427         int retval;
 428 #ifdef _WIN32
 429         bool result;
 430         DWORD last_error = 0;
 431         result = WriteFile(sock->handle, ofpbuf_data(msg), ofpbuf_size(msg),
 432                            &retval, NULL);
 433         last_error = GetLastError();
 434         if (last_error != ERROR_SUCCESS && !result) {
 435             retval = -1;
 436             errno = EAGAIN;
 437         }
 438 #else
 439         retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT);
 440 #endif
 441         error = retval < 0 ? errno : 0;
 442     } while (error == EINTR);
 443     log_nlmsg(__func__, error, ofpbuf_data(msg), ofpbuf_size(msg), sock->protocol);
 444     if (!error) {
 445         COVERAGE_INC(netlink_sent);
 446     }
 447     return error;
 448 }
 449
 450 /* Tries to send 'msg', which must contain a Netlink message, to the kernel on
 451  * 'sock'.  nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
 452  * will be set to 'sock''s pid, and nlmsg_seq will be initialized to a fresh
 453  * sequence number, before the message is sent.
 454  *
 455  * Returns 0 if successful, otherwise a positive errno value.  If
 456  * 'wait' is true, then the send will wait until buffer space is ready;
 457  * otherwise, returns EAGAIN if the 'sock' send buffer is full. */
 458 int
 459 nl_sock_send(struct nl_sock *sock, const struct ofpbuf *msg, bool wait)
 460 {
 461     return nl_sock_send_seq(sock, msg, nl_sock_allocate_seq(sock, 1), wait);
 462 }
 463
 464 /* Tries to send 'msg', which must contain a Netlink message, to the kernel on
 465  * 'sock'.  nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
 466  * will be set to 'sock''s pid, and nlmsg_seq will be initialized to
 467  * 'nlmsg_seq', before the message is sent.
 468  *
 469  * Returns 0 if successful, otherwise a positive errno value.  If
 470  * 'wait' is true, then the send will wait until buffer space is ready;
 471  * otherwise, returns EAGAIN if the 'sock' send buffer is full.
 472  *
 473  * This function is suitable for sending a reply to a request that was received
 474  * with sequence number 'nlmsg_seq'.  Otherwise, use nl_sock_send() instead. */
 475 int
 476 nl_sock_send_seq(struct nl_sock *sock, const struct ofpbuf *msg,
 477                  uint32_t nlmsg_seq, bool wait)
 478 {
 479     return nl_sock_send__(sock, msg, nlmsg_seq, wait);
 480 }
 481
 482 static int
 483 nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
 484 {
 485     /* We can't accurately predict the size of the data to be received.  The
 486      * caller is supposed to have allocated enough space in 'buf' to handle the
 487      * "typical" case.  To handle exceptions, we make available enough space in
 488      * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
 489      * figure since that's the maximum length of a Netlink attribute). */
 490     struct nlmsghdr *nlmsghdr;
 491 #ifdef _WIN32
 492 #define MAX_STACK_LENGTH 81920
 493     uint8_t tail[MAX_STACK_LENGTH];
 494 #else
 495     uint8_t tail[65536];
 496 #endif
 497     struct iovec iov[2];
 498     struct msghdr msg;
 499     ssize_t retval;
 500     int error;
 501
 502     ovs_assert(buf->allocated >= sizeof *nlmsghdr);
 503     ofpbuf_clear(buf);
 504
 505     iov[0].iov_base = ofpbuf_base(buf);
 506     iov[0].iov_len = buf->allocated;
 507     iov[1].iov_base = tail;
 508     iov[1].iov_len = sizeof tail;
 509
 510     memset(&msg, 0, sizeof msg);
 511     msg.msg_iov = iov;
 512     msg.msg_iovlen = 2;
 513
 514     /* Receive a Netlink message from the kernel.
 515      *
 516      * This works around a kernel bug in which the kernel returns an error code
 517      * as if it were the number of bytes read.  It doesn't actually modify
 518      * anything in the receive buffer in that case, so we can initialize the
 519      * Netlink header with an impossible message length and then, upon success,
 520      * check whether it changed. */
 521     nlmsghdr = ofpbuf_base(buf);
 522     do {
 523         nlmsghdr->nlmsg_len = UINT32_MAX;
 524 #ifdef _WIN32
 525         boolean result = false;
 526         DWORD last_error = 0;
 527         result = ReadFile(sock->handle, tail, MAX_STACK_LENGTH, &retval, NULL);
 528         last_error = GetLastError();
 529         if (last_error != ERROR_SUCCESS && !result) {
 530             retval = -1;
 531             errno = EAGAIN;
 532         } else {
 533             ofpbuf_put(buf, tail, retval);
 534         }
 535 #else
 536         retval = recvmsg(sock->fd, &msg, wait ? 0 : MSG_DONTWAIT);
 537 #endif
 538         error = (retval < 0 ? errno
 539                  : retval == 0 ? ECONNRESET /* not possible? */
 540                  : nlmsghdr->nlmsg_len != UINT32_MAX ? 0
 541                  : retval);
 542     } while (error == EINTR);
 543     if (error) {
 544         if (error == ENOBUFS) {
 545             /* Socket receive buffer overflow dropped one or more messages that
 546              * the kernel tried to send to us. */
 547             COVERAGE_INC(netlink_overflow);
 548         }
 549         return error;
 550     }
 551
 552     if (msg.msg_flags & MSG_TRUNC) {
 553         VLOG_ERR_RL(&rl, "truncated message (longer than %"PRIuSIZE" bytes)",
 554                     sizeof tail);
 555         return E2BIG;
 556     }
 557
 558     if (retval < sizeof *nlmsghdr
 559         || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
 560         || nlmsghdr->nlmsg_len > retval) {
 561         VLOG_ERR_RL(&rl, "received invalid nlmsg (%"PRIuSIZE" bytes < %"PRIuSIZE")",
 562                     retval, sizeof *nlmsghdr);
 563         return EPROTO;
 564     }
 565 #ifndef _WIN32
 566     ofpbuf_set_size(buf, MIN(retval, buf->allocated));
 567     if (retval > buf->allocated) {
 568         COVERAGE_INC(netlink_recv_jumbo);
 569         ofpbuf_put(buf, tail, retval - buf->allocated);
 570     }
 571 #endif
 572
 573     log_nlmsg(__func__, 0, ofpbuf_data(buf), ofpbuf_size(buf), sock->protocol);
 574     COVERAGE_INC(netlink_received);
 575
 576     return 0;
 577 }
 578
 579 /* Tries to receive a Netlink message from the kernel on 'sock' into 'buf'.  If
 580  * 'wait' is true, waits for a message to be ready.  Otherwise, fails with
 581  * EAGAIN if the 'sock' receive buffer is empty.
 582  *
 583  * The caller must have initialized 'buf' with an allocation of at least
 584  * NLMSG_HDRLEN bytes.  For best performance, the caller should allocate enough
 585  * space for a "typical" message.
 586  *
 587  * On success, returns 0 and replaces 'buf''s previous content by the received
 588  * message.  This function expands 'buf''s allocated memory, as necessary, to
 589  * hold the actual size of the received message.
 590  *
 591  * On failure, returns a positive errno value and clears 'buf' to zero length.
 592  * 'buf' retains its previous memory allocation.
 593  *
 594  * Regardless of success or failure, this function resets 'buf''s headroom to
 595  * 0. */
 596 int
 597 nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
 598 {
 599     return nl_sock_recv__(sock, buf, wait);
 600 }
 601
 602 static void
 603 nl_sock_record_errors__(struct nl_transaction **transactions, size_t n,
 604                         int error)
 605 {
 606     size_t i;
 607
 608     for (i = 0; i < n; i++) {
 609         struct nl_transaction *txn = transactions[i];
 610
 611         txn->error = error;
 612         if (txn->reply) {
 613             ofpbuf_clear(txn->reply);
 614         }
 615     }
 616 }
 617
 618 static int
 619 nl_sock_transact_multiple__(struct nl_sock *sock,
 620                             struct nl_transaction **transactions, size_t n,
 621                             size_t *done)
 622 {
 623     uint64_t tmp_reply_stub[1024 / 8];
 624     struct nl_transaction tmp_txn;
 625     struct ofpbuf tmp_reply;
 626
 627     uint32_t base_seq;
 628     struct iovec iovs[MAX_IOVS];
 629     struct msghdr msg;
 630     int error;
 631     int i;
 632
 633     base_seq = nl_sock_allocate_seq(sock, n);
 634     *done = 0;
 635     for (i = 0; i < n; i++) {
 636         struct nl_transaction *txn = transactions[i];
 637         struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(txn->request);
 638
 639         nlmsg->nlmsg_len = ofpbuf_size(txn->request);
 640         nlmsg->nlmsg_seq = base_seq + i;
 641         nlmsg->nlmsg_pid = sock->pid;
 642
 643         iovs[i].iov_base = ofpbuf_data(txn->request);
 644         iovs[i].iov_len = ofpbuf_size(txn->request);
 645     }
 646
 647     memset(&msg, 0, sizeof msg);
 648     msg.msg_iov = iovs;
 649     msg.msg_iovlen = n;
 650     do {
 651 #ifdef _WIN32
 652     DWORD last_error = 0;
 653     bool result = FALSE;
 654     for (i = 0; i < n; i++) {
 655         result = WriteFile((HANDLE)sock->handle, iovs[i].iov_base, iovs[i].iov_len,
 656                            &error, NULL);
 657         last_error = GetLastError();
 658         if (last_error != ERROR_SUCCESS && !result) {
 659             error = EAGAIN;
 660             errno = EAGAIN;
 661         } else {
 662             error = 0;
 663         }
 664     }
 665 #else
 666         error = sendmsg(sock->fd, &msg, 0) < 0 ? errno : 0;
 667 #endif
 668     } while (error == EINTR);
 669
 670     for (i = 0; i < n; i++) {
 671         struct nl_transaction *txn = transactions[i];
 672
 673         log_nlmsg(__func__, error, ofpbuf_data(txn->request), ofpbuf_size(txn->request),
 674                   sock->protocol);
 675     }
 676     if (!error) {
 677         COVERAGE_ADD(netlink_sent, n);
 678     }
 679
 680     if (error) {
 681         return error;
 682     }
 683
 684     ofpbuf_use_stub(&tmp_reply, tmp_reply_stub, sizeof tmp_reply_stub);
 685     tmp_txn.request = NULL;
 686     tmp_txn.reply = &tmp_reply;
 687     tmp_txn.error = 0;
 688     while (n > 0) {
 689         struct nl_transaction *buf_txn, *txn;
 690         uint32_t seq;
 691
 692         /* Find a transaction whose buffer we can use for receiving a reply.
 693          * If no such transaction is left, use tmp_txn. */
 694         buf_txn = &tmp_txn;
 695         for (i = 0; i < n; i++) {
 696             if (transactions[i]->reply) {
 697                 buf_txn = transactions[i];
 698                 break;
 699             }
 700         }
 701
 702         /* Receive a reply. */
 703         error = nl_sock_recv__(sock, buf_txn->reply, false);
 704         if (error) {
 705             if (error == EAGAIN) {
 706                 nl_sock_record_errors__(transactions, n, 0);
 707                 *done += n;
 708                 error = 0;
 709             }
 710             break;
 711         }
 712
 713         /* Match the reply up with a transaction. */
 714         seq = nl_msg_nlmsghdr(buf_txn->reply)->nlmsg_seq;
 715         if (seq < base_seq || seq >= base_seq + n) {
 716             VLOG_DBG_RL(&rl, "ignoring unexpected seq %#"PRIx32, seq);
 717             continue;
 718         }
 719         i = seq - base_seq;
 720         txn = transactions[i];
 721
 722         /* Fill in the results for 'txn'. */
 723         if (nl_msg_nlmsgerr(buf_txn->reply, &txn->error)) {
 724             if (txn->reply) {
 725                 ofpbuf_clear(txn->reply);
 726             }
 727             if (txn->error) {
 728                 VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
 729                             error, ovs_strerror(txn->error));
 730             }
 731         } else {
 732             txn->error = 0;
 733             if (txn->reply && txn != buf_txn) {
 734                 /* Swap buffers. */
 735                 struct ofpbuf *reply = buf_txn->reply;
 736                 buf_txn->reply = txn->reply;
 737                 txn->reply = reply;
 738             }
 739         }
 740
 741         /* Fill in the results for transactions before 'txn'.  (We have to do
 742          * this after the results for 'txn' itself because of the buffer swap
 743          * above.) */
 744         nl_sock_record_errors__(transactions, i, 0);
 745
 746         /* Advance. */
 747         *done += i + 1;
 748         transactions += i + 1;
 749         n -= i + 1;
 750         base_seq += i + 1;
 751     }
 752     ofpbuf_uninit(&tmp_reply);
 753
 754     return error;
 755 }
 756
 757 static void
 758 nl_sock_transact_multiple(struct nl_sock *sock,
 759                           struct nl_transaction **transactions, size_t n)
 760 {
 761     int max_batch_count;
 762     int error;
 763
 764     if (!n) {
 765         return;
 766     }
 767
 768     /* In theory, every request could have a 64 kB reply.  But the default and
 769      * maximum socket rcvbuf size with typical Dom0 memory sizes both tend to
 770      * be a bit below 128 kB, so that would only allow a single message in a
 771      * "batch".  So we assume that replies average (at most) 4 kB, which allows
 772      * a good deal of batching.
 773      *
 774      * In practice, most of the requests that we batch either have no reply at
 775      * all or a brief reply. */
 776     max_batch_count = MAX(sock->rcvbuf / 4096, 1);
 777     max_batch_count = MIN(max_batch_count, max_iovs);
 778
 779     while (n > 0) {
 780         size_t count, bytes;
 781         size_t done;
 782
 783         /* Batch up to 'max_batch_count' transactions.  But cap it at about a
 784          * page of requests total because big skbuffs are expensive to
 785          * allocate in the kernel.  */
 786 #if defined(PAGESIZE)
 787         enum { MAX_BATCH_BYTES = MAX(1, PAGESIZE - 512) };
 788 #else
 789         enum { MAX_BATCH_BYTES = 4096 - 512 };
 790 #endif
 791         bytes = ofpbuf_size(transactions[0]->request);
 792         for (count = 1; count < n && count < max_batch_count; count++) {
 793             if (bytes + ofpbuf_size(transactions[count]->request) > MAX_BATCH_BYTES) {
 794                 break;
 795             }
 796             bytes += ofpbuf_size(transactions[count]->request);
 797         }
 798
 799         error = nl_sock_transact_multiple__(sock, transactions, count, &done);
 800         transactions += done;
 801         n -= done;
 802
 803         if (error == ENOBUFS) {
 804             VLOG_DBG_RL(&rl, "receive buffer overflow, resending request");
 805         } else if (error) {
 806             VLOG_ERR_RL(&rl, "transaction error (%s)", ovs_strerror(error));
 807             nl_sock_record_errors__(transactions, n, error);
 808         }
 809     }
 810 }
 811
 812 static int
 813 nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request,
 814                  struct ofpbuf **replyp)
 815 {
 816     struct nl_transaction *transactionp;
 817     struct nl_transaction transaction;
 818
 819     transaction.request = CONST_CAST(struct ofpbuf *, request);
 820     transaction.reply = replyp ? ofpbuf_new(1024) : NULL;
 821     transactionp = &transaction;
 822
 823     nl_sock_transact_multiple(sock, &transactionp, 1);
 824
 825     if (replyp) {
 826         if (transaction.error) {
 827             ofpbuf_delete(transaction.reply);
 828             *replyp = NULL;
 829         } else {
 830             *replyp = transaction.reply;
 831         }
 832     }
 833
 834     return transaction.error;
 835 }
 836
 837 /* Drain all the messages currently in 'sock''s receive queue. */
 838 int
 839 nl_sock_drain(struct nl_sock *sock)
 840 {
 841 #ifdef _WIN32
 842     return 0;
 843 #else
 844     return drain_rcvbuf(sock->fd);
 845 #endif
 846 }
 847
 848 /* Starts a Netlink "dump" operation, by sending 'request' to the kernel on a
 849  * Netlink socket created with the given 'protocol', and initializes 'dump' to
 850  * reflect the state of the operation.
 851  *
 852  * 'request' must contain a Netlink message.  Before sending the message,
 853  * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
 854  * set to the Netlink socket's pid.  NLM_F_DUMP and NLM_F_ACK will be set in
 855  * nlmsg_flags.
 856  *
 857  * The design of this Netlink socket library ensures that the dump is reliable.
 858  *
 859  * This function provides no status indication.  nl_dump_done() provides an
 860  * error status for the entire dump operation.
 861  *
 862  * The caller must eventually destroy 'request'.
 863  */
 864 void
 865 nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
 866 {
 867     nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
 868
 869     ovs_mutex_init(&dump->mutex);
 870     ovs_mutex_lock(&dump->mutex);
 871     dump->status = nl_pool_alloc(protocol, &dump->sock);
 872     if (!dump->status) {
 873         dump->status = nl_sock_send__(dump->sock, request,
 874                                       nl_sock_allocate_seq(dump->sock, 1),
 875                                       true);
 876     }
 877     dump->nl_seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
 878     ovs_mutex_unlock(&dump->mutex);
 879 }
 880
 881 static int
 882 nl_dump_refill(struct nl_dump *dump, struct ofpbuf *buffer)
 883     OVS_REQUIRES(dump->mutex)
 884 {
 885     struct nlmsghdr *nlmsghdr;
 886     int error;
 887
 888     while (!ofpbuf_size(buffer)) {
 889         error = nl_sock_recv__(dump->sock, buffer, false);
 890         if (error) {
 891             /* The kernel never blocks providing the results of a dump, so
 892              * error == EAGAIN means that we've read the whole thing, and
 893              * therefore transform it into EOF.  (The kernel always provides
 894              * NLMSG_DONE as a sentinel.  Some other thread must have received
 895              * that already but not yet signaled it in 'status'.)
 896              *
 897              * Any other error is just an error. */
 898             return error == EAGAIN ? EOF : error;
 899         }
 900
 901         nlmsghdr = nl_msg_nlmsghdr(buffer);
 902         if (dump->nl_seq != nlmsghdr->nlmsg_seq) {
 903             VLOG_DBG_RL(&rl, "ignoring seq %#"PRIx32" != expected %#"PRIx32,
 904                         nlmsghdr->nlmsg_seq, dump->nl_seq);
 905             ofpbuf_clear(buffer);
 906         }
 907     }
 908
 909     if (nl_msg_nlmsgerr(buffer, &error) && error) {
 910         VLOG_INFO_RL(&rl, "netlink dump request error (%s)",
 911                      ovs_strerror(error));
 912         ofpbuf_clear(buffer);
 913         return error;
 914     }
 915
 916     return 0;
 917 }
 918
 919 static int
 920 nl_dump_next__(struct ofpbuf *reply, struct ofpbuf *buffer)
 921 {
 922     struct nlmsghdr *nlmsghdr = nl_msg_next(buffer, reply);
 923     if (!nlmsghdr) {
 924         VLOG_WARN_RL(&rl, "netlink dump contains message fragment");
 925         return EPROTO;
 926     } else if (nlmsghdr->nlmsg_type == NLMSG_DONE) {
 927         return EOF;
 928     } else {
 929         return 0;
 930     }
 931 }
 932
 933 /* Attempts to retrieve another reply from 'dump' into 'buffer'. 'dump' must
 934  * have been initialized with nl_dump_start(), and 'buffer' must have been
 935  * initialized. 'buffer' should be at least NL_DUMP_BUFSIZE bytes long.
 936  *
 937  * If successful, returns true and points 'reply->data' and
 938  * 'ofpbuf_size(reply)' to the message that was retrieved. The caller must not
 939  * modify 'reply' (because it points within 'buffer', which will be used by
 940  * future calls to this function).
 941  *
 942  * On failure, returns false and sets 'reply->data' to NULL and
 943  * 'ofpbuf_size(reply)' to 0.  Failure might indicate an actual error or merely
 944  * the end of replies.  An error status for the entire dump operation is
 945  * provided when it is completed by calling nl_dump_done().
 946  *
 947  * Multiple threads may call this function, passing the same nl_dump, however
 948  * each must provide independent buffers. This function may cache multiple
 949  * replies in the buffer, and these will be processed before more replies are
 950  * fetched. When this function returns false, other threads may continue to
 951  * process replies in their buffers, but they will not fetch more replies.
 952  */
 953 bool
 954 nl_dump_next(struct nl_dump *dump, struct ofpbuf *reply, struct ofpbuf *buffer)
 955 {
 956     int retval = 0;
 957
 958     /* If the buffer is empty, refill it.
 959      *
 960      * If the buffer is not empty, we don't check the dump's status.
 961      * Otherwise, we could end up skipping some of the dump results if thread A
 962      * hits EOF while thread B is in the midst of processing a batch. */
 963     if (!ofpbuf_size(buffer)) {
 964         ovs_mutex_lock(&dump->mutex);
 965         if (!dump->status) {
 966             /* Take the mutex here to avoid an in-kernel race.  If two threads
 967              * try to read from a Netlink dump socket at once, then the socket
 968              * error can be set to EINVAL, which will be encountered on the
 969              * next recv on that socket, which could be anywhere due to the way
 970              * that we pool Netlink sockets.  Serializing the recv calls avoids
 971              * the issue. */
 972             dump->status = nl_dump_refill(dump, buffer);
 973         }
 974         retval = dump->status;
 975         ovs_mutex_unlock(&dump->mutex);
 976     }
 977
 978     /* Fetch the next message from the buffer. */
 979     if (!retval) {
 980         retval = nl_dump_next__(reply, buffer);
 981         if (retval) {
 982             /* Record 'retval' as the dump status, but don't overwrite an error
 983              * with EOF.  */
 984             ovs_mutex_lock(&dump->mutex);
 985             if (dump->status <= 0) {
 986                 dump->status = retval;
 987             }
 988             ovs_mutex_unlock(&dump->mutex);
 989         }
 990     }
 991
 992     if (retval) {
 993         ofpbuf_set_data(reply, NULL);
 994         ofpbuf_set_size(reply, 0);
 995     }
 996     return !retval;
 997 }
 998
 999 /* Completes Netlink dump operation 'dump', which must have been initialized
1000  * with nl_dump_start().  Returns 0 if the dump operation was error-free,
1001  * otherwise a positive errno value describing the problem. */
1002 int
1003 nl_dump_done(struct nl_dump *dump)
1004 {
1005     int status;
1006
1007     ovs_mutex_lock(&dump->mutex);
1008     status = dump->status;
1009     ovs_mutex_unlock(&dump->mutex);
1010
1011     /* Drain any remaining messages that the client didn't read.  Otherwise the
1012      * kernel will continue to queue them up and waste buffer space.
1013      *
1014      * XXX We could just destroy and discard the socket in this case. */
1015     if (!status) {
1016         uint64_t tmp_reply_stub[NL_DUMP_BUFSIZE / 8];
1017         struct ofpbuf reply, buf;
1018
1019         ofpbuf_use_stub(&buf, tmp_reply_stub, sizeof tmp_reply_stub);
1020         while (nl_dump_next(dump, &reply, &buf)) {
1021             /* Nothing to do. */
1022         }
1023         ofpbuf_uninit(&buf);
1024
1025         ovs_mutex_lock(&dump->mutex);
1026         status = dump->status;
1027         ovs_mutex_unlock(&dump->mutex);
1028         ovs_assert(status);
1029     }
1030
1031     nl_pool_release(dump->sock);
1032     ovs_mutex_destroy(&dump->mutex);
1033
1034     return status == EOF ? 0 : status;
1035 }
1036
1037 /* Causes poll_block() to wake up when any of the specified 'events' (which is
1038  * a OR'd combination of POLLIN, POLLOUT, etc.) occur on 'sock'. */
1039 void
1040 nl_sock_wait(const struct nl_sock *sock, short int events)
1041 {
1042 #ifdef _WIN32
1043     poll_fd_wait(sock->handle, events);
1044 #else
1045     poll_fd_wait(sock->fd, events);
1046 #endif
1047 }
1048
1049 /* Returns the underlying fd for 'sock', for use in "poll()"-like operations
1050  * that can't use nl_sock_wait().
1051  *
1052  * It's a little tricky to use the returned fd correctly, because nl_sock does
1053  * "copy on write" to allow a single nl_sock to be used for notifications,
1054  * transactions, and dumps.  If 'sock' is used only for notifications and
1055  * transactions (and never for dump) then the usage is safe. */
1056 int
1057 nl_sock_fd(const struct nl_sock *sock)
1058 {
1059 #ifdef _WIN32
1060     return sock->handle;
1061 #else
1062     return sock->fd;
1063 #endif
1064 }
1065
1066 /* Returns the PID associated with this socket. */
1067 uint32_t
1068 nl_sock_pid(const struct nl_sock *sock)
1069 {
1070     return sock->pid;
1071 }
1072 \f
1073 /* Miscellaneous.  */
1074
1075 struct genl_family {
1076     struct hmap_node hmap_node;
1077     uint16_t id;
1078     char *name;
1079 };
1080
1081 static struct hmap genl_families = HMAP_INITIALIZER(&genl_families);
1082
1083 static const struct nl_policy family_policy[CTRL_ATTR_MAX + 1] = {
1084     [CTRL_ATTR_FAMILY_ID] = {.type = NL_A_U16},
1085     [CTRL_ATTR_MCAST_GROUPS] = {.type = NL_A_NESTED, .optional = true},
1086 };
1087
1088 static struct genl_family *
1089 find_genl_family_by_id(uint16_t id)
1090 {
1091     struct genl_family *family;
1092
1093     HMAP_FOR_EACH_IN_BUCKET (family, hmap_node, hash_int(id, 0),
1094                              &genl_families) {
1095         if (family->id == id) {
1096             return family;
1097         }
1098     }
1099     return NULL;
1100 }
1101
1102 static void
1103 define_genl_family(uint16_t id, const char *name)
1104 {
1105     struct genl_family *family = find_genl_family_by_id(id);
1106
1107     if (family) {
1108         if (!strcmp(family->name, name)) {
1109             return;
1110         }
1111         free(family->name);
1112     } else {
1113         family = xmalloc(sizeof *family);
1114         family->id = id;
1115         hmap_insert(&genl_families, &family->hmap_node, hash_int(id, 0));
1116     }
1117     family->name = xstrdup(name);
1118 }
1119
1120 static const char *
1121 genl_family_to_name(uint16_t id)
1122 {
1123     if (id == GENL_ID_CTRL) {
1124         return "control";
1125     } else {
1126         struct genl_family *family = find_genl_family_by_id(id);
1127         return family ? family->name : "unknown";
1128     }
1129 }
1130
1131 static int
1132 do_lookup_genl_family(const char *name, struct nlattr **attrs,
1133                       struct ofpbuf **replyp)
1134 {
1135     struct nl_sock *sock;
1136     struct ofpbuf request, *reply;
1137     int error;
1138
1139     *replyp = NULL;
1140     error = nl_sock_create(NETLINK_GENERIC, &sock);
1141     if (error) {
1142         return error;
1143     }
1144
1145     ofpbuf_init(&request, 0);
1146     nl_msg_put_genlmsghdr(&request, 0, GENL_ID_CTRL, NLM_F_REQUEST,
1147                           CTRL_CMD_GETFAMILY, 1);
1148     nl_msg_put_string(&request, CTRL_ATTR_FAMILY_NAME, name);
1149     error = nl_sock_transact(sock, &request, &reply);
1150     ofpbuf_uninit(&request);
1151     if (error) {
1152         nl_sock_destroy(sock);
1153         return error;
1154     }
1155
1156     if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
1157                          family_policy, attrs, ARRAY_SIZE(family_policy))
1158         || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
1159         nl_sock_destroy(sock);
1160         ofpbuf_delete(reply);
1161         return EPROTO;
1162     }
1163
1164     nl_sock_destroy(sock);
1165     *replyp = reply;
1166     return 0;
1167 }
1168
1169 /* Finds the multicast group called 'group_name' in genl family 'family_name'.
1170  * When successful, writes its result to 'multicast_group' and returns 0.
1171  * Otherwise, clears 'multicast_group' and returns a positive error code.
1172  */
1173 int
1174 nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
1175                        unsigned int *multicast_group)
1176 {
1177     struct nlattr *family_attrs[ARRAY_SIZE(family_policy)];
1178     const struct nlattr *mc;
1179     struct ofpbuf *reply;
1180     unsigned int left;
1181     int error;
1182
1183     *multicast_group = 0;
1184     error = do_lookup_genl_family(family_name, family_attrs, &reply);
1185     if (error) {
1186         return error;
1187     }
1188
1189     if (!family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
1190         error = EPROTO;
1191         goto exit;
1192     }
1193
1194     NL_NESTED_FOR_EACH (mc, left, family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
1195         static const struct nl_policy mc_policy[] = {
1196             [CTRL_ATTR_MCAST_GRP_ID] = {.type = NL_A_U32},
1197             [CTRL_ATTR_MCAST_GRP_NAME] = {.type = NL_A_STRING},
1198         };
1199
1200         struct nlattr *mc_attrs[ARRAY_SIZE(mc_policy)];
1201         const char *mc_name;
1202
1203         if (!nl_parse_nested(mc, mc_policy, mc_attrs, ARRAY_SIZE(mc_policy))) {
1204             error = EPROTO;
1205             goto exit;
1206         }
1207
1208         mc_name = nl_attr_get_string(mc_attrs[CTRL_ATTR_MCAST_GRP_NAME]);
1209         if (!strcmp(group_name, mc_name)) {
1210             *multicast_group =
1211                 nl_attr_get_u32(mc_attrs[CTRL_ATTR_MCAST_GRP_ID]);
1212             error = 0;
1213             goto exit;
1214         }
1215     }
1216     error = EPROTO;
1217
1218 exit:
1219     ofpbuf_delete(reply);
1220     return error;
1221 }
1222
1223 /* If '*number' is 0, translates the given Generic Netlink family 'name' to a
1224  * number and stores it in '*number'.  If successful, returns 0 and the caller
1225  * may use '*number' as the family number.  On failure, returns a positive
1226  * errno value and '*number' caches the errno value. */
1227 int
1228 nl_lookup_genl_family(const char *name, int *number)
1229 {
1230     if (*number == 0) {
1231         struct nlattr *attrs[ARRAY_SIZE(family_policy)];
1232         struct ofpbuf *reply;
1233         int error;
1234
1235         error = do_lookup_genl_family(name, attrs, &reply);
1236         if (!error) {
1237             *number = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]);
1238             define_genl_family(*number, name);
1239         } else {
1240             *number = -error;
1241         }
1242         ofpbuf_delete(reply);
1243
1244         ovs_assert(*number != 0);
1245     }
1246     return *number > 0 ? 0 : -*number;
1247 }
1248 \f
1249 struct nl_pool {
1250     struct nl_sock *socks[16];
1251     int n;
1252 };
1253
1254 static struct ovs_mutex pool_mutex = OVS_MUTEX_INITIALIZER;
1255 static struct nl_pool pools[MAX_LINKS] OVS_GUARDED_BY(pool_mutex);
1256
1257 static int
1258 nl_pool_alloc(int protocol, struct nl_sock **sockp)
1259 {
1260     struct nl_sock *sock = NULL;
1261     struct nl_pool *pool;
1262
1263     ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(pools));
1264
1265     ovs_mutex_lock(&pool_mutex);
1266     pool = &pools[protocol];
1267     if (pool->n > 0) {
1268         sock = pool->socks[--pool->n];
1269     }
1270     ovs_mutex_unlock(&pool_mutex);
1271
1272     if (sock) {
1273         *sockp = sock;
1274         return 0;
1275     } else {
1276         return nl_sock_create(protocol, sockp);
1277     }
1278 }
1279
1280 static void
1281 nl_pool_release(struct nl_sock *sock)
1282 {
1283     if (sock) {
1284         struct nl_pool *pool = &pools[sock->protocol];
1285
1286         ovs_mutex_lock(&pool_mutex);
1287         if (pool->n < ARRAY_SIZE(pool->socks)) {
1288             pool->socks[pool->n++] = sock;
1289             sock = NULL;
1290         }
1291         ovs_mutex_unlock(&pool_mutex);
1292
1293         nl_sock_destroy(sock);
1294     }
1295 }
1296
1297 /* Sends 'request' to the kernel on a Netlink socket for the given 'protocol'
1298  * (e.g. NETLINK_ROUTE or NETLINK_GENERIC) and waits for a response.  If
1299  * successful, returns 0.  On failure, returns a positive errno value.
1300  *
1301  * If 'replyp' is nonnull, then on success '*replyp' is set to the kernel's
1302  * reply, which the caller is responsible for freeing with ofpbuf_delete(), and
1303  * on failure '*replyp' is set to NULL.  If 'replyp' is null, then the kernel's
1304  * reply, if any, is discarded.
1305  *
1306  * Before the message is sent, nlmsg_len in 'request' will be finalized to
1307  * match ofpbuf_size(msg), nlmsg_pid will be set to the pid of the socket used
1308  * for sending the request, and nlmsg_seq will be initialized.
1309  *
1310  * The caller is responsible for destroying 'request'.
1311  *
1312  * Bare Netlink is an unreliable transport protocol.  This function layers
1313  * reliable delivery and reply semantics on top of bare Netlink.
1314  *
1315  * In Netlink, sending a request to the kernel is reliable enough, because the
1316  * kernel will tell us if the message cannot be queued (and we will in that
1317  * case put it on the transmit queue and wait until it can be delivered).
1318  *
1319  * Receiving the reply is the real problem: if the socket buffer is full when
1320  * the kernel tries to send the reply, the reply will be dropped.  However, the
1321  * kernel sets a flag that a reply has been dropped.  The next call to recv
1322  * then returns ENOBUFS.  We can then re-send the request.
1323  *
1324  * Caveats:
1325  *
1326  *      1. Netlink depends on sequence numbers to match up requests and
1327  *         replies.  The sender of a request supplies a sequence number, and
1328  *         the reply echos back that sequence number.
1329  *
1330  *         This is fine, but (1) some kernel netlink implementations are
1331  *         broken, in that they fail to echo sequence numbers and (2) this
1332  *         function will drop packets with non-matching sequence numbers, so
1333  *         that only a single request can be usefully transacted at a time.
1334  *
1335  *      2. Resending the request causes it to be re-executed, so the request
1336  *         needs to be idempotent.
1337  */
1338 int
1339 nl_transact(int protocol, const struct ofpbuf *request,
1340             struct ofpbuf **replyp)
1341 {
1342     struct nl_sock *sock;
1343     int error;
1344
1345     error = nl_pool_alloc(protocol, &sock);
1346     if (error) {
1347         *replyp = NULL;
1348         return error;
1349     }
1350
1351     error = nl_sock_transact(sock, request, replyp);
1352
1353     nl_pool_release(sock);
1354     return error;
1355 }
1356
1357 /* Sends the 'request' member of the 'n' transactions in 'transactions' on a
1358  * Netlink socket for the given 'protocol' (e.g. NETLINK_ROUTE or
1359  * NETLINK_GENERIC), in order, and receives responses to all of them.  Fills in
1360  * the 'error' member of each transaction with 0 if it was successful,
1361  * otherwise with a positive errno value.  If 'reply' is nonnull, then it will
1362  * be filled with the reply if the message receives a detailed reply.  In other
1363  * cases, i.e. where the request failed or had no reply beyond an indication of
1364  * success, 'reply' will be cleared if it is nonnull.
1365  *
1366  * The caller is responsible for destroying each request and reply, and the
1367  * transactions array itself.
1368  *
1369  * Before sending each message, this function will finalize nlmsg_len in each
1370  * 'request' to match the ofpbuf's size, set nlmsg_pid to the pid of the socket
1371  * used for the transaction, and initialize nlmsg_seq.
1372  *
1373  * Bare Netlink is an unreliable transport protocol.  This function layers
1374  * reliable delivery and reply semantics on top of bare Netlink.  See
1375  * nl_transact() for some caveats.
1376  */
1377 void
1378 nl_transact_multiple(int protocol,
1379                      struct nl_transaction **transactions, size_t n)
1380 {
1381     struct nl_sock *sock;
1382     int error;
1383
1384     error = nl_pool_alloc(protocol, &sock);
1385     if (!error) {
1386         nl_sock_transact_multiple(sock, transactions, n);
1387         nl_pool_release(sock);
1388     } else {
1389         nl_sock_record_errors__(transactions, n, error);
1390     }
1391 }
1392
1393 \f
1394 static uint32_t
1395 nl_sock_allocate_seq(struct nl_sock *sock, unsigned int n)
1396 {
1397     uint32_t seq = sock->next_seq;
1398
1399     sock->next_seq += n;
1400
1401     /* Make it impossible for the next request for sequence numbers to wrap
1402      * around to 0.  Start over with 1 to avoid ever using a sequence number of
1403      * 0, because the kernel uses sequence number 0 for notifications. */
1404     if (sock->next_seq >= UINT32_MAX / 2) {
1405         sock->next_seq = 1;
1406     }
1407
1408     return seq;
1409 }
1410
1411 static void
1412 nlmsghdr_to_string(const struct nlmsghdr *h, int protocol, struct ds *ds)
1413 {
1414     struct nlmsg_flag {
1415         unsigned int bits;
1416         const char *name;
1417     };
1418     static const struct nlmsg_flag flags[] = {
1419         { NLM_F_REQUEST, "REQUEST" },
1420         { NLM_F_MULTI, "MULTI" },
1421         { NLM_F_ACK, "ACK" },
1422         { NLM_F_ECHO, "ECHO" },
1423         { NLM_F_DUMP, "DUMP" },
1424         { NLM_F_ROOT, "ROOT" },
1425         { NLM_F_MATCH, "MATCH" },
1426         { NLM_F_ATOMIC, "ATOMIC" },
1427     };
1428     const struct nlmsg_flag *flag;
1429     uint16_t flags_left;
1430
1431     ds_put_format(ds, "nl(len:%"PRIu32", type=%"PRIu16,
1432                   h->nlmsg_len, h->nlmsg_type);
1433     if (h->nlmsg_type == NLMSG_NOOP) {
1434         ds_put_cstr(ds, "(no-op)");
1435     } else if (h->nlmsg_type == NLMSG_ERROR) {
1436         ds_put_cstr(ds, "(error)");
1437     } else if (h->nlmsg_type == NLMSG_DONE) {
1438         ds_put_cstr(ds, "(done)");
1439     } else if (h->nlmsg_type == NLMSG_OVERRUN) {
1440         ds_put_cstr(ds, "(overrun)");
1441     } else if (h->nlmsg_type < NLMSG_MIN_TYPE) {
1442         ds_put_cstr(ds, "(reserved)");
1443     } else if (protocol == NETLINK_GENERIC) {
1444         ds_put_format(ds, "(%s)", genl_family_to_name(h->nlmsg_type));
1445     } else {
1446         ds_put_cstr(ds, "(family-defined)");
1447     }
1448     ds_put_format(ds, ", flags=%"PRIx16, h->nlmsg_flags);
1449     flags_left = h->nlmsg_flags;
1450     for (flag = flags; flag < &flags[ARRAY_SIZE(flags)]; flag++) {
1451         if ((flags_left & flag->bits) == flag->bits) {
1452             ds_put_format(ds, "[%s]", flag->name);
1453             flags_left &= ~flag->bits;
1454         }
1455     }
1456     if (flags_left) {
1457         ds_put_format(ds, "[OTHER:%"PRIx16"]", flags_left);
1458     }
1459     ds_put_format(ds, ", seq=%"PRIx32", pid=%"PRIu32,
1460                   h->nlmsg_seq, h->nlmsg_pid);
1461 }
1462
1463 static char *
1464 nlmsg_to_string(const struct ofpbuf *buffer, int protocol)
1465 {
1466     struct ds ds = DS_EMPTY_INITIALIZER;
1467     const struct nlmsghdr *h = ofpbuf_at(buffer, 0, NLMSG_HDRLEN);
1468     if (h) {
1469         nlmsghdr_to_string(h, protocol, &ds);
1470         if (h->nlmsg_type == NLMSG_ERROR) {
1471             const struct nlmsgerr *e;
1472             e = ofpbuf_at(buffer, NLMSG_HDRLEN,
1473                           NLMSG_ALIGN(sizeof(struct nlmsgerr)));
1474             if (e) {
1475                 ds_put_format(&ds, " error(%d", e->error);
1476                 if (e->error < 0) {
1477                     ds_put_format(&ds, "(%s)", ovs_strerror(-e->error));
1478                 }
1479                 ds_put_cstr(&ds, ", in-reply-to(");
1480                 nlmsghdr_to_string(&e->msg, protocol, &ds);
1481                 ds_put_cstr(&ds, "))");
1482             } else {
1483                 ds_put_cstr(&ds, " error(truncated)");
1484             }
1485         } else if (h->nlmsg_type == NLMSG_DONE) {
1486             int *error = ofpbuf_at(buffer, NLMSG_HDRLEN, sizeof *error);
1487             if (error) {
1488                 ds_put_format(&ds, " done(%d", *error);
1489                 if (*error < 0) {
1490                     ds_put_format(&ds, "(%s)", ovs_strerror(-*error));
1491                 }
1492                 ds_put_cstr(&ds, ")");
1493             } else {
1494                 ds_put_cstr(&ds, " done(truncated)");
1495             }
1496         } else if (protocol == NETLINK_GENERIC) {
1497             struct genlmsghdr *genl = nl_msg_genlmsghdr(buffer);
1498             if (genl) {
1499                 ds_put_format(&ds, ",genl(cmd=%"PRIu8",version=%"PRIu8")",
1500                               genl->cmd, genl->version);
1501             }
1502         }
1503     } else {
1504         ds_put_cstr(&ds, "nl(truncated)");
1505     }
1506     return ds.string;
1507 }
1508
1509 static void
1510 log_nlmsg(const char *function, int error,
1511           const void *message, size_t size, int protocol)
1512 {
1513     struct ofpbuf buffer;
1514     char *nlmsg;
1515
1516     if (!VLOG_IS_DBG_ENABLED()) {
1517         return;
1518     }
1519
1520     ofpbuf_use_const(&buffer, message, size);
1521     nlmsg = nlmsg_to_string(&buffer, protocol);
1522     VLOG_DBG_RL(&rl, "%s (%s): %s", function, ovs_strerror(error), nlmsg);
1523     free(nlmsg);
1524 }