ovs-atomics: Add atomic support Windows.
authorGurucharan Shetty <gshetty@nicira.com>
Thu, 21 Aug 2014 20:57:37 +0000 (13:57 -0700)
committerGurucharan Shetty <gshetty@nicira.com>
Thu, 4 Sep 2014 22:57:26 +0000 (15:57 -0700)
Before this change (i.e., with pthread locks for atomics on Windows),
the benchmark for cmap and hmap was as follows:

$ ./tests/ovstest.exe test-cmap benchmark 10000000 3 1
Benchmarking with n=10000000, 3 threads, 1.00% mutations:
cmap insert:  61070 ms
cmap iterate:  2750 ms
cmap search:  14238 ms
cmap destroy:  8354 ms

hmap insert:   1701 ms
hmap iterate:   985 ms
hmap search:   3755 ms
hmap destroy:  1052 ms

After this change, the benchmark is as follows:
$ ./tests/ovstest.exe test-cmap benchmark 10000000 3 1
Benchmarking with n=10000000, 3 threads, 1.00% mutations:
cmap insert:   3666 ms
cmap iterate:   365 ms
cmap search:   2016 ms
cmap destroy:  1331 ms

hmap insert:   1495 ms
hmap iterate:  1026 ms
hmap search:   4167 ms
hmap destroy:  1046 ms

So there is clearly a big improvement for cmap.

But the correspondig test on Linux (with gcc 4.6) yeilds the following:

./tests/ovstest test-cmap benchmark 10000000 3 1
Benchmarking with n=10000000, 3 threads, 1.00% mutations:
cmap insert:   3917 ms
cmap iterate:   355 ms
cmap search:    871 ms
cmap destroy:  1158 ms

hmap insert:   1988 ms
hmap iterate:  1005 ms
hmap search:   5428 ms
hmap destroy:   980 ms

So for this particular test, except for "cmap search", Windows and
Linux have similar performance. Windows is around 2.5x slower in "cmap search"
compared to Linux. This has to be investigated.

Signed-off-by: Gurucharan Shetty <gshetty@nicira.com>
[With a lot of inputs and help from Jarno]
Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
lib/automake.mk
lib/ovs-atomic-msvc.h [new file with mode: 0644]
lib/ovs-atomic.h

index d46613f..f371ee0 100644 (file)
@@ -152,6 +152,7 @@ lib_libopenvswitch_la_SOURCES = \
        lib/ovs-atomic-i586.h \
        lib/ovs-atomic-locked.c \
        lib/ovs-atomic-locked.h \
+       lib/ovs-atomic-msvc.h \
        lib/ovs-atomic-pthreads.h \
        lib/ovs-atomic-x86_64.h \
        lib/ovs-atomic.h \
diff --git a/lib/ovs-atomic-msvc.h b/lib/ovs-atomic-msvc.h
new file mode 100644 (file)
index 0000000..11f5ad2
--- /dev/null
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2014 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* This header implements atomic operation primitives for MSVC
+ * on i586 or greater platforms (32 bit). */
+#ifndef IN_OVS_ATOMIC_H
+#error "This header should only be included indirectly via ovs-atomic.h."
+#endif
+
+/* From msdn documentation: With Visual Studio 2003, volatile to volatile
+ * references are ordered; the compiler will not re-order volatile variable
+ * access. With Visual Studio 2005, the compiler also uses acquire semantics
+ * for read operations on volatile variables and release semantics for write
+ * operations on volatile variables (when supported by the CPU).
+ *
+ * Though there is no clear documentation that states that anything greater
+ * than VS 2005 has the same behavior as described above, looking through MSVCs
+ * C++ atomics library in VS2013 shows that the compiler still takes
+ * acquire/release semantics on volatile variables. */
+#define ATOMIC(TYPE) TYPE volatile
+
+typedef enum {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} memory_order;
+
+#define ATOMIC_BOOL_LOCK_FREE 2
+#define ATOMIC_CHAR_LOCK_FREE 2
+#define ATOMIC_SHORT_LOCK_FREE 2
+#define ATOMIC_INT_LOCK_FREE 2
+#define ATOMIC_LONG_LOCK_FREE 2
+#define ATOMIC_LLONG_LOCK_FREE 2
+#define ATOMIC_POINTER_LOCK_FREE 2
+
+#define IS_LOCKLESS_ATOMIC(OBJECT)                      \
+    (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT)))
+
+#define ATOMIC_VAR_INIT(VALUE) (VALUE)
+#define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0)
+
+static inline void
+atomic_compiler_barrier(memory_order order)
+{
+    /* In case of 'memory_order_consume', it is implicitly assumed that
+     * the compiler will not move instructions that have data-dependency
+     * on the variable in question before the barrier. */
+    if (order > memory_order_consume) {
+        _ReadWriteBarrier();
+    }
+}
+
+static inline void
+atomic_thread_fence(memory_order order)
+{
+    /* x86 is strongly ordered and acquire/release semantics come
+     * automatically. */
+    atomic_compiler_barrier(order);
+    if (order == memory_order_seq_cst) {
+        MemoryBarrier();
+        atomic_compiler_barrier(order);
+    }
+}
+
+static inline void
+atomic_signal_fence(memory_order order)
+{
+    atomic_compiler_barrier(order);
+}
+
+/* 1, 2 and 4 bytes loads and stores are atomic on aligned memory. In addition,
+ * since the compiler automatically takes acquire and release semantics on
+ * volatile variables, for any order lesser than 'memory_order_seq_cst', we
+ * can directly assign or read values. */
+
+#define atomic_store32(DST, SRC, ORDER)                                 \
+    if (ORDER == memory_order_seq_cst) {                                \
+        InterlockedExchange((int32_t volatile *) (DST),                 \
+                               (int32_t) (SRC));                        \
+    } else {                                                            \
+        *(DST) = (SRC);                                                 \
+    }
+
+/* 64 bit writes are atomic on i586 if 64 bit aligned. */
+#define atomic_store64(DST, SRC, ORDER)                                    \
+    if (((size_t) (DST) & (sizeof *(DST) - 1))                             \
+        || ORDER == memory_order_seq_cst) {                                \
+        InterlockedExchange64((int64_t volatile *) (DST),                  \
+                              (int64_t) (SRC));                            \
+    } else {                                                               \
+        *(DST) = (SRC);                                                    \
+    }
+
+/* Used for 8 and 16 bit variations. */
+#define atomic_storeX(X, DST, SRC, ORDER)                               \
+    if (ORDER == memory_order_seq_cst) {                                \
+        InterlockedExchange##X((int##X##_t volatile *) (DST),           \
+                               (int##X##_t) (SRC));                     \
+    } else {                                                            \
+        *(DST) = (SRC);                                                 \
+    }
+
+#define atomic_store(DST, SRC)                               \
+        atomic_store_explicit(DST, SRC, memory_order_seq_cst)
+
+#define atomic_store_explicit(DST, SRC, ORDER)                           \
+    if (sizeof *(DST) == 1) {                                            \
+        atomic_storeX(8, DST, SRC, ORDER)                                \
+    } else if (sizeof *(DST) == 2) {                                     \
+        atomic_storeX(16, DST, SRC, ORDER)                               \
+    } else if (sizeof *(DST) == 4) {                                     \
+        atomic_store32(DST, SRC, ORDER)                                  \
+    } else if (sizeof *(DST) == 8) {                                     \
+        atomic_store64(DST, SRC, ORDER)                                  \
+    } else {                                                             \
+        abort();                                                         \
+    }
+
+/* On x86, for 'memory_order_seq_cst', if stores are locked, the corresponding
+ * reads don't need to be locked (based on the following in Intel Developers
+ * manual:
+ * â€śLocked operations are atomic with respect to all other memory operations
+ * and all externally visible events. Only instruction fetch and page table
+ * accesses can pass locked instructions. Locked instructions can be used to
+ * synchronize data written by one processor and read by another processor.
+ * For the P6 family processors, locked operations serialize all outstanding
+ * load and store operations (that is, wait for them to complete). This rule
+ * is also true for the Pentium 4 and Intel Xeon processors, with one
+ * exception. Load operations that reference weakly ordered memory types
+ * (such as the WC memory type) may not be serialized."). */
+
+ /* For 8, 16 and 32 bit variations. */
+#define atomic_readX(SRC, DST, ORDER)                                      \
+    *(DST) = *(SRC);
+
+/* 64 bit reads are atomic on i586 if 64 bit aligned. */
+#define atomic_read64(SRC, DST, ORDER)                                     \
+    if (((size_t) (SRC) & (sizeof *(SRC) - 1)) == 0) {                     \
+        *(DST) = *(SRC);                                                   \
+    } else {                                                               \
+       *(DST) = InterlockedOr64((int64_t volatile *) (SRC), 0);            \
+    }
+
+#define atomic_read(SRC, DST)                               \
+        atomic_read_explicit(SRC, DST, memory_order_seq_cst)
+
+#define atomic_read_explicit(SRC, DST, ORDER)                             \
+    if (sizeof *(DST) == 1 || sizeof *(DST) == 2 || sizeof *(DST) == 4) { \
+        atomic_readX(SRC, DST, ORDER)                                     \
+    } else if (sizeof *(DST) == 8) {                                      \
+        atomic_read64(SRC, DST, ORDER)                                    \
+    } else {                                                              \
+        abort();                                                          \
+    }
+
+/* For add, sub, and logical operations, for 8, 16 and 64 bit data types,
+ * functions for all the different memory orders does not exist
+ * (though documentation exists for some of them).  The MSVC C++ library which
+ * implements the c11 atomics simply calls the full memory barrier function
+ * for everything in x86(see xatomic.h). So do the same here. */
+
+/* For 8, 16 and 64 bit variations. */
+#define atomic_op(OP, X, RMW, ARG, ORIG, ORDER)                         \
+    atomic_##OP##_generic(X, RMW, ARG, ORIG, ORDER)
+
+/* Arithmetic addition calls. */
+
+#define atomic_add32(RMW, ARG, ORIG, ORDER)                        \
+    *(ORIG) = InterlockedExchangeAdd((int32_t volatile *) (RMW),   \
+                                      (int32_t) (ARG));
+
+/* For 8, 16 and 64 bit variations. */
+#define atomic_add_generic(X, RMW, ARG, ORIG, ORDER)                        \
+    *(ORIG) = _InterlockedExchangeAdd##X((int##X##_t volatile *) (RMW),     \
+                                      (int##X##_t) (ARG));
+
+#define atomic_add(RMW, ARG, ORIG)                               \
+        atomic_add_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
+
+#define atomic_add_explicit(RMW, ARG, ORIG, ORDER)             \
+    if (sizeof *(RMW) == 1) {                                  \
+        atomic_op(add, 8, RMW, ARG, ORIG, ORDER)               \
+    } else if (sizeof *(RMW) == 2) {                           \
+        atomic_op(add, 16, RMW, ARG, ORIG, ORDER)              \
+    } else if (sizeof *(RMW) == 4) {                           \
+        atomic_add32(RMW, ARG, ORIG, ORDER)                    \
+    } else if (sizeof *(RMW) == 8) {                           \
+        atomic_op(add, 64, RMW, ARG, ORIG, ORDER)              \
+    } else {                                                   \
+        abort();                                               \
+    }
+
+/* Arithmetic subtraction calls. */
+
+#define atomic_sub(RMW, ARG, ORIG)                             \
+        atomic_add_explicit(RMW, (0 - (ARG)), ORIG, memory_order_seq_cst)
+
+#define atomic_sub_explicit(RMW, ARG, ORIG, ORDER)           \
+        atomic_add_explicit(RMW, (0 - (ARG)), ORIG, ORDER)
+
+/* Logical 'and' calls. */
+
+#define atomic_and32(RMW, ARG, ORIG, ORDER)                        \
+    *(ORIG) = InterlockedAnd((int32_t volatile *) (RMW), (int32_t) (ARG));
+
+/* For 8, 16 and 64 bit variations. */
+#define atomic_and_generic(X, RMW, ARG, ORIG, ORDER)                        \
+    *(ORIG) = InterlockedAnd##X((int##X##_t volatile *) (RMW),              \
+                                (int##X##_t) (ARG));
+
+#define atomic_and(RMW, ARG, ORIG)                               \
+        atomic_and_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
+
+#define atomic_and_explicit(RMW, ARG, ORIG, ORDER)             \
+    if (sizeof *(RMW) == 1) {                                  \
+        atomic_op(and, 8, RMW, ARG, ORIG, ORDER)               \
+    } else if (sizeof *(RMW) == 2) {                           \
+        atomic_op(and, 16, RMW, ARG, ORIG, ORDER)              \
+    } else if (sizeof *(RMW) == 4) {                           \
+        atomic_and32(RMW, ARG, ORIG, ORDER)                    \
+    } else if (sizeof *(RMW) == 8) {                           \
+        atomic_op(and, 64, RMW, ARG, ORIG, ORDER)              \
+    } else {                                                   \
+        abort();                                               \
+    }
+
+/* Logical 'Or' calls. */
+
+#define atomic_or32(RMW, ARG, ORIG, ORDER)                        \
+    *(ORIG) = InterlockedOr((int32_t volatile *) (RMW), (int32_t) (ARG));
+
+/* For 8, 16 and 64 bit variations. */
+#define atomic_or_generic(X, RMW, ARG, ORIG, ORDER)                        \
+    *(ORIG) = InterlockedOr##X((int##X##_t volatile *) (RMW),              \
+                               (int##X##_t) (ARG));
+
+#define atomic_or(RMW, ARG, ORIG)                               \
+        atomic_or_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
+
+#define atomic_or_explicit(RMW, ARG, ORIG, ORDER)              \
+    if (sizeof *(RMW) == 1) {                                  \
+        atomic_op(or, 8, RMW, ARG, ORIG, ORDER)                \
+    } else if (sizeof *(RMW) == 2) {                           \
+        atomic_op(or, 16, RMW, ARG, ORIG, ORDER)               \
+    } else if (sizeof *(RMW) == 4) {                           \
+        atomic_or32(RMW, ARG, ORIG, ORDER)                     \
+    } else if (sizeof *(RMW) == 8) {                           \
+        atomic_op(or, 64, RMW, ARG, ORIG, ORDER)               \
+    } else {                                                   \
+        abort();                                               \
+    }
+
+/* Logical Xor calls. */
+
+#define atomic_xor32(RMW, ARG, ORIG, ORDER)                        \
+    *(ORIG) = InterlockedXor((int32_t volatile *) (RMW), (int32_t) (ARG));
+
+/* For 8, 16 and 64 bit variations. */
+#define atomic_xor_generic(X, RMW, ARG, ORIG, ORDER)                        \
+    *(ORIG) = InterlockedXor##X((int##X##_t volatile *) (RMW),              \
+                                (int##X##_t) (ARG));
+
+#define atomic_xor(RMW, ARG, ORIG)                               \
+        atomic_xor_explicit(RMW, ARG, ORIG, memory_order_seq_cst)
+
+#define atomic_xor_explicit(RMW, ARG, ORIG, ORDER)             \
+    if (sizeof *(RMW) == 1) {                                  \
+        atomic_op(xor, 8, RMW, ARG, ORIG, ORDER)               \
+    } else if (sizeof *(RMW) == 2) {                           \
+        atomic_op(xor, 16, RMW, ARG, ORIG, ORDER)              \
+    } else if (sizeof *(RMW) == 4) {                           \
+        atomic_xor32(RMW, ARG, ORIG, ORDER);                   \
+    } else if (sizeof *(RMW) == 8) {                           \
+        atomic_op(xor, 64, RMW, ARG, ORIG, ORDER)              \
+    } else {                                                   \
+        abort();                                               \
+    }
+
+#define atomic_compare_exchange_strong(DST, EXP, SRC)   \
+    atomic_compare_exchange_strong_explicit(DST, EXP, SRC, \
+                                            memory_order_seq_cst, \
+                                            memory_order_seq_cst)
+
+#define atomic_compare_exchange_weak atomic_compare_exchange_strong
+#define atomic_compare_exchange_weak_explicit \
+        atomic_compare_exchange_strong_explicit
+
+/* MSVCs c++ compiler implements c11 atomics and looking through its
+ * implementation (in xatomic.h), orders are ignored for x86 platform.
+ * Do the same here. */
+static inline bool
+atomic_compare_exchange8(int8_t volatile *dst, int8_t *expected, int8_t src)
+{
+    int8_t previous = _InterlockedCompareExchange8(dst, src, *expected);
+    if (previous == *expected) {
+        return true;
+    } else {
+        *expected = previous;
+        return false;
+    }
+}
+
+static inline bool
+atomic_compare_exchange16(int16_t volatile *dst, int16_t *expected,
+                          int16_t src)
+{
+    int16_t previous = InterlockedCompareExchange16(dst, src, *expected);
+    if (previous == *expected) {
+        return true;
+    } else {
+        *expected = previous;
+        return false;
+    }
+}
+
+static inline bool
+atomic_compare_exchange32(int32_t volatile *dst, int32_t *expected,
+                          int32_t src)
+{
+    int32_t previous = InterlockedCompareExchange(dst, src, *expected);
+    if (previous == *expected) {
+        return true;
+    } else {
+        *expected = previous;
+        return false;
+    }
+}
+
+static inline bool
+atomic_compare_exchange64(int64_t volatile *dst, int64_t *expected,
+                          int64_t src)
+{
+    int64_t previous = InterlockedCompareExchange64(dst, src, *expected);
+    if (previous == *expected) {
+        return true;
+    } else {
+        *expected = previous;
+        return false;
+    }
+}
+
+static inline bool
+atomic_compare_unreachable()
+{
+    return true;
+}
+
+#define atomic_compare_exchange_strong_explicit(DST, EXP, SRC, ORD1, ORD2)    \
+    (sizeof *(DST) == 1                                                       \
+     ? atomic_compare_exchange8((int8_t volatile *) (DST), (int8_t *) (EXP),  \
+                                (int8_t) (SRC))                               \
+     : (sizeof *(DST) == 2                                                    \
+     ? atomic_compare_exchange16((int16_t volatile *) (DST),                  \
+                                 (int16_t *) (EXP), (int16_t) (SRC))          \
+     : (sizeof *(DST) == 4                                                    \
+     ? atomic_compare_exchange32((int32_t volatile *) (DST),                  \
+                                 (int32_t *) (EXP), (int32_t) (SRC))          \
+     : (sizeof *(DST) == 8                                                    \
+     ? atomic_compare_exchange64((int64_t volatile *) (DST),                  \
+                                 (int64_t *) (EXP), (int64_t) (SRC))          \
+     : ovs_fatal(0, "atomic operation with size greater than 8 bytes"),       \
+       atomic_compare_unreachable()))))
+
+\f
+/* atomic_flag */
+
+typedef ATOMIC(int32_t) atomic_flag;
+#define ATOMIC_FLAG_INIT 0
+
+#define atomic_flag_test_and_set(FLAG)                 \
+    (bool) InterlockedBitTestAndSet(FLAG, 0)
+
+#define atomic_flag_test_and_set_explicit(FLAG, ORDER) \
+        atomic_flag_test_and_set(FLAG)
+
+#define atomic_flag_clear_explicit(FLAG, ORDER) \
+        atomic_flag_clear()
+#define atomic_flag_clear(FLAG)                 \
+    InterlockedBitTestAndReset(FLAG, 0)
index 0f52c5f..6a21372 100644 (file)
         #include "ovs-atomic-i586.h"
     #elif HAVE_GCC4_ATOMICS
         #include "ovs-atomic-gcc4+.h"
+    #elif _MSC_VER && _M_IX86 >= 500
+        #include "ovs-atomic-msvc.h"
     #else
         /* ovs-atomic-pthreads implementation is provided for portability.
          * It might be too slow for real use because Open vSwitch is