/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- * vim: set ts=8 sts=4 et sw=4 tw=99: * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* For overall documentation, see jit/AtomicOperations.h */ #ifndef jit_shared_AtomicOperations_x86_shared_h #define jit_shared_AtomicOperations_x86_shared_h #include "mozilla/Assertions.h" #include "mozilla/Types.h" // Lock-freedom on x86 and x64: // // On x86 and x64 there are atomic instructions for 8-byte accesses: // // Load and stores: // - Loads and stores are single-copy atomic for up to 8 bytes // starting with the Pentium; the store requires a post-fence for // sequential consistency // // CompareExchange: // - On x64 CMPXCHGQ can always be used // - On x86 CMPXCHG8B can be used starting with the first Pentium // // Exchange: // - On x64 XCHGQ can always be used // - On x86 one has to use a CompareExchange loop // // Observe also that the JIT will not be enabled unless we have SSE2, // which was introduced with the Pentium 4. Ergo the JIT will be able // to use atomic instructions for up to 8 bytes on all x86 platforms // for the primitives we care about. // // However, C++ compilers and libraries may not provide access to // those 8-byte instructions directly. Clang in 32-bit mode does not // provide 8-byte atomic primitives at all (even with eg -arch i686 // specified). On Windows 32-bit, MSVC does not provide // _InterlockedExchange64 since it does not map directly to an // instruction. // // There are thus sundry workarounds below to handle known corner // cases. #if defined(__clang__) || defined(__GNUC__) // The default implementation tactic for gcc/clang is to use the newer // __atomic intrinsics added for use in C++11 . Where that // isn't available, we use GCC's older __sync functions instead. // // ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS is kept as a backward // compatible option for older compilers: enable this to use GCC's old // __sync functions instead of the newer __atomic functions. This // will be required for GCC 4.6.x and earlier, and probably for Clang // 3.1, should we need to use those versions. // #define ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS // Lock-free 8-byte atomics are assumed on x86 but must be disabled in // corner cases, see comments below and in isLockfree8(). # define LOCKFREE8 // This pertains to Clang compiling with -m32, in this case the 64-bit // __atomic builtins are not available (observed on various Mac OS X // versions with Apple Clang and on Linux with Clang 3.5). // // For now just punt: disable lock-free 8-word data. The JIT will // call isLockfree8() to determine what to do and will stay in sync. // (Bug 1146817 tracks the work to improve on this.) # if defined(__clang__) && defined(__i386) # undef LOCKFREE8 # endif inline bool js::jit::AtomicOperations::isLockfree8() { # ifndef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS MOZ_ASSERT(__atomic_always_lock_free(sizeof(int8_t), 0)); MOZ_ASSERT(__atomic_always_lock_free(sizeof(int16_t), 0)); MOZ_ASSERT(__atomic_always_lock_free(sizeof(int32_t), 0)); # endif # ifdef LOCKFREE8 # ifndef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS MOZ_ASSERT(__atomic_always_lock_free(sizeof(int64_t), 0)); # endif return true; # else return false; # endif } inline void js::jit::AtomicOperations::fenceSeqCst() { # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS __sync_synchronize(); # else __atomic_thread_fence(__ATOMIC_SEQ_CST); # endif } template inline T js::jit::AtomicOperations::loadSeqCst(T* addr) { MOZ_ASSERT(sizeof(T) < 8 || isLockfree8()); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS // Inhibit compiler reordering with a volatile load. The x86 does // not reorder loads with respect to subsequent loads or stores // and no ordering barrier is required here. See more elaborate // comments in storeSeqCst. T v = *static_cast(addr); # else T v; __atomic_load(addr, &v, __ATOMIC_SEQ_CST); # endif return v; } # ifndef LOCKFREE8 template<> inline int64_t js::jit::AtomicOperations::loadSeqCst(int64_t* addr) { MOZ_CRASH(); } template<> inline uint64_t js::jit::AtomicOperations::loadSeqCst(uint64_t* addr) { MOZ_CRASH(); } # endif // LOCKFREE8 template inline void js::jit::AtomicOperations::storeSeqCst(T* addr, T val) { MOZ_ASSERT(sizeof(T) < 8 || isLockfree8()); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS // Inhibit compiler reordering with a volatile store. The x86 may // reorder a store with respect to a subsequent load from a // different location, hence there is an ordering barrier here to // prevent that. // // By way of background, look to eg // http://bartoszmilewski.com/2008/11/05/who-ordered-memory-fences-on-an-x86/ // // Consider: // // uint8_t x = 0, y = 0; // to start // // thread1: // sx: AtomicOperations::store(&x, 1); // gy: uint8_t obs1 = AtomicOperations::loadSeqCst(&y); // // thread2: // sy: AtomicOperations::store(&y, 1); // gx: uint8_t obs2 = AtomicOperations::loadSeqCst(&x); // // Sequential consistency requires a total global ordering of // operations: sx-gy-sy-gx, sx-sy-gx-gy, sx-sy-gy-gx, sy-gx-sx-gy, // sy-sx-gy-gx, or sy-sx-gx-gy. In every ordering at least one of // sx-before-gx or sy-before-gy happens, so *at least one* of // obs1/obs2 is 1. // // If AtomicOperations::{load,store}SeqCst were just volatile // {load,store}, x86 could reorder gx/gy before each thread's // prior load. That would permit gx-gy-sx-sy: both loads would be // 0! Thus after a volatile store we must synchronize to ensure // the store happens before the load. *static_cast(addr) = val; __sync_synchronize(); # else __atomic_store(addr, &val, __ATOMIC_SEQ_CST); # endif } # ifndef LOCKFREE8 template<> inline void js::jit::AtomicOperations::storeSeqCst(int64_t* addr, int64_t val) { MOZ_CRASH(); } template<> inline void js::jit::AtomicOperations::storeSeqCst(uint64_t* addr, uint64_t val) { MOZ_CRASH(); } # endif // LOCKFREE8 template inline T js::jit::AtomicOperations::exchangeSeqCst(T* addr, T val) { MOZ_ASSERT(sizeof(T) < 8 || isLockfree8()); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS T v; do { // Here I assume the compiler will not hoist the load. It // shouldn't, because the CAS could affect* addr. v = *addr; } while (!__sync_bool_compare_and_swap(addr, v, val)); return v; # else T v; __atomic_exchange(addr, &val, &v, __ATOMIC_SEQ_CST); return v; # endif } # ifndef LOCKFREE8 template<> inline int64_t js::jit::AtomicOperations::exchangeSeqCst(int64_t* addr, int64_t val) { MOZ_CRASH(); } template<> inline uint64_t js::jit::AtomicOperations::exchangeSeqCst(uint64_t* addr, uint64_t val) { MOZ_CRASH(); } # endif // LOCKFREE8 template inline T js::jit::AtomicOperations::compareExchangeSeqCst(T* addr, T oldval, T newval) { MOZ_ASSERT(sizeof(T) < 8 || isLockfree8()); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS return __sync_val_compare_and_swap(addr, oldval, newval); # else __atomic_compare_exchange(addr, &oldval, &newval, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); return oldval; # endif } # ifndef LOCKFREE8 template<> inline int64_t js::jit::AtomicOperations::compareExchangeSeqCst(int64_t* addr, int64_t oldval, int64_t newval) { MOZ_CRASH(); } template<> inline uint64_t js::jit::AtomicOperations::compareExchangeSeqCst(uint64_t* addr, uint64_t oldval, uint64_t newval) { MOZ_CRASH(); } # endif // LOCKFREE8 template inline T js::jit::AtomicOperations::fetchAddSeqCst(T* addr, T val) { static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS return __sync_fetch_and_add(addr, val); # else return __atomic_fetch_add(addr, val, __ATOMIC_SEQ_CST); # endif } template inline T js::jit::AtomicOperations::fetchSubSeqCst(T* addr, T val) { static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS return __sync_fetch_and_sub(addr, val); # else return __atomic_fetch_sub(addr, val, __ATOMIC_SEQ_CST); # endif } template inline T js::jit::AtomicOperations::fetchAndSeqCst(T* addr, T val) { static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS return __sync_fetch_and_and(addr, val); # else return __atomic_fetch_and(addr, val, __ATOMIC_SEQ_CST); # endif } template inline T js::jit::AtomicOperations::fetchOrSeqCst(T* addr, T val) { static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS return __sync_fetch_and_or(addr, val); # else return __atomic_fetch_or(addr, val, __ATOMIC_SEQ_CST); # endif } template inline T js::jit::AtomicOperations::fetchXorSeqCst(T* addr, T val) { static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS return __sync_fetch_and_xor(addr, val); # else return __atomic_fetch_xor(addr, val, __ATOMIC_SEQ_CST); # endif } template inline T js::jit::AtomicOperations::loadSafeWhenRacy(T* addr) { return *addr; // FIXME (1208663): not yet safe } template inline void js::jit::AtomicOperations::storeSafeWhenRacy(T* addr, T val) { *addr = val; // FIXME (1208663): not yet safe } inline void js::jit::AtomicOperations::memcpySafeWhenRacy(void* dest, const void* src, size_t nbytes) { ::memcpy(dest, src, nbytes); // FIXME (1208663): not yet safe } inline void js::jit::AtomicOperations::memmoveSafeWhenRacy(void* dest, const void* src, size_t nbytes) { ::memmove(dest, src, nbytes); // FIXME (1208663): not yet safe } template inline void js::jit::RegionLock::acquire(void* addr) { # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS while (!__sync_bool_compare_and_swap(&spinlock, 0, 1)) continue; # else uint32_t zero = 0; uint32_t one = 1; while (!__atomic_compare_exchange(&spinlock, &zero, &one, false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE)) { zero = 0; continue; } # endif } template inline void js::jit::RegionLock::release(void* addr) { MOZ_ASSERT(AtomicOperations::loadSeqCst(&spinlock) == 1, "releasing unlocked region lock"); # ifdef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS __sync_sub_and_fetch(&spinlock, 1); // Should turn into LOCK XADD # else uint32_t zero = 0; __atomic_store(&spinlock, &zero, __ATOMIC_SEQ_CST); # endif } # undef ATOMICS_IMPLEMENTED_WITH_SYNC_INTRINSICS # undef LOCKFREE8 #elif defined(_MSC_VER) // On 32-bit CPUs there is no 64-bit XCHG instruction, one must // instead use a loop with CMPXCHG8B. Since MSVC provides // _InterlockedExchange64 only if it maps directly to XCHG, the // workaround must be manual. # define HAVE_EXCHANGE64 # if !_WIN64 # undef HAVE_EXCHANGE64 # endif // Below, _ReadWriteBarrier is a compiler directive, preventing // reordering of instructions and reuse of memory values across it. inline bool js::jit::AtomicOperations::isLockfree8() { // See general comments at the start of this file. // // The MSDN docs suggest very strongly that if code is compiled for // Pentium or better the 64-bit primitives will be lock-free, see // eg the "Remarks" secion of the page for _InterlockedCompareExchange64, // currently here: // https://msdn.microsoft.com/en-us/library/ttk2z1ws%28v=vs.85%29.aspx // // But I've found no way to assert that at compile time or run time, // there appears to be no WinAPI is_lock_free() test. return true; } inline void js::jit::AtomicOperations::fenceSeqCst() { _ReadWriteBarrier(); # if JS_BITS_PER_WORD == 32 // If configured for SSE2+ we can use the MFENCE instruction, available // through the _mm_mfence intrinsic. But for non-SSE2 systems we have // to do something else. Linux uses "lock add [esp], 0", so why not? __asm lock add [esp], 0; # else _mm_mfence(); # endif } template inline T js::jit::AtomicOperations::loadSeqCst(T* addr) { MOZ_ASSERT(sizeof(T) < 8 || isLockfree8()); _ReadWriteBarrier(); T v = *addr; _ReadWriteBarrier(); return v; } template inline void js::jit::AtomicOperations::storeSeqCst(T* addr, T val) { MOZ_ASSERT(sizeof(T) < 8 || isLockfree8()); _ReadWriteBarrier(); *addr = val; fenceSeqCst(); } # define MSC_EXCHANGEOP(T, U, xchgop) \ template<> inline T \ js::jit::AtomicOperations::exchangeSeqCst(T* addr, T val) { \ MOZ_ASSERT(sizeof(T) < 8 || isLockfree8()); \ return (T)xchgop((U volatile*)addr, (U)val); \ } # define MSC_EXCHANGEOP_CAS(T, U, cmpxchg) \ template<> inline T \ js::jit::AtomicOperations::exchangeSeqCst(T* addr, T newval) { \ MOZ_ASSERT(sizeof(T) < 8 || isLockfree8()); \ T oldval; \ do { \ _ReadWriteBarrier(); \ oldval = *addr; \ } while (!cmpxchg((U volatile*)addr, (U)newval, (U)oldval)); \ return oldval; \ } MSC_EXCHANGEOP(int8_t, char, _InterlockedExchange8) MSC_EXCHANGEOP(uint8_t, char, _InterlockedExchange8) MSC_EXCHANGEOP(int16_t, short, _InterlockedExchange16) MSC_EXCHANGEOP(uint16_t, short, _InterlockedExchange16) MSC_EXCHANGEOP(int32_t, long, _InterlockedExchange) MSC_EXCHANGEOP(uint32_t, long, _InterlockedExchange) # ifdef HAVE_EXCHANGE64 MSC_EXCHANGEOP(int64_t, __int64, _InterlockedExchange64) MSC_EXCHANGEOP(uint64_t, __int64, _InterlockedExchange64) # else MSC_EXCHANGEOP_CAS(int64_t, __int64, _InterlockedCompareExchange64) MSC_EXCHANGEOP_CAS(uint64_t, __int64, _InterlockedCompareExchange64) # endif # undef MSC_EXCHANGEOP # undef MSC_EXCHANGEOP_CAS # define MSC_CAS(T, U, cmpxchg) \ template<> inline T \ js::jit::AtomicOperations::compareExchangeSeqCst(T* addr, T oldval, T newval) { \ MOZ_ASSERT(sizeof(T) < 8 || isLockfree8()); \ return (T)cmpxchg((U volatile*)addr, (U)newval, (U)oldval); \ } MSC_CAS(int8_t, char, _InterlockedCompareExchange8) MSC_CAS(uint8_t, char, _InterlockedCompareExchange8) MSC_CAS(int16_t, short, _InterlockedCompareExchange16) MSC_CAS(uint16_t, short, _InterlockedCompareExchange16) MSC_CAS(int32_t, long, _InterlockedCompareExchange) MSC_CAS(uint32_t, long, _InterlockedCompareExchange) MSC_CAS(int64_t, __int64, _InterlockedCompareExchange64) MSC_CAS(uint64_t, __int64, _InterlockedCompareExchange64) # undef MSC_CAS # define MSC_FETCHADDOP(T, U, xadd) \ template<> inline T \ js::jit::AtomicOperations::fetchAddSeqCst(T* addr, T val) { \ static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); \ return (T)xadd((U volatile*)addr, (U)val); \ } \ template<> inline T \ js::jit::AtomicOperations::fetchSubSeqCst(T* addr, T val) { \ static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); \ return (T)xadd((U volatile*)addr, -(U)val); \ } MSC_FETCHADDOP(int8_t, char, _InterlockedExchangeAdd8) MSC_FETCHADDOP(uint8_t, char, _InterlockedExchangeAdd8) MSC_FETCHADDOP(int16_t, short, _InterlockedExchangeAdd16) MSC_FETCHADDOP(uint16_t, short, _InterlockedExchangeAdd16) MSC_FETCHADDOP(int32_t, long, _InterlockedExchangeAdd) MSC_FETCHADDOP(uint32_t, long, _InterlockedExchangeAdd) # undef MSC_FETCHADDOP # define MSC_FETCHBITOP(T, U, andop, orop, xorop) \ template<> inline T \ js::jit::AtomicOperations::fetchAndSeqCst(T* addr, T val) { \ static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); \ return (T)andop((U volatile*)addr, (U)val); \ } \ template<> inline T \ js::jit::AtomicOperations::fetchOrSeqCst(T* addr, T val) { \ static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); \ return (T)orop((U volatile*)addr, (U)val); \ } \ template<> inline T \ js::jit::AtomicOperations::fetchXorSeqCst(T* addr, T val) { \ static_assert(sizeof(T) <= 4, "not available for 8-byte values yet"); \ return (T)xorop((U volatile*)addr, (U)val); \ } MSC_FETCHBITOP(int8_t, char, _InterlockedAnd8, _InterlockedOr8, _InterlockedXor8) MSC_FETCHBITOP(uint8_t, char, _InterlockedAnd8, _InterlockedOr8, _InterlockedXor8) MSC_FETCHBITOP(int16_t, short, _InterlockedAnd16, _InterlockedOr16, _InterlockedXor16) MSC_FETCHBITOP(uint16_t, short, _InterlockedAnd16, _InterlockedOr16, _InterlockedXor16) MSC_FETCHBITOP(int32_t, long, _InterlockedAnd, _InterlockedOr, _InterlockedXor) MSC_FETCHBITOP(uint32_t, long, _InterlockedAnd, _InterlockedOr, _InterlockedXor) # undef MSC_FETCHBITOP template inline T js::jit::AtomicOperations::loadSafeWhenRacy(T* addr) { return *addr; // FIXME (1208663): not yet safe } template inline void js::jit::AtomicOperations::storeSafeWhenRacy(T* addr, T val) { *addr = val; // FIXME (1208663): not yet safe } inline void js::jit::AtomicOperations::memcpySafeWhenRacy(void* dest, const void* src, size_t nbytes) { ::memcpy(dest, src, nbytes); // FIXME (1208663): not yet safe } inline void js::jit::AtomicOperations::memmoveSafeWhenRacy(void* dest, const void* src, size_t nbytes) { ::memmove(dest, src, nbytes); // FIXME (1208663): not yet safe } template inline void js::jit::RegionLock::acquire(void* addr) { while (_InterlockedCompareExchange((long*)&spinlock, /*newval=*/1, /*oldval=*/0) == 1) continue; } template inline void js::jit::RegionLock::release(void* addr) { MOZ_ASSERT(AtomicOperations::loadSeqCst(&spinlock) == 1, "releasing unlocked region lock"); _InterlockedExchange((long*)&spinlock, 0); } # undef HAVE_EXCHANGE64 #elif defined(ENABLE_SHARED_ARRAY_BUFFER) # error "Either disable JS shared memory at compile time, use GCC, Clang, or MSVC, or add code here" #endif // platform #endif // jit_shared_AtomicOperations_x86_shared_h