yalantinglibs/include/ylt/util/concurrentqueue.h

4399 lines
170 KiB
C++

/*
* Copyright (c) 2023, Alibaba Group Holding Limited;
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free
// queue. An overview, including benchmark results, is provided here:
// http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
// The full design is also described in excruciating detail at:
// http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
// Simplified BSD license:
// Copyright (c) 2013-2020, Cameron Desrochers.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// - Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
// - Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
// Also dual-licensed under the Boost Software License (see LICENSE.md)
#pragma once
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
// Traits::index_t are set to < 32 bits, causing integer promotion, causing
// warnings upon assigning any computed values)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#ifdef MCDBGQ_USE_RELACY
#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
#endif
#endif
#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
// VS2019 with /W4 warns about constant conditional expressions but unless
// /std=c++17 or higher does not support `if constexpr`, so we have no choice
// but to simply disable the warning
#pragma warning(push)
#pragma warning(disable : 4127) // conditional expression is constant
#endif
#if defined(__APPLE__)
#include "TargetConditionals.h"
#endif
#ifdef MCDBGQ_USE_RELACY
#include "relacy/relacy_std.hpp"
#include "relacy_shims.h"
// We only use malloc/free anyway, and the delete macro messes up `= delete`
// method declarations. We'll override the default trait malloc ourselves
// without a macro.
#undef new
#undef delete
#undef malloc
#undef free
#else
#include <atomic> // Requires C++11. Sorry VS2010.
#include <cassert>
#endif
#include <algorithm>
#include <array>
#include <climits> // for CHAR_BIT
#include <cstddef> // for max_align_t
#include <cstdint>
#include <cstdlib>
#include <limits>
#include <mutex> // used for thread exit synchronization
#include <thread> // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
#include <type_traits>
#include <utility>
// Platform-specific definitions of a numeric thread ID type and an invalid
// value
namespace moodycamel {
namespace details {
template <typename thread_id_t>
struct thread_id_converter {
typedef thread_id_t thread_id_numeric_size_t;
typedef thread_id_t thread_id_hash_t;
static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
};
} // namespace details
} // namespace moodycamel
#if defined(MCDBGQ_USE_RELACY)
namespace moodycamel {
namespace details {
typedef std::uint32_t thread_id_t;
static const thread_id_t invalid_thread_id = 0xFFFFFFFFU;
static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
static inline thread_id_t thread_id() { return rl::thread_index(); }
} // namespace details
} // namespace moodycamel
#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
// No sense pulling in windows.h in a header, we'll manually declare the
// function we use and rely on backwards-compatibility for this not to break
extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(
void);
namespace moodycamel {
namespace details {
static_assert(sizeof(unsigned long) == sizeof(std::uint32_t),
"Expected size of unsigned long to be 32 bits on Windows");
typedef std::uint32_t thread_id_t;
static const thread_id_t invalid_thread_id =
0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
static const thread_id_t invalid_thread_id2 =
0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used
// in practice. Note that all Win32 thread IDs are presently
// multiples of 4.
static inline thread_id_t thread_id() {
return static_cast<thread_id_t>(::GetCurrentThreadId());
}
} // namespace details
} // namespace moodycamel
#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
(defined(__APPLE__) && TARGET_OS_IPHONE) || \
defined(MOODYCAMEL_NO_THREAD_LOCAL)
namespace moodycamel {
namespace details {
static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8,
"std::thread::id is expected to be either 4 or 8 bytes");
typedef std::thread::id thread_id_t;
static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID
// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have
// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined
// anyway, which it won't be.
static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
template <std::size_t>
struct thread_id_size {};
template <>
struct thread_id_size<4> {
typedef std::uint32_t numeric_t;
};
template <>
struct thread_id_size<8> {
typedef std::uint64_t numeric_t;
};
template <>
struct thread_id_converter<thread_id_t> {
typedef thread_id_size<sizeof(thread_id_t)>::numeric_t
thread_id_numeric_size_t;
#ifndef __APPLE__
typedef std::size_t thread_id_hash_t;
#else
typedef thread_id_numeric_size_t thread_id_hash_t;
#endif
static thread_id_hash_t prehash(thread_id_t const& x) {
#ifndef __APPLE__
return std::hash<std::thread::id>()(x);
#else
return *reinterpret_cast<thread_id_hash_t const*>(&x);
#endif
}
};
}
}
#else
// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
// In order to get a numeric thread ID in a platform-independent way, we use a
// thread-local static variable's address as a thread identifier :-)
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define MOODYCAMEL_THREADLOCAL __thread
#elif defined(_MSC_VER)
#define MOODYCAMEL_THREADLOCAL __declspec(thread)
#else
// Assume C++11 compliant compiler
#define MOODYCAMEL_THREADLOCAL thread_local
#endif
namespace moodycamel {
namespace details {
typedef std::uintptr_t thread_id_t;
static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr
static const thread_id_t invalid_thread_id2 =
1; // Member accesses off a null pointer are also generally invalid. Plus
// it's not aligned.
inline thread_id_t thread_id() {
static MOODYCAMEL_THREADLOCAL int x;
return reinterpret_cast<thread_id_t>(&x);
}
}
}
#endif
// Constexpr if
#ifndef MOODYCAMEL_CONSTEXPR_IF
#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \
__cplusplus > 201402L
#define MOODYCAMEL_CONSTEXPR_IF if constexpr
#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
#else
#define MOODYCAMEL_CONSTEXPR_IF if
#define MOODYCAMEL_MAYBE_UNUSED
#endif
#endif
// Exceptions
#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || \
(defined(__GNUC__) && defined(__EXCEPTIONS)) || \
(!defined(_MSC_VER) && !defined(__GNUC__))
#define MOODYCAMEL_EXCEPTIONS_ENABLED
#endif
#endif
#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
#define MOODYCAMEL_TRY try
#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
#define MOODYCAMEL_RETHROW throw
#define MOODYCAMEL_THROW(expr) throw(expr)
#else
#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true)
#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
#define MOODYCAMEL_RETHROW
#define MOODYCAMEL_THROW(expr)
#endif
#ifndef MOODYCAMEL_NOEXCEPT
#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
#define MOODYCAMEL_NOEXCEPT
#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when
// it shouldn't :-( We have to assume *all* non-trivial constructors may throw
// on VS2012!
#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \
(std::is_rvalue_reference<valueType>::value && \
std::is_move_constructible<type>::value \
? std::is_trivially_move_constructible<type>::value \
: std::is_trivially_copy_constructible<type>::value)
#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \
((std::is_rvalue_reference<valueType>::value && \
std::is_move_assignable<type>::value \
? std::is_trivially_move_assignable<type>::value || \
std::is_nothrow_move_assignable<type>::value \
: std::is_trivially_copy_assignable<type>::value || \
std::is_nothrow_copy_assignable<type>::value) && \
MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \
(std::is_rvalue_reference<valueType>::value && \
std::is_move_constructible<type>::value \
? std::is_trivially_move_constructible<type>::value || \
std::is_nothrow_move_constructible<type>::value \
: std::is_trivially_copy_constructible<type>::value || \
std::is_nothrow_copy_constructible<type>::value)
#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \
((std::is_rvalue_reference<valueType>::value && \
std::is_move_assignable<type>::value \
? std::is_trivially_move_assignable<type>::value || \
std::is_nothrow_move_assignable<type>::value \
: std::is_trivially_copy_assignable<type>::value || \
std::is_nothrow_copy_assignable<type>::value) && \
MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
#else
#define MOODYCAMEL_NOEXCEPT noexcept
#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
#endif
#endif
#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
#ifdef MCDBGQ_USE_RELACY
#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
#else
// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a
// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't
// support thread_local either. Finally, iOS/ARM doesn't have support for it
// either, and g++/ARM allows it to compile but it's unconfirmed to actually
// work
#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \
(!defined(__MINGW32__) && !defined(__MINGW64__) || \
!defined(__WINPTHREADS_VERSION)) && \
(!defined(__GNUC__) || __GNUC__ > 4 || \
(__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \
(!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \
!defined(_M_ARM) && !defined(__aarch64__)
// Assume `thread_local` is fully supported in all other C++11
// compilers/platforms
#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now;
// years ago several users
// report having problems with
// it on
#endif
#endif
#endif
// VS2012 doesn't support deleted functions.
// In this case, we declare the function normally but don't define it. A link
// error will be generated if the function is called.
#ifndef MOODYCAMEL_DELETE_FUNCTION
#if defined(_MSC_VER) && _MSC_VER < 1800
#define MOODYCAMEL_DELETE_FUNCTION
#else
#define MOODYCAMEL_DELETE_FUNCTION = delete
#endif
#endif
namespace moodycamel {
namespace details {
#ifndef MOODYCAMEL_ALIGNAS
// VS2013 doesn't support alignas or alignof, and align() requires a constant
// literal
#if defined(_MSC_VER) && _MSC_VER <= 1800
#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
template <int Align, typename T>
struct Vs2013Aligned {}; // default, unsupported alignment
template <typename T>
struct Vs2013Aligned<1, T> {
typedef __declspec(align(1)) T type;
};
template <typename T>
struct Vs2013Aligned<2, T> {
typedef __declspec(align(2)) T type;
};
template <typename T>
struct Vs2013Aligned<4, T> {
typedef __declspec(align(4)) T type;
};
template <typename T>
struct Vs2013Aligned<8, T> {
typedef __declspec(align(8)) T type;
};
template <typename T>
struct Vs2013Aligned<16, T> {
typedef __declspec(align(16)) T type;
};
template <typename T>
struct Vs2013Aligned<32, T> {
typedef __declspec(align(32)) T type;
};
template <typename T>
struct Vs2013Aligned<64, T> {
typedef __declspec(align(64)) T type;
};
template <typename T>
struct Vs2013Aligned<128, T> {
typedef __declspec(align(128)) T type;
};
template <typename T>
struct Vs2013Aligned<256, T> {
typedef __declspec(align(256)) T type;
};
#else
template <typename T>
struct identity {
typedef T type;
};
#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \
alignas(alignof(obj)) typename details::identity<T>::type
#endif
#endif
} // namespace details
} // namespace moodycamel
// TSAN can false report races in lock-free code. To enable TSAN to be used
// from projects that use this one, we can apply per-function compile-time
// suppression. See
// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
#define MOODYCAMEL_NO_TSAN
#if defined(__has_feature)
#if __has_feature(thread_sanitizer)
#undef MOODYCAMEL_NO_TSAN
#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
#endif // TSAN
#endif // TSAN
// Compiler-specific likely/unlikely hints
namespace moodycamel {
namespace details {
#if defined(__GNUC__)
static inline bool(likely)(bool x) { return __builtin_expect((x), true); }
static inline bool(unlikely)(bool x) { return __builtin_expect((x), false); }
#else
static inline bool(likely)(bool x) { return x; }
static inline bool(unlikely)(bool x) { return x; }
#endif
} // namespace details
} // namespace moodycamel
#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
#include "internal/concurrentqueue_internal_debug.h"
#endif
namespace moodycamel {
namespace details {
template <typename T>
struct const_numeric_max {
static_assert(std::is_integral<T>::value,
"const_numeric_max can only be used with integers");
static const T value =
std::numeric_limits<T>::is_signed
? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) -
static_cast<T>(1)
: static_cast<T>(-1);
};
#if defined(__GLIBCXX__)
typedef ::max_align_t
std_max_align_t; // libstdc++ forgot to add it to std:: for a while
#else
typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can
// *only* be accessed via std::
#endif
// Some platforms have incorrectly set max_align_t to a type with <8 bytes
// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit
// iOS). Work around this with our own union. See issue #64.
typedef union {
std_max_align_t x;
long long y;
void* z;
} max_align_t;
} // namespace details
// Default traits for the ConcurrentQueue. To change some of the
// traits without re-implementing all of them, inherit from this
// struct and shadow the declarations you wish to be different;
// since the traits are used as a template type parameter, the
// shadowed declarations will be used where defined, and the defaults
// otherwise.
struct ConcurrentQueueDefaultTraits {
// General-purpose size type. std::size_t is strongly recommended.
typedef std::size_t size_t;
// The type used for the enqueue and dequeue indices. Must be at least as
// large as size_t. Should be significantly larger than the number of elements
// you expect to hold at once, especially if you have a high turnover rate;
// for example, on 32-bit x86, if you expect to have over a hundred million
// elements or pump several million elements through your queue in a very
// short space of time, using a 32-bit type *may* trigger a race condition.
// A 64-bit int type is recommended in that case, and in practice will
// prevent a race condition no matter the usage of the queue. Note that
// whether the queue is lock-free with a 64-int type depends on the whether
// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
typedef std::size_t index_t;
// Internally, all elements are enqueued and dequeued from multi-element
// blocks; this is the smallest controllable unit. If you expect few elements
// but many producers, a smaller block size should be favoured. For few
// producers and/or many elements, a larger block size is preferred. A sane
// default is provided. Must be a power of 2.
static const size_t QUEUE_BLOCK_SIZE = 32;
// For explicit producers (i.e. when using a producer token), the block is
// checked for being empty by iterating through a list of flags, one per
// element. For large block sizes, this is too inefficient, and switching to
// an atomic counter-based approach is faster. The switch is made for block
// sizes strictly larger than this threshold.
static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
// How many full blocks can be expected for a single explicit producer? This
// should reflect that number's maximum for optimal performance. Must be a
// power of 2.
static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
// How many full blocks can be expected for a single implicit producer? This
// should reflect that number's maximum for optimal performance. Must be a
// power of 2.
static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
// The initial size of the hash table mapping thread IDs to implicit
// producers. Note that the hash is resized every time it becomes half full.
// Must be a power of two, and either 0 or at least 1. If 0, implicit
// production (using the enqueue methods without an explicit producer token)
// is disabled.
static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
// Controls the number of items that an explicit consumer (i.e. one with a
// token) must consume before it causes all consumers to rotate and move on to
// the next internal queue.
static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
256;
// The maximum number of elements (inclusive) that can be enqueued to a
// sub-queue. Enqueue operations that would cause this limit to be surpassed
// will fail. Note that this limit is enforced at the block level (for
// performance reasons), i.e. it's rounded up to the nearest block size.
static const size_t MAX_SUBQUEUE_SIZE =
details::const_numeric_max<size_t>::value;
// The number of times to spin before sleeping when waiting on a semaphore.
// Recommended values are on the order of 1000-10000 unless the number of
// consumer threads exceeds the number of idle cores (in which case try
// 0-100). Only affects instances of the BlockingConcurrentQueue.
static const int MAX_SEMA_SPINS = 10000;
// Whether to recycle dynamically-allocated blocks into an internal free list
// or not. If false, only pre-allocated blocks (controlled by the constructor
// arguments) will be recycled, and all others will be `free`d back to the
// heap. Note that blocks consumed by explicit producers are only freed on
// destruction of the queue (not following destruction of the token)
// regardless of this trait.
static const bool RECYCLE_ALLOCATED_BLOCKS = false;
#ifndef MCDBGQ_USE_RELACY
// Memory allocation can be customized if needed.
// malloc should return nullptr on failure, and handle alignment like
// std::malloc.
#if defined(malloc) || defined(free)
// Gah, this is 2015, stop defining macros that break standard code already!
// Work around malloc/free being special macros:
static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
static inline void*(malloc)(size_t size) { return WORKAROUND_malloc(size); }
static inline void(free)(void* ptr) { return WORKAROUND_free(ptr); }
#else
static inline void* malloc(size_t size) { return std::malloc(size); }
static inline void free(void* ptr) { return std::free(ptr); }
#endif
#else
// Debug versions when running under the Relacy race detector (ignore
// these in user code)
static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
#endif
};
// When producing or consuming many elements, the most efficient way is to:
// 1) Use one of the bulk-operation methods of the queue with a token
// 2) Failing that, use the bulk-operation methods without a token
// 3) Failing that, create a token and use that with the single-item methods
// 4) Failing that, use the single-parameter methods of the queue
// Having said that, don't create tokens willy-nilly -- ideally there should be
// a maximum of one token per thread (of each kind).
struct ProducerToken;
struct ConsumerToken;
template <typename T, typename Traits>
class ConcurrentQueue;
template <typename T, typename Traits>
class BlockingConcurrentQueue;
class ConcurrentQueueTests;
namespace details {
struct ConcurrentQueueProducerTypelessBase {
ConcurrentQueueProducerTypelessBase* next;
std::atomic<bool> inactive;
ProducerToken* token;
ConcurrentQueueProducerTypelessBase()
: next(nullptr), inactive(false), token(nullptr) {}
};
template <bool use32>
struct _hash_32_or_64 {
static inline std::uint32_t hash(std::uint32_t h) {
// MurmurHash3 finalizer -- see
// https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
// Since the thread ID is already unique, all we really want to do is
// propagate that uniqueness evenly across all the bits, so that we can use
// a subset of the bits while reducing collisions significantly
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
return h ^ (h >> 16);
}
};
template <>
struct _hash_32_or_64<1> {
static inline std::uint64_t hash(std::uint64_t h) {
h ^= h >> 33;
h *= 0xff51afd7ed558ccd;
h ^= h >> 33;
h *= 0xc4ceb9fe1a85ec53;
return h ^ (h >> 33);
}
};
template <std::size_t size>
struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {};
static inline size_t hash_thread_id(thread_id_t id) {
static_assert(
sizeof(thread_id_t) <= 8,
"Expected a platform where thread IDs are at most 64-bit values");
return static_cast<size_t>(
hash_32_or_64<sizeof(
thread_id_converter<thread_id_t>::thread_id_hash_t)>::
hash(thread_id_converter<thread_id_t>::prehash(id)));
}
template <typename T>
static inline bool circular_less_than(T a, T b) {
static_assert(
std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
"circular_less_than is intended to be used only with unsigned integer "
"types");
return static_cast<T>(a - b) >
static_cast<T>(static_cast<T>(1)
<< (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
// Note: extra parens around rhs of operator<< is MSVC bug:
// https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
// silencing the bug requires #pragma warning(disable: 4554) around the
// calling code and has no effect when done here.
}
template <typename U>
static inline char* align_for(char* ptr) {
const std::size_t alignment = std::alignment_of<U>::value;
return ptr +
(alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) %
alignment;
}
template <typename T>
static inline T ceil_to_pow_2(T x) {
static_assert(
std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
"ceil_to_pow_2 is intended to be used only with unsigned integer types");
// Adapted from
// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
x |= x >> (i << 3);
}
++x;
return x;
}
template <typename T>
static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right) {
T temp = std::move(left.load(std::memory_order_relaxed));
left.store(std::move(right.load(std::memory_order_relaxed)),
std::memory_order_relaxed);
right.store(std::move(temp), std::memory_order_relaxed);
}
template <typename T>
static inline T const& nomove(T const& x) {
return x;
}
template <bool Enable>
struct nomove_if {
template <typename T>
static inline T const& eval(T const& x) {
return x;
}
};
template <>
struct nomove_if<false> {
template <typename U>
static inline auto eval(U&& x) -> decltype(std::forward<U>(x)) {
return std::forward<U>(x);
}
};
template <typename It>
static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
return *it;
}
#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \
(__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
template <typename T>
struct is_trivially_destructible : std::is_trivially_destructible<T> {};
#else
template <typename T>
struct is_trivially_destructible : std::has_trivial_destructor<T> {};
#endif
#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
#ifdef MCDBGQ_USE_RELACY
typedef RelacyThreadExitListener ThreadExitListener;
typedef RelacyThreadExitNotifier ThreadExitNotifier;
#else
class ThreadExitNotifier;
struct ThreadExitListener {
typedef void (*callback_t)(void*);
callback_t callback;
void* userData;
ThreadExitListener* next; // reserved for use by the ThreadExitNotifier
ThreadExitNotifier* chain; // reserved for use by the ThreadExitNotifier
};
class ThreadExitNotifier {
public:
static void subscribe(ThreadExitListener* listener) {
auto& tlsInst = instance();
std::lock_guard<std::mutex> guard(mutex());
listener->next = tlsInst.tail;
listener->chain = &tlsInst;
tlsInst.tail = listener;
}
static void unsubscribe(ThreadExitListener* listener) {
std::lock_guard<std::mutex> guard(mutex());
if (!listener->chain) {
return; // race with ~ThreadExitNotifier
}
auto& tlsInst = *listener->chain;
listener->chain = nullptr;
ThreadExitListener** prev = &tlsInst.tail;
for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
if (ptr == listener) {
*prev = ptr->next;
break;
}
prev = &ptr->next;
}
}
private:
ThreadExitNotifier() : tail(nullptr) {}
ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
ThreadExitNotifier& operator=(ThreadExitNotifier const&)
MOODYCAMEL_DELETE_FUNCTION;
~ThreadExitNotifier() {
// This thread is about to exit, let everyone know!
assert(this == &instance() &&
"If this assert fails, you likely have a buggy compiler! Change the "
"preprocessor conditions such that "
"MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
std::lock_guard<std::mutex> guard(mutex());
for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
ptr->chain = nullptr;
ptr->callback(ptr->userData);
}
}
// Thread-local
static inline ThreadExitNotifier& instance() {
static thread_local ThreadExitNotifier notifier;
return notifier;
}
static inline std::mutex& mutex() {
// Must be static because the ThreadExitNotifier could be destroyed while
// unsubscribe is called
static std::mutex mutex;
return mutex;
}
private:
ThreadExitListener* tail;
};
#endif
#endif
template <typename T>
struct static_is_lock_free_num {
enum { value = 0 };
};
template <>
struct static_is_lock_free_num<signed char> {
enum { value = ATOMIC_CHAR_LOCK_FREE };
};
template <>
struct static_is_lock_free_num<short> {
enum { value = ATOMIC_SHORT_LOCK_FREE };
};
template <>
struct static_is_lock_free_num<int> {
enum { value = ATOMIC_INT_LOCK_FREE };
};
template <>
struct static_is_lock_free_num<long> {
enum { value = ATOMIC_LONG_LOCK_FREE };
};
template <>
struct static_is_lock_free_num<long long> {
enum { value = ATOMIC_LLONG_LOCK_FREE };
};
template <typename T>
struct static_is_lock_free
: static_is_lock_free_num<typename std::make_signed<T>::type> {};
template <>
struct static_is_lock_free<bool> {
enum { value = ATOMIC_BOOL_LOCK_FREE };
};
template <typename U>
struct static_is_lock_free<U*> {
enum { value = ATOMIC_POINTER_LOCK_FREE };
};
} // namespace details
struct ProducerToken {
template <typename T, typename Traits>
explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
template <typename T, typename Traits>
explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
: producer(other.producer) {
other.producer = nullptr;
if (producer != nullptr) {
producer->token = this;
}
}
inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT {
swap(other);
return *this;
}
void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT {
std::swap(producer, other.producer);
if (producer != nullptr) {
producer->token = this;
}
if (other.producer != nullptr) {
other.producer->token = &other;
}
}
// A token is always valid unless:
// 1) Memory allocation failed during construction
// 2) It was moved via the move constructor
// (Note: assignment does a swap, leaving both potentially valid)
// 3) The associated queue was destroyed
// Note that if valid() returns true, that only indicates
// that the token is valid for use with a specific queue,
// but not which one; that's up to the user to track.
inline bool valid() const { return producer != nullptr; }
~ProducerToken() {
if (producer != nullptr) {
producer->token = nullptr;
producer->inactive.store(true, std::memory_order_release);
}
}
// Disable copying and assignment
ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
private:
template <typename T, typename Traits>
friend class ConcurrentQueue;
friend class ConcurrentQueueTests;
protected:
details::ConcurrentQueueProducerTypelessBase* producer;
};
struct ConsumerToken {
template <typename T, typename Traits>
explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
template <typename T, typename Traits>
explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
: initialOffset(other.initialOffset),
lastKnownGlobalOffset(other.lastKnownGlobalOffset),
itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
currentProducer(other.currentProducer),
desiredProducer(other.desiredProducer) {}
inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT {
swap(other);
return *this;
}
void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT {
std::swap(initialOffset, other.initialOffset);
std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
std::swap(currentProducer, other.currentProducer);
std::swap(desiredProducer, other.desiredProducer);
}
// Disable copying and assignment
ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
private:
template <typename T, typename Traits>
friend class ConcurrentQueue;
friend class ConcurrentQueueTests;
private: // but shared with ConcurrentQueue
std::uint32_t initialOffset;
std::uint32_t lastKnownGlobalOffset;
std::uint32_t itemsConsumedFromCurrent;
details::ConcurrentQueueProducerTypelessBase* currentProducer;
details::ConcurrentQueueProducerTypelessBase* desiredProducer;
};
// Need to forward-declare this swap because it's in a namespace.
// See
// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
template <typename T, typename Traits>
inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b)
MOODYCAMEL_NOEXCEPT;
template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
class ConcurrentQueue {
public:
typedef ::moodycamel::ProducerToken producer_token_t;
typedef ::moodycamel::ConsumerToken consumer_token_t;
typedef typename Traits::index_t index_t;
typedef typename Traits::size_t size_t;
static const size_t QUEUE_BLOCK_SIZE =
static_cast<size_t>(Traits::QUEUE_BLOCK_SIZE);
static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
static const size_t EXPLICIT_INITIAL_INDEX_SIZE =
static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
static const size_t IMPLICIT_INITIAL_INDEX_SIZE =
static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
static_cast<std::uint32_t>(
Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4307) // + integral constant overflow (that's what
// the ternary expression is for!)
#pragma warning(disable : 4309) // static_cast: Truncation of constant value
#endif
static const size_t MAX_SUBQUEUE_SIZE =
(details::const_numeric_max<size_t>::value -
static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) <
QUEUE_BLOCK_SIZE)
? details::const_numeric_max<size_t>::value
: ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) +
(QUEUE_BLOCK_SIZE - 1)) /
QUEUE_BLOCK_SIZE * QUEUE_BLOCK_SIZE);
#ifdef _MSC_VER
#pragma warning(pop)
#endif
static_assert(!std::numeric_limits<size_t>::is_signed &&
std::is_integral<size_t>::value,
"Traits::size_t must be an unsigned integral type");
static_assert(!std::numeric_limits<index_t>::is_signed &&
std::is_integral<index_t>::value,
"Traits::index_t must be an unsigned integral type");
static_assert(sizeof(index_t) >= sizeof(size_t),
"Traits::index_t must be at least as wide as Traits::size_t");
static_assert(
(QUEUE_BLOCK_SIZE > 1) && !(QUEUE_BLOCK_SIZE & (QUEUE_BLOCK_SIZE - 1)),
"Traits::QUEUE_BLOCK_SIZE must be a power of 2 (and at least 2)");
static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) &&
!(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD &
(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
"Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a "
"power of 2 (and greater than 1)");
static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) &&
!(EXPLICIT_INITIAL_INDEX_SIZE &
(EXPLICIT_INITIAL_INDEX_SIZE - 1)),
"Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
"greater than 1)");
static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) &&
!(IMPLICIT_INITIAL_INDEX_SIZE &
(IMPLICIT_INITIAL_INDEX_SIZE - 1)),
"Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
"greater than 1)");
static_assert(
(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) ||
!(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE &
(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
"Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 ||
INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
"Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least "
"1 (or 0 to disable implicit enqueueing)");
public:
// Creates a queue with at least `capacity` element slots; note that the
// actual number of elements that can be inserted without additional memory
// allocation depends on the number of producers and the block size (e.g. if
// the block size is equal to `capacity`, only a single block will be
// allocated up-front, which means only a single producer will be able to
// enqueue elements without an extra allocation -- blocks aren't shared
// between producers). This method is not thread safe -- it is up to the user
// to ensure that the queue is fully constructed before it starts being used
// by other threads (this includes making the memory effects of construction
// visible, possibly with a memory barrier).
explicit ConcurrentQueue(size_t capacity = 32 * QUEUE_BLOCK_SIZE)
: producerListTail(nullptr),
producerCount(0),
initialBlockPoolIndex(0),
nextExplicitConsumerId(0),
globalExplicitConsumerOffset(0) {
implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
populate_initial_implicit_producer_hash();
populate_initial_block_list(
capacity / QUEUE_BLOCK_SIZE +
((capacity & (QUEUE_BLOCK_SIZE - 1)) == 0 ? 0 : 1));
#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
// Track all the producers using a fully-resolved typed list for
// each kind; this makes it possible to debug them starting from
// the root queue object (otherwise wacky casts are needed that
// don't compile in the debugger's expression evaluator).
explicitProducers.store(nullptr, std::memory_order_relaxed);
implicitProducers.store(nullptr, std::memory_order_relaxed);
#endif
}
// Computes the correct amount of pre-allocated blocks for you based
// on the minimum number of elements you want available at any given
// time, and the maximum concurrent number of each type of producer.
ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers,
size_t maxImplicitProducers)
: producerListTail(nullptr),
producerCount(0),
initialBlockPoolIndex(0),
nextExplicitConsumerId(0),
globalExplicitConsumerOffset(0) {
implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
populate_initial_implicit_producer_hash();
size_t blocks =
(((minCapacity + QUEUE_BLOCK_SIZE - 1) / QUEUE_BLOCK_SIZE) - 1) *
(maxExplicitProducers + 1) +
2 * (maxExplicitProducers + maxImplicitProducers);
populate_initial_block_list(blocks);
#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
explicitProducers.store(nullptr, std::memory_order_relaxed);
implicitProducers.store(nullptr, std::memory_order_relaxed);
#endif
}
// Note: The queue should not be accessed concurrently while it's
// being deleted. It's up to the user to synchronize this.
// This method is not thread safe.
~ConcurrentQueue() {
// Destroy producers
auto ptr = producerListTail.load(std::memory_order_relaxed);
while (ptr != nullptr) {
auto next = ptr->next_prod();
if (ptr->token != nullptr) {
ptr->token->producer = nullptr;
}
destroy(ptr);
ptr = next;
}
// Destroy implicit producer hash tables
MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
auto hash = implicitProducerHash.load(std::memory_order_relaxed);
while (hash != nullptr) {
auto prev = hash->prev;
if (prev != nullptr) { // The last hash is part of this object and was
// not allocated dynamically
for (size_t i = 0; i != hash->capacity; ++i) {
hash->entries[i].~ImplicitProducerKVP();
}
hash->~ImplicitProducerHash();
(Traits::free)(hash);
}
hash = prev;
}
}
// Destroy global free list
auto block = freeList.head_unsafe();
while (block != nullptr) {
auto next = block->freeListNext.load(std::memory_order_relaxed);
if (block->dynamicallyAllocated) {
destroy(block);
}
block = next;
}
// Destroy initial free list
destroy_array(initialBlockPool, initialBlockPoolSize);
}
// Disable copying and copy assignment
ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
// Moving is supported, but note that it is *not* a thread-safe operation.
// Nobody can use the queue while it's being moved, and the memory effects
// of that move must be propagated to other threads before they can use it.
// Note: When a queue is moved, its tokens are still valid but can only be
// used with the destination queue (i.e. semantically they are moved along
// with the queue itself).
ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
: producerListTail(
other.producerListTail.load(std::memory_order_relaxed)),
producerCount(other.producerCount.load(std::memory_order_relaxed)),
initialBlockPoolIndex(
other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
initialBlockPool(other.initialBlockPool),
initialBlockPoolSize(other.initialBlockPoolSize),
freeList(std::move(other.freeList)),
nextExplicitConsumerId(
other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(
std::memory_order_relaxed)) {
// Move the other one into this, and leave the other one as an empty queue
implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
populate_initial_implicit_producer_hash();
swap_implicit_producer_hashes(other);
other.producerListTail.store(nullptr, std::memory_order_relaxed);
other.producerCount.store(0, std::memory_order_relaxed);
other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
explicitProducers.store(
other.explicitProducers.load(std::memory_order_relaxed),
std::memory_order_relaxed);
other.explicitProducers.store(nullptr, std::memory_order_relaxed);
implicitProducers.store(
other.implicitProducers.load(std::memory_order_relaxed),
std::memory_order_relaxed);
other.implicitProducers.store(nullptr, std::memory_order_relaxed);
#endif
other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
other.initialBlockPoolSize = 0;
other.initialBlockPool = nullptr;
reown_producers();
}
inline ConcurrentQueue& operator=(ConcurrentQueue&& other)
MOODYCAMEL_NOEXCEPT {
return swap_internal(other);
}
// Swaps this queue's state with the other's. Not thread-safe.
// Swapping two queues does not invalidate their tokens, however
// the tokens that were created for one queue must be used with
// only the swapped queue (i.e. the tokens are tied to the
// queue's movable state, not the object itself).
inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT {
swap_internal(other);
}
private:
ConcurrentQueue& swap_internal(ConcurrentQueue& other) {
if (this == &other) {
return *this;
}
details::swap_relaxed(producerListTail, other.producerListTail);
details::swap_relaxed(producerCount, other.producerCount);
details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
std::swap(initialBlockPool, other.initialBlockPool);
std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
freeList.swap(other.freeList);
details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
details::swap_relaxed(globalExplicitConsumerOffset,
other.globalExplicitConsumerOffset);
swap_implicit_producer_hashes(other);
reown_producers();
other.reown_producers();
#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
details::swap_relaxed(explicitProducers, other.explicitProducers);
details::swap_relaxed(implicitProducers, other.implicitProducers);
#endif
return *this;
}
public:
// Enqueues a single item (by copying it).
// Allocates memory if required. Only fails if memory allocation fails (or
// implicit production is disabled because
// Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
// Thread-safe.
inline bool enqueue(T const& item) {
MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
return false;
else return inner_enqueue<CanAlloc>(item);
}
// Enqueues a single item (by moving it, if possible).
// Allocates memory if required. Only fails if memory allocation fails (or
// implicit production is disabled because
// Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
// Thread-safe.
inline bool enqueue(T&& item) {
MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
return false;
else return inner_enqueue<CanAlloc>(std::move(item));
}
// Enqueues a single item (by copying it) using an explicit producer token.
// Allocates memory if required. Only fails if memory allocation fails (or
// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
// Thread-safe.
inline bool enqueue(producer_token_t const& token, T const& item) {
return inner_enqueue<CanAlloc>(token, item);
}
// Enqueues a single item (by moving it, if possible) using an explicit
// producer token. Allocates memory if required. Only fails if memory
// allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would
// be surpassed). Thread-safe.
inline bool enqueue(producer_token_t const& token, T&& item) {
return inner_enqueue<CanAlloc>(token, std::move(item));
}
// Enqueues several items.
// Allocates memory if required. Only fails if memory allocation fails (or
// implicit production is disabled because
// Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note:
// Use std::make_move_iterator if the elements should be moved instead of
// copied. Thread-safe.
template <typename It>
bool enqueue_bulk(It itemFirst, size_t count) {
MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
return false;
else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
}
// Enqueues several items using an explicit producer token.
// Allocates memory if required. Only fails if memory allocation fails
// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
// Note: Use std::make_move_iterator if the elements should be moved
// instead of copied.
// Thread-safe.
template <typename It>
bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) {
return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
}
// Enqueues a single item (by copying it).
// Does not allocate memory. Fails if not enough room to enqueue (or implicit
// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
// is 0).
// Thread-safe.
inline bool try_enqueue(T const& item) {
MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
return false;
else return inner_enqueue<CannotAlloc>(item);
}
// Enqueues a single item (by moving it, if possible).
// Does not allocate memory (except for one-time implicit producer).
// Fails if not enough room to enqueue (or implicit production is
// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
// Thread-safe.
inline bool try_enqueue(T&& item) {
MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
return false;
else return inner_enqueue<CannotAlloc>(std::move(item));
}
// Enqueues a single item (by copying it) using an explicit producer token.
// Does not allocate memory. Fails if not enough room to enqueue.
// Thread-safe.
inline bool try_enqueue(producer_token_t const& token, T const& item) {
return inner_enqueue<CannotAlloc>(token, item);
}
// Enqueues a single item (by moving it, if possible) using an explicit
// producer token. Does not allocate memory. Fails if not enough room to
// enqueue. Thread-safe.
inline bool try_enqueue(producer_token_t const& token, T&& item) {
return inner_enqueue<CannotAlloc>(token, std::move(item));
}
// Enqueues several items.
// Does not allocate memory (except for one-time implicit producer).
// Fails if not enough room to enqueue (or implicit production is
// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
// Note: Use std::make_move_iterator if the elements should be moved
// instead of copied.
// Thread-safe.
template <typename It>
bool try_enqueue_bulk(It itemFirst, size_t count) {
MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
return false;
else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
}
// Enqueues several items using an explicit producer token.
// Does not allocate memory. Fails if not enough room to enqueue.
// Note: Use std::make_move_iterator if the elements should be moved
// instead of copied.
// Thread-safe.
template <typename It>
bool try_enqueue_bulk(producer_token_t const& token, It itemFirst,
size_t count) {
return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
}
// Attempts to dequeue from the queue.
// Returns false if all producer streams appeared empty at the time they
// were checked (so, the queue is likely but not guaranteed to be empty).
// Never allocates. Thread-safe.
template <typename U>
bool try_dequeue(U& item) {
// Instead of simply trying each producer in turn (which could cause
// needless contention on the first producer), we score them heuristically.
size_t nonEmptyCount = 0;
ProducerBase* best = nullptr;
size_t bestSize = 0;
for (auto ptr = producerListTail.load(std::memory_order_acquire);
nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
auto size = ptr->size_approx();
if (size > 0) {
if (size > bestSize) {
bestSize = size;
best = ptr;
}
++nonEmptyCount;
}
}
// If there was at least one non-empty queue but it appears empty at the
// time we try to dequeue from it, we need to make sure every queue's been
// tried
if (nonEmptyCount > 0) {
if ((details::likely)(best->dequeue(item))) {
return true;
}
for (auto ptr = producerListTail.load(std::memory_order_acquire);
ptr != nullptr; ptr = ptr->next_prod()) {
if (ptr != best && ptr->dequeue(item)) {
return true;
}
}
}
return false;
}
// Attempts to dequeue from the queue.
// Returns false if all producer streams appeared empty at the time they
// were checked (so, the queue is likely but not guaranteed to be empty).
// This differs from the try_dequeue(item) method in that this one does
// not attempt to reduce contention by interleaving the order that producer
// streams are dequeued from. So, using this method can reduce overall
// throughput under contention, but will give more predictable results in
// single-threaded consumer scenarios. This is mostly only useful for internal
// unit tests. Never allocates. Thread-safe.
template <typename U>
bool try_dequeue_non_interleaved(U& item) {
for (auto ptr = producerListTail.load(std::memory_order_acquire);
ptr != nullptr; ptr = ptr->next_prod()) {
if (ptr->dequeue(item)) {
return true;
}
}
return false;
}
// Attempts to dequeue from the queue using an explicit consumer token.
// Returns false if all producer streams appeared empty at the time they
// were checked (so, the queue is likely but not guaranteed to be empty).
// Never allocates. Thread-safe.
template <typename U>
bool try_dequeue(consumer_token_t& token, U& item) {
// The idea is roughly as follows:
// Every 256 items from one producer, make everyone rotate (increase the
// global offset) -> this means the highest efficiency consumer dictates the
// rotation speed of everyone else, more or less If you see that the global
// offset has changed, you must reset your consumption counter and move to
// your designated place If there's no items where you're supposed to be,
// keep moving until you find a producer with some items If the global
// offset has not changed but you've run out of items to consume, move over
// from your current position until you find an producer with something in
// it
if (token.desiredProducer == nullptr ||
token.lastKnownGlobalOffset !=
globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
if (!update_current_producer_after_rotation(token)) {
return false;
}
}
// If there was at least one non-empty queue but it appears empty at the
// time we try to dequeue from it, we need to make sure every queue's been
// tried
if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
if (++token.itemsConsumedFromCurrent ==
EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
}
return true;
}
auto tail = producerListTail.load(std::memory_order_acquire);
auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
if (ptr == nullptr) {
ptr = tail;
}
while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
if (ptr->dequeue(item)) {
token.currentProducer = ptr;
token.itemsConsumedFromCurrent = 1;
return true;
}
ptr = ptr->next_prod();
if (ptr == nullptr) {
ptr = tail;
}
}
return false;
}
// Attempts to dequeue several elements from the queue.
// Returns the number of items actually dequeued.
// Returns 0 if all producer streams appeared empty at the time they
// were checked (so, the queue is likely but not guaranteed to be empty).
// Never allocates. Thread-safe.
template <typename It>
size_t try_dequeue_bulk(It itemFirst, size_t max) {
size_t count = 0;
for (auto ptr = producerListTail.load(std::memory_order_acquire);
ptr != nullptr; ptr = ptr->next_prod()) {
count += ptr->dequeue_bulk(itemFirst, max - count);
if (count == max) {
break;
}
}
return count;
}
// Attempts to dequeue several elements from the queue using an explicit
// consumer token. Returns the number of items actually dequeued. Returns 0 if
// all producer streams appeared empty at the time they were checked (so, the
// queue is likely but not guaranteed to be empty). Never allocates.
// Thread-safe.
template <typename It>
size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) {
if (token.desiredProducer == nullptr ||
token.lastKnownGlobalOffset !=
globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
if (!update_current_producer_after_rotation(token)) {
return 0;
}
}
size_t count = static_cast<ProducerBase*>(token.currentProducer)
->dequeue_bulk(itemFirst, max);
if (count == max) {
if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >=
EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
}
return max;
}
token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
max -= count;
auto tail = producerListTail.load(std::memory_order_acquire);
auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
if (ptr == nullptr) {
ptr = tail;
}
while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
auto dequeued = ptr->dequeue_bulk(itemFirst, max);
count += dequeued;
if (dequeued != 0) {
token.currentProducer = ptr;
token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
}
if (dequeued == max) {
break;
}
max -= dequeued;
ptr = ptr->next_prod();
if (ptr == nullptr) {
ptr = tail;
}
}
return count;
}
// Attempts to dequeue from a specific producer's inner queue.
// If you happen to know which producer you want to dequeue from, this
// is significantly faster than using the general-case try_dequeue methods.
// Returns false if the producer's queue appeared empty at the time it
// was checked (so, the queue is likely but not guaranteed to be empty).
// Never allocates. Thread-safe.
template <typename U>
inline bool try_dequeue_from_producer(producer_token_t const& producer,
U& item) {
return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
}
// Attempts to dequeue several elements from a specific producer's inner
// queue. Returns the number of items actually dequeued. If you happen to know
// which producer you want to dequeue from, this is significantly faster than
// using the general-case try_dequeue methods. Returns 0 if the producer's
// queue appeared empty at the time it was checked (so, the queue is likely
// but not guaranteed to be empty). Never allocates. Thread-safe.
template <typename It>
inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer,
It itemFirst, size_t max) {
return static_cast<ExplicitProducer*>(producer.producer)
->dequeue_bulk(itemFirst, max);
}
// Returns an estimate of the total number of elements currently in the queue.
// This estimate is only accurate if the queue has completely stabilized
// before it is called (i.e. all enqueue and dequeue operations have completed
// and their memory effects are visible on the calling thread, and no further
// operations start while this method is being called). Thread-safe.
size_t size_approx() const {
size_t size = 0;
for (auto ptr = producerListTail.load(std::memory_order_acquire);
ptr != nullptr; ptr = ptr->next_prod()) {
size += ptr->size_approx();
}
return size;
}
// Returns true if the underlying atomic variables used by
// the queue are lock-free (they should be on most platforms).
// Thread-safe.
static constexpr bool is_lock_free() {
return details::static_is_lock_free<bool>::value == 2 &&
details::static_is_lock_free<size_t>::value == 2 &&
details::static_is_lock_free<std::uint32_t>::value == 2 &&
details::static_is_lock_free<index_t>::value == 2 &&
details::static_is_lock_free<void*>::value == 2 &&
details::static_is_lock_free<typename details::thread_id_converter<
details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
}
private:
friend struct ProducerToken;
friend struct ConsumerToken;
struct ExplicitProducer;
friend struct ExplicitProducer;
struct ImplicitProducer;
friend struct ImplicitProducer;
friend class ConcurrentQueueTests;
enum AllocationMode { CanAlloc, CannotAlloc };
///////////////////////////////
// Queue methods
///////////////////////////////
template <AllocationMode canAlloc, typename U>
inline bool inner_enqueue(producer_token_t const& token, U&& element) {
return static_cast<ExplicitProducer*>(token.producer)
->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(
std::forward<U>(element));
}
template <AllocationMode canAlloc, typename U>
inline bool inner_enqueue(U&& element) {
auto producer = get_or_add_implicit_producer();
return producer == nullptr
? false
: producer->ConcurrentQueue::ImplicitProducer::template enqueue<
canAlloc>(std::forward<U>(element));
}
template <AllocationMode canAlloc, typename It>
inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst,
size_t count) {
return static_cast<ExplicitProducer*>(token.producer)
->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(
itemFirst, count);
}
template <AllocationMode canAlloc, typename It>
inline bool inner_enqueue_bulk(It itemFirst, size_t count) {
auto producer = get_or_add_implicit_producer();
return producer == nullptr
? false
: producer->ConcurrentQueue::ImplicitProducer::
template enqueue_bulk<canAlloc>(itemFirst, count);
}
inline bool update_current_producer_after_rotation(consumer_token_t& token) {
// Ah, there's been a rotation, figure out where we should be!
auto tail = producerListTail.load(std::memory_order_acquire);
if (token.desiredProducer == nullptr && tail == nullptr) {
return false;
}
auto prodCount = producerCount.load(std::memory_order_relaxed);
auto globalOffset =
globalExplicitConsumerOffset.load(std::memory_order_relaxed);
if ((details::unlikely)(token.desiredProducer == nullptr)) {
// Aha, first time we're dequeueing anything.
// Figure out our local position
// Note: offset is from start, not end, but we're traversing from end --
// subtract from count first
std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
token.desiredProducer = tail;
for (std::uint32_t i = 0; i != offset; ++i) {
token.desiredProducer =
static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
if (token.desiredProducer == nullptr) {
token.desiredProducer = tail;
}
}
}
std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
if (delta >= prodCount) {
delta = delta % prodCount;
}
for (std::uint32_t i = 0; i != delta; ++i) {
token.desiredProducer =
static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
if (token.desiredProducer == nullptr) {
token.desiredProducer = tail;
}
}
token.lastKnownGlobalOffset = globalOffset;
token.currentProducer = token.desiredProducer;
token.itemsConsumedFromCurrent = 0;
return true;
}
///////////////////////////
// Free list
///////////////////////////
template <typename N>
struct FreeListNode {
FreeListNode() : freeListRefs(0), freeListNext(nullptr) {}
std::atomic<std::uint32_t> freeListRefs;
std::atomic<N*> freeListNext;
};
// A simple CAS-based lock-free free list. Not the fastest thing in the world
// under heavy contention, but simple and correct (assuming nodes are never
// freed until after the free list is destroyed), and fairly speedy under low
// contention.
template <typename N> // N must inherit FreeListNode or have the same fields
// (and initialization of them)
struct FreeList {
FreeList() : freeListHead(nullptr) {}
FreeList(FreeList&& other)
: freeListHead(other.freeListHead.load(std::memory_order_relaxed)) {
other.freeListHead.store(nullptr, std::memory_order_relaxed);
}
void swap(FreeList& other) {
details::swap_relaxed(freeListHead, other.freeListHead);
}
FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
inline void add(N* node) {
#ifdef MCDBGQ_NOLOCKFREE_FREELIST
debug::DebugLock lock(mutex);
#endif
// We know that the should-be-on-freelist bit is 0 at this point, so it's
// safe to set it using a fetch_add
if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST,
std::memory_order_acq_rel) == 0) {
// Oh look! We were the last ones referencing this node, and we know
// we want to add it to the free list, so let's do it!
add_knowing_refcount_is_zero(node);
}
}
inline N* try_get() {
#ifdef MCDBGQ_NOLOCKFREE_FREELIST
debug::DebugLock lock(mutex);
#endif
auto head = freeListHead.load(std::memory_order_acquire);
while (head != nullptr) {
auto prevHead = head;
auto refs = head->freeListRefs.load(std::memory_order_relaxed);
if ((refs & REFS_MASK) == 0 ||
!head->freeListRefs.compare_exchange_strong(
refs, refs + 1, std::memory_order_acquire,
std::memory_order_relaxed)) {
head = freeListHead.load(std::memory_order_acquire);
continue;
}
// Good, reference count has been incremented (it wasn't at zero), which
// means we can read the next and not worry about it changing between
// now and the time we do the CAS
auto next = head->freeListNext.load(std::memory_order_relaxed);
if (freeListHead.compare_exchange_strong(head, next,
std::memory_order_acquire,
std::memory_order_relaxed)) {
// Yay, got the node. This means it was on the list, which means
// shouldBeOnFreeList must be false no matter the refcount (because
// nobody else knows it's been taken off yet, it can't have been put
// back on).
assert((head->freeListRefs.load(std::memory_order_relaxed) &
SHOULD_BE_ON_FREELIST) == 0);
// Decrease refcount twice, once for our ref, and once for the list's
// ref
head->freeListRefs.fetch_sub(2, std::memory_order_release);
return head;
}
// OK, the head must have changed on us, but we still need to decrease
// the refcount we increased. Note that we don't need to release any
// memory effects, but we do need to ensure that the reference count
// decrement happens-after the CAS on the head.
refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
if (refs == SHOULD_BE_ON_FREELIST + 1) {
add_knowing_refcount_is_zero(prevHead);
}
}
return nullptr;
}
// Useful for traversing the list when there's no contention (e.g. to
// destroy remaining nodes)
N* head_unsafe() const {
return freeListHead.load(std::memory_order_relaxed);
}
private:
inline void add_knowing_refcount_is_zero(N* node) {
// Since the refcount is zero, and nobody can increase it once it's zero
// (except us, and we run only one copy of this method per node at a time,
// i.e. the single thread case), then we know we can safely change the
// next pointer of the node; however, once the refcount is back above
// zero, then other threads could increase it (happens under heavy
// contention, when the refcount goes to zero in between a load and a
// refcount increment of a node in try_get, then back up to something
// non-zero, then the refcount increment is done by the other thread) --
// so, if the CAS to add the node to the actual list fails, decrease the
// refcount and leave the add operation to the next thread who puts the
// refcount back at zero (which could be us, hence the loop).
auto head = freeListHead.load(std::memory_order_relaxed);
while (true) {
node->freeListNext.store(head, std::memory_order_relaxed);
node->freeListRefs.store(1, std::memory_order_release);
if (!freeListHead.compare_exchange_strong(head, node,
std::memory_order_release,
std::memory_order_relaxed)) {
// Hmm, the add failed, but we can only try again when the refcount
// goes back to zero
if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1,
std::memory_order_release) == 1) {
continue;
}
}
return;
}
}
private:
// Implemented like a stack, but where node order doesn't matter (nodes are
// inserted out of order under contention)
std::atomic<N*> freeListHead;
static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
#ifdef MCDBGQ_NOLOCKFREE_FREELIST
debug::DebugMutex mutex;
#endif
};
///////////////////////////
// Block
///////////////////////////
enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
struct Block {
Block()
: next(nullptr),
elementsCompletelyDequeued(0),
freeListRefs(0),
freeListNext(nullptr),
dynamicallyAllocated(true) {
#ifdef MCDBGQ_TRACKMEM
owner = nullptr;
#endif
}
template <InnerQueueContext context>
inline bool is_empty() const {
MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
QUEUE_BLOCK_SIZE <=
EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
// Check flags
for (size_t i = 0; i < QUEUE_BLOCK_SIZE; ++i) {
if (!emptyFlags[i].load(std::memory_order_relaxed)) {
return false;
}
}
// Aha, empty; make sure we have all other memory effects that happened
// before the empty flags were set
std::atomic_thread_fence(std::memory_order_acquire);
return true;
}
else {
// Check counter
if (elementsCompletelyDequeued.load(std::memory_order_relaxed) ==
QUEUE_BLOCK_SIZE) {
std::atomic_thread_fence(std::memory_order_acquire);
return true;
}
assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <=
QUEUE_BLOCK_SIZE);
return false;
}
}
// Returns true if the block is now empty (does not apply in explicit
// context)
template <InnerQueueContext context>
inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) {
MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
QUEUE_BLOCK_SIZE <=
EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
// Set flag
assert(!emptyFlags[QUEUE_BLOCK_SIZE - 1 -
static_cast<size_t>(
i & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1))]
.load(std::memory_order_relaxed));
emptyFlags[QUEUE_BLOCK_SIZE - 1 -
static_cast<size_t>(
i & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1))]
.store(true, std::memory_order_release);
return false;
}
else {
// Increment counter
auto prevVal =
elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
assert(prevVal < QUEUE_BLOCK_SIZE);
return prevVal == QUEUE_BLOCK_SIZE - 1;
}
}
// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping
// and count > 0). Returns true if the block is now empty (does not apply in
// explicit context).
template <InnerQueueContext context>
inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i,
size_t count) {
MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
QUEUE_BLOCK_SIZE <=
EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
// Set flags
std::atomic_thread_fence(std::memory_order_release);
i = QUEUE_BLOCK_SIZE - 1 -
static_cast<size_t>(i &
static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) -
count + 1;
for (size_t j = 0; j != count; ++j) {
assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
emptyFlags[i + j].store(true, std::memory_order_relaxed);
}
return false;
}
else {
// Increment counter
auto prevVal = elementsCompletelyDequeued.fetch_add(
count, std::memory_order_release);
assert(prevVal + count <= QUEUE_BLOCK_SIZE);
return prevVal + count == QUEUE_BLOCK_SIZE;
}
}
template <InnerQueueContext context>
inline void set_all_empty() {
MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
QUEUE_BLOCK_SIZE <=
EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
// Set all flags
for (size_t i = 0; i != QUEUE_BLOCK_SIZE; ++i) {
emptyFlags[i].store(true, std::memory_order_relaxed);
}
}
else {
// Reset counter
elementsCompletelyDequeued.store(QUEUE_BLOCK_SIZE,
std::memory_order_relaxed);
}
}
template <InnerQueueContext context>
inline void reset_empty() {
MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
QUEUE_BLOCK_SIZE <=
EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
// Reset flags
for (size_t i = 0; i != QUEUE_BLOCK_SIZE; ++i) {
emptyFlags[i].store(false, std::memory_order_relaxed);
}
}
else {
// Reset counter
elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
}
}
inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT {
return static_cast<T*>(static_cast<void*>(elements)) +
static_cast<size_t>(idx &
static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
}
inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT {
return static_cast<T const*>(static_cast<void const*>(elements)) +
static_cast<size_t>(idx &
static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
}
private:
static_assert(std::alignment_of<T>::value <= sizeof(T),
"The queue does not support types with an alignment greater "
"than their size at this time");
MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * QUEUE_BLOCK_SIZE], T)
elements;
public:
Block* next;
std::atomic<size_t> elementsCompletelyDequeued;
std::atomic<bool>
emptyFlags[QUEUE_BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD
? QUEUE_BLOCK_SIZE
: 1];
public:
std::atomic<std::uint32_t> freeListRefs;
std::atomic<Block*> freeListNext;
bool dynamicallyAllocated; // Perhaps a better name for this would be
// 'isNotPartOfInitialBlockPool'
#ifdef MCDBGQ_TRACKMEM
void* owner;
#endif
};
static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value,
"Internal error: Blocks must be at least as aligned as the "
"type they are wrapping");
#ifdef MCDBGQ_TRACKMEM
public:
struct MemStats;
private:
#endif
///////////////////////////
// Producer base
///////////////////////////
struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase {
ProducerBase(ConcurrentQueue* parent_, bool isExplicit_)
: tailIndex(0),
headIndex(0),
dequeueOptimisticCount(0),
dequeueOvercommit(0),
tailBlock(nullptr),
isExplicit(isExplicit_),
parent(parent_) {}
virtual ~ProducerBase() {}
template <typename U>
inline bool dequeue(U& element) {
if (isExplicit) {
return static_cast<ExplicitProducer*>(this)->dequeue(element);
}
else {
return static_cast<ImplicitProducer*>(this)->dequeue(element);
}
}
template <typename It>
inline size_t dequeue_bulk(It& itemFirst, size_t max) {
if (isExplicit) {
return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst,
max);
}
else {
return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst,
max);
}
}
inline ProducerBase* next_prod() const {
return static_cast<ProducerBase*>(next);
}
inline size_t size_approx() const {
auto tail = tailIndex.load(std::memory_order_relaxed);
auto head = headIndex.load(std::memory_order_relaxed);
return details::circular_less_than(head, tail)
? static_cast<size_t>(tail - head)
: 0;
}
inline index_t getTail() const {
return tailIndex.load(std::memory_order_relaxed);
}
protected:
std::atomic<index_t> tailIndex; // Where to enqueue to next
std::atomic<index_t> headIndex; // Where to dequeue from next
std::atomic<index_t> dequeueOptimisticCount;
std::atomic<index_t> dequeueOvercommit;
Block* tailBlock;
public:
bool isExplicit;
ConcurrentQueue* parent;
protected:
#ifdef MCDBGQ_TRACKMEM
friend struct MemStats;
#endif
};
///////////////////////////
// Explicit queue
///////////////////////////
struct ExplicitProducer : public ProducerBase {
explicit ExplicitProducer(ConcurrentQueue* parent_)
: ProducerBase(parent_, true),
blockIndex(nullptr),
pr_blockIndexSlotsUsed(0),
pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
pr_blockIndexFront(0),
pr_blockIndexEntries(nullptr),
pr_blockIndexRaw(nullptr) {
size_t poolBasedIndexSize =
details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
if (poolBasedIndexSize > pr_blockIndexSize) {
pr_blockIndexSize = poolBasedIndexSize;
}
new_block_index(0); // This creates an index with double the number of
// current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
}
~ExplicitProducer() {
// Destruct any elements not yet dequeued.
// Since we're in the destructor, we can assume all elements
// are either completely dequeued or completely not (no halfways).
if (this->tailBlock !=
nullptr) { // Note this means there must be a block index too
// First find the block that's partially dequeued, if any
Block* halfDequeuedBlock = nullptr;
if ((this->headIndex.load(std::memory_order_relaxed) &
static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) != 0) {
// The head's not on a block boundary, meaning a block somewhere is
// partially dequeued (or the head block is the tail block and was
// fully dequeued, but the head/tail are still not on a boundary)
size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) &
(pr_blockIndexSize - 1);
while (details::circular_less_than<index_t>(
pr_blockIndexEntries[i].base + QUEUE_BLOCK_SIZE,
this->headIndex.load(std::memory_order_relaxed))) {
i = (i + 1) & (pr_blockIndexSize - 1);
}
assert(details::circular_less_than<index_t>(
pr_blockIndexEntries[i].base,
this->headIndex.load(std::memory_order_relaxed)));
halfDequeuedBlock = pr_blockIndexEntries[i].block;
}
// Start at the head block (note the first line in the loop gives us the
// head from the tail on the first iteration)
auto block = this->tailBlock;
do {
block = block->next;
if (block->ConcurrentQueue::Block::template is_empty<
explicit_context>()) {
continue;
}
size_t i = 0; // Offset into block
if (block == halfDequeuedBlock) {
i = static_cast<size_t>(
this->headIndex.load(std::memory_order_relaxed) &
static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
}
// Walk through all the items in the block; if this is the tail block,
// we need to stop when we reach the tail index
auto lastValidIndex =
(this->tailIndex.load(std::memory_order_relaxed) &
static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0
? QUEUE_BLOCK_SIZE
: static_cast<size_t>(
this->tailIndex.load(std::memory_order_relaxed) &
static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
while (i != QUEUE_BLOCK_SIZE &&
(block != this->tailBlock || i != lastValidIndex)) {
(*block)[i++]->~T();
}
} while (block != this->tailBlock);
}
// Destroy all blocks that we own
if (this->tailBlock != nullptr) {
auto block = this->tailBlock;
do {
auto nextBlock = block->next;
this->parent->add_block_to_free_list(block);
block = nextBlock;
} while (block != this->tailBlock);
}
// Destroy the block indices
auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
while (header != nullptr) {
auto prev = static_cast<BlockIndexHeader*>(header->prev);
header->~BlockIndexHeader();
(Traits::free)(header);
header = prev;
}
}
template <AllocationMode allocMode, typename U>
inline bool enqueue(U&& element) {
index_t currentTailIndex =
this->tailIndex.load(std::memory_order_relaxed);
index_t newTailIndex = 1 + currentTailIndex;
if ((currentTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) ==
0) {
// We reached the end of a block, start a new one
auto startBlock = this->tailBlock;
auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
if (this->tailBlock != nullptr &&
this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
explicit_context>()) {
// We can re-use the block ahead of us, it's empty!
this->tailBlock = this->tailBlock->next;
this->tailBlock->ConcurrentQueue::Block::template reset_empty<
explicit_context>();
// We'll put the block on the block index (guaranteed to be room since
// we're conceptually removing the last block from it first -- except
// instead of removing then adding, we can just overwrite). Note that
// there must be a valid block index here, since even if allocation
// failed in the ctor, it would have been re-attempted when adding the
// first block to the queue; since there is such a block, a block
// index must have been successfully allocated.
}
else {
// Whatever head value we see here is >= the last value we saw here
// (relatively), and <= its current value. Since we have the most
// recent tail, the head must be
// <= to it.
auto head = this->headIndex.load(std::memory_order_relaxed);
assert(!details::circular_less_than<index_t>(currentTailIndex, head));
if (!details::circular_less_than<index_t>(
head, currentTailIndex + QUEUE_BLOCK_SIZE) ||
(MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
(MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE <
currentTailIndex - head))) {
// We can't enqueue in another block because there's not enough
// leeway -- the tail could surpass the head by the time the block
// fills up! (Or we'll exceed the size limit, if the second part of
// the condition was true.)
return false;
}
// We're going to need a new block; check that the block index has
// room
if (pr_blockIndexRaw == nullptr ||
pr_blockIndexSlotsUsed == pr_blockIndexSize) {
// Hmm, the circular block index is already full -- we'll need
// to allocate a new index. Note pr_blockIndexRaw can only be
// nullptr if the initial allocation failed in the constructor.
MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; }
else if (!new_block_index(pr_blockIndexSlotsUsed)) {
return false;
}
}
// Insert a new block in the circular linked list
auto newBlock =
this->parent
->ConcurrentQueue::template requisition_block<allocMode>();
if (newBlock == nullptr) {
return false;
}
#ifdef MCDBGQ_TRACKMEM
newBlock->owner = this;
#endif
newBlock->ConcurrentQueue::Block::template reset_empty<
explicit_context>();
if (this->tailBlock == nullptr) {
newBlock->next = newBlock;
}
else {
newBlock->next = this->tailBlock->next;
this->tailBlock->next = newBlock;
}
this->tailBlock = newBlock;
++pr_blockIndexSlotsUsed;
}
MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
// The constructor may throw. We want the element not to appear in the
// queue in that case (without corrupting the queue):
MOODYCAMEL_TRY {
new ((*this->tailBlock)[currentTailIndex])
T(std::forward<U>(element));
}
MOODYCAMEL_CATCH(...) {
// Revert change to the current block, but leave the new block
// available for next time
pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
this->tailBlock =
startBlock == nullptr ? this->tailBlock : startBlock;
MOODYCAMEL_RETHROW;
}
}
else {
(void)startBlock;
(void)originalBlockIndexSlotsUsed;
}
// Add block to block index
auto& entry = blockIndex.load(std::memory_order_relaxed)
->entries[pr_blockIndexFront];
entry.base = currentTailIndex;
entry.block = this->tailBlock;
blockIndex.load(std::memory_order_relaxed)
->front.store(pr_blockIndexFront, std::memory_order_release);
pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
this->tailIndex.store(newTailIndex, std::memory_order_release);
return true;
}
}
// Enqueue
new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
this->tailIndex.store(newTailIndex, std::memory_order_release);
return true;
}
template <typename U>
bool dequeue(U& element) {
auto tail = this->tailIndex.load(std::memory_order_relaxed);
auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
if (details::circular_less_than<index_t>(
this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
overcommit,
tail)) {
// Might be something to dequeue, let's give it a try
// Note that this if is purely for performance purposes in the common
// case when the queue is empty and the values are eventually consistent
// -- we may enter here spuriously.
// Note that whatever the values of overcommit and tail are, they are
// not going to change (unless we change them) and must be the same
// value at this point (inside the if) as when the if condition was
// evaluated.
// We insert an acquire fence here to synchronize-with the release upon
// incrementing dequeueOvercommit below. This ensures that whatever the
// value we got loaded into overcommit, the load of dequeueOptisticCount
// in the fetch_add below will result in a value at least as recent as
// that (and therefore at least as large). Note that I believe a
// compiler (signal) fence here would be sufficient due to the nature of
// fetch_add (all read-modify-write operations are guaranteed to work on
// the latest value in the modification order), but unfortunately that
// can't be shown to be correct using only the C++11 standard. See
// http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
std::atomic_thread_fence(std::memory_order_acquire);
// Increment optimistic counter, then check if it went over the boundary
auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
1, std::memory_order_relaxed);
// Note that since dequeueOvercommit must be <= dequeueOptimisticCount
// (because dequeueOvercommit is only ever incremented after
// dequeueOptimisticCount -- this is enforced in the `else` block
// below), and since we now have a version of dequeueOptimisticCount
// that is at least as recent as overcommit (due to the release upon
// incrementing dequeueOvercommit and the acquire above that
// synchronizes with it), overcommit <= myDequeueCount. However, we
// can't assert this since both dequeueOptimisticCount and
// dequeueOvercommit may (independently) overflow; in such a case,
// though, the logic still holds since the difference between the two is
// maintained.
// Note that we reload tail here in case it changed; it will be the same
// value as before or greater, since this load is sequenced after
// (happens after) the earlier load above. This is supported by
// read-read coherency (as defined in the standard), explained here:
// http://en.cppreference.com/w/cpp/atomic/memory_order
tail = this->tailIndex.load(std::memory_order_acquire);
if ((details::likely)(details::circular_less_than<index_t>(
myDequeueCount - overcommit, tail))) {
// Guaranteed to be at least one element to dequeue!
// Get the index. Note that since there's guaranteed to be at least
// one element, this will never exceed tail. We need to do an
// acquire-release fence here since it's possible that whatever
// condition got us to this point was for an earlier enqueued element
// (that we already see the memory effects for), but that by the time
// we increment somebody else has incremented it, and we need to see
// the memory effects for *that* element, which is in such a case is
// necessarily visible on the thread that incremented it in the first
// place with the more current condition (they must have acquired a
// tail that is at least as recent).
auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
// Determine which block the element is in
auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
auto localBlockIndexHead =
localBlockIndex->front.load(std::memory_order_acquire);
// We need to be careful here about subtracting and dividing because
// of index wrap-around. When an index wraps, we need to preserve the
// sign of the offset when dividing it by the block size (in order to
// get a correct signed block count offset in all cases):
auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
auto blockBaseIndex =
index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
auto offset = static_cast<size_t>(
static_cast<typename std::make_signed<index_t>::type>(
blockBaseIndex - headBase) /
static_cast<typename std::make_signed<index_t>::type>(
QUEUE_BLOCK_SIZE));
auto block = localBlockIndex
->entries[(localBlockIndexHead + offset) &
(localBlockIndex->size - 1)]
.block;
// Dequeue
auto& el = *((*block)[index]);
if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
// Make sure the element is still fully dequeued and destroyed even
// if the assignment throws
struct Guard {
Block* block;
index_t index;
~Guard() {
(*block)[index]->~T();
block->ConcurrentQueue::Block::template set_empty<
explicit_context>(index);
}
} guard = {block, index};
element = std::move(el); // NOLINT
}
else {
element = std::move(el); // NOLINT
el.~T(); // NOLINT
block->ConcurrentQueue::Block::template set_empty<explicit_context>(
index);
}
return true;
}
else {
// Wasn't anything to dequeue after all; make the effective dequeue
// count eventually consistent
this->dequeueOvercommit.fetch_add(
1, std::memory_order_release); // Release so that the fetch_add
// on dequeueOptimisticCount is
// guaranteed to happen before
// this write
}
}
return false;
}
template <AllocationMode allocMode, typename It>
bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) {
// First, we need to make sure we have enough room to enqueue all of the
// elements; this means pre-allocating blocks and putting them in the
// block index (but only if all the allocations succeeded).
index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
auto startBlock = this->tailBlock;
auto originalBlockIndexFront = pr_blockIndexFront;
auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
Block* firstAllocatedBlock = nullptr;
// Figure out how many blocks we'll need to allocate, and do so
size_t blockBaseDiff =
((startTailIndex + count - 1) &
~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) -
((startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
index_t currentTailIndex =
(startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
if (blockBaseDiff > 0) {
// Allocate as many blocks as possible from ahead
while (blockBaseDiff > 0 && this->tailBlock != nullptr &&
this->tailBlock->next != firstAllocatedBlock &&
this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
explicit_context>()) {
blockBaseDiff -= static_cast<index_t>(QUEUE_BLOCK_SIZE);
currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);
this->tailBlock = this->tailBlock->next;
firstAllocatedBlock = firstAllocatedBlock == nullptr
? this->tailBlock
: firstAllocatedBlock;
auto& entry = blockIndex.load(std::memory_order_relaxed)
->entries[pr_blockIndexFront];
entry.base = currentTailIndex;
entry.block = this->tailBlock;
pr_blockIndexFront =
(pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
}
// Now allocate as many blocks as necessary from the block pool
while (blockBaseDiff > 0) {
blockBaseDiff -= static_cast<index_t>(QUEUE_BLOCK_SIZE);
currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);
auto head = this->headIndex.load(std::memory_order_relaxed);
assert(!details::circular_less_than<index_t>(currentTailIndex, head));
bool full =
!details::circular_less_than<index_t>(
head, currentTailIndex + QUEUE_BLOCK_SIZE) ||
(MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
(MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE <
currentTailIndex - head));
if (pr_blockIndexRaw == nullptr ||
pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
// Failed to allocate, undo changes (but keep injected blocks)
pr_blockIndexFront = originalBlockIndexFront;
pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
this->tailBlock =
startBlock == nullptr ? firstAllocatedBlock : startBlock;
return false;
}
else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
// Failed to allocate, undo changes (but keep injected blocks)
pr_blockIndexFront = originalBlockIndexFront;
pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
this->tailBlock =
startBlock == nullptr ? firstAllocatedBlock : startBlock;
return false;
}
// pr_blockIndexFront is updated inside new_block_index, so we need
// to update our fallback value too (since we keep the new index
// even if we later fail)
originalBlockIndexFront = originalBlockIndexSlotsUsed;
}
// Insert a new block in the circular linked list
auto newBlock =
this->parent
->ConcurrentQueue::template requisition_block<allocMode>();
if (newBlock == nullptr) {
pr_blockIndexFront = originalBlockIndexFront;
pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
this->tailBlock =
startBlock == nullptr ? firstAllocatedBlock : startBlock;
return false;
}
#ifdef MCDBGQ_TRACKMEM
newBlock->owner = this;
#endif
newBlock->ConcurrentQueue::Block::template set_all_empty<
explicit_context>();
if (this->tailBlock == nullptr) {
newBlock->next = newBlock;
}
else {
newBlock->next = this->tailBlock->next;
this->tailBlock->next = newBlock;
}
this->tailBlock = newBlock;
firstAllocatedBlock = firstAllocatedBlock == nullptr
? this->tailBlock
: firstAllocatedBlock;
++pr_blockIndexSlotsUsed;
auto& entry = blockIndex.load(std::memory_order_relaxed)
->entries[pr_blockIndexFront];
entry.base = currentTailIndex;
entry.block = this->tailBlock;
pr_blockIndexFront =
(pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
}
// Excellent, all allocations succeeded. Reset each block's emptiness
// before we fill them up, and publish the new block index front
auto block = firstAllocatedBlock;
while (true) {
block->ConcurrentQueue::Block::template reset_empty<
explicit_context>();
if (block == this->tailBlock) {
break;
}
block = block->next;
}
MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
T, decltype(*itemFirst),
new (static_cast<T*>(nullptr))
T(details::deref_noexcept(itemFirst)))) {
blockIndex.load(std::memory_order_relaxed)
->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
std::memory_order_release);
}
}
// Enqueue, one block at a time
index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
currentTailIndex = startTailIndex;
auto endBlock = this->tailBlock;
this->tailBlock = startBlock;
assert((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) !=
0 ||
firstAllocatedBlock != nullptr || count == 0);
if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 &&
firstAllocatedBlock != nullptr) {
this->tailBlock = firstAllocatedBlock;
}
while (true) {
index_t stopIndex =
(currentTailIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
static_cast<index_t>(QUEUE_BLOCK_SIZE);
if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
stopIndex = newTailIndex;
}
MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
T, decltype(*itemFirst),
new (static_cast<T*>(nullptr))
T(details::deref_noexcept(itemFirst)))) {
while (currentTailIndex != stopIndex) {
new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
}
}
else {
MOODYCAMEL_TRY {
while (currentTailIndex != stopIndex) {
// Must use copy constructor even if move constructor is available
// because we may have to revert if there's an exception.
// Sorry about the horrible templated next line, but it was the
// only way to disable moving *at compile time*, which is
// important because a type may only define a (noexcept) move
// constructor, and so calls to the cctor will not compile, even
// if they are in an if branch that will never be executed
new ((*this->tailBlock)[currentTailIndex]) T(
details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
T, decltype(*itemFirst),
new (static_cast<T*>(nullptr)) T(details::deref_noexcept(
itemFirst)))>::eval(*itemFirst));
++currentTailIndex;
++itemFirst;
}
}
MOODYCAMEL_CATCH(...) {
// Oh dear, an exception's been thrown -- destroy the elements that
// were enqueued so far and revert the entire bulk operation (we'll
// keep any allocated blocks in our linked list for later, though).
auto constructedStopIndex = currentTailIndex;
auto lastBlockEnqueued = this->tailBlock;
pr_blockIndexFront = originalBlockIndexFront;
pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
this->tailBlock =
startBlock == nullptr ? firstAllocatedBlock : startBlock;
if (!details::is_trivially_destructible<T>::value) {
auto block = startBlock;
if ((startTailIndex &
static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0) {
block = firstAllocatedBlock;
}
currentTailIndex = startTailIndex;
while (true) {
stopIndex = (currentTailIndex &
~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
static_cast<index_t>(QUEUE_BLOCK_SIZE);
if (details::circular_less_than<index_t>(constructedStopIndex,
stopIndex)) {
stopIndex = constructedStopIndex;
}
while (currentTailIndex != stopIndex) {
(*block)[currentTailIndex++]->~T();
}
if (block == lastBlockEnqueued) {
break;
}
block = block->next;
}
}
MOODYCAMEL_RETHROW;
}
}
if (this->tailBlock == endBlock) {
assert(currentTailIndex == newTailIndex);
break;
}
this->tailBlock = this->tailBlock->next;
}
MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
T, decltype(*itemFirst),
new (static_cast<T*>(nullptr))
T(details::deref_noexcept(itemFirst)))) {
if (firstAllocatedBlock != nullptr)
blockIndex.load(std::memory_order_relaxed)
->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
std::memory_order_release);
}
this->tailIndex.store(newTailIndex, std::memory_order_release);
return true;
}
template <typename It>
size_t dequeue_bulk(It& itemFirst, size_t max) {
auto tail = this->tailIndex.load(std::memory_order_relaxed);
auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
auto desiredCount = static_cast<size_t>(
tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
overcommit));
if (details::circular_less_than<size_t>(0, desiredCount)) {
desiredCount = desiredCount < max ? desiredCount : max;
std::atomic_thread_fence(std::memory_order_acquire);
auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
desiredCount, std::memory_order_relaxed);
tail = this->tailIndex.load(std::memory_order_acquire);
auto actualCount =
static_cast<size_t>(tail - (myDequeueCount - overcommit));
if (details::circular_less_than<size_t>(0, actualCount)) {
actualCount = desiredCount < actualCount ? desiredCount : actualCount;
if (actualCount < desiredCount) {
this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
std::memory_order_release);
}
// Get the first index. Note that since there's guaranteed to be at
// least actualCount elements, this will never exceed tail.
auto firstIndex =
this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
// Determine which block the first element is in
auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
auto localBlockIndexHead =
localBlockIndex->front.load(std::memory_order_acquire);
auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
auto firstBlockBaseIndex =
firstIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
auto offset = static_cast<size_t>(
static_cast<typename std::make_signed<index_t>::type>(
firstBlockBaseIndex - headBase) /
static_cast<typename std::make_signed<index_t>::type>(
QUEUE_BLOCK_SIZE));
auto indexIndex =
(localBlockIndexHead + offset) & (localBlockIndex->size - 1);
// Iterate the blocks and dequeue
auto index = firstIndex;
do {
auto firstIndexInBlock = index;
index_t endIndex =
(index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
static_cast<index_t>(QUEUE_BLOCK_SIZE);
endIndex =
details::circular_less_than<index_t>(
firstIndex + static_cast<index_t>(actualCount), endIndex)
? firstIndex + static_cast<index_t>(actualCount)
: endIndex;
auto block = localBlockIndex->entries[indexIndex].block;
if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&,
details::deref_noexcept(itemFirst) =
std::move((*(*block)[index])))) {
while (index != endIndex) {
auto& el = *((*block)[index]);
*itemFirst++ = std::move(el);
el.~T();
++index;
}
}
else {
MOODYCAMEL_TRY {
while (index != endIndex) {
auto& el = *((*block)[index]);
*itemFirst = std::move(el);
++itemFirst;
el.~T();
++index;
}
}
MOODYCAMEL_CATCH(...) {
// It's too late to revert the dequeue, but we can make sure
// that all the dequeued objects are properly destroyed and the
// block index (and empty count) are properly updated before we
// propagate the exception
do {
block = localBlockIndex->entries[indexIndex].block;
while (index != endIndex) {
(*block)[index++]->~T();
}
block->ConcurrentQueue::Block::template set_many_empty<
explicit_context>(
firstIndexInBlock,
static_cast<size_t>(endIndex - firstIndexInBlock));
indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
firstIndexInBlock = index;
endIndex =
(index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
static_cast<index_t>(QUEUE_BLOCK_SIZE);
endIndex =
details::circular_less_than<index_t>(
firstIndex + static_cast<index_t>(actualCount),
endIndex)
? firstIndex + static_cast<index_t>(actualCount)
: endIndex;
} while (index != firstIndex + actualCount);
MOODYCAMEL_RETHROW;
}
}
block->ConcurrentQueue::Block::template set_many_empty<
explicit_context>(
firstIndexInBlock,
static_cast<size_t>(endIndex - firstIndexInBlock));
indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
} while (index != firstIndex + actualCount);
return actualCount;
}
else {
// Wasn't anything to dequeue after all; make the effective dequeue
// count eventually consistent
this->dequeueOvercommit.fetch_add(desiredCount,
std::memory_order_release);
}
}
return 0;
}
private:
struct BlockIndexEntry {
index_t base;
Block* block;
};
struct BlockIndexHeader {
size_t size;
std::atomic<size_t>
front; // Current slot (not next, like pr_blockIndexFront)
BlockIndexEntry* entries;
void* prev;
};
bool new_block_index(size_t numberOfFilledSlotsToExpose) {
auto prevBlockSizeMask = pr_blockIndexSize - 1;
// Create the new block
pr_blockIndexSize <<= 1;
auto newRawPtr = static_cast<char*>((Traits::malloc)(
sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
if (newRawPtr == nullptr) {
pr_blockIndexSize >>= 1; // Reset to allow graceful retry
return false;
}
auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(
details::align_for<BlockIndexEntry>(newRawPtr +
sizeof(BlockIndexHeader)));
// Copy in all the old indices, if any
size_t j = 0;
if (pr_blockIndexSlotsUsed != 0) {
auto i =
(pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
do {
newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
i = (i + 1) & prevBlockSizeMask;
} while (i != pr_blockIndexFront);
}
// Update everything
auto header = new (newRawPtr) BlockIndexHeader;
header->size = pr_blockIndexSize;
header->front.store(numberOfFilledSlotsToExpose - 1,
std::memory_order_relaxed);
header->entries = newBlockIndexEntries;
header->prev = pr_blockIndexRaw; // we link the new block to the old one
// so we can free it later
pr_blockIndexFront = j;
pr_blockIndexEntries = newBlockIndexEntries;
pr_blockIndexRaw = newRawPtr;
blockIndex.store(header, std::memory_order_release);
return true;
}
private:
std::atomic<BlockIndexHeader*> blockIndex;
// To be used by producer only -- consumer must use the ones in referenced
// by blockIndex
size_t pr_blockIndexSlotsUsed;
size_t pr_blockIndexSize;
size_t pr_blockIndexFront; // Next slot (not current)
BlockIndexEntry* pr_blockIndexEntries;
void* pr_blockIndexRaw;
#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
public:
ExplicitProducer* nextExplicitProducer;
private:
#endif
#ifdef MCDBGQ_TRACKMEM
friend struct MemStats;
#endif
};
//////////////////////////////////
// Implicit queue
//////////////////////////////////
struct ImplicitProducer : public ProducerBase {
ImplicitProducer(ConcurrentQueue* parent_)
: ProducerBase(parent_, false),
nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
blockIndex(nullptr) {
new_block_index();
}
~ImplicitProducer() {
// Note that since we're in the destructor we can assume that all
// enqueue/dequeue operations completed already; this means that all
// undequeued elements are placed contiguously across contiguous blocks,
// and that only the first and last remaining blocks can be only partially
// empty (all other remaining blocks must be completely full).
#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
// Unregister ourselves for thread termination notification
if (!this->inactive.load(std::memory_order_relaxed)) {
details::ThreadExitNotifier::unsubscribe(&threadExitListener);
}
#endif
// Destroy all remaining elements!
auto tail = this->tailIndex.load(std::memory_order_relaxed);
auto index = this->headIndex.load(std::memory_order_relaxed);
Block* block = nullptr;
assert(index == tail || details::circular_less_than(index, tail));
bool forceFreeLastBlock =
index != tail; // If we enter the loop, then the last (tail) block
// will not be freed
while (index != tail) {
if ((index & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 ||
block == nullptr) {
if (block != nullptr) {
// Free the old block
this->parent->add_block_to_free_list(block);
}
block = get_block_index_entry_for_index(index)->value.load(
std::memory_order_relaxed);
}
((*block)[index])->~T();
++index;
}
// Even if the queue is empty, there's still one block that's not on the
// free list (unless the head index reached the end of it, in which case
// the tail will be poised to create a new block).
if (this->tailBlock != nullptr &&
(forceFreeLastBlock ||
(tail & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) != 0)) {
this->parent->add_block_to_free_list(this->tailBlock);
}
// Destroy block index
auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
if (localBlockIndex != nullptr) {
for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
localBlockIndex->index[i]->~BlockIndexEntry();
}
do {
auto prev = localBlockIndex->prev;
localBlockIndex->~BlockIndexHeader();
(Traits::free)(localBlockIndex);
localBlockIndex = prev;
} while (localBlockIndex != nullptr);
}
}
template <AllocationMode allocMode, typename U>
inline bool enqueue(U&& element) {
index_t currentTailIndex =
this->tailIndex.load(std::memory_order_relaxed);
index_t newTailIndex = 1 + currentTailIndex;
if ((currentTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) ==
0) {
// We reached the end of a block, start a new one
auto head = this->headIndex.load(std::memory_order_relaxed);
assert(!details::circular_less_than<index_t>(currentTailIndex, head));
if (!details::circular_less_than<index_t>(
head, currentTailIndex + QUEUE_BLOCK_SIZE) ||
(MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
(MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE <
currentTailIndex - head))) {
return false;
}
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
debug::DebugLock lock(mutex);
#endif
// Find out where we'll be inserting this block in the block index
BlockIndexEntry* idxEntry;
if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
return false;
}
// Get ahold of a new block
auto newBlock =
this->parent
->ConcurrentQueue::template requisition_block<allocMode>();
if (newBlock == nullptr) {
rewind_block_index_tail();
idxEntry->value.store(nullptr, std::memory_order_relaxed);
return false;
}
#ifdef MCDBGQ_TRACKMEM
newBlock->owner = this;
#endif
newBlock
->ConcurrentQueue::Block::template reset_empty<implicit_context>();
MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
// May throw, try to insert now before we publish the fact that we
// have this new block
MOODYCAMEL_TRY {
new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
}
MOODYCAMEL_CATCH(...) {
rewind_block_index_tail();
idxEntry->value.store(nullptr, std::memory_order_relaxed);
this->parent->add_block_to_free_list(newBlock);
MOODYCAMEL_RETHROW;
}
}
// Insert the new block into the index
idxEntry->value.store(newBlock, std::memory_order_relaxed);
this->tailBlock = newBlock;
MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
this->tailIndex.store(newTailIndex, std::memory_order_release);
return true;
}
}
// Enqueue
new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
this->tailIndex.store(newTailIndex, std::memory_order_release);
return true;
}
template <typename U>
bool dequeue(U& element) {
// See ExplicitProducer::dequeue for rationale and explanation
index_t tail = this->tailIndex.load(std::memory_order_relaxed);
index_t overcommit =
this->dequeueOvercommit.load(std::memory_order_relaxed);
if (details::circular_less_than<index_t>(
this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
overcommit,
tail)) {
std::atomic_thread_fence(std::memory_order_acquire);
index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(
1, std::memory_order_relaxed);
tail = this->tailIndex.load(std::memory_order_acquire);
if ((details::likely)(details::circular_less_than<index_t>(
myDequeueCount - overcommit, tail))) {
index_t index =
this->headIndex.fetch_add(1, std::memory_order_acq_rel);
// Determine which block the element is in
auto entry = get_block_index_entry_for_index(index);
// Dequeue
auto block = entry->value.load(std::memory_order_relaxed);
auto& el = *((*block)[index]);
if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
// Note: Acquiring the mutex with every dequeue instead of only when
// a block is released is very sub-optimal, but it is, after all,
// purely debug code.
debug::DebugLock lock(producer->mutex);
#endif
struct Guard {
Block* block;
index_t index;
BlockIndexEntry* entry;
ConcurrentQueue* parent;
~Guard() {
(*block)[index]->~T();
if (block->ConcurrentQueue::Block::template set_empty<
implicit_context>(index)) {
entry->value.store(nullptr, std::memory_order_relaxed);
parent->add_block_to_free_list(block);
}
}
} guard = {block, index, entry, this->parent};
element = std::move(el); // NOLINT
}
else {
element = std::move(el); // NOLINT
el.~T(); // NOLINT
if (block->ConcurrentQueue::Block::template set_empty<
implicit_context>(index)) {
{
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
debug::DebugLock lock(mutex);
#endif
// Add the block back into the global free pool (and remove from
// block index)
entry->value.store(nullptr, std::memory_order_relaxed);
}
this->parent->add_block_to_free_list(
block); // releases the above store
}
}
return true;
}
else {
this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
}
}
return false;
}
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4706) // assignment within conditional expression
#endif
template <AllocationMode allocMode, typename It>
bool enqueue_bulk(It itemFirst, size_t count) {
// First, we need to make sure we have enough room to enqueue all of the
// elements; this means pre-allocating blocks and putting them in the
// block index (but only if all the allocations succeeded).
// Note that the tailBlock we start off with may not be owned by us any
// more; this happens if it was filled up exactly to the top (setting
// tailIndex to the first index of the next block which is not yet
// allocated), then dequeued completely (putting it on the free list)
// before we enqueue again.
index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
auto startBlock = this->tailBlock;
Block* firstAllocatedBlock = nullptr;
auto endBlock = this->tailBlock;
// Figure out how many blocks we'll need to allocate, and do so
size_t blockBaseDiff =
((startTailIndex + count - 1) &
~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) -
((startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1));
index_t currentTailIndex =
(startTailIndex - 1) & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
if (blockBaseDiff > 0) {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
debug::DebugLock lock(mutex);
#endif
do {
blockBaseDiff -= static_cast<index_t>(QUEUE_BLOCK_SIZE);
currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);
// Find out where we'll be inserting this block in the block index
BlockIndexEntry* idxEntry =
nullptr; // initialization here unnecessary but compiler can't
// always tell
Block* newBlock;
bool indexInserted = false;
auto head = this->headIndex.load(std::memory_order_relaxed);
assert(!details::circular_less_than<index_t>(currentTailIndex, head));
bool full =
!details::circular_less_than<index_t>(
head, currentTailIndex + QUEUE_BLOCK_SIZE) ||
(MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
(MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - QUEUE_BLOCK_SIZE <
currentTailIndex - head));
if (full ||
!(indexInserted = insert_block_index_entry<allocMode>(
idxEntry, currentTailIndex)) ||
(newBlock =
this->parent->ConcurrentQueue::template requisition_block<
allocMode>()) == nullptr) {
// Index allocation or block allocation failed; revert any other
// allocations and index insertions done so far for this operation
if (indexInserted) {
rewind_block_index_tail();
idxEntry->value.store(nullptr, std::memory_order_relaxed);
}
currentTailIndex = (startTailIndex - 1) &
~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
for (auto block = firstAllocatedBlock; block != nullptr;
block = block->next) {
currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);
idxEntry = get_block_index_entry_for_index(currentTailIndex);
idxEntry->value.store(nullptr, std::memory_order_relaxed);
rewind_block_index_tail();
}
this->parent->add_blocks_to_free_list(firstAllocatedBlock);
this->tailBlock = startBlock;
return false;
}
#ifdef MCDBGQ_TRACKMEM
newBlock->owner = this;
#endif
newBlock->ConcurrentQueue::Block::template reset_empty<
implicit_context>();
newBlock->next = nullptr;
// Insert the new block into the index
idxEntry->value.store(newBlock, std::memory_order_relaxed);
// Store the chain of blocks so that we can undo if later allocations
// fail, and so that we can find the blocks when we do the actual
// enqueueing
if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) !=
0 ||
firstAllocatedBlock != nullptr) {
assert(this->tailBlock != nullptr);
this->tailBlock->next = newBlock;
}
this->tailBlock = newBlock;
endBlock = newBlock;
firstAllocatedBlock =
firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
} while (blockBaseDiff > 0);
}
// Enqueue, one block at a time
index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
currentTailIndex = startTailIndex;
this->tailBlock = startBlock;
assert((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) !=
0 ||
firstAllocatedBlock != nullptr || count == 0);
if ((startTailIndex & static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0 &&
firstAllocatedBlock != nullptr) {
this->tailBlock = firstAllocatedBlock;
}
while (true) {
index_t stopIndex =
(currentTailIndex & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
static_cast<index_t>(QUEUE_BLOCK_SIZE);
if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
stopIndex = newTailIndex;
}
MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
T, decltype(*itemFirst),
new (static_cast<T*>(nullptr))
T(details::deref_noexcept(itemFirst)))) {
while (currentTailIndex != stopIndex) {
new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
}
}
else {
MOODYCAMEL_TRY {
while (currentTailIndex != stopIndex) {
new ((*this->tailBlock)[currentTailIndex]) T(
details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
T, decltype(*itemFirst),
new (static_cast<T*>(nullptr)) T(details::deref_noexcept(
itemFirst)))>::eval(*itemFirst));
++currentTailIndex;
++itemFirst;
}
}
MOODYCAMEL_CATCH(...) {
auto constructedStopIndex = currentTailIndex;
auto lastBlockEnqueued = this->tailBlock;
if (!details::is_trivially_destructible<T>::value) {
auto block = startBlock;
if ((startTailIndex &
static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) == 0) {
block = firstAllocatedBlock;
}
currentTailIndex = startTailIndex;
while (true) {
stopIndex = (currentTailIndex &
~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
static_cast<index_t>(QUEUE_BLOCK_SIZE);
if (details::circular_less_than<index_t>(constructedStopIndex,
stopIndex)) {
stopIndex = constructedStopIndex;
}
while (currentTailIndex != stopIndex) {
(*block)[currentTailIndex++]->~T();
}
if (block == lastBlockEnqueued) {
break;
}
block = block->next;
}
}
currentTailIndex = (startTailIndex - 1) &
~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
for (auto block = firstAllocatedBlock; block != nullptr;
block = block->next) {
currentTailIndex += static_cast<index_t>(QUEUE_BLOCK_SIZE);
auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
idxEntry->value.store(nullptr, std::memory_order_relaxed);
rewind_block_index_tail();
}
this->parent->add_blocks_to_free_list(firstAllocatedBlock);
this->tailBlock = startBlock;
MOODYCAMEL_RETHROW;
}
}
if (this->tailBlock == endBlock) {
assert(currentTailIndex == newTailIndex);
break;
}
this->tailBlock = this->tailBlock->next;
}
this->tailIndex.store(newTailIndex, std::memory_order_release);
return true;
}
#ifdef _MSC_VER
#pragma warning(pop)
#endif
template <typename It>
size_t dequeue_bulk(It& itemFirst, size_t max) {
auto tail = this->tailIndex.load(std::memory_order_relaxed);
auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
auto desiredCount = static_cast<size_t>(
tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
overcommit));
if (details::circular_less_than<size_t>(0, desiredCount)) {
desiredCount = desiredCount < max ? desiredCount : max;
std::atomic_thread_fence(std::memory_order_acquire);
auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
desiredCount, std::memory_order_relaxed);
tail = this->tailIndex.load(std::memory_order_acquire);
auto actualCount =
static_cast<size_t>(tail - (myDequeueCount - overcommit));
if (details::circular_less_than<size_t>(0, actualCount)) {
actualCount = desiredCount < actualCount ? desiredCount : actualCount;
if (actualCount < desiredCount) {
this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
std::memory_order_release);
}
// Get the first index. Note that since there's guaranteed to be at
// least actualCount elements, this will never exceed tail.
auto firstIndex =
this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
// Iterate the blocks and dequeue
auto index = firstIndex;
BlockIndexHeader* localBlockIndex;
auto indexIndex =
get_block_index_index_for_index(index, localBlockIndex);
do {
auto blockStartIndex = index;
index_t endIndex =
(index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
static_cast<index_t>(QUEUE_BLOCK_SIZE);
endIndex =
details::circular_less_than<index_t>(
firstIndex + static_cast<index_t>(actualCount), endIndex)
? firstIndex + static_cast<index_t>(actualCount)
: endIndex;
auto entry = localBlockIndex->index[indexIndex];
auto block = entry->value.load(std::memory_order_relaxed);
if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&,
details::deref_noexcept(itemFirst) =
std::move((*(*block)[index])))) {
while (index != endIndex) {
auto& el = *((*block)[index]);
*itemFirst++ = std::move(el);
el.~T();
++index;
}
}
else {
MOODYCAMEL_TRY {
while (index != endIndex) {
auto& el = *((*block)[index]);
*itemFirst = std::move(el);
++itemFirst;
el.~T();
++index;
}
}
MOODYCAMEL_CATCH(...) {
do {
entry = localBlockIndex->index[indexIndex];
block = entry->value.load(std::memory_order_relaxed);
while (index != endIndex) {
(*block)[index++]->~T();
}
if (block->ConcurrentQueue::Block::template set_many_empty<
implicit_context>(
blockStartIndex,
static_cast<size_t>(endIndex - blockStartIndex))) {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
debug::DebugLock lock(mutex);
#endif
entry->value.store(nullptr, std::memory_order_relaxed);
this->parent->add_block_to_free_list(block);
}
indexIndex =
(indexIndex + 1) & (localBlockIndex->capacity - 1);
blockStartIndex = index;
endIndex =
(index & ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1)) +
static_cast<index_t>(QUEUE_BLOCK_SIZE);
endIndex =
details::circular_less_than<index_t>(
firstIndex + static_cast<index_t>(actualCount),
endIndex)
? firstIndex + static_cast<index_t>(actualCount)
: endIndex;
} while (index != firstIndex + actualCount);
MOODYCAMEL_RETHROW;
}
}
if (block->ConcurrentQueue::Block::template set_many_empty<
implicit_context>(
blockStartIndex,
static_cast<size_t>(endIndex - blockStartIndex))) {
{
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
debug::DebugLock lock(mutex);
#endif
// Note that the set_many_empty above did a release, meaning
// that anybody who acquires the block we're about to free can
// use it safely since our writes (and reads!) will have
// happened-before then.
entry->value.store(nullptr, std::memory_order_relaxed);
}
this->parent->add_block_to_free_list(
block); // releases the above store
}
indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
} while (index != firstIndex + actualCount);
return actualCount;
}
else {
this->dequeueOvercommit.fetch_add(desiredCount,
std::memory_order_release);
}
}
return 0;
}
private:
// The block size must be > 1, so any number with the low bit set is an
// invalid block base index
static const index_t INVALID_BLOCK_BASE = 1;
struct BlockIndexEntry {
std::atomic<index_t> key;
std::atomic<Block*> value;
};
struct BlockIndexHeader {
size_t capacity;
std::atomic<size_t> tail;
BlockIndexEntry* entries;
BlockIndexEntry** index;
BlockIndexHeader* prev;
};
template <AllocationMode allocMode>
inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry,
index_t blockStartIndex) {
auto localBlockIndex =
blockIndex.load(std::memory_order_relaxed); // We're the only writer
// thread, relaxed is OK
if (localBlockIndex == nullptr) {
return false; // this can happen if new_block_index failed in the
// constructor
}
size_t newTail =
(localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
(localBlockIndex->capacity - 1);
idxEntry = localBlockIndex->index[newTail];
if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
localBlockIndex->tail.store(newTail, std::memory_order_release);
return true;
}
// No room in the old block index, try to allocate another one!
MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; }
else if (!new_block_index()) {
return false;
}
else {
localBlockIndex = blockIndex.load(std::memory_order_relaxed);
newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
(localBlockIndex->capacity - 1);
idxEntry = localBlockIndex->index[newTail];
assert(idxEntry->key.load(std::memory_order_relaxed) ==
INVALID_BLOCK_BASE);
idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
localBlockIndex->tail.store(newTail, std::memory_order_release);
return true;
}
}
inline void rewind_block_index_tail() {
auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
localBlockIndex->tail.store(
(localBlockIndex->tail.load(std::memory_order_relaxed) - 1) &
(localBlockIndex->capacity - 1),
std::memory_order_relaxed);
}
inline BlockIndexEntry* get_block_index_entry_for_index(
index_t index) const {
BlockIndexHeader* localBlockIndex;
auto idx = get_block_index_index_for_index(index, localBlockIndex);
return localBlockIndex->index[idx];
}
inline size_t get_block_index_index_for_index(
index_t index, BlockIndexHeader*& localBlockIndex) const {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
debug::DebugLock lock(mutex);
#endif
index &= ~static_cast<index_t>(QUEUE_BLOCK_SIZE - 1);
localBlockIndex = blockIndex.load(std::memory_order_acquire);
auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
auto tailBase =
localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
assert(tailBase != INVALID_BLOCK_BASE);
// Note: Must use division instead of shift because the index may wrap
// around, causing a negative offset, whose negativity we want to preserve
auto offset = static_cast<size_t>(
static_cast<typename std::make_signed<index_t>::type>(index -
tailBase) /
static_cast<typename std::make_signed<index_t>::type>(
QUEUE_BLOCK_SIZE));
size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) ==
index &&
localBlockIndex->index[idx]->value.load(
std::memory_order_relaxed) != nullptr);
return idx;
}
bool new_block_index() {
auto prev = blockIndex.load(std::memory_order_relaxed);
size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
auto raw = static_cast<char*>((Traits::malloc)(
sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
1 + sizeof(BlockIndexEntry) * entryCount +
std::alignment_of<BlockIndexEntry*>::value - 1 +
sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
if (raw == nullptr) {
return false;
}
auto header = new (raw) BlockIndexHeader;
auto entries = reinterpret_cast<BlockIndexEntry*>(
details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
auto index = reinterpret_cast<BlockIndexEntry**>(
details::align_for<BlockIndexEntry*>(
reinterpret_cast<char*>(entries) +
sizeof(BlockIndexEntry) * entryCount));
if (prev != nullptr) {
auto prevTail = prev->tail.load(std::memory_order_relaxed);
auto prevPos = prevTail;
size_t i = 0;
do {
prevPos = (prevPos + 1) & (prev->capacity - 1);
index[i++] = prev->index[prevPos];
} while (prevPos != prevTail);
assert(i == prevCapacity);
}
for (size_t i = 0; i != entryCount; ++i) {
new (entries + i) BlockIndexEntry;
entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
index[prevCapacity + i] = entries + i;
}
header->prev = prev;
header->entries = entries;
header->index = index;
header->capacity = nextBlockIndexCapacity;
header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1),
std::memory_order_relaxed);
blockIndex.store(header, std::memory_order_release);
nextBlockIndexCapacity <<= 1;
return true;
}
private:
size_t nextBlockIndexCapacity;
std::atomic<BlockIndexHeader*> blockIndex;
#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
public:
details::ThreadExitListener threadExitListener;
private:
#endif
#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
public:
ImplicitProducer* nextImplicitProducer;
private:
#endif
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
mutable debug::DebugMutex mutex;
#endif
#ifdef MCDBGQ_TRACKMEM
friend struct MemStats;
#endif
};
//////////////////////////////////
// Block pool manipulation
//////////////////////////////////
void populate_initial_block_list(size_t blockCount) {
initialBlockPoolSize = blockCount;
if (initialBlockPoolSize == 0) {
initialBlockPool = nullptr;
return;
}
initialBlockPool = create_array<Block>(blockCount);
if (initialBlockPool == nullptr) {
initialBlockPoolSize = 0;
}
for (size_t i = 0; i < initialBlockPoolSize; ++i) {
initialBlockPool[i].dynamicallyAllocated = false;
}
}
inline Block* try_get_block_from_initial_pool() {
if (initialBlockPoolIndex.load(std::memory_order_relaxed) >=
initialBlockPoolSize) {
return nullptr;
}
auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
}
inline void add_block_to_free_list(Block* block) {
#ifdef MCDBGQ_TRACKMEM
block->owner = nullptr;
#endif
if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
destroy(block);
}
else {
freeList.add(block);
}
}
inline void add_blocks_to_free_list(Block* block) {
while (block != nullptr) {
auto next = block->next;
add_block_to_free_list(block);
block = next;
}
}
inline Block* try_get_block_from_free_list() { return freeList.try_get(); }
// Gets a free block from one of the memory pools, or allocates a new one (if
// applicable)
template <AllocationMode canAlloc>
Block* requisition_block() {
auto block = try_get_block_from_initial_pool();
if (block != nullptr) {
return block;
}
block = try_get_block_from_free_list();
if (block != nullptr) {
return block;
}
MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) { return create<Block>(); }
else {
return nullptr;
}
}
#ifdef MCDBGQ_TRACKMEM
public:
struct MemStats {
size_t allocatedBlocks;
size_t usedBlocks;
size_t freeBlocks;
size_t ownedBlocksExplicit;
size_t ownedBlocksImplicit;
size_t implicitProducers;
size_t explicitProducers;
size_t elementsEnqueued;
size_t blockClassBytes;
size_t queueClassBytes;
size_t implicitBlockIndexBytes;
size_t explicitBlockIndexBytes;
friend class ConcurrentQueue;
private:
static MemStats getFor(ConcurrentQueue* q) {
MemStats stats = {0};
stats.elementsEnqueued = q->size_approx();
auto block = q->freeList.head_unsafe();
while (block != nullptr) {
++stats.allocatedBlocks;
++stats.freeBlocks;
block = block->freeListNext.load(std::memory_order_relaxed);
}
for (auto ptr = q->producerListTail.load(std::memory_order_acquire);
ptr != nullptr; ptr = ptr->next_prod()) {
bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
stats.implicitProducers += implicit ? 1 : 0;
stats.explicitProducers += implicit ? 0 : 1;
if (implicit) {
auto prod = static_cast<ImplicitProducer*>(ptr);
stats.queueClassBytes += sizeof(ImplicitProducer);
auto head = prod->headIndex.load(std::memory_order_relaxed);
auto tail = prod->tailIndex.load(std::memory_order_relaxed);
auto hash = prod->blockIndex.load(std::memory_order_relaxed);
if (hash != nullptr) {
for (size_t i = 0; i != hash->capacity; ++i) {
if (hash->index[i]->key.load(std::memory_order_relaxed) !=
ImplicitProducer::INVALID_BLOCK_BASE &&
hash->index[i]->value.load(std::memory_order_relaxed) !=
nullptr) {
++stats.allocatedBlocks;
++stats.ownedBlocksImplicit;
}
}
stats.implicitBlockIndexBytes +=
hash->capacity *
sizeof(typename ImplicitProducer::BlockIndexEntry);
for (; hash != nullptr; hash = hash->prev) {
stats.implicitBlockIndexBytes +=
sizeof(typename ImplicitProducer::BlockIndexHeader) +
hash->capacity *
sizeof(typename ImplicitProducer::BlockIndexEntry*);
}
}
for (; details::circular_less_than<index_t>(head, tail);
head += QUEUE_BLOCK_SIZE) {
// auto block = prod->get_block_index_entry_for_index(head);
++stats.usedBlocks;
}
}
else {
auto prod = static_cast<ExplicitProducer*>(ptr);
stats.queueClassBytes += sizeof(ExplicitProducer);
auto tailBlock = prod->tailBlock;
bool wasNonEmpty = false;
if (tailBlock != nullptr) {
auto block = tailBlock;
do {
++stats.allocatedBlocks;
if (!block->ConcurrentQueue::Block::template is_empty<
explicit_context>() ||
wasNonEmpty) {
++stats.usedBlocks;
wasNonEmpty = wasNonEmpty || block != tailBlock;
}
++stats.ownedBlocksExplicit;
block = block->next;
} while (block != tailBlock);
}
auto index = prod->blockIndex.load(std::memory_order_relaxed);
while (index != nullptr) {
stats.explicitBlockIndexBytes +=
sizeof(typename ExplicitProducer::BlockIndexHeader) +
index->size *
sizeof(typename ExplicitProducer::BlockIndexEntry);
index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(
index->prev);
}
}
}
auto freeOnInitialPool =
q->initialBlockPoolIndex.load(std::memory_order_relaxed) >=
q->initialBlockPoolSize
? 0
: q->initialBlockPoolSize -
q->initialBlockPoolIndex.load(std::memory_order_relaxed);
stats.allocatedBlocks += freeOnInitialPool;
stats.freeBlocks += freeOnInitialPool;
stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
stats.queueClassBytes += sizeof(ConcurrentQueue);
return stats;
}
};
// For debugging only. Not thread-safe.
MemStats getMemStats() { return MemStats::getFor(this); }
private:
friend struct MemStats;
#endif
//////////////////////////////////
// Producer list manipulation
//////////////////////////////////
ProducerBase* recycle_or_create_producer(bool isExplicit) {
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
debug::DebugLock lock(implicitProdMutex);
#endif
// Try to re-use one first
for (auto ptr = producerListTail.load(std::memory_order_acquire);
ptr != nullptr; ptr = ptr->next_prod()) {
if (ptr->inactive.load(std::memory_order_relaxed) &&
ptr->isExplicit == isExplicit) {
bool expected = true;
if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false,
std::memory_order_acquire,
std::memory_order_relaxed)) {
// We caught one! It's been marked as activated, the caller can have
// it
return ptr;
}
}
}
return add_producer(
isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this))
: create<ImplicitProducer>(this));
}
ProducerBase* add_producer(ProducerBase* producer) {
// Handle failed memory allocation
if (producer == nullptr) {
return nullptr;
}
producerCount.fetch_add(1, std::memory_order_relaxed);
// Add it to the lock-free list
auto prevTail = producerListTail.load(std::memory_order_relaxed);
do {
producer->next = prevTail;
} while (!producerListTail.compare_exchange_weak(
prevTail, producer, std::memory_order_release,
std::memory_order_relaxed));
#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
if (producer->isExplicit) {
auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
do {
static_cast<ExplicitProducer*>(producer)->nextExplicitProducer =
prevTailExplicit;
} while (!explicitProducers.compare_exchange_weak(
prevTailExplicit, static_cast<ExplicitProducer*>(producer),
std::memory_order_release, std::memory_order_relaxed));
}
else {
auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
do {
static_cast<ImplicitProducer*>(producer)->nextImplicitProducer =
prevTailImplicit;
} while (!implicitProducers.compare_exchange_weak(
prevTailImplicit, static_cast<ImplicitProducer*>(producer),
std::memory_order_release, std::memory_order_relaxed));
}
#endif
return producer;
}
void reown_producers() {
// After another instance is moved-into/swapped-with this one, all the
// producers we stole still think their parents are the other queue.
// So fix them up!
for (auto ptr = producerListTail.load(std::memory_order_relaxed);
ptr != nullptr; ptr = ptr->next_prod()) {
ptr->parent = this;
}
}
//////////////////////////////////
// Implicit producer hash
//////////////////////////////////
struct ImplicitProducerKVP {
std::atomic<details::thread_id_t> key;
ImplicitProducer* value; // No need for atomicity since it's only read by
// the thread that sets it in the first place
ImplicitProducerKVP() : value(nullptr) {}
ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT {
key.store(other.key.load(std::memory_order_relaxed),
std::memory_order_relaxed);
value = other.value;
}
inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other)
MOODYCAMEL_NOEXCEPT {
swap(other);
return *this;
}
inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT {
if (this != &other) {
details::swap_relaxed(key, other.key);
std::swap(value, other.value);
}
}
};
template <typename XT, typename XTraits>
friend void moodycamel::swap(
typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&,
typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&)
MOODYCAMEL_NOEXCEPT;
struct ImplicitProducerHash {
size_t capacity;
ImplicitProducerKVP* entries;
ImplicitProducerHash* prev;
};
inline void populate_initial_implicit_producer_hash() {
MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
return;
}
else {
implicitProducerHashCount.store(0, std::memory_order_relaxed);
auto hash = &initialImplicitProducerHash;
hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
hash->entries = &initialImplicitProducerHashEntries[0];
for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
initialImplicitProducerHashEntries[i].key.store(
details::invalid_thread_id, std::memory_order_relaxed);
}
hash->prev = nullptr;
implicitProducerHash.store(hash, std::memory_order_relaxed);
}
}
void swap_implicit_producer_hashes(ConcurrentQueue& other) {
MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
return;
}
else {
// Swap (assumes our implicit producer hash is initialized)
initialImplicitProducerHashEntries.swap(
other.initialImplicitProducerHashEntries);
initialImplicitProducerHash.entries =
&initialImplicitProducerHashEntries[0];
other.initialImplicitProducerHash.entries =
&other.initialImplicitProducerHashEntries[0];
details::swap_relaxed(implicitProducerHashCount,
other.implicitProducerHashCount);
details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
if (implicitProducerHash.load(std::memory_order_relaxed) ==
&other.initialImplicitProducerHash) {
implicitProducerHash.store(&initialImplicitProducerHash,
std::memory_order_relaxed);
}
else {
ImplicitProducerHash* hash;
for (hash = implicitProducerHash.load(std::memory_order_relaxed);
hash->prev != &other.initialImplicitProducerHash;
hash = hash->prev) {
continue;
}
hash->prev = &initialImplicitProducerHash;
}
if (other.implicitProducerHash.load(std::memory_order_relaxed) ==
&initialImplicitProducerHash) {
other.implicitProducerHash.store(&other.initialImplicitProducerHash,
std::memory_order_relaxed);
}
else {
ImplicitProducerHash* hash;
for (hash = other.implicitProducerHash.load(std::memory_order_relaxed);
hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
continue;
}
hash->prev = &other.initialImplicitProducerHash;
}
}
}
// Only fails (returns nullptr) if memory allocation fails
ImplicitProducer* get_or_add_implicit_producer() {
// Note that since the data is essentially thread-local (key is thread ID),
// there's a reduced need for fences (memory ordering is already consistent
// for any individual thread), except for the current table itself.
// Start by looking for the thread ID in the current and all previous hash
// tables. If it's not found, it must not be in there yet, since this same
// thread would have added it previously to one of the tables that we
// traversed.
// Code and algorithm adapted from
// http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
debug::DebugLock lock(implicitProdMutex);
#endif
auto id = details::thread_id();
auto hashedId = details::hash_thread_id(id);
auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
assert(
mainHash !=
nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null)
for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
// Look for the id in this hash
auto index = hashedId;
while (true) { // Not an infinite loop because at least one slot is free
// in the hash table
index &= hash->capacity - 1u;
auto probedKey =
hash->entries[index].key.load(std::memory_order_relaxed);
if (probedKey == id) {
// Found it! If we had to search several hashes deep, though, we
// should lazily add it to the current main hash table to avoid the
// extended search next time. Note there's guaranteed to be room in
// the current hash table since every subsequent table implicitly
// reserves space for all previous tables (there's only one
// implicitProducerHashCount).
auto value = hash->entries[index].value;
if (hash != mainHash) {
index = hashedId;
while (true) {
index &= mainHash->capacity - 1u;
auto empty = details::invalid_thread_id;
#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
auto reusable = details::invalid_thread_id2;
if (mainHash->entries[index].key.compare_exchange_strong(
empty, id, std::memory_order_seq_cst,
std::memory_order_relaxed) ||
mainHash->entries[index].key.compare_exchange_strong(
reusable, id, std::memory_order_seq_cst,
std::memory_order_relaxed)) {
#else
if (mainHash->entries[index].key.compare_exchange_strong(
empty, id, std::memory_order_seq_cst,
std::memory_order_relaxed)) {
#endif
mainHash->entries[index].value = value;
break;
}
++index;
}
}
return value;
}
if (probedKey == details::invalid_thread_id) {
break; // Not in this hash table
}
++index;
}
}
// Insert!
auto newCount =
1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
while (true) {
// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
if (newCount >= (mainHash->capacity >> 1) &&
!implicitProducerHashResizeInProgress.test_and_set(
std::memory_order_acquire)) {
// We've acquired the resize lock, try to allocate a bigger hash table.
// Note the acquire fence synchronizes with the release fence at the end
// of this block, and hence when we reload implicitProducerHash it must
// be the most recent version (it only gets changed within this locked
// block).
mainHash = implicitProducerHash.load(std::memory_order_acquire);
if (newCount >= (mainHash->capacity >> 1)) {
size_t newCapacity = mainHash->capacity << 1;
while (newCount >= (newCapacity >> 1)) {
newCapacity <<= 1;
}
auto raw = static_cast<char*>(
(Traits::malloc)(sizeof(ImplicitProducerHash) +
std::alignment_of<ImplicitProducerKVP>::value -
1 + sizeof(ImplicitProducerKVP) * newCapacity));
if (raw == nullptr) {
// Allocation failed
implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
implicitProducerHashResizeInProgress.clear(
std::memory_order_relaxed);
return nullptr;
}
auto newHash = new (raw) ImplicitProducerHash;
newHash->capacity = static_cast<size_t>(newCapacity);
newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(
details::align_for<ImplicitProducerKVP>(
raw + sizeof(ImplicitProducerHash)));
for (size_t i = 0; i != newCapacity; ++i) {
new (newHash->entries + i) ImplicitProducerKVP;
newHash->entries[i].key.store(details::invalid_thread_id,
std::memory_order_relaxed);
}
newHash->prev = mainHash;
implicitProducerHash.store(newHash, std::memory_order_release);
implicitProducerHashResizeInProgress.clear(std::memory_order_release);
mainHash = newHash;
}
else {
implicitProducerHashResizeInProgress.clear(std::memory_order_release);
}
}
// If it's < three-quarters full, add to the old one anyway so that we
// don't have to wait for the next table to finish being allocated by
// another thread (and if we just finished allocating above, the condition
// will always be true)
if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
auto producer =
static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
if (producer == nullptr) {
implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
return nullptr;
}
#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
producer->threadExitListener.callback =
&ConcurrentQueue::implicit_producer_thread_exited_callback;
producer->threadExitListener.userData = producer;
details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
#endif
auto index = hashedId;
while (true) {
index &= mainHash->capacity - 1u;
auto empty = details::invalid_thread_id;
#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
auto reusable = details::invalid_thread_id2;
if (mainHash->entries[index].key.compare_exchange_strong(
reusable, id, std::memory_order_seq_cst,
std::memory_order_relaxed)) {
implicitProducerHashCount.fetch_sub(
1,
std::memory_order_relaxed); // already counted as a used slot
mainHash->entries[index].value = producer;
break;
}
#endif
if (mainHash->entries[index].key.compare_exchange_strong(
empty, id, std::memory_order_seq_cst,
std::memory_order_relaxed)) {
mainHash->entries[index].value = producer;
break;
}
++index;
}
return producer;
}
// Hmm, the old hash is quite full and somebody else is busy allocating a
// new one. We need to wait for the allocating thread to finish (if it
// succeeds, we add, if not, we try to allocate ourselves).
mainHash = implicitProducerHash.load(std::memory_order_acquire);
}
}
#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
void implicit_producer_thread_exited(ImplicitProducer* producer) {
// Remove from hash
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
debug::DebugLock lock(implicitProdMutex);
#endif
auto hash = implicitProducerHash.load(std::memory_order_acquire);
assert(hash != nullptr); // The thread exit listener is only registered if
// we were added to a hash in the first place
auto id = details::thread_id();
auto hashedId = details::hash_thread_id(id);
details::thread_id_t probedKey;
// We need to traverse all the hashes just in case other threads aren't on
// the current one yet and are trying to add an entry thinking there's a
// free slot (because they reused a producer)
for (; hash != nullptr; hash = hash->prev) {
auto index = hashedId;
do {
index &= hash->capacity - 1u;
probedKey = id;
if (hash->entries[index].key.compare_exchange_strong(
probedKey, details::invalid_thread_id2,
std::memory_order_seq_cst, std::memory_order_relaxed)) {
break;
}
++index;
} while (
probedKey !=
details::invalid_thread_id); // Can happen if the hash has changed
// but we weren't put back in it yet, or
// if we weren't added to this hash in
// the first place
}
// Mark the queue as being recyclable
producer->inactive.store(true, std::memory_order_release);
}
static void implicit_producer_thread_exited_callback(void* userData) {
auto producer = static_cast<ImplicitProducer*>(userData);
auto queue = producer->parent;
queue->implicit_producer_thread_exited(producer);
}
#endif
//////////////////////////////////
// Utility functions
//////////////////////////////////
template <typename TAlign>
static inline void* aligned_malloc(size_t size) {
MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
std::alignment_of<details::max_align_t>::value)
return (Traits::malloc)(size);
else {
size_t alignment = std::alignment_of<TAlign>::value;
void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
if (!raw)
return nullptr;
char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) +
sizeof(void*));
*(reinterpret_cast<void**>(ptr) - 1) = raw;
return ptr;
}
}
template <typename TAlign>
static inline void aligned_free(void* ptr) {
MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
std::alignment_of<details::max_align_t>::value)
return (Traits::free)(ptr);
else(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
}
template <typename U>
static inline U* create_array(size_t count) {
assert(count > 0);
U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
if (p == nullptr)
return nullptr;
for (size_t i = 0; i != count; ++i) new (p + i) U();
return p;
}
template <typename U>
static inline void destroy_array(U* p, size_t count) {
if (p != nullptr) {
assert(count > 0);
for (size_t i = count; i != 0;) (p + --i)->~U();
}
aligned_free<U>(p);
}
template <typename U>
static inline U* create() {
void* p = aligned_malloc<U>(sizeof(U));
return p != nullptr ? new (p) U : nullptr;
}
template <typename U, typename A1>
static inline U* create(A1&& a1) {
void* p = aligned_malloc<U>(sizeof(U));
return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
}
template <typename U>
static inline void destroy(U* p) {
if (p != nullptr)
p->~U();
aligned_free<U>(p);
}
private:
std::atomic<ProducerBase*> producerListTail;
std::atomic<std::uint32_t> producerCount;
std::atomic<size_t> initialBlockPoolIndex;
Block* initialBlockPool;
size_t initialBlockPoolSize;
#ifndef MCDBGQ_USEDEBUGFREELIST
FreeList<Block> freeList;
#else
debug::DebugFreeList<Block> freeList;
#endif
std::atomic<ImplicitProducerHash*> implicitProducerHash;
std::atomic<size_t>
implicitProducerHashCount; // Number of slots logically used
ImplicitProducerHash initialImplicitProducerHash;
std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE>
initialImplicitProducerHashEntries;
std::atomic_flag implicitProducerHashResizeInProgress;
std::atomic<std::uint32_t> nextExplicitConsumerId;
std::atomic<std::uint32_t> globalExplicitConsumerOffset;
#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
debug::DebugMutex implicitProdMutex;
#endif
#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
std::atomic<ExplicitProducer*> explicitProducers;
std::atomic<ImplicitProducer*> implicitProducers;
#endif
};
template <typename T, typename Traits>
ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
: producer(queue.recycle_or_create_producer(true)) {
if (producer != nullptr) {
producer->token = this;
}
}
template <typename T, typename Traits>
ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)
->recycle_or_create_producer(true)) {
if (producer != nullptr) {
producer->token = this;
}
}
template <typename T, typename Traits>
ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
: itemsConsumedFromCurrent(0),
currentProducer(nullptr),
desiredProducer(nullptr) {
initialOffset =
queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
}
template <typename T, typename Traits>
ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
: itemsConsumedFromCurrent(0),
currentProducer(nullptr),
desiredProducer(nullptr) {
initialOffset =
reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)
->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
}
template <typename T, typename Traits>
inline void swap(ConcurrentQueue<T, Traits>& a,
ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT {
a.swap(b);
}
inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT {
a.swap(b);
}
inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT {
a.swap(b);
}
template <typename T, typename Traits>
inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a,
typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b)
MOODYCAMEL_NOEXCEPT {
a.swap(b);
}
} // namespace moodycamel
#if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
#pragma warning(pop)
#endif
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
#pragma GCC diagnostic pop
#endif