opentelemetry-cpp/api/test/common/spinlock_benchmark.cc

// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

#include "opentelemetry/common/spin_lock_mutex.h"

#include <benchmark/benchmark.h>
#include <mutex>

namespace
{
using opentelemetry::common::SpinLockMutex;

constexpr int TightLoopLocks = 10000;

// Runs a thrash-test where we spin up N threads, each of which will
// attempt to lock-mutate-unlock a total of `TightLoopLocks` times.
//
// lock: A lambda denoting how to lock.   Accepts a reference to `SpinLockType`.
// unlock: A lambda denoting how to unlock.   Accepts a reference to `SpinLockType`.
template <typename SpinLockType, typename LockF, typename UnlockF>
inline void SpinThrash(benchmark::State &s, SpinLockType &spinlock, LockF lock, UnlockF unlock)
{
  auto num_threads = s.range(0);
  // Value we will increment, fighting over a spinlock.
  // The contention is meant to be brief, as close to our expected
  // use cases of "updating pointers" or "pushing an event onto a buffer".
  std::int64_t value = 0;

  std::vector<std::thread> threads;
  threads.reserve(num_threads);

  // Timing loop
  for (auto _ : s)
  {
    for (auto i = 0; i < num_threads; i++)
    {
      threads.emplace_back([&] {
        // Increment value once each time the lock is acquired.  Spin a few times
        // to ensure maximum thread contention.
        for (int i = 0; i < TightLoopLocks; i++)
        {
          lock(spinlock);
          value++;
          unlock(spinlock);
        }
      });
    }
    // Join threads
    for (auto &thread : threads)
      thread.join();
    threads.clear();
  }
}

// Benchmark of full spin-lock implementation.
static void BM_SpinLockThrashing(benchmark::State &s)
{
  SpinLockMutex spinlock;
  SpinThrash(
      s, spinlock, [](SpinLockMutex &m) { m.lock(); }, [](SpinLockMutex &m) { m.unlock(); });
}

// Naive `while(try_lock()) {}` implementation of lock.
static void BM_NaiveSpinLockThrashing(benchmark::State &s)
{
  SpinLockMutex spinlock;
  SpinThrash(
      s, spinlock,
      [](SpinLockMutex &m) {
        while (!m.try_lock())
        {
          // Left this comment to keep the same format on old and new versions of clang-format
        }
      },
      [](SpinLockMutex &m) { m.unlock(); });
}

// Simple `while(try_lock()) { yield-processor }`
static void BM_ProcYieldSpinLockThrashing(benchmark::State &s)
{
  SpinLockMutex spinlock;
  SpinThrash<SpinLockMutex>(
      s, spinlock,
      [](SpinLockMutex &m) {
        while (!m.try_lock())
        {
#if defined(_MSC_VER)
          YieldProcessor();
#elif defined(__i386__) || defined(__x86_64__)
#  if defined(__clang__)
          _mm_pause();
#  else
          __builtin_ia32_pause();
#  endif
#elif defined(__arm__)
          __yield();
#endif
        }
      },
      [](SpinLockMutex &m) { m.unlock(); });
}

// SpinLock thrashing with thread::yield().
static void BM_ThreadYieldSpinLockThrashing(benchmark::State &s)
{
  std::atomic_flag mutex = ATOMIC_FLAG_INIT;
  SpinThrash<std::atomic_flag>(
      s, mutex,
      [](std::atomic_flag &l) {
        uint32_t try_count = 0;
        while (l.test_and_set(std::memory_order_acq_rel))
        {
          ++try_count;
          if (try_count % 32)
          {
            std::this_thread::yield();
          }
        }
        std::this_thread::yield();
      },
      [](std::atomic_flag &l) { l.clear(std::memory_order_release); });
}

// Run the benchmarks at 2x thread/core and measure the amount of time to thrash around.
BENCHMARK(BM_SpinLockThrashing)
    ->RangeMultiplier(2)
    ->Range(1, std::thread::hardware_concurrency())
    ->MeasureProcessCPUTime()
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);
BENCHMARK(BM_ProcYieldSpinLockThrashing)
    ->RangeMultiplier(2)
    ->Range(1, std::thread::hardware_concurrency())
    ->MeasureProcessCPUTime()
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);
BENCHMARK(BM_NaiveSpinLockThrashing)
    ->RangeMultiplier(2)
    ->Range(1, std::thread::hardware_concurrency())
    ->MeasureProcessCPUTime()
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);
BENCHMARK(BM_ThreadYieldSpinLockThrashing)
    ->RangeMultiplier(2)
    ->Range(1, std::thread::hardware_concurrency())
    ->MeasureProcessCPUTime()
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);

}  // namespace

BENCHMARK_MAIN();