New upstream version 8.1.0

2025-08-10 01:34:16 +02:00
commit c891bb7105
4398 changed files with 838833 additions and 0 deletions
--- a/meta/source/pmq/pmq.cpp
+++ b/meta/source/pmq/pmq.cpp
--- a/meta/source/pmq/pmq.hpp
+++ b/meta/source/pmq/pmq.hpp
@@ -0,0 +1,243 @@
+#pragma once
+
+#include <stdint.h>  // uint64_t etc.
+#include <stddef.h>  // size_t
+
+struct PMQ_Enqueuer_Stats
+{
+   // how many times was the buffer filled up (the flusher couldn't keep up)?
+   uint64_t buffer_full_count;
+   uint64_t total_messages_enqueued;
+   uint64_t total_bytes_enqueued;
+};
+
+struct PMQ_Persister_Stats
+{
+   uint64_t num_async_flushes;  // calls to pmq_sync()
+   uint64_t wakeups;
+   uint64_t fsync_calls;
+   uint64_t wal_flushes;
+   uint64_t wal_flush_bytes;
+};
+
+struct PMQ_Stats
+{
+   PMQ_Enqueuer_Stats enqueuer;
+   PMQ_Persister_Stats persister;
+};
+
+struct PMQ;
+
+/* Parmeters for creating a new new queue object (see pmq_create()).
+ * If basedir_path exists, try to load existing queue data structures from disk.
+ * Otherwise, create the directory and initialize a new queue there.
+ * A queue use approximately the number of bytes that were specified in
+ * create_size at the time of creation. (Something like 2 GiB is not
+ * unreasonable).
+ */
+struct PMQ_Init_Params
+{
+   const char *basedir_path;
+   uint64_t create_size;
+};
+
+PMQ *pmq_create(const PMQ_Init_Params *params);
+
+/* Destroy queue object. This will first flush the remaining buffered messages to disk.
+ */
+void pmq_destroy(PMQ *q);
+
+bool pmq_enqueue_msg(PMQ *q, const void *data, size_t size);
+bool pmq_sync(PMQ *q);
+
+void pmq_get_stats(PMQ *q, PMQ_Stats *stats);
+
+/* Information about persisted data */
+struct PMQ_Persist_Info
+{
+   uint64_t cks_discard_csn;  // oldest CSN in the chunk store (next chunk to be discarded)
+   uint64_t cks_msn;  // next MSN to hit the chunk store
+   uint64_t wal_msn;  // next MSN to hit the WAL
+};
+
+PMQ_Persist_Info pmq_get_persist_info(PMQ *q);
+
+/*
+ * Get an updated value of the byte range of the underlying data store.
+ * The returned range will be chunk-aligned, but what size chunks are is
+ * currently not exposed in this API.
+ */
+
+
+/* */
+enum PMQ_Read_Result
+{
+   // The message was successfully read back.
+   PMQ_Read_Result_Success,
+
+   // The provided buffer has insufficient size. (The size gets returned back nevertheless)
+   PMQ_Read_Result_Buffer_Too_Small,
+
+   // The requested data is at the current end of the storage area/window. It
+   // is the next data the will be written. Try again later.
+   // TODO: We might want to introduce mechanisms to block until new data arrives at
+   // every level. Currently this has to be implemented in the integration
+   // code.
+   PMQ_Read_Result_EOF,
+
+   // The requested data is not present. Maybe the requested data was discarded
+   // concurrently? It is safe to re-position he cursor and retry.
+   PMQ_Read_Result_Out_Of_Bounds,
+
+   // An error was detected by the storage layer
+   PMQ_Read_Result_IO_Error,
+
+   // A problem with the data read back from the storage layer was detected.
+   PMQ_Read_Result_Integrity_Error,
+};
+
+static inline const char *pmq_read_result_string(PMQ_Read_Result readres)
+{
+   switch (readres)
+   {
+      case PMQ_Read_Result_Success: return "Success";
+      case PMQ_Read_Result_Buffer_Too_Small: return "Buffer_Too_Small";
+      case PMQ_Read_Result_EOF: return "EOF";
+      case PMQ_Read_Result_Out_Of_Bounds: return "Out_Of_Bounds";
+      case PMQ_Read_Result_IO_Error: return "IO_Error";
+      case PMQ_Read_Result_Integrity_Error: return "Integrity_Error";
+      default: return "(invalid value)";
+   }
+}
+
+
+struct PMQ_Reader;
+
+PMQ_Reader *pmq_reader_create(PMQ *q);
+void pmq_reader_destroy(PMQ_Reader *reader);
+
+PMQ *pmq_reader_get_pmq(PMQ_Reader *reader);
+
+/* Position cursor at the next incoming message -- or, in other words, at the
+ * current write end of the queue. */
+PMQ_Read_Result pmq_reader_seek_to_current(PMQ_Reader *reader);
+
+/* Position cursor at the oldest message (the first message in the chunk
+ * cks_discard). Note that this is rarely a good idea since this message is
+ * likely to be discarded concurrently, so it runs risk of losing sync
+ * immediately or shortly. */
+PMQ_Read_Result pmq_reader_seek_to_oldest(PMQ_Reader *reader);
+
+/* Position cursor to given msn. MSNs cannot be directly adressed. The
+ * implementation will have to load multiple chunks to find it.
+ * This also means that the call can fail -- I/O errors etc. can be returned.
+ */
+PMQ_Read_Result pmq_reader_seek_to_msg(PMQ_Reader *reader, uint64_t msn);
+
+/* Read the current message and advance. On success, returns the size of the
+ * message that was read in @out_size and advances to the next message
+ * internally.
+ */
+PMQ_Read_Result pmq_read_msg(PMQ_Reader *reader,
+      void *data, size_t size, size_t *out_size);
+
+uint64_t pmq_reader_get_current_msn(PMQ_Reader *reader);
+
+/* Attempt to find the MSN of the oldest persisted message.
+ *
+ * Note that the MSN that ends up being returned might already be discarded
+ * once the caller tries to read that message. So calling this function might
+ * not be a good idea.
+ *
+ * Another difficulty, at the implementation level, is that the implementationn
+ * needs to read the oldest chunk to know the oldest MSN in that chunk. But the
+ * oldest chunk may be discarded concurrently, so reading it might fail.  In
+ * case of a concurrent discard, the implementation will update its
+ * oldest-chunk information and then skip ahead some chunks, trying to read a
+ * slightly newer chunk. This makes the operation more likely to succeed next
+ * time. This continues until either a chunk was read successfully, or we run
+ * out of persisted chunks. In the latter case, the implementation returns the
+ * current "next" MSN. The PMQ always keeps track of this information, so we
+ * can know it without reading a chunk from disk.
+ */
+uint64_t pmq_reader_find_old_msn(PMQ_Reader *reader);
+
+/* Equivalent to pmq_get_persist_info(pmq_reader_get_pmq(reader)); */
+PMQ_Persist_Info pmq_reader_get_persist_info(PMQ_Reader *reader);
+
+/* pmq_reader_eof() -- Inexpensive check if there are messages available
+ * currently.
+ * This allows a concurrent reader procedure synchronize with writers without
+ * having to actually read a message while holding a lock -- which could block
+ * writers for a long time if we have to do actual I/O.
+ */
+bool pmq_reader_eof(PMQ_Reader *reader);
+
+
+
+// C++ RAII wrappers
+
+// unique_ptr is maybe not precisely what we're looking for. So we're using some boilerplate instead.
+//#include <memory>
+//using PMQ_Handle = std::unique_ptr<PMQ, decltype(pmq_destroy)>;
+//using PMQ_Reader_Handle = std::unique_ptr<PMQ_Reader, decltype(pmq_reader_destroy)>;
+
+
+template<typename T, void Deleter(T *)>
+class PMQ_Handle_Wrapper
+{
+   T *m_ptr = nullptr;
+
+public:
+
+   T *get() const
+   {
+      return m_ptr;
+   }
+
+   void drop()
+   {
+      if (m_ptr)
+      {
+         Deleter(m_ptr);
+         m_ptr = nullptr;
+      }
+   }
+
+   operator T *() const  // automatic implicit cast to T *
+   {
+      return m_ptr;
+   }
+  
+   explicit operator bool() const
+   {
+      return m_ptr != nullptr;
+   }
+
+   void operator=(PMQ_Handle_Wrapper&& other)
+   {
+      drop();
+      std::swap(m_ptr, other.m_ptr);
+   }
+
+   void operator=(T *ptr)
+   {
+      drop();
+      m_ptr = ptr;
+   }
+
+   void operator=(PMQ_Handle_Wrapper const& other) = delete;
+
+   explicit PMQ_Handle_Wrapper(T *ptr = nullptr)
+      : m_ptr(ptr)
+   {
+   }
+
+   ~PMQ_Handle_Wrapper()
+   {
+      drop();
+   }
+};
+
+using PMQ_Handle = PMQ_Handle_Wrapper<PMQ, pmq_destroy>;
+using PMQ_Reader_Handle = PMQ_Handle_Wrapper<PMQ_Reader, pmq_reader_destroy>;
--- a/meta/source/pmq/pmq_base.hpp
+++ b/meta/source/pmq/pmq_base.hpp
@@ -0,0 +1,350 @@
+#pragma once
+
+#include <cassert>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <cstdlib>
+#include <cstring>
+#include <unistd.h>
+#include <new>
+
+// macro to align variables to cache line size
+// There is C++ standardized value of std::hardware_destructive_interference_size.
+// However that currently produces a warning, probably because of concerns about ABI stability.
+// So instead I just hardcode a cache line size of 64 bytes for now.
+// The worst that could happen would be bad performance.
+//#define __pmq_cache_aligned alignas(std::hardware_destructive_interference_size)
+#define __pmq_cache_aligned alignas(64)
+
+// These #define's work for GCC and possibly other compilers.  To guarantee
+// that these definitions are active wherever they could potentially work, I
+// will define them unconditionally for now, instead of guarding them with
+// #ifdef __GNUC__.
+// TODO: try on more compilers and improve compatibility logic!
+
+#if PMQ_WITH_PROFILING
+#define __pmq_profiled __attribute__((noinline))  // could consider attribute "noipa" instead of "noinline"
+#else
+#define __pmq_profiled
+#endif
+
+// "artificial" is used for small inlined wrapper methods, such as operator[].
+// In theory (and to some extent in practice) the effect should be that the
+// code that gets inlined to a call site gets attributed to the _call site_
+// instead of to the definition site of the inlined function -- reducing the
+// effect of jumping around like wild files when debugging.
+
+#define __pmq_artificial_method inline __attribute__((always_inline, artificial))
+#define __pmq_artificial_func static inline __attribute__((always_inline, artificial))
+
+// Attribute used for logging functions and other printf-style functions. If
+// these functions are properly annotated, the compiler can check matching
+// arguments in usage places.
+
+#define __pmq_formatter(fmt_index, first_arg_index) \
+   __attribute__((format(printf, (fmt_index), (first_arg_index))))
+
+// treat format warnings as errors for the PMQ
+// This could be a build system flag but for now I want the change just for
+// this module in the larger system
+#pragma GCC diagnostic error "-Wformat"
+
+
+#ifdef NDEBUG
+#define pmq_assert(expr)
+#else
+static inline void __pmq_assert_fail(const char *expr, const char *file, int line, const char *func)
+{
+   // this hopefully gives the logger a chance to save the logs.
+   // If there was time, we should probably implement the logger in a separate component,
+   // communicating using a shared memory mapping.
+
+   sleep(3);
+   __assert_fail(expr, file, line, func);
+}
+#define pmq_assert(expr) do { if (! (expr)) {  __pmq_assert_fail(#expr, __FILE__, __LINE__, __func__); } } while (0)
+#endif
+
+
+
+__pmq_artificial_func
+void __pmq_assert_aligned(const void *ptr, size_t size)
+{
+   assert((uintptr_t) (ptr) % size == 0);
+}
+
+template<size_t size, typename T>
+__pmq_artificial_func
+T __attribute__((aligned(size))) *__pmq_assume_aligned(const T *ptr)
+{
+   __pmq_assert_aligned(ptr, size);
+   return (T *) __builtin_assume_aligned(ptr, size);
+}
+
+
+static inline bool pmq_is_power_of_2(uint64_t value)
+{
+   assert(value != 0);
+   return (value & (value - 1)) == 0;
+}
+
+static inline uint64_t pmq_mask_power_of_2(uint64_t value)
+{
+   assert(value != 0);
+   assert((value & (value - 1)) == 0);
+   return value - 1;
+}
+
+
+static inline constexpr uint64_t PMQ_Kilobytes(uint64_t count) { return count << 10; }
+static inline constexpr uint64_t PMQ_Megabytes(uint64_t count) { return count << 20; }
+static inline constexpr uint64_t PMQ_Gigabytes(uint64_t count) { return count << 30; }
+static inline constexpr uint64_t PMQ_Terabytes(uint64_t count) { return count << 40; }
+static inline constexpr uint64_t PMQ_Petabytes(uint64_t count) { return count << 50; }
+
+
+/* Untyped slice class. This is mainly used for slice-copy operations, both for
+ * memory and disk I/O. It saves some boilerplate and is a little bit safer to use.
+ *
+ * Note, we should check if we can replace this with a standard C++ type maybe.
+ * But I personally don't consider this code a liability, and add
+ * __pmq_artificial_method method improves the debugging experience.
+ */
+class Untyped_Slice
+{
+   void *m_data;
+   size_t m_size;
+
+public:
+   __pmq_artificial_method void *data() const { return m_data; }
+   __pmq_artificial_method size_t size() const { return m_size; }
+
+   __pmq_artificial_method
+   Untyped_Slice offset_bytes(size_t offset) const
+   {
+      assert(offset <= m_size);
+      return Untyped_Slice((char *) m_data + offset, m_size - offset);
+   }
+
+   __pmq_artificial_method
+   Untyped_Slice limit_size_bytes(size_t size) const
+   {
+      assert(size <= m_size);
+      return Untyped_Slice(m_data, size);
+   }
+
+   __pmq_artificial_method
+   Untyped_Slice sub_slice_bytes(size_t offset, size_t size) const
+   {
+      return offset_bytes(offset).limit_size_bytes(size);
+   }
+
+   __pmq_artificial_method
+   Untyped_Slice()
+   {
+      m_data = nullptr;
+      m_size = 0;
+   }
+
+   __pmq_artificial_method
+      Untyped_Slice(void *data, size_t size)
+   {
+      m_data = data;
+      m_size = size;
+   }
+};
+
+__pmq_artificial_func
+void zero_out_slice(Untyped_Slice dst)
+{
+   memset(dst.data(), 0, dst.size());
+}
+
+__pmq_artificial_func
+void copy_slice(Untyped_Slice dst, Untyped_Slice src)
+{
+   assert(dst.size() == src.size());
+   memcpy(dst.data(), src.data(), dst.size());
+}
+
+__pmq_artificial_func
+void copy_slice_bytes(Untyped_Slice dst, Untyped_Slice src, size_t size_bytes)
+{
+   assert(size_bytes <= dst.size());
+   assert(size_bytes <= src.size());
+   memcpy(dst.data(), src.data(), size_bytes);
+}
+
+__pmq_artificial_func
+void copy_to_slice(Untyped_Slice slice, const void *data, size_t size)
+{
+   assert(slice.size() >= size);
+   memcpy(slice.data(), data, size);
+}
+
+__pmq_artificial_func
+void copy_from_slice(void *data, Untyped_Slice slice, size_t size)
+{
+   assert(slice.size() >= size);
+   memcpy(data, slice.data(), size);
+}
+
+
+/*
+ * Typed slice type.
+ *
+ * Note, we should check if we can replace this using std::span (C++20).
+ */
+template<typename T>
+class Slice
+{
+   T *m_data;
+   size_t m_count;
+
+public:
+
+   __pmq_artificial_method
+   T *data() const
+   {
+      return m_data;
+   }
+
+   __pmq_artificial_method
+   size_t count() const
+   {
+      return m_count;
+   }
+
+   __pmq_artificial_method
+   size_t size_in_bytes() const
+   {
+      return m_count * sizeof (T);
+   }
+
+   __pmq_artificial_method
+   T get(size_t index) const
+   {
+      assert(index < m_count);
+      return m_data[index];
+   }
+
+   __pmq_artificial_method
+   T& at(size_t index)
+   {
+      assert(index < m_count);
+      return m_data[index];
+   }
+
+   __pmq_artificial_method
+   T const& at(size_t index) const
+   {
+      assert(index < m_count);
+      return m_data[index];
+   }
+
+   __pmq_artificial_method
+   Untyped_Slice untyped() const
+   {
+      return Untyped_Slice(m_data, m_count * sizeof (T));
+   }
+
+   __pmq_artificial_method
+   Slice<T> slice_from(size_t start_index)
+   {
+      assert(start_index <= m_count);
+      return Slice<T>(m_data + start_index, m_count - start_index);
+   }
+
+   __pmq_artificial_method
+   Slice<T> slice_to(size_t count)
+   {
+      assert(count <= m_count);
+      return Slice<T>(m_data, count);
+   }
+
+   __pmq_artificial_method
+   Slice<T> sub_slice(size_t start_index, size_t count)
+   {
+      return slice_from(start_index).slice_to(count);
+   }
+
+   __pmq_artificial_method
+   Slice()
+   {
+      m_data = nullptr;
+      m_count = 0;
+   }
+
+   __pmq_artificial_method
+   Slice(T *data, size_t count)
+   {
+      m_data = data;
+      m_count = count;
+   }
+};
+
+template<typename T>
+__pmq_artificial_func
+void copy_to_slice(Slice<T> slice, const void *data, size_t size)
+{
+   assert(slice.size_in_bytes() >= size);
+   memcpy(slice.data(), data, size);
+}
+
+template<typename T>
+__pmq_artificial_func
+void copy_from_slice(void *data, Slice<T> slice, size_t size)
+{
+   assert(slice.size_in_bytes() >= size);
+   memcpy(data, slice.data(), size);
+}
+
+
+
+// A reference type, which wraps a bare pointer. The semantics are the same as
+// pointer but we don't allow indexing. In other words, the point of this class
+// is to make clear that it doesn't point to an array but only to a single
+// (potentially null) object.
+// In contrast to C++ reference types (T& value), no surprises given value
+// syntax but pointer semantics.
+
+template<typename T>
+class Pointer
+{
+   T *m_ptr;
+
+public:
+
+   __pmq_artificial_method
+   T *ptr() const
+   {
+      return m_ptr;
+   }
+
+   __pmq_artificial_method
+   const T *const_ptr() const
+   {
+      return m_ptr;
+   }
+
+   __pmq_artificial_method
+   Pointer<const T> as_const() const
+   {
+      return Pointer<const T>(m_ptr);
+   }
+
+   __pmq_artificial_method
+   T *operator->()
+   {
+      return m_ptr;
+   }
+
+   __pmq_artificial_method
+   Pointer(T *ptr)
+   {
+      assert(ptr);
+      m_ptr = ptr;
+   }
+};
--- a/meta/source/pmq/pmq_common.hpp
+++ b/meta/source/pmq/pmq_common.hpp
@@ -0,0 +1,851 @@
+#pragma once
+
+#include <new>  // std::bad_alloc
+
+#include "pmq_base.hpp"
+#include "pmq_logging.hpp"
+#include "pmq_posix_io.hpp"
+#include "pmq_profiling.hpp"
+
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <dirent.h>
+
+//
+// Simple allocating slice class with delayed allocation.
+//
+// Why don't I just use std::vector or similar? After all, std::vector is a
+// well known standard solution that allocates a contiguous buffer of memory.
+//
+// I understand this concern, but I've written several simple classes anyway.
+// Let me try and defend this case of "NIH". (It may or may not convince the
+// reader).
+//
+// This code is much more straightforward and simple compared to STL headers.
+// It is basically "new" and "delete" wrapped in a simple package together with
+// operator[] and a way to get a slice to the memory without attached lifetime
+// semantics.
+//
+// Most of the STL classes try to be very generic solutions applicable in a
+// wide variety of use cases. While using with standardized solutions has the
+// advantages of familiarity, this flexibility and wide applicability comes
+// with a complexity cost that brings a disadvantage to anyone working with the
+// codebase.
+//
+// Beyond having a fill level separate from allocation size (size() vs
+// capacity()), std::vector has all sorts of methods and functionality to
+// support pushing, popping, emplacing, iterators, constructors, destructors,
+// and so on. It is highly flexible, which shows whenever an actual
+// instanciated vector type is printed on the terminal, including template
+// allocator parameter amongst to other things.
+//
+// All this is ill-fitting for our simple use case. For a queue we just need a
+// few preallocated buffers. Just for convenience and to get a little safety,
+// the Alloc_Slice wrapper class was created -- so we can do bounds checking
+// and get to automatically deallocate the buffers in the destructor.
+//
+// The size() field and associated semantics that come with std::vector are
+// baggage that we can't make use of (we have multiple cursors that wrap around
+// our buffers in circular fashion). These semantics are not just available,
+// but are understood by programmers as how std::vector gets used.
+//
+// From a mere functionality standpoint this shouldn't be an issue -- We could
+// make sure that we call .resize(N) only once in the beginning and never call
+// e.g. push_back(), emplace_back(), reserve(), or similar. This way we'd
+// essentially be considering the size() as a constant i.e. ignore it.
+//
+// However, again, this usage of the type is not guaranteed.  The sight of a
+// std::vector normally suggest pushing (maybe popping), resizing and
+// reserving, buffer reallocation, pointer/iterator invalidation, and runtime
+// exceptions.
+//
+// With the Alloc_Slice class on the other hand, there is no reallocation and
+// consequently no iterator invalidation.  Exceptions might or might not happen
+// depending on compile settings -- but only at construction time, i.e. program
+// startup. Because no reallocations are possible, no pointer invalidation /
+// iterator invalidation is possible.
+//
+// Compared to std::vector and other STL headers, significantly less header
+// code gets included, so the code compiles quicker. How much quicker?  In a
+// simple test with a single file, adding any of vector, string, map etc.
+// added around 100ms of compilation time (each). I believe I've seen much worse,
+// but just multiply 100-400ms by the number of files in a large project and
+// there may be a good argument for avoiding to include STL headers based on
+// build time.  (TODO: refer to example program).
+//
+// In fairness, this problem may be partially solved with precompiled headers,
+// but those come with some issues too. (build setup, pollution, still have to
+// compile on each rebuild or precompiled header change).
+//
+// With the Alloc_Slice class, methods like operator[] have been marked as
+// "artificial", meaning it's easier to debug code without jumping all over the
+// place. With std::vector and similar classes, I believe there is no way, or
+// no standardized way, to build such that we don't jump around files like wild
+// when debugging.
+//
+// If these arguments haven't been convincing, I'll end it now anyway -- the
+// text is already much bigger than the actual code.
+
+template<typename T>
+class Alloc_Slice
+{
+   T *m_ptr = nullptr;
+   size_t m_capacity = 0;
+
+public:
+
+   __pmq_artificial_method
+   T& operator[](size_t i) const
+   {
+      return m_ptr[i];
+   }
+
+   __pmq_artificial_method
+   T *data() const
+   {
+      return m_ptr;
+   }
+
+   __pmq_artificial_method
+   size_t capacity() const
+   {
+      return m_capacity;
+   }
+
+   __pmq_artificial_method
+   Slice<T> slice() const
+   {
+      return Slice<T>(m_ptr, m_capacity);
+   }
+
+   __pmq_artificial_method
+   Untyped_Slice untyped_slice() const
+   {
+      return slice().untyped();
+   }
+
+   void allocate(size_t capacity)
+   {
+      assert(! m_ptr);
+      m_ptr = new T[capacity];
+      m_capacity = capacity;
+   }
+
+   ~Alloc_Slice()
+   {
+      delete[] m_ptr;
+   }
+};
+
+
+// Posix_FD
+//
+// Simple file-descriptor holder. The only important purpose is automatically
+// closing the fd in the destructor. Setting the fd can happen in the
+// constructor or be delayed. A new fd can be set after closing the old one.
+// The fd can be retrieved using the .get() method. There are no other methods
+// defined, the point here is not to make an abstraction over FDs but just to
+// auto-close it.
+//
+// There is not much more to say. A concern was brought up was that it would be
+// better to use an existing class. Again, it's important to note that we're
+// not trying to add some (probably ill-defined) abstraction. The fact that
+// this class stores fds is not hidden and there isn't any I/O functionality
+// contained.
+//
+// Given this, I wasn't sure what existing class to use that does the same
+// thing. This Posix_FD class was quick and easy to write and I hope it is easy
+// to read too.
+//
+// Another concern was that we shouldn't use close() directly here, but instead
+// use an abstraction (from an existing library) that papers over platform
+// differences such that the code can work on e.g. Windows too. (Windows has a
+// Poxix FS layer as well but the code probably wouldn't work without extra
+// work and handling of subtle differences).
+//
+// I can understand this concern, however BeeGFS can not be easily ported to
+// e.g. Windows anyway, and this has never been a declared goal of the project.
+// BeeGFS currently can't build on Windows and probably never will.
+//
+// The usage code currently makes non-trivial use of advanced POSIX and Linux
+// functions, such as openat(), fsync(), mmap(), pread(), pwrite(). sendfile()
+// was used earlier, and might come back. We rely on Posix file permissions
+// too, and on certain semantics like for example O_CREAT | O_EXCL during file
+// creation.
+//
+// I'm not aware of a better API that is more portable while providing the same
+// functionality.
+//
+// Also, papering over platform differences may be harder than it initially
+// sounds as soon as good performance and thus good control and good error
+// handling is a requirement. To be portable, special handling of platform
+// idiosyncracies might be required, and the architecture would have to change
+// anyway: away from synchronous function calls which would make the
+// abstraction leak into the core code, and towards a more asynchronous model
+// that is better decoupled from the core code.
+//
+// It was proposed that std::ifstream / std::ofstream (or similar standardized
+// class) could be used instead. std::ifstream in particular would be a bad fit
+// since it is a very generic class that comes with buffering and formatting by
+// default. I can't easily see how to replace the calls I listed above using
+// std::ifstream. Event if it's possible, the result may be more complicated /
+// require use of the underlying Posix FD anyway / be less clear / be more code
+// / require to give up some control over syscalls etc. ifstream uses
+// exceptions and has facilities such as formatting that aren't needed, but the
+// presence of this attached functionality would make the purpose less clear
+// IMO.
+//
+
+class Posix_FD
+{
+   int m_fd = -1;
+
+public:
+
+   __pmq_artificial_method
+   int get()
+   {
+      return m_fd;
+   }
+
+   __pmq_artificial_method
+   bool valid()
+   {
+      return m_fd != -1;
+   }
+
+   int close_fd()
+   {
+      int ret = 0;
+      if (m_fd != -1)
+      {
+         ret = close(m_fd);
+         m_fd = -1;
+      }
+      return ret;
+   }
+
+   __pmq_artificial_method
+   void set(int fd)
+   {
+      assert(m_fd == -1);
+      m_fd = fd;
+   }
+
+   __pmq_artificial_method
+   void operator=(int fd)
+   {
+      set(fd);
+   }
+
+   __pmq_artificial_method
+   Posix_FD()
+   {
+   }
+
+   __pmq_artificial_method
+   Posix_FD(int fd)
+   {
+      set(fd);
+   }
+
+   __pmq_artificial_method
+   ~Posix_FD()
+   {
+      close_fd();
+   }
+};
+
+
+//
+// Libc_DIR
+//
+// Similar to Posix_FD, but for libc DIR * handles. Same rationale for why I've
+// written this applies as for Posix_FD.
+//
+// This class is currently not used so could be removed.
+//
+
+class Libc_DIR
+{
+   DIR *m_dir = nullptr;
+
+   public:
+
+   __pmq_artificial_method
+   bool valid()
+   {
+      return m_dir != nullptr;
+   }
+
+   __pmq_artificial_method
+   DIR *get()
+   {
+      return m_dir;
+   }
+
+   __pmq_artificial_method
+   void set(DIR *dir)
+   {
+      assert(m_dir == nullptr);
+      m_dir = dir;
+   }
+
+   void close_dir()
+   {
+      if (m_dir)
+      {
+         closedir(m_dir);
+         m_dir = nullptr;
+      }
+   }
+
+   __pmq_artificial_method
+   void operator=(DIR *dir)
+   {
+      set(dir);
+   }
+
+   __pmq_artificial_method
+   Libc_DIR()
+   {
+   }
+
+   __pmq_artificial_method
+   Libc_DIR(DIR *dir)
+   {
+      m_dir = dir;
+   }
+
+   __pmq_artificial_method
+   ~Libc_DIR()
+   {
+      close_dir();
+   }
+};
+
+//
+// Mmap_Region
+//
+// Similar to Posix_FD, but for memory mappings.
+//
+// On destruction, unmaps the mapped region using munmap().
+//
+
+class MMap_Region
+{
+   void *m_ptr = MAP_FAILED;
+   size_t m_length = 0;
+
+public:
+
+   __pmq_artificial_method
+   void *get() const
+   {
+      return m_ptr;
+   }
+
+   __pmq_artificial_method
+   Untyped_Slice untyped_slice() const
+   {
+      return Untyped_Slice(m_ptr, m_length);
+   }
+
+   __pmq_artificial_method
+   bool valid()
+   {
+      return m_ptr != MAP_FAILED;
+   }
+
+   void close_mapping()
+   {
+      if (m_ptr != MAP_FAILED)
+      {
+         if (munmap(m_ptr, m_length) == -1)
+         {
+            // should not happen. Simply printing the error for now
+            pmq_perr_ef(errno, "WARNING: munmap() failed");
+         }
+         m_ptr = MAP_FAILED;
+         m_length = 0;
+      }
+   }
+
+   // like mmap but returns whether successful
+   bool create(void *addr, size_t newlength, int prot, int flags,
+         int fd, off_t offset)
+   {
+      assert(m_ptr == MAP_FAILED);
+      void *newptr = mmap(addr, newlength, prot, flags, fd, offset);
+      if (newptr == MAP_FAILED)
+         return false;
+      m_ptr = newptr;
+      m_length = newlength;
+      return true;
+   }
+
+   __pmq_artificial_method
+   ~MMap_Region()
+   {
+      close_mapping();
+   }
+};
+
+
+// Mutex_Protected
+//
+// Simple wrapper class that protects a data item with a mutex.
+// The load() and store() mutex implement thread-synchronized read and write
+// access to the data item by locking the resource with a mutex during the
+// operation.
+//
+// A class like Folly::Synchronized might replace this. But again, this was
+// very easy to write and is extremely small. Pulling in a large dependency
+// just for that might not be justified. Also, having our own class allows
+// choosing the mutex type. For example, if we want to profile mutexes using
+// the Tracy frame profiler, we need to use Tracy's mutex wrappers (here,
+// hidden in the PMQ_PROFILED_MUTEX wrapper). While Folly::Synchronized supports
+// custom mutexes, one would need to understand and impleemnt "the extended
+// protocol implemented in folly/synchronized/Lock.h".
+//
+// Upon quick browsing of the 1000 lines in Lock.h, it isn't immediately clear
+// what that protocol entails and how much work it would be (if any) to wrap
+// our own mutex type (which is potentially a wrap of std::mutex already) to
+// conform to that protocol.
+//
+// Maybe there is something in the C++ standard that is suited as a
+// replacement?
+//
+// Maybe there is, but I consider it much easier to just write 2 methods
+// totalling 4 straightforward lines of code...
+//
+
+template<typename T>
+class Mutex_Protected
+{
+   PMQ_PROFILED_MUTEX(m_mutex);
+   T m_value;
+
+public:
+
+   void store(T value)
+   {
+      PMQ_PROFILED_LOCK(lock_, m_mutex);
+      m_value = value;
+   }
+
+   T load()
+   {
+      PMQ_PROFILED_LOCK(lock_, m_mutex);
+      return m_value;
+   }
+};
+
+
+/*
+ * String "slice" that can be passed around. No lifetime semantics or
+ * unexpected copying etc.
+ *
+ * We could use std::string_view instead, but that is a templated type. The
+ * idea of PMQ_String is to wrap just a char-pointer with a size, and nothing
+ * more, to have a package that one can ship around. We mostly use strings for
+ * printf-style formatting and to open files, and we don't need or want any
+ * more complicated semantics than that.
+ */
+struct PMQ_String
+{
+   const char *buffer;
+   size_t size;
+};
+
+/*
+ * Simple string "holder" class that allocates and frees its buffer. The
+ * contained string is immutable once constructed. But a new one can be
+ * "swapped" in by dropping the old string and creating a new one.
+ *
+ * Is this a case of NIH when there is std::string? Maybe, but basically the
+ * same arguments as for Alloc_Slice and the other classes above apply.
+ *
+ * std::string
+ *
+ *  - is somewhat slow to compile
+ *  - Unexpected allocations / copies (and thus exceptions as well) can happen
+ *    very easily, without anyone noticing -- For example, it's as easy as
+ *    writing "auto x = y" instead of "auto& x = y".
+ *  - Apart from exceptions and copies / resizes, appending, there is more
+ *    complexity that we don't need and don't want and that would actually be a
+ *    misfit for our project. Ugly error messages with huge types (...
+ *    std::basic_char ... etc.) is only a small symptom of this.
+ */
+class PMQ_Owned_String
+{
+   PMQ_String m_string = {};
+
+public:
+
+   bool valid() const
+   {
+      return m_string.buffer != nullptr;
+   }
+
+   __pmq_artificial_method
+   PMQ_String get() const
+   {
+      return m_string;
+   }
+
+   void drop()
+   {
+      // Checking only for clarity. free() and the rest of the code would work
+      // with a null buffer too.
+      if (m_string.buffer != nullptr)
+      {
+         free((void *) m_string.buffer);
+         m_string.buffer = nullptr;
+         m_string.size = 0;
+      }
+   }
+
+   void set(const char *buffer)
+   {
+      assert(! m_string.buffer);
+      char *copy = strdup(buffer);
+      if (copy == nullptr)
+      {
+         // is an exception what we want / need?
+         throw std::bad_alloc();
+      }
+      m_string.buffer = copy;
+      m_string.size = strlen(buffer);
+   }
+
+   __pmq_artificial_method
+   PMQ_Owned_String()
+   {
+      m_string.buffer = nullptr;
+      m_string.size = 0;
+   }
+
+   ~PMQ_Owned_String()
+   {
+      drop();
+   }
+};
+
+
+
+/*
+ * SNs (sequence numbers)
+ *
+ * Sequence numbers, and the ringbuffers that build on them, are a core concept
+ * of how the PMQ works.
+ *
+ * I believe they are pretty much what is elsewhere known as "LMAX Disruptor"
+ * (google it).
+ *
+ * Sequence numbers are 64-bit unsigned integers that can wraparound (but this
+ * is only theoretical -- wraparound is probably completely untested since
+ * 64-bit numbers don't overflow easily in practice).
+ *
+ * Ringbuffers have a number of slots that is 2^N for some N. SN's are mapped
+ * to slots with wrap-around in the ringbuffer's 2^N slots by using the lowest
+ * N bits of the SN to index into the slots array.
+ *
+ * The SN templated class provides some type safety -- the Tag type is a
+ * "phantom tag" (can be implemented by making a new "empty" class) that
+ * prevents indexing into a ringbuffer using a mismatching sequence number. For
+ * example, we have a ringbuffer of input-slots that should be indexed by *slot
+ * sequence numbers* (SSNs). And we have a ringbuffer of chunks that should be
+ * indexed by *chunk sequence numbers (CSNs). The on-disk chunk store is
+ * another kind of ringbuffer that works with the same principle of wrapping
+ * around automatically.
+ *
+ * We also track *message sequence numbers* (MSNs) but we don't use them for
+ * indexing, only for binary search.
+ *
+ * Mathematically, SNs form an affine space. This is like a vector space but
+ * without a designated origin (pls forgive me if what I write here is slightly
+ * incorrect as far as mathematics is concerned. Only the idea matters). There
+ * is a 0 value, but it is not meaningfully different compared to any other
+ * value.
+ *
+ * One can subtract two sequence numbers to get a distance (represented as bare
+ * uint64_t), and one can add a distance to a sequence number to get a new
+ * sequence number. However, unlike a vector space with designated 0, one can
+ * not add two sequence numbers meaningfully (SN<T> has operator+(uint64_t d)
+ * but no operator+(SN<T>& other).
+ */
+
+template<typename Tag>
+class SN
+{
+   uint64_t m_value;
+
+public:
+
+   explicit SN(uint64_t value)
+   {
+      m_value = value;
+   }
+
+   // Some C++ trivia following. In most cases you can ignore this and just use
+   // the class similar to primitive integers.
+   //
+   // Here we specify an *explicitly-defaulted default-constructor*. This will
+   // allow us to initialize the object with undefined (garbage) value if we
+   // want so.
+   //
+   // Explanation: Since we have explicitly specified the constructor with 1
+   // argument already, there wouldn't be an implicit default constructor (a
+   // constructor with no arguments). To get a default constructor, we need to
+   // explicitly specify one.  We need a default constructor (no constructor
+   // arguments) if we want to write
+   //
+   // SN sn;
+   //
+   // For simple data types (like SN), we typically want the above line to
+   // leave the object's members uninitialized (garbage values). While this is
+   // in some ways dangerous, it can be simpler especially for objects where
+   // zero-initialization isn't very convenient or meaningful. Leaving values
+   // uninitialized in the default constructor also allows the compiler to
+   // catch bugs in some situations when the user unintentionally forgot to
+   // specify an explicit value.
+   //
+   // Note a gotcha: There is a difference between an empty default constructor
+   //
+   // SN() {}
+   //
+   // and an (explicitly or implicitly) defaulted default constructor:
+   //
+   // SN() = default;
+   //
+   // If we use the class like this:
+   //
+   //   SN x {};
+   //   SN y = SN();  // or like this
+   //   SN z = {};  // or like this...
+   //
+   // then x will contain garbarge with the empty default constructor, but will
+   // be zero-initialized with the (explicitly-) defaulted default constructor.
+   // We'd typically want zero initialization with this syntax.
+
+   SN() = default;
+
+   __pmq_artificial_method
+   uint64_t value() const
+   {
+      return m_value;
+   }
+
+   __pmq_artificial_method
+   void operator++()
+   {
+      m_value++;
+   }
+
+   __pmq_artificial_method
+   void operator++(int)
+   {
+      m_value++;
+   }
+
+   __pmq_artificial_method
+   SN operator+=(uint64_t d)
+   {
+      m_value += d;
+      return *this;
+   }
+
+   __pmq_artificial_method
+   SN& operator-=(uint64_t d)
+   {
+      m_value -= d;
+      return *this;
+   }
+
+   __pmq_artificial_method
+   SN operator+(uint64_t d) const
+   {
+      return SN(m_value + d);
+   }
+
+   __pmq_artificial_method
+   SN operator-(uint64_t d) const
+   {
+      return SN(m_value - d);
+   }
+
+   __pmq_artificial_method
+   uint64_t operator-(SN other) const
+   {
+      return m_value - other.m_value;
+   }
+
+   __pmq_artificial_method
+   bool operator==(SN other) const
+   {
+      return m_value == other.m_value;
+   }
+
+   __pmq_artificial_method
+   bool operator!=(SN other) const
+   {
+      return m_value != other.m_value;
+   }
+};
+
+/*
+ * COMPARING SEQUENCE NUMBERS
+ * ==========================
+ *
+ * Since sequence numbers wrap around (in theory, when 64 bits overflow) they
+ * have no natural ordering.
+ *
+ * However, in practice, sequence numbers are used to index in much smaller
+ * buffer, and at any given time there is only a small window of sequence
+ * numbers. It's a sliding window, but a window still.
+ *
+ * So, admitting that the sequence numbers in a given window may wraparound,
+ * back to 0, we can still assume that they never "overtake" each other.
+ * We can subtract two numbers using unsigned arithmetic and determine their
+ * relative ordering from the result. Centering our worldview at a number x, we
+ * divide the space of uint64_t numbers into those that are less than x (x -
+ * 2^63 to x) and those that are greater than x (x to 2^63).
+ *
+ * Note that this relation is not transitive (x <= y && y <= z does not imply x
+ * <= z), and not antisymmetric -- (x + 2^63) is both greater and less than x.
+ * So it's not a true ordering relation, but in practice we can use it to
+ * reliably compare items by "age".
+ *
+ * The value 1 should be considered greater than UINT64_MAX, since 1 -
+ * UINT64_MAX == 2.  Conversely, UINT64_MAX is less than 1 since UINT64_MAX - 1
+ * equals (UINT64_MAX - 1), which is a.
+ *
+ */
+
+
+// Comparing bare uint64_t sequence values.
+
+__pmq_artificial_func
+bool _sn64_lt(uint64_t a, uint64_t b)
+{
+   return b - (a + 1) <= UINT64_MAX / 2;
+}
+
+__pmq_artificial_func
+bool _sn64_le(uint64_t a, uint64_t b)
+{
+   return b - a <= UINT64_MAX / 2;
+}
+
+__pmq_artificial_func
+bool _sn64_ge(uint64_t a, uint64_t b)
+{
+   return a - b <= UINT64_MAX / 2;
+}
+
+__pmq_artificial_func
+bool _sn64_gt(uint64_t a, uint64_t b)
+{
+   return a - (b + 1) <= UINT64_MAX / 2;
+}
+
+
+
+// Comparing type-safe "tagged" SN values
+
+template<typename Tag>
+__pmq_artificial_func
+bool sn64_lt(SN<Tag> a, SN<Tag> b)
+{
+   return b - (a + 1) <= UINT64_MAX / 2;
+}
+
+template<typename Tag>
+__pmq_artificial_func
+bool sn64_le(SN<Tag> a, SN<Tag> b)
+{
+   return b - a <= UINT64_MAX / 2;
+}
+
+template<typename Tag>
+__pmq_artificial_func
+bool sn64_ge(SN<Tag> a, SN<Tag> b)
+{
+   return a - b <= UINT64_MAX / 2;
+}
+
+template<typename Tag>
+__pmq_artificial_func
+bool sn64_gt(SN<Tag> a, SN<Tag> b)
+{
+   return a - (b + 1) <= UINT64_MAX / 2;
+}
+
+template<typename Tag>
+__pmq_artificial_func
+bool sn64_inrange(SN<Tag> sn, SN<Tag> lo, SN<Tag> hi)
+{
+   return sn - lo <= hi - lo;
+}
+
+
+
+// Ringbuffer containing a buffer (power-of-2 size) of element of type V. It
+// can be "indexed" using SN's of matching type.
+
+template<typename Tag, typename V>
+class Ringbuffer
+{
+   using K = SN<Tag>;
+
+   V *m_ptr = nullptr;
+   size_t m_count = 0;
+
+public:
+
+   __pmq_artificial_method
+   uint64_t slot_count() const
+   {
+      return m_count;
+   }
+
+   __pmq_artificial_method
+   void reset(Slice<V> slice)
+   {
+      assert(pmq_is_power_of_2(slice.count()));
+      m_ptr = slice.data();
+      m_count = slice.count();
+   }
+
+   __pmq_artificial_method
+   Slice<V> as_slice() const
+   {
+      return Slice<V>(m_ptr, m_count);
+   }
+
+   __pmq_artificial_method
+   const V *get_slot_for(K k) const
+   {
+      return &m_ptr[k.value() & (m_count - 1)];
+   }
+
+   __pmq_artificial_method
+   V *get_slot_for(K k)
+   {
+      return &m_ptr[k.value() & (m_count - 1)];
+   }
+
+   __pmq_artificial_method
+   Ringbuffer()
+   {
+   }
+
+   __pmq_artificial_method
+   Ringbuffer(V *ptr, uint64_t size)
+   {
+      reset(ptr, size);
+   }
+};
--- a/meta/source/pmq/pmq_logging.cpp
+++ b/meta/source/pmq/pmq_logging.cpp
@@ -0,0 +1,241 @@
+#include "pmq_logging.hpp"
+#include "pmq_common.hpp"
+
+#include <cassert>
+#include <cstdio>
+
+
+// The logging module can either print to stderr or use the BeeGFS metadata
+// server's logging backend.
+
+#ifdef PMQ_TEST
+# ifndef PMQ_LOG_LEVEL
+#  error PMQ_LOG_LEVEL must be defined when compiling test case
+# endif
+#define INTEGRATE_WITH_METADATA_SERVER 0
+#else
+#define INTEGRATE_WITH_METADATA_SERVER 1
+#endif
+
+
+#if INTEGRATE_WITH_METADATA_SERVER
+// Integrate into metadata server
+#include <common/app/log/Logger.h>
+#endif
+
+struct Log_Buffer
+{
+   PMQ_PROFILED_MUTEX(mutex);
+   PMQ_PROFILED_CONDVAR(writeable);  // reader => writer
+   PMQ_PROFILED_CONDVAR(readable);   // writer => reader. Corresponding mutex is in Log_Message
+   Alloc_Slice<Log_Message> msgs;
+   size_t capacity = 0;
+   size_t writepos = 0;
+   size_t readpos = 0;
+
+   Log_Buffer()
+   {
+      // TODO: this costs a lot of memory. However, a previous setting of 64
+      // wasn't enough for high-frequency logging. We need more dynamic and
+      // judicious memory allocation.
+      capacity = 1024;
+      msgs.allocate(capacity);
+   }
+};
+
+static void log_buffer_write(Log_Buffer *logbuf, Log_Message const *input)
+{
+   PMQ_PROFILED_UNIQUE_LOCK(lock, logbuf->mutex);
+   while (logbuf->writepos - logbuf->readpos == logbuf->capacity)
+      logbuf->writeable.wait(lock);
+   size_t mask = logbuf->capacity - 1;
+   size_t pos = logbuf->writepos;
+   Log_Message *msg = &logbuf->msgs[pos & mask];
+   msg->size = input->size;
+   memcpy(msg->data, input->data, input->size);
+   ++ logbuf->writepos;
+   // hoping that this is cheap: otherwise we should track the number of
+   // readers and check it before calling notify_one()
+   logbuf->readable.notify_one();
+}
+
+static void _log_buffer_read(Log_Buffer *logbuf, Log_Message *output)
+{
+   size_t mask = logbuf->capacity - 1;
+   size_t pos = logbuf->readpos;
+   Log_Message *msg = &logbuf->msgs[pos & mask];
+   output->size = msg->size;
+   memcpy(output->data, msg->data, msg->size);
+   ++ logbuf->readpos;
+   // hoping that this is cheap: otherwise we should track the number of
+   // writers and check it before calling notify_one()
+   logbuf->writeable.notify_one();
+}
+
+static void log_buffer_read(Log_Buffer *logbuf, Log_Message *output)
+{
+   PMQ_PROFILED_UNIQUE_LOCK(lock, logbuf->mutex);
+   while (logbuf->writepos == logbuf->readpos)
+      logbuf->readable.wait(lock);
+   _log_buffer_read(logbuf, output);
+}
+
+static bool log_buffer_try_read(Log_Buffer *logbuf, Log_Message *output)
+{
+   PMQ_PROFILED_LOCK(lock, logbuf->mutex);
+   if (logbuf->writepos == logbuf->readpos)
+      return false;
+   _log_buffer_read(logbuf, output);
+   return true;
+}
+
+static bool log_buffer_try_read_timeout_millis(Log_Buffer *logbuf, Log_Message *output, int millis)
+{
+   auto time_point = std::chrono::steady_clock::now() + std::chrono::milliseconds(millis);
+   PMQ_PROFILED_UNIQUE_LOCK(lock, logbuf->mutex);
+   while (logbuf->writepos == logbuf->readpos)
+   {
+      if (logbuf->readable.wait_until(lock, time_point) == std::cv_status::timeout)
+         return false;
+   }
+   _log_buffer_read(logbuf, output);
+   return true;
+}
+
+
+
+static Log_Buffer global_log_buffer;
+
+void pmq_write_log_message(Log_Message const *input)
+{
+   log_buffer_write(&global_log_buffer, input);
+}
+
+void pmq_read_log_message(Log_Message *output)
+{
+   log_buffer_read(&global_log_buffer, output);
+}
+
+bool pmq_try_read_log_message(Log_Message *output)
+{
+   return log_buffer_try_read(&global_log_buffer, output);
+}
+
+bool pmq_try_read_log_message_timeout_millis(Log_Message *output, int millis)
+{
+   return log_buffer_try_read_timeout_millis(&global_log_buffer, output, millis);
+}
+
+
+void log_msg_printfv(Log_Message *msg, const char *fmt, va_list ap)
+{
+   int ret = vsnprintf(msg->data + msg->size, sizeof msg->data - 1 - msg->size, fmt, ap);
+   assert(ret >= 0);
+   msg->size += (size_t) ret;
+   if (msg->size > sizeof msg->data - 1)
+      msg->size = sizeof msg->data - 1;
+   msg->data[msg->size] = 0;
+}
+
+// Note: this is a method (implicit this pointer) so we use
+// __pmq_formatter(2, 3) instead of __pmq_formatter(1, 2).
+void __pmq_formatter(2, 3) log_msg_printf(Log_Message *msg, const char *fmt, ...)  // NOLINT this is safe because of __pmq_formatter() annotation
+{
+   va_list ap;
+   va_start(ap, fmt);
+   log_msg_printfv(msg, fmt, ap);
+   va_end(ap);
+}
+
+void pmq_msg_ofv(const PMQ_Msg_Options& opt, const char *fmt, va_list ap)
+{
+   bool print_errno = (bool) (opt.flags & PMQ_MSG_OPT_ERRNO);
+   uint32_t priority = opt.flags & PMQ_MSG_OPT_LVL_MASK;
+
+   Log_Message log_msg;
+   log_msg.size = 0;
+
+#if INTEGRATE_WITH_METADATA_SERVER
+
+   int metadata_priority = 0;
+
+   switch (priority)
+   {
+   case PMQ_MSG_OPT_LVL_DEBUG: metadata_priority = Log_DEBUG; break;
+   case PMQ_MSG_OPT_LVL_INFO:  metadata_priority = Log_NOTICE; break;
+   case PMQ_MSG_OPT_LVL_WARN:  metadata_priority = Log_WARNING; break;
+   case PMQ_MSG_OPT_LVL_ERR:   metadata_priority = Log_ERR; break;
+   default: assert(0); // can't happen at least currently where log mask has 2 bits.
+   }
+
+#else
+
+   // Early return, avoiding most of the work if the message has less priority
+   // than the log level.
+   // TODO: we should have something like this for metadata server integration
+   // too.
+   if (PMQ_LOG_LEVEL > priority)
+      return;
+
+   switch (priority)
+   {
+   case PMQ_MSG_OPT_LVL_DEBUG: log_msg_printf(&log_msg, "DEBUG: "); break;
+   case PMQ_MSG_OPT_LVL_INFO:  log_msg_printf(&log_msg, "INFO: "); break;
+   case PMQ_MSG_OPT_LVL_WARN:  log_msg_printf(&log_msg, "WARNING: "); break;
+   case PMQ_MSG_OPT_LVL_ERR:   log_msg_printf(&log_msg, "ERROR: "); break;
+   default: assert(0); // can't happen at least currently where log mask has 2 bits.
+   }
+
+#endif
+
+   log_msg_printfv(&log_msg, fmt, ap);
+
+   if (print_errno)
+   {
+      char errbuf[64];
+      const char *errstr;
+
+#if  (_POSIX_C_SOURCE >= 200112L) && !  _GNU_SOURCE
+      {
+         // XSI compliant strerror_r()
+         int ret = strerror_r(opt.errnum, errbuf, sizeof errbuf);
+         if (ret == 0)
+            errstr = errbuf;
+      }
+#else
+      {
+         // GNU version of strerror_r()
+         errstr = strerror_r(opt.errnum, errbuf, sizeof errbuf);
+      }
+#endif
+      if (! errstr)
+      {
+         snprintf(errbuf, sizeof errbuf, "(errno=%d)", opt.errnum);
+         errstr = errbuf;
+      }
+
+      log_msg_printf(&log_msg, ": %s", errstr);
+   }
+
+#if INTEGRATE_WITH_METADATA_SERVER
+   // Integration into metadata server
+   Logger *logger = Logger::getLogger();
+   logger->log(LogTopic_EVENTLOGGER, metadata_priority, opt.loc.file, opt.loc.line, log_msg.data);
+#else
+
+   log_msg_printf(&log_msg, "\n");
+
+   //fwrite(log_msg.data, log_msg.size, 1, stderr);
+
+   pmq_write_log_message(&log_msg);
+
+#endif
+}
+
+void __pmq_formatter(2, 3) pmq_msg_of(const PMQ_Msg_Options& opt, const char *fmt, ...)  // NOLINT this is safe because of use of __pmq_formatter() annotation
+{
+   va_list ap;
+   va_start(ap, fmt);
+   pmq_msg_ofv(opt, fmt, ap);
+   va_end(ap);
+}
--- a/meta/source/pmq/pmq_logging.hpp
+++ b/meta/source/pmq/pmq_logging.hpp
@@ -0,0 +1,112 @@
+#pragma once
+
+#include "pmq_base.hpp"
+
+enum
+{
+   PMQ_MSG_OPT_DEFAULT = 0,
+   PMQ_MSG_OPT_ERRNO = (1 << 0),
+   PMQ_MSG_HAS_SOURCE_LOC = (1 << 1),
+
+   PMQ_MSG_OPT_LVL_MASK = (3 << 2),  // bits 3 and 4 for debug level.
+   PMQ_MSG_OPT_LVL_DEBUG = (0 << 2),
+   PMQ_MSG_OPT_LVL_INFO = (1 << 2),
+   PMQ_MSG_OPT_LVL_WARN = (2 << 2),
+   PMQ_MSG_OPT_LVL_ERR = (3 << 2),
+};
+
+struct PMQ_Source_Loc
+{
+   const char *file;
+   uint32_t line;
+};
+
+struct PMQ_Msg_Options
+{
+   PMQ_Source_Loc loc;
+   uint32_t flags;  // PMQ_MSG_OPT_
+   int errnum;
+};
+
+
+// Logging functions / macros.
+//
+// The following functions are typically used in the client code.
+//
+//   pmq_msg_f(fmt, ...): Submit a log message with default log level and format string + var-args
+//   pmq_perr_f(fmt, ...): Submit a log message with error log level and format string + var-args
+//   pmq_perr_ef(errno, fmt, ...): like pmq_perr_f() but also add text for given system error code ("errno")
+//
+// To explain all functions available here: they are made according to a pattern
+//
+// pmq_{LVL}_{MNEMNONICS}
+//
+// LVL: logging level, possible options
+//  - msg: Default level or use specified level ('l' mnemnonic)
+//  - debug: Debug level
+//  - warn: Warning level
+//  - perr: Error level ("print-error")
+//
+// MNEMNONICS: combination of 1-letter chars
+//  - l: means a logging level is specified (only available with 'msg' category)
+//  - e: add a text for specified system error code ("errno")
+//  - f: "format", like in the stdio function printf().
+//  - v: in combination with f (so 'fv'), means the arguments come as a va_list, like in stdio function vfprintf().
+//  - o: Use a single options struct holding level, errno explictly, as well as source code location info.
+
+
+void pmq_msg_ofv(const PMQ_Msg_Options& opt, const char *fmt, va_list ap);
+
+void __pmq_formatter(2, 3) pmq_msg_of(const PMQ_Msg_Options& opt, const char *fmt, ...);
+
+#define PMQ_SOURCE_LOC ((PMQ_Source_Loc) { __FILE__, __LINE__ })
+
+#define PMQ_MSG_OPTIONS(...) (PMQ_Msg_Options { PMQ_SOURCE_LOC, ##__VA_ARGS__ })
+
+#define pmq_msg_lf(lvl, fmt, ...) \
+   pmq_msg_of(PMQ_MSG_OPTIONS((lvl), 0), fmt, ##__VA_ARGS__)
+
+#define pmq_msg_lef(lvl, e, fmt, ...) \
+   pmq_msg_of(PMQ_MSG_OPTIONS(PMQ_MSG_OPT_ERRNO | (lvl), e), fmt, ##__VA_ARGS__)
+
+#define pmq_msg_f(fmt, ...) \
+   pmq_msg_lf(PMQ_MSG_OPT_LVL_INFO, fmt, ##__VA_ARGS__)
+
+#define pmq_msg_ef(e, fmt, ...) \
+   pmq_msg_lef(PMQ_MSG_OPT_LVL_INFO, (e), fmt, ##__VA_ARGS__)
+
+#define pmq_debug_f(fmt, ...) \
+   pmq_msg_lf(PMQ_MSG_OPT_LVL_DEBUG, fmt, ##__VA_ARGS__)
+
+#define pmq_debug_ef(e, fmt, ...) \
+   pmq_msg_lef(PMQ_MSG_OPT_LVL_DEBUG, (e), fmt, ##__VA_ARGS__)
+
+#define pmq_warn_f(fmt, ...) \
+   pmq_msg_lf(PMQ_MSG_OPT_LVL_WARN, fmt, ##__VA_ARGS__)
+
+#define pmq_warn_ef(e, fmt, ...) \
+   pmq_msg_lef(PMQ_MSG_OPT_LVL_WARN, (e), fmt, ##__VA_ARGS__)
+
+#define pmq_perr_f(fmt, ...) \
+   pmq_msg_lf(PMQ_MSG_OPT_LVL_ERR, fmt, ##__VA_ARGS__)
+
+#define pmq_perr_ef(e, fmt, ...) \
+   pmq_msg_lef(PMQ_MSG_OPT_LVL_ERR, (e), fmt, ##__VA_ARGS__)
+
+
+
+
+
+
+// Low-level Logging I/O interface
+
+struct Log_Message
+{
+   size_t size;
+   char data[256 - sizeof (size_t)];  // for simplicity
+};
+
+void pmq_write_log_message(Log_Message const *input);
+void pmq_read_log_message(Log_Message *output);
+bool pmq_try_read_log_message_timeout_millis(Log_Message *output, int millis);
+bool pmq_try_read_log_message(Log_Message *output);
--- a/meta/source/pmq/pmq_posix_io.hpp
+++ b/meta/source/pmq/pmq_posix_io.hpp
@@ -0,0 +1,212 @@
+#pragma once
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "pmq_logging.hpp"
+
+/* Wrapper around open() to open directories to put this awkward code in a
+ * central place.  It is counter-intuitive but this is apparently how you're
+ * supposed to open directories on Unix, both the reading and "writing" (i.e.
+ * create, unlink, rename).
+ * The O_DIRECTORY flag is optional but the O_RDONLY is not; opening with
+ * O_RDWR | O_DIRECTORY fails with "is a directory" (weird!).
+ *
+ * Returns: fd to open directory or -1, in which case the errno variable must
+ * be handled as usual.
+ */
+static inline int pmq_open_dir(const char *path)
+{
+   return open(path, O_RDONLY | O_DIRECTORY);
+}
+
+static inline int pmq_check_regular_file(int fd, const char *what_file)
+{
+   struct stat st;
+
+   if (fstat(fd, &st) == -1)
+   {
+      pmq_perr_ef(errno, "Failed to fstat() the fd we opened");
+      return false;
+   }
+
+   if (! S_ISREG(st.st_mode))
+   {
+      pmq_perr_f("We opened the file '%s' expecting a regular file but it's not",
+            what_file);
+      return false;
+   }
+
+   return true;
+}
+
+// Note: returns an fd >= 0 if successful.
+// On failure, -1 is returned and
+//  - if the file failed to open, errno indicates why the file failed to open.
+//  - if the file was opened successfuly but then closed again because it was not
+//    a regular file, errno is set to 0.
+static inline int pmq_openat_regular_existing(
+      int basedir_fd, const char *relpath, int flags)
+{
+   // only access mode may be specified -- no other flags
+   // In particular, O_CREAT would break the logic.
+   assert(flags == O_RDWR || flags == O_RDONLY || flags == O_WRONLY);
+
+   int fd = openat(basedir_fd, relpath, flags, 0);
+
+   if (fd == -1)
+   {
+      int e = errno;
+      pmq_perr_ef(errno, "Failed to openat() existing file='%s', flags=%x",
+            relpath, flags);
+      errno = e;
+      return fd;
+   }
+
+   /* The case where fd refers to something other than a regular file _may_
+    * have been caught by the kernel already above. For example, opening a
+    * directory using O_RDWR will fail. On the other hand, opening a directory
+    * using O_RDONLY will succeed.
+    * In any case, doing an explicit check here.
+    */
+   if (! pmq_check_regular_file(fd, relpath))
+   {
+      close(fd);
+      errno = 0;
+      return -1;
+   }
+
+   return fd;
+}
+
+static inline int pmq_openat_regular_create(
+      int basedir_fd, const char *relpath, int flags, mode_t mode)
+{
+   // only access mode may be specified -- no other flags
+   assert(flags == O_RDWR || flags == O_RDONLY || flags == O_WRONLY);
+
+   // But this func makes sure that the creation-flags are specified
+   flags |= O_CREAT | O_EXCL;
+
+   int fd = openat(basedir_fd, relpath, flags, mode);
+
+   if (fd == -1)
+   {
+      int e = errno;
+      pmq_perr_ef(errno, "Failed to openat() file='%s', flags=%x, mode=%o",
+            relpath, flags, (unsigned) mode);
+      errno = e;
+      return fd;
+   }
+
+   // all necessarily error handling should be done by the OS. (note O_EXCL)
+
+   return fd;
+}
+
+__pmq_artificial_func
+void assert_sane_size(size_t size)
+{
+   // check that size is representable as a ssize_t too.
+   // It is implementation defined how syscalls like write() handle write I/O sizes larger than SSIZE_T.
+   // So better don't even try to.
+
+   assert((size_t) (ssize_t) size == size);
+}
+
+static inline bool pmq_write_all(int fd, Untyped_Slice slice, const char *what)
+{
+   assert_sane_size(slice.size());
+
+   while (slice.size())
+   {
+      ssize_t nw = write(fd, slice.data(), slice.size());
+
+      if (nw == -1)
+      {
+         int e = errno;
+         pmq_perr_ef(errno, "Failed to write %zu bytes to %s",
+               slice.size(), what);
+         errno = e;
+         return false;
+      }
+
+      slice = slice.offset_bytes((size_t) nw);
+   }
+
+   return true;
+}
+
+static inline bool pmq_pwrite_all(int fd, Untyped_Slice slice, off_t offset, const char *what)
+{
+   assert_sane_size(slice.size());
+
+   while (slice.size())
+   {
+      ssize_t nw = pwrite(fd, slice.data(), slice.size(), offset);
+
+      if (nw == -1)
+      {
+         int e = errno;
+         pmq_perr_ef(errno, "Failed to pwrite() %zu bytes at offset %jd to %s",
+               slice.size(), (intmax_t) offset, what);
+         errno = e;
+         return false;
+      }
+
+      slice = slice.offset_bytes((size_t) nw);
+      offset += (size_t) nw;
+   }
+
+   return true;
+}
+
+static inline bool pmq_read_all(int fd, Untyped_Slice slice, const char *what)
+{
+   assert_sane_size(slice.size());
+
+   while (slice.size())
+   {
+      ssize_t nw = read(fd, slice.data(), slice.size());
+
+      if (nw == -1)
+      {
+         int e = errno;
+         pmq_perr_ef(errno, "Failed to read %zu bytes from %s",
+               slice.size(), what);
+         errno = e;
+         return false;
+      }
+
+      slice = slice.offset_bytes((size_t) nw);
+   }
+
+   return true;
+}
+
+static inline bool pmq_pread_all(int fd, Untyped_Slice slice, off_t offset, const char *what)
+{
+   assert_sane_size(slice.size());
+
+   while (slice.size())
+   {
+      ssize_t nw = pread(fd, slice.data(), slice.size(), offset);
+
+      if (nw == -1)
+      {
+         int e = errno;
+         pmq_perr_ef(errno, "Failed to pread() %zu bytes at offset %jd to %s",
+               slice.size(), (intmax_t) offset, what);
+         errno = e;
+         return false;
+      }
+
+      slice = slice.offset_bytes((size_t) nw);
+      offset += (size_t) nw;
+   }
+
+   return true;
+}
--- a/meta/source/pmq/pmq_profiling.hpp
+++ b/meta/source/pmq/pmq_profiling.hpp
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include <mutex>
+#include <condition_variable>
+
+#if PMQ_WITH_PROFILING
+
+// NOTE: this requires building with matching headers in the include-path for
+// tracy. Tracy is a "frame profiler": https://github.com/wolfpld/tracy
+// If PMQ is built as part of a bigger project (e.g. BeeGFS metadata server)
+// buildin with the proper settings may not be supported (yet).
+// A build setup that supports tracy currently exists as part of the flex-docs
+// repository (ask the BeeGFS team).
+
+#  include <Tracy.hpp>
+
+#  define PMQ_PROFILING_CTX FrameMark
+#  define PMQ_PROFILED_SCOPE(name) ZoneScopedN(name)
+#  define PMQ_PROFILED_FUNCTION ZoneScoped
+#  define PMQ_PROFILED_MUTEX(name) TracyLockable(std::mutex, name)
+#  define PMQ_PROFILED_CONDVAR(name) std::condition_variable_any name
+#  define PMQ_PROFILED_LOCK(name, themutex) \
+      auto& __ref__##name(themutex); \
+      std::lock_guard<LockableBase(std::mutex)> name(__ref__##name); \
+      LockMark(__ref__##name)
+#  define PMQ_PROFILED_UNIQUE_LOCK(name, themutex) \
+      auto& __ref__##name(themutex); \
+      std::unique_lock<LockableBase(std::mutex)> name(__ref__##name); \
+      LockMark(__ref__##name) \
+
+#else
+
+#  define PMQ_PROFILING_CTX
+#  define PMQ_PROFILED_SCOPE(name)
+#  define PMQ_PROFILED_FUNCTION
+#  define PMQ_PROFILED_MUTEX(name) std::mutex name
+#  define PMQ_PROFILED_CONDVAR(name) std::condition_variable name
+#  define PMQ_PROFILED_LOCK(name, themutex) \
+      std::lock_guard<std::mutex> name(themutex)
+#  define PMQ_PROFILED_UNIQUE_LOCK(name, themutex) \
+      std::unique_lock<std::mutex> name(themutex)
+
+#endif