beegfs/meta/source/pmq/pmq_common.hpp
2025-08-10 01:34:16 +02:00

852 lines
23 KiB
C++

#pragma once
#include <new> // std::bad_alloc
#include "pmq_base.hpp"
#include "pmq_logging.hpp"
#include "pmq_posix_io.hpp"
#include "pmq_profiling.hpp"
#include <sys/fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <dirent.h>
//
// Simple allocating slice class with delayed allocation.
//
// Why don't I just use std::vector or similar? After all, std::vector is a
// well known standard solution that allocates a contiguous buffer of memory.
//
// I understand this concern, but I've written several simple classes anyway.
// Let me try and defend this case of "NIH". (It may or may not convince the
// reader).
//
// This code is much more straightforward and simple compared to STL headers.
// It is basically "new" and "delete" wrapped in a simple package together with
// operator[] and a way to get a slice to the memory without attached lifetime
// semantics.
//
// Most of the STL classes try to be very generic solutions applicable in a
// wide variety of use cases. While using with standardized solutions has the
// advantages of familiarity, this flexibility and wide applicability comes
// with a complexity cost that brings a disadvantage to anyone working with the
// codebase.
//
// Beyond having a fill level separate from allocation size (size() vs
// capacity()), std::vector has all sorts of methods and functionality to
// support pushing, popping, emplacing, iterators, constructors, destructors,
// and so on. It is highly flexible, which shows whenever an actual
// instanciated vector type is printed on the terminal, including template
// allocator parameter amongst to other things.
//
// All this is ill-fitting for our simple use case. For a queue we just need a
// few preallocated buffers. Just for convenience and to get a little safety,
// the Alloc_Slice wrapper class was created -- so we can do bounds checking
// and get to automatically deallocate the buffers in the destructor.
//
// The size() field and associated semantics that come with std::vector are
// baggage that we can't make use of (we have multiple cursors that wrap around
// our buffers in circular fashion). These semantics are not just available,
// but are understood by programmers as how std::vector gets used.
//
// From a mere functionality standpoint this shouldn't be an issue -- We could
// make sure that we call .resize(N) only once in the beginning and never call
// e.g. push_back(), emplace_back(), reserve(), or similar. This way we'd
// essentially be considering the size() as a constant i.e. ignore it.
//
// However, again, this usage of the type is not guaranteed. The sight of a
// std::vector normally suggest pushing (maybe popping), resizing and
// reserving, buffer reallocation, pointer/iterator invalidation, and runtime
// exceptions.
//
// With the Alloc_Slice class on the other hand, there is no reallocation and
// consequently no iterator invalidation. Exceptions might or might not happen
// depending on compile settings -- but only at construction time, i.e. program
// startup. Because no reallocations are possible, no pointer invalidation /
// iterator invalidation is possible.
//
// Compared to std::vector and other STL headers, significantly less header
// code gets included, so the code compiles quicker. How much quicker? In a
// simple test with a single file, adding any of vector, string, map etc.
// added around 100ms of compilation time (each). I believe I've seen much worse,
// but just multiply 100-400ms by the number of files in a large project and
// there may be a good argument for avoiding to include STL headers based on
// build time. (TODO: refer to example program).
//
// In fairness, this problem may be partially solved with precompiled headers,
// but those come with some issues too. (build setup, pollution, still have to
// compile on each rebuild or precompiled header change).
//
// With the Alloc_Slice class, methods like operator[] have been marked as
// "artificial", meaning it's easier to debug code without jumping all over the
// place. With std::vector and similar classes, I believe there is no way, or
// no standardized way, to build such that we don't jump around files like wild
// when debugging.
//
// If these arguments haven't been convincing, I'll end it now anyway -- the
// text is already much bigger than the actual code.
template<typename T>
class Alloc_Slice
{
T *m_ptr = nullptr;
size_t m_capacity = 0;
public:
__pmq_artificial_method
T& operator[](size_t i) const
{
return m_ptr[i];
}
__pmq_artificial_method
T *data() const
{
return m_ptr;
}
__pmq_artificial_method
size_t capacity() const
{
return m_capacity;
}
__pmq_artificial_method
Slice<T> slice() const
{
return Slice<T>(m_ptr, m_capacity);
}
__pmq_artificial_method
Untyped_Slice untyped_slice() const
{
return slice().untyped();
}
void allocate(size_t capacity)
{
assert(! m_ptr);
m_ptr = new T[capacity];
m_capacity = capacity;
}
~Alloc_Slice()
{
delete[] m_ptr;
}
};
// Posix_FD
//
// Simple file-descriptor holder. The only important purpose is automatically
// closing the fd in the destructor. Setting the fd can happen in the
// constructor or be delayed. A new fd can be set after closing the old one.
// The fd can be retrieved using the .get() method. There are no other methods
// defined, the point here is not to make an abstraction over FDs but just to
// auto-close it.
//
// There is not much more to say. A concern was brought up was that it would be
// better to use an existing class. Again, it's important to note that we're
// not trying to add some (probably ill-defined) abstraction. The fact that
// this class stores fds is not hidden and there isn't any I/O functionality
// contained.
//
// Given this, I wasn't sure what existing class to use that does the same
// thing. This Posix_FD class was quick and easy to write and I hope it is easy
// to read too.
//
// Another concern was that we shouldn't use close() directly here, but instead
// use an abstraction (from an existing library) that papers over platform
// differences such that the code can work on e.g. Windows too. (Windows has a
// Poxix FS layer as well but the code probably wouldn't work without extra
// work and handling of subtle differences).
//
// I can understand this concern, however BeeGFS can not be easily ported to
// e.g. Windows anyway, and this has never been a declared goal of the project.
// BeeGFS currently can't build on Windows and probably never will.
//
// The usage code currently makes non-trivial use of advanced POSIX and Linux
// functions, such as openat(), fsync(), mmap(), pread(), pwrite(). sendfile()
// was used earlier, and might come back. We rely on Posix file permissions
// too, and on certain semantics like for example O_CREAT | O_EXCL during file
// creation.
//
// I'm not aware of a better API that is more portable while providing the same
// functionality.
//
// Also, papering over platform differences may be harder than it initially
// sounds as soon as good performance and thus good control and good error
// handling is a requirement. To be portable, special handling of platform
// idiosyncracies might be required, and the architecture would have to change
// anyway: away from synchronous function calls which would make the
// abstraction leak into the core code, and towards a more asynchronous model
// that is better decoupled from the core code.
//
// It was proposed that std::ifstream / std::ofstream (or similar standardized
// class) could be used instead. std::ifstream in particular would be a bad fit
// since it is a very generic class that comes with buffering and formatting by
// default. I can't easily see how to replace the calls I listed above using
// std::ifstream. Event if it's possible, the result may be more complicated /
// require use of the underlying Posix FD anyway / be less clear / be more code
// / require to give up some control over syscalls etc. ifstream uses
// exceptions and has facilities such as formatting that aren't needed, but the
// presence of this attached functionality would make the purpose less clear
// IMO.
//
class Posix_FD
{
int m_fd = -1;
public:
__pmq_artificial_method
int get()
{
return m_fd;
}
__pmq_artificial_method
bool valid()
{
return m_fd != -1;
}
int close_fd()
{
int ret = 0;
if (m_fd != -1)
{
ret = close(m_fd);
m_fd = -1;
}
return ret;
}
__pmq_artificial_method
void set(int fd)
{
assert(m_fd == -1);
m_fd = fd;
}
__pmq_artificial_method
void operator=(int fd)
{
set(fd);
}
__pmq_artificial_method
Posix_FD()
{
}
__pmq_artificial_method
Posix_FD(int fd)
{
set(fd);
}
__pmq_artificial_method
~Posix_FD()
{
close_fd();
}
};
//
// Libc_DIR
//
// Similar to Posix_FD, but for libc DIR * handles. Same rationale for why I've
// written this applies as for Posix_FD.
//
// This class is currently not used so could be removed.
//
class Libc_DIR
{
DIR *m_dir = nullptr;
public:
__pmq_artificial_method
bool valid()
{
return m_dir != nullptr;
}
__pmq_artificial_method
DIR *get()
{
return m_dir;
}
__pmq_artificial_method
void set(DIR *dir)
{
assert(m_dir == nullptr);
m_dir = dir;
}
void close_dir()
{
if (m_dir)
{
closedir(m_dir);
m_dir = nullptr;
}
}
__pmq_artificial_method
void operator=(DIR *dir)
{
set(dir);
}
__pmq_artificial_method
Libc_DIR()
{
}
__pmq_artificial_method
Libc_DIR(DIR *dir)
{
m_dir = dir;
}
__pmq_artificial_method
~Libc_DIR()
{
close_dir();
}
};
//
// Mmap_Region
//
// Similar to Posix_FD, but for memory mappings.
//
// On destruction, unmaps the mapped region using munmap().
//
class MMap_Region
{
void *m_ptr = MAP_FAILED;
size_t m_length = 0;
public:
__pmq_artificial_method
void *get() const
{
return m_ptr;
}
__pmq_artificial_method
Untyped_Slice untyped_slice() const
{
return Untyped_Slice(m_ptr, m_length);
}
__pmq_artificial_method
bool valid()
{
return m_ptr != MAP_FAILED;
}
void close_mapping()
{
if (m_ptr != MAP_FAILED)
{
if (munmap(m_ptr, m_length) == -1)
{
// should not happen. Simply printing the error for now
pmq_perr_ef(errno, "WARNING: munmap() failed");
}
m_ptr = MAP_FAILED;
m_length = 0;
}
}
// like mmap but returns whether successful
bool create(void *addr, size_t newlength, int prot, int flags,
int fd, off_t offset)
{
assert(m_ptr == MAP_FAILED);
void *newptr = mmap(addr, newlength, prot, flags, fd, offset);
if (newptr == MAP_FAILED)
return false;
m_ptr = newptr;
m_length = newlength;
return true;
}
__pmq_artificial_method
~MMap_Region()
{
close_mapping();
}
};
// Mutex_Protected
//
// Simple wrapper class that protects a data item with a mutex.
// The load() and store() mutex implement thread-synchronized read and write
// access to the data item by locking the resource with a mutex during the
// operation.
//
// A class like Folly::Synchronized might replace this. But again, this was
// very easy to write and is extremely small. Pulling in a large dependency
// just for that might not be justified. Also, having our own class allows
// choosing the mutex type. For example, if we want to profile mutexes using
// the Tracy frame profiler, we need to use Tracy's mutex wrappers (here,
// hidden in the PMQ_PROFILED_MUTEX wrapper). While Folly::Synchronized supports
// custom mutexes, one would need to understand and impleemnt "the extended
// protocol implemented in folly/synchronized/Lock.h".
//
// Upon quick browsing of the 1000 lines in Lock.h, it isn't immediately clear
// what that protocol entails and how much work it would be (if any) to wrap
// our own mutex type (which is potentially a wrap of std::mutex already) to
// conform to that protocol.
//
// Maybe there is something in the C++ standard that is suited as a
// replacement?
//
// Maybe there is, but I consider it much easier to just write 2 methods
// totalling 4 straightforward lines of code...
//
template<typename T>
class Mutex_Protected
{
PMQ_PROFILED_MUTEX(m_mutex);
T m_value;
public:
void store(T value)
{
PMQ_PROFILED_LOCK(lock_, m_mutex);
m_value = value;
}
T load()
{
PMQ_PROFILED_LOCK(lock_, m_mutex);
return m_value;
}
};
/*
* String "slice" that can be passed around. No lifetime semantics or
* unexpected copying etc.
*
* We could use std::string_view instead, but that is a templated type. The
* idea of PMQ_String is to wrap just a char-pointer with a size, and nothing
* more, to have a package that one can ship around. We mostly use strings for
* printf-style formatting and to open files, and we don't need or want any
* more complicated semantics than that.
*/
struct PMQ_String
{
const char *buffer;
size_t size;
};
/*
* Simple string "holder" class that allocates and frees its buffer. The
* contained string is immutable once constructed. But a new one can be
* "swapped" in by dropping the old string and creating a new one.
*
* Is this a case of NIH when there is std::string? Maybe, but basically the
* same arguments as for Alloc_Slice and the other classes above apply.
*
* std::string
*
* - is somewhat slow to compile
* - Unexpected allocations / copies (and thus exceptions as well) can happen
* very easily, without anyone noticing -- For example, it's as easy as
* writing "auto x = y" instead of "auto& x = y".
* - Apart from exceptions and copies / resizes, appending, there is more
* complexity that we don't need and don't want and that would actually be a
* misfit for our project. Ugly error messages with huge types (...
* std::basic_char ... etc.) is only a small symptom of this.
*/
class PMQ_Owned_String
{
PMQ_String m_string = {};
public:
bool valid() const
{
return m_string.buffer != nullptr;
}
__pmq_artificial_method
PMQ_String get() const
{
return m_string;
}
void drop()
{
// Checking only for clarity. free() and the rest of the code would work
// with a null buffer too.
if (m_string.buffer != nullptr)
{
free((void *) m_string.buffer);
m_string.buffer = nullptr;
m_string.size = 0;
}
}
void set(const char *buffer)
{
assert(! m_string.buffer);
char *copy = strdup(buffer);
if (copy == nullptr)
{
// is an exception what we want / need?
throw std::bad_alloc();
}
m_string.buffer = copy;
m_string.size = strlen(buffer);
}
__pmq_artificial_method
PMQ_Owned_String()
{
m_string.buffer = nullptr;
m_string.size = 0;
}
~PMQ_Owned_String()
{
drop();
}
};
/*
* SNs (sequence numbers)
*
* Sequence numbers, and the ringbuffers that build on them, are a core concept
* of how the PMQ works.
*
* I believe they are pretty much what is elsewhere known as "LMAX Disruptor"
* (google it).
*
* Sequence numbers are 64-bit unsigned integers that can wraparound (but this
* is only theoretical -- wraparound is probably completely untested since
* 64-bit numbers don't overflow easily in practice).
*
* Ringbuffers have a number of slots that is 2^N for some N. SN's are mapped
* to slots with wrap-around in the ringbuffer's 2^N slots by using the lowest
* N bits of the SN to index into the slots array.
*
* The SN templated class provides some type safety -- the Tag type is a
* "phantom tag" (can be implemented by making a new "empty" class) that
* prevents indexing into a ringbuffer using a mismatching sequence number. For
* example, we have a ringbuffer of input-slots that should be indexed by *slot
* sequence numbers* (SSNs). And we have a ringbuffer of chunks that should be
* indexed by *chunk sequence numbers (CSNs). The on-disk chunk store is
* another kind of ringbuffer that works with the same principle of wrapping
* around automatically.
*
* We also track *message sequence numbers* (MSNs) but we don't use them for
* indexing, only for binary search.
*
* Mathematically, SNs form an affine space. This is like a vector space but
* without a designated origin (pls forgive me if what I write here is slightly
* incorrect as far as mathematics is concerned. Only the idea matters). There
* is a 0 value, but it is not meaningfully different compared to any other
* value.
*
* One can subtract two sequence numbers to get a distance (represented as bare
* uint64_t), and one can add a distance to a sequence number to get a new
* sequence number. However, unlike a vector space with designated 0, one can
* not add two sequence numbers meaningfully (SN<T> has operator+(uint64_t d)
* but no operator+(SN<T>& other).
*/
template<typename Tag>
class SN
{
uint64_t m_value;
public:
explicit SN(uint64_t value)
{
m_value = value;
}
// Some C++ trivia following. In most cases you can ignore this and just use
// the class similar to primitive integers.
//
// Here we specify an *explicitly-defaulted default-constructor*. This will
// allow us to initialize the object with undefined (garbage) value if we
// want so.
//
// Explanation: Since we have explicitly specified the constructor with 1
// argument already, there wouldn't be an implicit default constructor (a
// constructor with no arguments). To get a default constructor, we need to
// explicitly specify one. We need a default constructor (no constructor
// arguments) if we want to write
//
// SN sn;
//
// For simple data types (like SN), we typically want the above line to
// leave the object's members uninitialized (garbage values). While this is
// in some ways dangerous, it can be simpler especially for objects where
// zero-initialization isn't very convenient or meaningful. Leaving values
// uninitialized in the default constructor also allows the compiler to
// catch bugs in some situations when the user unintentionally forgot to
// specify an explicit value.
//
// Note a gotcha: There is a difference between an empty default constructor
//
// SN() {}
//
// and an (explicitly or implicitly) defaulted default constructor:
//
// SN() = default;
//
// If we use the class like this:
//
// SN x {};
// SN y = SN(); // or like this
// SN z = {}; // or like this...
//
// then x will contain garbarge with the empty default constructor, but will
// be zero-initialized with the (explicitly-) defaulted default constructor.
// We'd typically want zero initialization with this syntax.
SN() = default;
__pmq_artificial_method
uint64_t value() const
{
return m_value;
}
__pmq_artificial_method
void operator++()
{
m_value++;
}
__pmq_artificial_method
void operator++(int)
{
m_value++;
}
__pmq_artificial_method
SN operator+=(uint64_t d)
{
m_value += d;
return *this;
}
__pmq_artificial_method
SN& operator-=(uint64_t d)
{
m_value -= d;
return *this;
}
__pmq_artificial_method
SN operator+(uint64_t d) const
{
return SN(m_value + d);
}
__pmq_artificial_method
SN operator-(uint64_t d) const
{
return SN(m_value - d);
}
__pmq_artificial_method
uint64_t operator-(SN other) const
{
return m_value - other.m_value;
}
__pmq_artificial_method
bool operator==(SN other) const
{
return m_value == other.m_value;
}
__pmq_artificial_method
bool operator!=(SN other) const
{
return m_value != other.m_value;
}
};
/*
* COMPARING SEQUENCE NUMBERS
* ==========================
*
* Since sequence numbers wrap around (in theory, when 64 bits overflow) they
* have no natural ordering.
*
* However, in practice, sequence numbers are used to index in much smaller
* buffer, and at any given time there is only a small window of sequence
* numbers. It's a sliding window, but a window still.
*
* So, admitting that the sequence numbers in a given window may wraparound,
* back to 0, we can still assume that they never "overtake" each other.
* We can subtract two numbers using unsigned arithmetic and determine their
* relative ordering from the result. Centering our worldview at a number x, we
* divide the space of uint64_t numbers into those that are less than x (x -
* 2^63 to x) and those that are greater than x (x to 2^63).
*
* Note that this relation is not transitive (x <= y && y <= z does not imply x
* <= z), and not antisymmetric -- (x + 2^63) is both greater and less than x.
* So it's not a true ordering relation, but in practice we can use it to
* reliably compare items by "age".
*
* The value 1 should be considered greater than UINT64_MAX, since 1 -
* UINT64_MAX == 2. Conversely, UINT64_MAX is less than 1 since UINT64_MAX - 1
* equals (UINT64_MAX - 1), which is a.
*
*/
// Comparing bare uint64_t sequence values.
__pmq_artificial_func
bool _sn64_lt(uint64_t a, uint64_t b)
{
return b - (a + 1) <= UINT64_MAX / 2;
}
__pmq_artificial_func
bool _sn64_le(uint64_t a, uint64_t b)
{
return b - a <= UINT64_MAX / 2;
}
__pmq_artificial_func
bool _sn64_ge(uint64_t a, uint64_t b)
{
return a - b <= UINT64_MAX / 2;
}
__pmq_artificial_func
bool _sn64_gt(uint64_t a, uint64_t b)
{
return a - (b + 1) <= UINT64_MAX / 2;
}
// Comparing type-safe "tagged" SN values
template<typename Tag>
__pmq_artificial_func
bool sn64_lt(SN<Tag> a, SN<Tag> b)
{
return b - (a + 1) <= UINT64_MAX / 2;
}
template<typename Tag>
__pmq_artificial_func
bool sn64_le(SN<Tag> a, SN<Tag> b)
{
return b - a <= UINT64_MAX / 2;
}
template<typename Tag>
__pmq_artificial_func
bool sn64_ge(SN<Tag> a, SN<Tag> b)
{
return a - b <= UINT64_MAX / 2;
}
template<typename Tag>
__pmq_artificial_func
bool sn64_gt(SN<Tag> a, SN<Tag> b)
{
return a - (b + 1) <= UINT64_MAX / 2;
}
template<typename Tag>
__pmq_artificial_func
bool sn64_inrange(SN<Tag> sn, SN<Tag> lo, SN<Tag> hi)
{
return sn - lo <= hi - lo;
}
// Ringbuffer containing a buffer (power-of-2 size) of element of type V. It
// can be "indexed" using SN's of matching type.
template<typename Tag, typename V>
class Ringbuffer
{
using K = SN<Tag>;
V *m_ptr = nullptr;
size_t m_count = 0;
public:
__pmq_artificial_method
uint64_t slot_count() const
{
return m_count;
}
__pmq_artificial_method
void reset(Slice<V> slice)
{
assert(pmq_is_power_of_2(slice.count()));
m_ptr = slice.data();
m_count = slice.count();
}
__pmq_artificial_method
Slice<V> as_slice() const
{
return Slice<V>(m_ptr, m_count);
}
__pmq_artificial_method
const V *get_slot_for(K k) const
{
return &m_ptr[k.value() & (m_count - 1)];
}
__pmq_artificial_method
V *get_slot_for(K k)
{
return &m_ptr[k.value() & (m_count - 1)];
}
__pmq_artificial_method
Ringbuffer()
{
}
__pmq_artificial_method
Ringbuffer(V *ptr, uint64_t size)
{
reset(ptr, size);
}
};