2501 lines
77 KiB
C++
2501 lines
77 KiB
C++
#include "pmq_common.hpp"
|
|
#include "pmq.hpp"
|
|
|
|
static constexpr uint64_t PMQ_SLOT_SIZE = 128;
|
|
static constexpr uint64_t PMQ_SLOT_HEADER_SIZE = 16;
|
|
static constexpr uint64_t PMQ_SLOT_SPACE = PMQ_SLOT_SIZE - PMQ_SLOT_HEADER_SIZE;
|
|
|
|
// A bit somewhere in the slot header that indicates that a given slot is the
|
|
// first slot of a sequence of slots that hold a message.
|
|
static constexpr uint64_t PMQ_SLOT_LEADER_MASK = 1;
|
|
|
|
static constexpr uint64_t PMQ_CHUNK_SHIFT = 16;
|
|
static constexpr uint64_t PMQ_CHUNK_SIZE = (uint64_t) 1 << PMQ_CHUNK_SHIFT;
|
|
|
|
|
|
class CSN_Tag{};
|
|
class SSN_Tag{};
|
|
class MSN_Tag{};
|
|
|
|
using CSN = SN<CSN_Tag>;
|
|
using SSN = SN<SSN_Tag>;
|
|
using MSN = SN<MSN_Tag>;
|
|
|
|
struct PMQ_Chunk_Hdr
|
|
{
|
|
// msn: msn of first message stored in this chunk.
|
|
// msgoffsets_off: offset in the chunk to an array of (msgcount + 1) offsets.
|
|
MSN msn;
|
|
uint16_t msgcount;
|
|
uint16_t msgoffsets_off;
|
|
uint32_t pad; // pad to 16 bytes for now
|
|
};
|
|
|
|
struct PMQ_Slot
|
|
{
|
|
uint32_t flags;
|
|
uint32_t msgsize;
|
|
uint32_t pad0;
|
|
uint32_t pad1;
|
|
char payload[PMQ_SLOT_SPACE];
|
|
|
|
__pmq_artificial_method
|
|
Untyped_Slice payload_untyped_slice()
|
|
{
|
|
return Untyped_Slice(payload, sizeof payload);
|
|
}
|
|
};
|
|
|
|
/* In-memory enqueue buffer. This is where incoming message get written first.
|
|
* It consists of a ringbuffer of fixed-size slots.
|
|
* A slot has a header and a payload. The size of each slot is PMQ_SLOT_SIZE,
|
|
* and the payload can be up to up to PMQ_SLOT_SPACE bytes.
|
|
* In the future, we might support even concurrent writes.
|
|
* This structure needs no locking; its contents are static except when
|
|
* initialization and destroying.
|
|
* Accessing the cursors though needs a mutex lock.
|
|
*/
|
|
struct In_Queue
|
|
{
|
|
uint64_t slot_count = 0;
|
|
uint64_t size_bytes = 0; // slot_count * PMQ_SLOT_SIZE
|
|
|
|
// When reaching this fill level (in slots) we persist unconditionally.
|
|
uint64_t slots_persist_watermark = 0;
|
|
|
|
// A shared memory file store that survives application restarts (but not
|
|
// OS restarts).
|
|
Posix_FD shm_fd;
|
|
|
|
// Memory mapping of @shm_fd.
|
|
MMap_Region mapping;
|
|
|
|
Ringbuffer<SSN_Tag, PMQ_Slot> slots; // the buffer from the mapping.
|
|
};
|
|
|
|
/* Cursors published by the enqueuer. The ssn_* members index into the
|
|
* In_Queue. They are consumed by the persister and by reader threads.
|
|
* They get written by enqueuer threads only.
|
|
*
|
|
* NOTE: The *_disk cursors are treated as "tail" cursors i.e. mark the end of
|
|
* the valid region of the queue. They are a copy from the persister's cursors.
|
|
* They get copied only from time to time as necessary if the In_Queue ran out
|
|
* of space to store new messages: We have ssn_disk <= ssn_mem and msn_disk <=
|
|
* msn. The cursors indicate how far the persister has come in persisting
|
|
* chunks by compacting messages (stored in slots) from the In_Queue.
|
|
*/
|
|
struct In_Queue_Cursors
|
|
{
|
|
MSN msn;
|
|
SSN ssn_mem;
|
|
MSN msn_disk;
|
|
SSN ssn_disk;
|
|
};
|
|
|
|
/* An in-memory representation for a chunk page that is about to be written to
|
|
* disk.
|
|
* Contains an untyped buffer and minimal book-keeping information.
|
|
*/
|
|
struct Chunk_Buffer
|
|
{
|
|
void *data;
|
|
|
|
// Tracking msn and ssn of first message so we can know how many pages
|
|
// to write() or fsync().
|
|
MSN msn;
|
|
SSN ssn;
|
|
|
|
// last_msn and last_ssn: These fields indicate the one-past-last msn and
|
|
// ssn. They get set only when the chunk buffer gets finalized.
|
|
// The purpose is to allow the persister thread to update the persister
|
|
// cursors after persisting the chunk buffer.
|
|
// Because msn's and ssn's are continuous (i.e. last_msn is equal to the
|
|
// next buffer's msn field), these data fields may seem redundant -- the
|
|
// persister could use the msn and ssn from the following chunk buffer
|
|
// instead.
|
|
// However, that approach has in the past caused a bug where the persister
|
|
// used the cursors from the following buffer before that buffer was even
|
|
// initialized -- effectively decreasing the cursors to an earlier state
|
|
// instead of advancing them.
|
|
// After finding the bug, it was clear that we should introduce a little bit
|
|
// of redundancy in order to keep things simple: the persister will access
|
|
// only finalized buffers, and those will always have last_msn and last_ssn
|
|
// fields set correctly.
|
|
|
|
MSN last_msn;
|
|
SSN last_ssn;
|
|
|
|
__pmq_artificial_method
|
|
Untyped_Slice untyped_slice() const
|
|
{
|
|
return Untyped_Slice(data, PMQ_CHUNK_SIZE);
|
|
}
|
|
|
|
__pmq_artificial_method
|
|
PMQ_Chunk_Hdr *get_chunk_header() const
|
|
{
|
|
return (PMQ_Chunk_Hdr *) data;
|
|
}
|
|
};
|
|
|
|
/* A queue of in-memory chunks.
|
|
* We might write() only one chunk at a time, but we need to hold each chunk in
|
|
* memory until it is fsynced, so we need a bunch of chunk buffers.
|
|
* */
|
|
struct Chunk_Queue
|
|
{
|
|
// The only purpose of this is to be a "holder" for the data buffers
|
|
// contained in the "chunks" Chunk_Buffer's.
|
|
// It's currently a single contiguous mmap allocation, i.e. (PMQ_CHUNK_SIZE
|
|
// * chunks.slot_count())
|
|
MMap_Region chunk_buffer_mapping;
|
|
|
|
// The only purpose of this is to be a "holder" for the Chunk_Buffer
|
|
// structures (the "chunks" Ringbuffer is non-owning).
|
|
Alloc_Slice<Chunk_Buffer> chunks_alloc_slice;
|
|
|
|
Ringbuffer<CSN_Tag, Chunk_Buffer> chunks;
|
|
|
|
/* CSN, MSN, SSN of the "current" next message that will be compacted. */
|
|
MSN cq_msn;
|
|
CSN cq_csn;
|
|
SSN cq_ssn;
|
|
|
|
// Construction data for currently built chunk
|
|
|
|
// buffer of current chunk (current chunk is identified Persist_Cursors::cq_csn)
|
|
Chunk_Buffer *ck_buffer;
|
|
|
|
uint64_t msg_count; // number of messages compacted in current chunk
|
|
|
|
/* Array of message offsets. msg_count + 1 elements are always valid. The
|
|
* next message will be appended (if it fits) at an offset of
|
|
* offsets[msg_count] bytes in the current chunks page. When the chunk is
|
|
* finalized, the position of the array is set.
|
|
*/
|
|
Alloc_Slice<uint16_t> offsets;
|
|
};
|
|
|
|
/* Persistent chunk store -- storing chunks on the file system.
|
|
*/
|
|
struct Chunk_Store
|
|
{
|
|
Posix_FD chunk_fd;
|
|
|
|
uint64_t capacity_bytes = 0;
|
|
|
|
// After persisting to the chunk store, when at least high_watermark many
|
|
// chunks are filled, we may discard some to lower the chunk fill count to
|
|
// low_watermark.
|
|
// Discarding may also be required in the middle of persisting when all
|
|
// chunks are full. But normally this shouldn't happen because the
|
|
// In_Queue's capacity should be smaller than (chunk_count -
|
|
// high_watermark) chunks.
|
|
// Since discarding chunks involves an fsync() (really a write barrrier
|
|
// would be enough but in practice we only have fsync() currently),
|
|
// we discard many chunks at once to hide the overhead coming from disk
|
|
// latency.
|
|
uint64_t chunks_high_watermark = 0;
|
|
uint64_t chunks_low_watermark = 0;
|
|
|
|
__pmq_artificial_method
|
|
uint64_t chunk_count() const
|
|
{
|
|
return capacity_bytes >> PMQ_CHUNK_SHIFT;
|
|
}
|
|
};
|
|
|
|
/* Cursors published by the persister. They index into the chunk store. They
|
|
* are consumed by message readers. Sometimes enqueuer threads read these
|
|
* cursors as well, to be able to skip persisting slots from the In_Queue.
|
|
*/
|
|
struct Persist_Cursors
|
|
{
|
|
// The wal_ssn member indicates the tentative next slot to be written to the
|
|
// In_Queue's persist file.
|
|
// Most slots never end up going to that persist file but are compacted
|
|
// directly to the chunk store. At each sync to disk, only the slots that
|
|
// can't form a complete chunk buffer go to the In_Queue's persist file. In
|
|
// that file, only the slots from cks_ssn to wal_ssn (exclusive) are valid.
|
|
SSN wal_ssn;
|
|
|
|
// we also store the MSN corresponding to the wal_ssn.
|
|
MSN wal_msn;
|
|
|
|
// We might want to also introduce a cursor to indicate the oldest valid
|
|
// chunk buffer in the (in-memory) queue. But currently only the flusher is
|
|
// reading from the queue -- always at position cks_csn.
|
|
|
|
// The next chunk that will be written (and fsync'ed) to disk.
|
|
CSN cks_csn;
|
|
|
|
// The msn of the first message that is stored in the chunk indicated by
|
|
// cks_csn. That msn is also stored in that chunk's header.
|
|
MSN cks_msn;
|
|
|
|
// The ssn of the leader slot where the first message of the chunk indicated
|
|
// by cks_csn was stored. Note, this ssn is _not_ stored in the chunk's
|
|
// header -- it only has value coordinating with the In_Queue.
|
|
SSN cks_ssn;
|
|
|
|
// The next chunk that will be discarded from the chunk store.
|
|
CSN cks_discard_csn;
|
|
};
|
|
|
|
/* This structure is persisted in a state file.
|
|
*/
|
|
struct Commit_Record
|
|
{
|
|
// Number of slots of the in-queue
|
|
// must be a power of 2 currently.
|
|
uint64_t inqueue_slotcount;
|
|
|
|
uint64_t slotsfile_size_bytes;
|
|
|
|
// next ssn that will hit the slots-persist file
|
|
SSN wal_ssn;
|
|
MSN wal_msn;
|
|
|
|
uint64_t chunkfile_size_bytes;
|
|
|
|
// Csn, msn, and ssn of the next chunk that will be persisted to the chunk
|
|
// store.
|
|
CSN cks_csn;
|
|
MSN cks_msn;
|
|
SSN cks_ssn;
|
|
|
|
// The next chunk that will be discarded from the chunk store
|
|
CSN cks_discard_csn;
|
|
};
|
|
|
|
|
|
// Data owned by the enqueuer functionality. There can only be 1 enqueuer
|
|
// thread at a time.
|
|
struct Enqueuer
|
|
{
|
|
PMQ_Enqueuer_Stats enqueuer_stats;
|
|
In_Queue_Cursors in_queue_cursors;
|
|
};
|
|
|
|
// Data owner by the persister functionality. There can only be 1 persister
|
|
// thread at a time
|
|
struct Persister
|
|
{
|
|
PMQ_Persister_Stats stats;
|
|
Persist_Cursors persist_cursors;
|
|
};
|
|
|
|
|
|
struct PMQ
|
|
{
|
|
PMQ_Owned_String basedir_path;
|
|
|
|
Posix_FD basedir_fd;
|
|
|
|
Posix_FD slotsfile_fd;
|
|
uint64_t slotsfile_size_bytes = 0;
|
|
|
|
In_Queue in_queue;
|
|
|
|
// Chunks that get compacted from the in_queue. They will by persisted
|
|
// to the chunk_store.
|
|
Chunk_Queue chunk_queue;
|
|
|
|
Chunk_Store chunk_store;
|
|
|
|
// Cursors published by enqueuer threads, consumable by persister and reader
|
|
// threads.
|
|
In_Queue_Cursors pub_in_queue_cursors;
|
|
|
|
PMQ_PROFILED_MUTEX(pub_in_queue_mutex);
|
|
PMQ_PROFILED_CONDVAR(pub_in_queue_cond);
|
|
|
|
// Cursors published by persister threads, consumable by enqueuer and reader
|
|
// threads.
|
|
Mutex_Protected<Persist_Cursors> pub_persist_cursors;
|
|
Mutex_Protected<PMQ_Persister_Stats> pub_persister_stats;
|
|
|
|
|
|
// must be held to guarantee only 1 enqueuer at a time (protects only
|
|
// In_Queue currently).
|
|
PMQ_PROFILED_MUTEX(enqueue_mutex);
|
|
|
|
Enqueuer enqueuer;
|
|
|
|
|
|
// Must be held to to guarantee only 1 persister at a time. Flushing may
|
|
// happen by a dedicated persister thread that checks regularly, or by an
|
|
// enqueuer thread when the In_Queue is full.
|
|
|
|
PMQ_PROFILED_MUTEX(persist_mutex);
|
|
|
|
Persister persister;
|
|
|
|
Posix_FD statefile_fd;
|
|
};
|
|
|
|
void pmq_get_stats(PMQ *q, PMQ_Stats *out_stats)
|
|
{
|
|
PMQ_Stats stats = {};
|
|
stats.persister = q->pub_persister_stats.load();
|
|
{
|
|
PMQ_PROFILED_LOCK(lock_, q->enqueue_mutex);
|
|
stats.enqueuer = q->enqueuer.enqueuer_stats;
|
|
}
|
|
|
|
*out_stats = stats;
|
|
}
|
|
|
|
PMQ_Persist_Info pmq_get_persist_info(PMQ *q)
|
|
{
|
|
Persist_Cursors persist_cursors = q->pub_persist_cursors.load();
|
|
PMQ_Persist_Info out;
|
|
out.wal_msn = persist_cursors.wal_msn.value();
|
|
out.cks_msn = persist_cursors.cks_msn.value();
|
|
out.cks_discard_csn = persist_cursors.cks_discard_csn.value();
|
|
return out;
|
|
}
|
|
|
|
static bool pmq_persist_finished_chunk_buffers(PMQ *q);
|
|
|
|
// Called only on initialization and then subsequently by pmq_switch_to_next_chunk_buffer()
|
|
// Note: must be called from a persister context (with persist_mutex locked)
|
|
static bool pmq_begin_current_chunk_buffer(PMQ *q)
|
|
{
|
|
PMQ_PROFILED_FUNCTION;
|
|
|
|
Persist_Cursors *pc = &q->persister.persist_cursors;
|
|
Chunk_Queue *cq = &q->chunk_queue;
|
|
|
|
uint64_t num_chunks = cq->chunks_alloc_slice.capacity();
|
|
|
|
if (cq->cq_csn - pc->cks_csn == num_chunks)
|
|
{
|
|
if (! pmq_persist_finished_chunk_buffers(q))
|
|
return false;
|
|
}
|
|
|
|
Chunk_Buffer *ck_buffer = cq->chunks.get_slot_for(cq->cq_csn);
|
|
ck_buffer->msn = cq->cq_msn;
|
|
ck_buffer->ssn = cq->cq_ssn;
|
|
ck_buffer->last_msn = MSN(0); // only set when chunk buffer gets finalized
|
|
ck_buffer->last_ssn = SSN(0); // only set when chunk buffer gets finalized
|
|
|
|
cq->ck_buffer = ck_buffer;
|
|
cq->msg_count = 0;
|
|
cq->offsets[0] = 16; // XXX ???
|
|
|
|
return true;
|
|
}
|
|
|
|
static void pmq_finalize_current_chunk_buffer(PMQ *q)
|
|
{
|
|
Chunk_Queue *cq = &q->chunk_queue;
|
|
|
|
Chunk_Buffer *cb = cq->ck_buffer;
|
|
cb->last_msn = cq->cq_msn;
|
|
cb->last_ssn = cq->cq_ssn;
|
|
pmq_assert(cq->cq_msn == cb->msn + cq->msg_count);
|
|
|
|
pmq_debug_f("Finalize chunk %" PRIu64 ": MSN %" PRIu64 " - %" PRIu64
|
|
", %" PRIu64 " messages. Last msg ends at %" PRIu64,
|
|
cq->cq_csn.value(),
|
|
cb->msn.value(),
|
|
cb->last_msn.value(),
|
|
cq->msg_count, (uint64_t) cq->offsets[cq->msg_count]);
|
|
|
|
// msn: msn of first message stored in this chunk.
|
|
// msgoffsets_off: offset in the chunk to an array of (msgcount + 1) offsets.
|
|
|
|
Untyped_Slice chunk_slice = cq->ck_buffer->untyped_slice();
|
|
Slice<uint16_t> offsets_slice = cq->offsets.slice().sub_slice(0, cq->msg_count + 1);
|
|
|
|
PMQ_Chunk_Hdr *hdr = (PMQ_Chunk_Hdr *) chunk_slice.data();
|
|
hdr->msn = cq->ck_buffer->msn;
|
|
hdr->msgcount = cq->msg_count;
|
|
hdr->msgoffsets_off = PMQ_CHUNK_SIZE - offsets_slice.size_in_bytes();
|
|
|
|
pmq_debug_f("Place msgoffsets array in chunk at bytes offset %" PRIu64,
|
|
(uint64_t) hdr->msgoffsets_off);
|
|
|
|
// zero out gap between last message and message-offsets-array
|
|
zero_out_slice(chunk_slice
|
|
.limit_size_bytes(hdr->msgoffsets_off)
|
|
.offset_bytes(offsets_slice.at(cq->msg_count)));
|
|
|
|
Untyped_Slice chunk_offsets_slice = chunk_slice.offset_bytes(hdr->msgoffsets_off);
|
|
copy_slice(chunk_offsets_slice, offsets_slice.untyped());
|
|
}
|
|
|
|
// Finalize the current chunk buffer and start the next one.
|
|
static bool pmq_switch_to_next_chunk_buffer(PMQ *q)
|
|
{
|
|
Chunk_Queue *cq = &q->chunk_queue;
|
|
|
|
pmq_finalize_current_chunk_buffer(q);
|
|
|
|
// "ship"
|
|
cq->cq_csn += 1;
|
|
|
|
if (! pmq_begin_current_chunk_buffer(q))
|
|
return false;
|
|
|
|
pmq_assert(cq->ck_buffer->msn.value() == cq->cq_msn.value());
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool pmq_msg_fits_current_chunk_buffer(PMQ *q, uint64_t msgsize)
|
|
{
|
|
Chunk_Queue *cq = &q->chunk_queue;
|
|
|
|
// Compute start of offsets-array given the number of messages in the chunk.
|
|
// Why (msg_count + 2)? Here is why: (msg_count + 2) = (next_count + 1).
|
|
// next_count = the count, after appending current message.
|
|
// Add 1 to that because the offsets array holds one more slot to include
|
|
// the final size.
|
|
uint64_t offsets_off = PMQ_CHUNK_SIZE - (cq->msg_count + 2) * (uint64_t) sizeof cq->offsets[0];
|
|
|
|
uint64_t msgs_end = cq->offsets[cq->msg_count] + msgsize;
|
|
|
|
return msgs_end <= offsets_off;
|
|
}
|
|
|
|
// Helper function for pmq_persist()
|
|
// NOTE: persist_mutex must be locked
|
|
static bool pmq_compact(PMQ *q, SSN compact_ssn, SSN max_ssn)
|
|
{
|
|
if (false) // NOLINT
|
|
{
|
|
pmq_debug_f("pmq_persist(ssn=%" PRIu64 ", max_ssn=%" PRIu64 ")",
|
|
compact_ssn.value(), max_ssn.value());
|
|
}
|
|
|
|
PMQ_PROFILED_FUNCTION;
|
|
|
|
Chunk_Queue *cq = &q->chunk_queue;
|
|
|
|
for (;;)
|
|
{
|
|
if (sn64_ge(cq->cq_ssn, max_ssn))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
// Extract message size from slot header.
|
|
uint64_t msgsize;
|
|
{
|
|
SSN ssn = cq->cq_ssn;
|
|
const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn);
|
|
if ((slot->flags & PMQ_SLOT_LEADER_MASK) == 0)
|
|
{
|
|
// Earlier there was an assert() here instead of an integrity check,
|
|
// assuming that RAM should never be corrupted. However, the RAM might
|
|
// be filled from disk, and we currently don't validate the data after
|
|
// loading. Thus we now consider slot memory just as corruptible as
|
|
// disk data.
|
|
pmq_perr_f("slot %" PRIu64 " is not a leader slot.", ssn.value());
|
|
return false;
|
|
}
|
|
msgsize = slot->msgsize;
|
|
}
|
|
|
|
// check if there is enough room for the message in current chunk buffer.
|
|
// If necessary, start a new chunk buffer. This in turn may require
|
|
// flushing another chunk buffer to disk (we may flush multiple
|
|
// considering throughput vs latency).
|
|
if (! pmq_msg_fits_current_chunk_buffer(q, msgsize))
|
|
{
|
|
if (! pmq_switch_to_next_chunk_buffer(q))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// Actually let's always try to compact up to max_ssn
|
|
// sice we should avoid writing small batches.
|
|
// So disabling this early return.
|
|
if(false) // NOLINT
|
|
if (sn64_ge(cq->cq_ssn, compact_ssn))
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// compute number of slots that we need to read, and copy to out-message
|
|
uint64_t nslots_req = (msgsize + PMQ_SLOT_SPACE - 1) / PMQ_SLOT_SPACE;
|
|
|
|
uint64_t nslots_avail = max_ssn - cq->cq_ssn;
|
|
if (nslots_req > nslots_avail)
|
|
{
|
|
pmq_perr_f("Internal error: Invalid msgsize field in the slots file!");
|
|
pmq_perr_f("Internal error: msgsize is %" PRIu64 ", needs %" PRIu64
|
|
" slots but I believe only %" PRIu64 " are available!",
|
|
msgsize, nslots_req, nslots_avail);
|
|
return false; // can't we do a little more to handle the issue?
|
|
}
|
|
|
|
// copy one message
|
|
uint64_t remain = msgsize;
|
|
uint64_t dst_offset = cq->offsets[cq->msg_count];
|
|
|
|
for (uint64_t i = 0; i < nslots_req; i++)
|
|
{
|
|
// copy slots to chunks
|
|
SSN ssn = cq->cq_ssn + i;
|
|
const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn);
|
|
|
|
void *dst = (char *) cq->ck_buffer->data + dst_offset;
|
|
const void *src = __pmq_assume_aligned<16>(slot->payload);
|
|
uint64_t n = remain < PMQ_SLOT_SPACE ? remain : PMQ_SLOT_SPACE;
|
|
memcpy(dst, src, n);
|
|
dst_offset += n;
|
|
}
|
|
|
|
cq->cq_ssn += nslots_req;
|
|
cq->cq_msn += 1;
|
|
|
|
cq->offsets[cq->msg_count + 1] = cq->offsets[cq->msg_count] + msgsize;
|
|
cq->msg_count += 1;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool pmq_commit(PMQ *q); // forward decl. We should get rid of this and fix the order.
|
|
|
|
// Helper function for pmq_persist()
|
|
// NOTE: persister lock must be taken!
|
|
// Persists all chunk buffers from disk_csn up to (but excluding) cq_csn.
|
|
static bool pmq_persist_finished_chunk_buffers(PMQ *q)
|
|
{
|
|
Chunk_Queue *cq = &q->chunk_queue;
|
|
Persist_Cursors *pc = &q->persister.persist_cursors;
|
|
|
|
pmq_assert(cq->cq_csn - pc->cks_csn <= q->chunk_queue.chunks.slot_count());
|
|
|
|
CSN cq_csn = cq->cq_csn;
|
|
|
|
uint64_t chunk_slot_mask = pmq_mask_power_of_2(q->chunk_store.chunk_count());
|
|
|
|
for (CSN csn = pc->cks_csn;
|
|
csn != cq_csn;)
|
|
{
|
|
pmq_assert(csn - pc->cks_discard_csn <= q->chunk_store.chunk_count());
|
|
if (csn - pc->cks_discard_csn == q->chunk_store.chunk_count())
|
|
{
|
|
// All chunks are filled. This should rarely happen since chunks are
|
|
// normally discarded when the high-watermark chunk fill count is
|
|
// reached. Typically, the chunk store is much larger than the
|
|
// In_Queue, and we should not get here.
|
|
// We discard some chunks manually by calling pmq_commit().
|
|
if (! pmq_commit(q))
|
|
return false;
|
|
}
|
|
pmq_assert(csn - pc->cks_discard_csn < q->chunk_store.chunk_count());
|
|
|
|
Chunk_Buffer *cb = q->chunk_queue.chunks.get_slot_for(csn);
|
|
|
|
Pointer<PMQ_Chunk_Hdr> hdr = cb->get_chunk_header();
|
|
|
|
pmq_debug_f("Persist chunk buffer csn=%" PRIu64 ", pointer is %p, msn is %" PRIu64,
|
|
csn.value(), cb, hdr->msn.value());
|
|
|
|
// write chunk buffer to disk.
|
|
Untyped_Slice slice = cb->untyped_slice();
|
|
pmq_assert(slice.size() == PMQ_CHUNK_SIZE);
|
|
|
|
int fd = q->chunk_store.chunk_fd.get();
|
|
uint64_t chunk_slot_index = csn.value() & chunk_slot_mask;
|
|
off_t offset_bytes = chunk_slot_index << PMQ_CHUNK_SHIFT;
|
|
|
|
if (! pmq_pwrite_all(fd, slice, offset_bytes, "chunk buffer"))
|
|
{
|
|
// should we publish the new cursors anyway? (see exit code below)
|
|
return false;
|
|
}
|
|
|
|
csn += 1;
|
|
|
|
// Advance chunk store pointers
|
|
pc->cks_csn = csn;
|
|
pc->cks_ssn = cb->last_ssn;
|
|
pc->cks_msn = cb->last_msn;
|
|
|
|
pmq_debug_f("persisted. pc->cks_msn=%" PRIu64 ", pc->cks_ssn=%" PRIu64,
|
|
pc->cks_msn.value(), pc->cks_ssn.value());
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Helper function for pmq_persist_unpersisted_slots()
|
|
// Persist a contiguous sub-range of all slots.
|
|
// The *slots* argument must correspond to the slots ringbuffer.
|
|
// start_index + count may not exceed the slots size.
|
|
static bool pmq_persist_slots_slice(PMQ *q, uint64_t start_index, uint64_t count)
|
|
{
|
|
Slice<PMQ_Slot> slots = q->in_queue.slots.as_slice();
|
|
|
|
uint64_t offset_bytes = start_index * sizeof (PMQ_Slot);
|
|
|
|
Untyped_Slice slice = slots.sub_slice(start_index, count).untyped();
|
|
|
|
pmq_debug_f("Persist %" PRIu64 " slots (%zu bytes) starting from %" PRIu64,
|
|
count, slice.size(), start_index);
|
|
|
|
bool ret = pmq_pwrite_all(q->slotsfile_fd.get(), slice, offset_bytes, "slots-file");
|
|
|
|
if (ret)
|
|
{
|
|
q->persister.stats.wal_flushes += 1;
|
|
q->persister.stats.wal_flush_bytes += slice.size();
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
// Helper function for pmq_persist()
|
|
// NOTE: persister lock must be taken!
|
|
// This function writes all unpersisted slots to the slots-file. The point of
|
|
// not writing them as a chunk is that we only want to write complete chunk
|
|
// buffers. The reason is that each chunk gets written once only (append-only),
|
|
// and chunks are fixed-size (PMQ_CHUNK_SIZE). Syncing a complete chunk would
|
|
// mean spending at least a whole chunk regardless of the number of messages in
|
|
// it.
|
|
static bool pmq_persist_unpersisted_slots(PMQ *q, SSN persist_ssn)
|
|
{
|
|
PMQ_PROFILED_FUNCTION;
|
|
|
|
Persist_Cursors *pc = &q->persister.persist_cursors;
|
|
|
|
if (sn64_lt(pc->wal_ssn, pc->cks_ssn))
|
|
{
|
|
// The chunk store contains more recent data than the slots file.
|
|
// Effectively clear slotsfile, by setting the valid range from cks_ssn to cks_ssn
|
|
pc->wal_ssn = pc->cks_ssn;
|
|
pc->wal_msn = pc->cks_msn;
|
|
}
|
|
|
|
if (sn64_ge(pc->wal_ssn, persist_ssn))
|
|
{
|
|
return true;
|
|
}
|
|
|
|
pmq_debug_f("Persist unpersisted slots from %" PRIu64 " to %" PRIu64, pc->wal_ssn.value(), persist_ssn.value());
|
|
|
|
SSN ssn_lo = pc->wal_ssn;
|
|
SSN ssn_hi = persist_ssn;
|
|
|
|
uint64_t count = q->in_queue.slots.slot_count();
|
|
uint64_t mask = pmq_mask_power_of_2(count);
|
|
uint64_t i_lo = ssn_lo.value() & mask;
|
|
uint64_t i_hi = ssn_hi.value() & mask;
|
|
|
|
bool ret;
|
|
if (i_lo <= i_hi)
|
|
{
|
|
ret = pmq_persist_slots_slice(q, i_lo, i_hi - i_lo);
|
|
}
|
|
else
|
|
{
|
|
ret = pmq_persist_slots_slice(q, i_lo, count - i_lo);
|
|
if (ret)
|
|
ret = pmq_persist_slots_slice(q, 0, i_hi);
|
|
}
|
|
|
|
if (! ret)
|
|
{
|
|
pmq_perr_f("Failed to persist to slots-file!");
|
|
return false;
|
|
}
|
|
|
|
q->persister.stats.fsync_calls += 1;
|
|
if (fsync(q->slotsfile_fd.get()) < 0)
|
|
{
|
|
pmq_perr_ef(errno, "fsync() of slots file failed");
|
|
return false;
|
|
}
|
|
|
|
// Only now we update the cursor.
|
|
pc->wal_ssn = ssn_hi;
|
|
|
|
// We also need to update the MSN accordingly. To find the MSN, because the
|
|
// MSN is currently not stored in the slots, instead we count the number of
|
|
// slot leaders.
|
|
// This assumes that all the follower slots have been atomically enqueued
|
|
// with each leader.
|
|
|
|
for (SSN ssn = ssn_lo; ssn != ssn_hi; ssn ++)
|
|
{
|
|
PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn);
|
|
if ((slot->flags & PMQ_SLOT_LEADER_MASK))
|
|
{
|
|
pc->wal_msn ++;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Helper function, only used by pmq_commit()
|
|
// Compute the next value of cks_discard_csn.
|
|
// NOTE: persister lock must be taken
|
|
static CSN pmq_compute_next_discard_csn(PMQ *q)
|
|
{
|
|
Persist_Cursors *pc = &q->persister.persist_cursors;
|
|
|
|
uint64_t chunks_count = pc->cks_csn - pc->cks_discard_csn;
|
|
uint64_t low_mark = q->chunk_store.chunks_low_watermark;
|
|
uint64_t high_mark = q->chunk_store.chunks_high_watermark;
|
|
|
|
if (chunks_count < high_mark)
|
|
return pc->cks_discard_csn;
|
|
|
|
CSN old_discard_csn = pc->cks_discard_csn;
|
|
CSN new_discard_csn = pc->cks_csn - low_mark;
|
|
pmq_debug_f("Discarding chunks from %" PRIu64 " to %" PRIu64,
|
|
old_discard_csn.value(), new_discard_csn.value());
|
|
|
|
return new_discard_csn;
|
|
}
|
|
|
|
// persister lock must be taken
|
|
bool __pmq_profiled pmq_commit(PMQ *q)
|
|
{
|
|
PMQ_PROFILED_FUNCTION;
|
|
|
|
{
|
|
q->persister.stats.fsync_calls += 1;
|
|
if (fsync(q->chunk_store.chunk_fd.get()) < 0)
|
|
{
|
|
pmq_perr_ef(errno, "fsync() of chunks file failed");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
Persist_Cursors *pc = &q->persister.persist_cursors;
|
|
|
|
Commit_Record commit_record;
|
|
commit_record.inqueue_slotcount = q->in_queue.slot_count;
|
|
commit_record.slotsfile_size_bytes = q->slotsfile_size_bytes;
|
|
commit_record.wal_ssn = pc->wal_ssn;
|
|
commit_record.wal_msn = pc->wal_msn;
|
|
commit_record.chunkfile_size_bytes = q->chunk_store.capacity_bytes;
|
|
commit_record.cks_csn = pc->cks_csn;
|
|
commit_record.cks_msn = pc->cks_msn;
|
|
commit_record.cks_ssn = pc->cks_ssn;
|
|
commit_record.cks_discard_csn = pmq_compute_next_discard_csn(q);
|
|
|
|
{
|
|
Untyped_Slice slice = Untyped_Slice(&commit_record, sizeof commit_record);
|
|
|
|
if (! pmq_pwrite_all(q->statefile_fd.get(), slice, 0, "state.dat file"))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
{
|
|
if (fsync(q->statefile_fd.get()) < 0)
|
|
{
|
|
pmq_perr_ef(errno, "fsync() of statefile failed");
|
|
return false;
|
|
}
|
|
q->persister.stats.fsync_calls += 1;
|
|
}
|
|
|
|
// Successfully committed the next discard cursor, now we can recycle the
|
|
// released chunks internally.
|
|
pc->cks_discard_csn = commit_record.cks_discard_csn;
|
|
|
|
q->pub_persister_stats.store(q->persister.stats);
|
|
q->pub_persist_cursors.store(q->persister.persist_cursors);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Persist messages from the In_Queue to the Chunk_Queue, at least until reaching ssn.
|
|
// The given max_ssn is the hard stop, a good choice here is the In_Queue's ssn_mem.
|
|
// The function isn't able to determine max_ssn as ssn_mem on its own, since it
|
|
// may or may not be used from within the enqueuer context.
|
|
// NOTE: This function tries to fill the current Chunk_Buffer once it reaches compact_ssn.
|
|
static bool pmq_persist(PMQ *q, SSN ssn, SSN max_ssn)
|
|
{
|
|
if (! pmq_compact(q, ssn, max_ssn))
|
|
goto error;
|
|
|
|
if (! pmq_persist_finished_chunk_buffers(q))
|
|
goto error;
|
|
|
|
if (! pmq_persist_unpersisted_slots(q, ssn))
|
|
goto error;
|
|
|
|
if (! pmq_commit(q))
|
|
goto error;
|
|
|
|
return true;
|
|
|
|
error:
|
|
pmq_perr_f("Failed to persist slots");
|
|
return false;
|
|
}
|
|
|
|
// Only meant to be called by pmq_sync()
|
|
static bool _pmq_sync(PMQ *q)
|
|
{
|
|
In_Queue_Cursors ic;
|
|
PMQ_PROFILED_LOCK(lock_, q->persist_mutex);
|
|
|
|
{
|
|
PMQ_PROFILED_SCOPE("wait-fill");
|
|
PMQ_PROFILED_UNIQUE_LOCK(lock_, q->pub_in_queue_mutex);
|
|
for (;;)
|
|
{
|
|
ic = q->pub_in_queue_cursors;
|
|
pmq_assert(sn64_le(ic.ssn_disk, ic.ssn_mem));
|
|
uint64_t slots_fill = ic.ssn_mem - ic.ssn_disk;
|
|
if (slots_fill >= q->in_queue.slots_persist_watermark)
|
|
break;
|
|
auto max_wait_time = std::chrono::milliseconds(50);
|
|
auto wait_result = q->pub_in_queue_cond.wait_for(lock_, max_wait_time);
|
|
if (wait_result == std::cv_status::timeout)
|
|
break;
|
|
q->persister.stats.wakeups += 1;
|
|
}
|
|
}
|
|
|
|
if (false) // NOLINT
|
|
{
|
|
pmq_debug_f("ic.ssn_mem is now %" PRIu64, ic.ssn_mem.value());
|
|
uint64_t slots_fill = ic.ssn_mem - ic.ssn_disk;
|
|
pmq_debug_f("slots_fill is now %" PRIu64, slots_fill);
|
|
pmq_debug_f("slots_persist_watermark is %" PRIu64, q->in_queue.slots_persist_watermark);
|
|
}
|
|
|
|
if (! pmq_persist(q, ic.ssn_mem, ic.ssn_mem))
|
|
return false;
|
|
|
|
q->persister.stats.num_async_flushes += 1;
|
|
return true;
|
|
}
|
|
|
|
// Entry point to persisted all messages that have been successfully enqueued
|
|
// so far. Concurrent operations (e.g. pmq_enqueue_msg()) are possible, but may
|
|
// not be persisted this time.
|
|
bool pmq_sync(PMQ *q)
|
|
{
|
|
PMQ_PROFILED_FUNCTION;
|
|
|
|
bool ret = _pmq_sync(q);
|
|
|
|
if (! ret)
|
|
pmq_perr_f("Failed to pmq_sync()!");
|
|
|
|
return ret;
|
|
}
|
|
|
|
// Helper function for pmq_enqueue_msg
|
|
// Attempts to make enough room in the In_Queue
|
|
// enqueue_mutex must be locked.
|
|
|
|
static bool __pmq_profiled pmq_prepare_input_slots(PMQ *q, uint64_t nslots_req)
|
|
{
|
|
PMQ_PROFILED_FUNCTION;
|
|
|
|
In_Queue_Cursors *ic = &q->enqueuer.in_queue_cursors;
|
|
|
|
uint64_t slot_count = q->in_queue.slot_count;
|
|
SSN next_ssn_mem = ic->ssn_mem + nslots_req;
|
|
|
|
pmq_assert(ic->ssn_mem - ic->ssn_disk <= slot_count);
|
|
|
|
if (next_ssn_mem - ic->ssn_disk <= slot_count)
|
|
return true;
|
|
|
|
// Update the ssn_disk cursor from the pub_persist_cursors. Those hold the
|
|
// same values as q->persister.persist_cursors, just ever so slightly
|
|
// outdated. This information lets us detect if we can jump out early,
|
|
// without requiring to lock the persister context, which can take a lot of
|
|
// time.
|
|
{
|
|
Persist_Cursors pc = q->pub_persist_cursors.load();
|
|
ic->msn_disk = pc.cks_msn;
|
|
ic->ssn_disk = pc.cks_ssn;
|
|
}
|
|
|
|
if (next_ssn_mem - ic->ssn_disk <= slot_count)
|
|
return true;
|
|
|
|
// Still not enough room, need to switch to persister context (lock
|
|
// it) and flush some more messages.
|
|
|
|
q->enqueuer.enqueuer_stats.buffer_full_count += 1;
|
|
|
|
PMQ_PROFILED_LOCK(lock_, q->persist_mutex);
|
|
|
|
if (! pmq_persist(q, next_ssn_mem - slot_count, ic->ssn_mem))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (false) // NOLINT
|
|
{
|
|
Chunk_Queue *cq = &q->chunk_queue;
|
|
Persist_Cursors *pc = &q->persister.persist_cursors;
|
|
pmq_assert(sn64_ge(cq->cq_ssn, ic->ssn_mem));
|
|
pmq_debug_f("ic->ssn_mem: %" PRIu64 ", cq_ssn - cks_ssn: %" PRIu64,
|
|
ic->ssn_mem.value(), cq->cq_ssn.value() - pc->cks_ssn.value());
|
|
}
|
|
|
|
if (false) // NOLINT
|
|
{
|
|
SSN old_ssn_disk = ic->ssn_disk;
|
|
SSN new_ssn_disk = q->persister.persist_cursors.cks_ssn;
|
|
|
|
pmq_debug_f("Flushed %" PRIu64 " ssns", ic->ssn_disk - old_ssn_disk);
|
|
|
|
if (sn64_le(new_ssn_disk, old_ssn_disk))
|
|
{
|
|
pmq_perr_f("Something is wrong: %" PRIu64 ", %" PRIu64,
|
|
old_ssn_disk.value(), ic->ssn_disk.value());
|
|
}
|
|
pmq_assert(slot_count >= (ic->ssn_mem - ic->ssn_disk));
|
|
}
|
|
|
|
// Update the ssn_disk cursor from the (locked) persister context.
|
|
{
|
|
ic->msn_disk = q->persister.persist_cursors.cks_msn;
|
|
ic->ssn_disk = q->persister.persist_cursors.cks_ssn;
|
|
}
|
|
|
|
pmq_assert(next_ssn_mem - ic->ssn_disk <= slot_count);
|
|
return true;
|
|
}
|
|
|
|
// Helper function for pmq_enqueue_msg().
|
|
// Serialize message to In_Queue's memory buffer.
|
|
// Expects enqueue_mutex to be taken.
|
|
// Expects that there is enough room to serialize the message (pmq_prepare_input_slots())
|
|
static void pmq_serialize_msg(PMQ *q, const void *data, size_t size)
|
|
{
|
|
PMQ_PROFILED_FUNCTION;
|
|
|
|
In_Queue_Cursors *ic = &q->enqueuer.in_queue_cursors;
|
|
|
|
SSN ssn_mem = ic->ssn_mem;
|
|
SSN old_ssn_mem = ic->ssn_mem;
|
|
|
|
uint64_t slot_count = q->in_queue.slot_count;
|
|
pmq_assert(pmq_is_power_of_2(slot_count));
|
|
uint32_t slot_flags = PMQ_SLOT_LEADER_MASK;
|
|
|
|
// write full slots
|
|
size_t i = 0;
|
|
while (i + PMQ_SLOT_SPACE <= size)
|
|
{
|
|
PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn_mem);
|
|
slot->flags = slot_flags;
|
|
slot->msgsize = size - i;
|
|
|
|
memcpy(__pmq_assume_aligned<16>(slot->payload), (const char *) data + i, PMQ_SLOT_SPACE);
|
|
|
|
ssn_mem += 1;
|
|
i += PMQ_SLOT_SPACE;
|
|
slot_flags &= ~PMQ_SLOT_LEADER_MASK;
|
|
}
|
|
|
|
// write last slot
|
|
if (i < size)
|
|
{
|
|
PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn_mem);
|
|
slot->flags = slot_flags;
|
|
slot->msgsize = size - i;
|
|
memcpy(__pmq_assume_aligned<16>(slot->payload), (const char *) data + i, size - i);
|
|
|
|
ssn_mem += 1;
|
|
}
|
|
|
|
// can bump ssn_mem cursor, publish new cursors field, and release lock now
|
|
|
|
ic->ssn_mem = ssn_mem;
|
|
ic->msn += 1;
|
|
|
|
q->enqueuer.enqueuer_stats.total_messages_enqueued += 1;
|
|
q->enqueuer.enqueuer_stats.total_bytes_enqueued += size;
|
|
|
|
{
|
|
uint64_t new_slot_count = ic->ssn_mem - ic->ssn_disk;
|
|
uint64_t old_slot_count = old_ssn_mem - ic->ssn_disk;
|
|
|
|
bool notify =
|
|
old_slot_count < q->in_queue.slots_persist_watermark &&
|
|
new_slot_count >= q->in_queue.slots_persist_watermark;
|
|
|
|
{
|
|
PMQ_PROFILED_UNIQUE_LOCK(lock_, q->pub_in_queue_mutex);
|
|
q->pub_in_queue_cursors = *ic;
|
|
if (notify)
|
|
q->pub_in_queue_cond.notify_one();
|
|
}
|
|
}
|
|
|
|
pmq_assert(ic->ssn_mem - ic->ssn_disk <= slot_count);
|
|
}
|
|
|
|
bool pmq_enqueue_msg(PMQ *q, const void *data, size_t size)
|
|
{
|
|
PMQ_PROFILED_FUNCTION;
|
|
|
|
pmq_assert(size > 0);
|
|
uint64_t nslots_req = (size + PMQ_SLOT_SPACE - 1) / PMQ_SLOT_SPACE;
|
|
|
|
PMQ_PROFILED_LOCK(lock_, q->enqueue_mutex);
|
|
if (! pmq_prepare_input_slots(q, nslots_req))
|
|
return false;
|
|
pmq_serialize_msg(q, data, size);
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
static void pmq_init_chunk_store_size(Chunk_Store *cks, uint64_t capacity_bytes)
|
|
{
|
|
pmq_assert(pmq_is_power_of_2(capacity_bytes));
|
|
pmq_assert(capacity_bytes >= PMQ_Megabytes(64));
|
|
|
|
cks->capacity_bytes = capacity_bytes;
|
|
|
|
uint64_t chunks_count = cks->capacity_bytes >> PMQ_CHUNK_SHIFT;
|
|
|
|
// What is a reasonable watermark at which we should start discarding chunks?
|
|
// Note that while discarding a chunk is logically only advancing a CSN cursor,
|
|
// it's very expensive because we have to fsync() that updated cursor to disk.
|
|
// For now, I'm deciding to set them to chunks_count minus 256 resp. 512.
|
|
// On each discard we'll be discarding between (hi_mark - low_mark) and
|
|
// (chunks_count - low_mark) chunks, i.e. between 16 and 32 MiB of data.
|
|
// These values should be fair when targetting a reasonable throughput of
|
|
// 2GB/sec and an fsync() latency of ~5ms.
|
|
cks->chunks_low_watermark = chunks_count - 512;
|
|
cks->chunks_high_watermark = chunks_count - 256;
|
|
|
|
pmq_assert(cks->chunks_low_watermark < chunks_count);
|
|
pmq_assert(cks->chunks_high_watermark < chunks_count);
|
|
pmq_assert(cks->chunks_low_watermark < cks->chunks_high_watermark);
|
|
}
|
|
|
|
static bool pmq_init_createnew(PMQ *q, const PMQ_Init_Params *params)
|
|
{
|
|
const char *basedir_path = q->basedir_path.get().buffer;
|
|
|
|
if (mkdir(basedir_path, 0750) == -1)
|
|
{
|
|
pmq_perr_ef(errno, "Failed to create queue directory %s", basedir_path);
|
|
return false;
|
|
}
|
|
|
|
q->basedir_fd = pmq_open_dir(basedir_path);
|
|
|
|
if (! q->basedir_fd.valid())
|
|
{
|
|
pmq_perr_ef(errno,
|
|
"Failed to open the directory we created: %s", basedir_path);
|
|
return false;
|
|
}
|
|
|
|
// Initialize In_Queue_Cursors to all 0.
|
|
{
|
|
q->enqueuer.in_queue_cursors = In_Queue_Cursors {};
|
|
}
|
|
|
|
// Initialize persister cursors to all 0.
|
|
{
|
|
q->persister.persist_cursors = Persist_Cursors {};
|
|
}
|
|
|
|
// Create slots-file.
|
|
// The slots-file is called "wal.dat" but it's not really a WAL -- only a
|
|
// buffer to store the rest slots that didn't make a complete chunk page.
|
|
{
|
|
q->slotsfile_fd = pmq_openat_regular_create(q->basedir_fd.get(),
|
|
"wal.dat", O_RDWR, 0644);
|
|
|
|
if (! q->slotsfile_fd.valid())
|
|
{
|
|
pmq_perr_ef(errno, "Failed to create slots file (wal.dat)");
|
|
return false;
|
|
}
|
|
|
|
//TODO: currently this must be the same size as the in-memory slots buffer. Fix this, we only need a tiny file on disk to persist the remaining slots that
|
|
//didn't fill a complete chunk page.
|
|
q->slotsfile_size_bytes = q->in_queue.size_bytes;
|
|
|
|
if (fallocate(q->slotsfile_fd.get(), FALLOC_FL_ZERO_RANGE,
|
|
0, q->slotsfile_size_bytes) == -1)
|
|
{
|
|
pmq_perr_ef(errno, "Failed to fallocate() slots file");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Create chunk store
|
|
{
|
|
Chunk_Store *cks = &q->chunk_store;
|
|
|
|
uint64_t create_size = params->create_size;
|
|
|
|
if (create_size == 0)
|
|
create_size = PMQ_Gigabytes(1); // default to 1 GiB
|
|
|
|
if (create_size < PMQ_Megabytes(64))
|
|
{
|
|
pmq_perr_f("PMQ_Init_Params::create_size is invalid: "
|
|
"Must be at least 64 MiB. Requested: %" PRIu64, create_size);
|
|
return false;
|
|
}
|
|
|
|
if (! pmq_is_power_of_2(create_size))
|
|
{
|
|
pmq_warn_f("PMQ_Init_Params::create_size is not a power of 2: %" PRIu64, create_size);
|
|
create_size *= 2;
|
|
while (! pmq_is_power_of_2(create_size))
|
|
create_size = create_size & (create_size - 1);
|
|
pmq_warn_f("PMQ_Init_Params::create_size is not a power of 2: rounded up to %" PRIu64, create_size);
|
|
}
|
|
|
|
pmq_init_chunk_store_size(cks, create_size);
|
|
|
|
cks->chunk_fd = pmq_openat_regular_create(q->basedir_fd.get(),
|
|
"chunks.dat", O_RDWR, 0644);
|
|
|
|
if (! cks->chunk_fd.valid())
|
|
{
|
|
pmq_perr_ef(errno, "Failed to create chunks file");
|
|
return false;
|
|
}
|
|
|
|
if (fallocate(cks->chunk_fd.get(), FALLOC_FL_ZERO_RANGE,
|
|
0, cks->capacity_bytes) == -1)
|
|
{
|
|
pmq_perr_ef(errno, "Failed to fallocate() chunks file"
|
|
" to size %" PRIu64, cks->capacity_bytes);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Create state.dat file
|
|
{
|
|
q->statefile_fd = pmq_openat_regular_create(q->basedir_fd.get(),
|
|
"state.dat", O_RDWR, 0644);
|
|
|
|
if (! q->statefile_fd.valid())
|
|
{
|
|
pmq_perr_ef(errno, "Failed to open state.dat file");
|
|
return false;
|
|
}
|
|
|
|
// Is it ok to try and reuse the pmq_commit() function to initialize the file?
|
|
if (! pmq_commit(q))
|
|
return false;
|
|
}
|
|
|
|
// Sync basedir to make sure the new files are persisted.
|
|
{
|
|
if (fsync(q->basedir_fd.get()) == -1)
|
|
{
|
|
pmq_perr_ef(errno, "Error from fsync() on base directory");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool __pmq_validate_commit_record_weak_ordering(
|
|
uint64_t sn_lo, uint64_t sn_hi, const char *name_lo, const char *name_hi)
|
|
{
|
|
if (! _sn64_le(sn_lo, sn_hi))
|
|
{
|
|
pmq_perr_f("Integrity error in state.dat file: We expected %s <= %s"
|
|
" but their values are %" PRIu64 " > %" PRIu64,
|
|
name_lo, name_hi, sn_lo, sn_hi);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template<typename T>
|
|
static bool _pmq_validate_commit_record_weak_ordering(
|
|
T sn_lo, T sn_hi, const char *name_lo, const char *name_hi)
|
|
{
|
|
return __pmq_validate_commit_record_weak_ordering(
|
|
sn_lo.value(), sn_hi.value(), name_lo, name_hi);
|
|
}
|
|
|
|
#define pmq_validate_commit_record_weak_ordering(cr, lo, hi) \
|
|
_pmq_validate_commit_record_weak_ordering((cr).lo, (cr).hi, #lo, #hi)
|
|
|
|
|
|
static bool pmq_inithelper_check_file_size(
|
|
int fd, uint64_t expected_file_size, const char *what_file)
|
|
{
|
|
pmq_assert(fd >= 0);
|
|
|
|
struct stat st;
|
|
|
|
if (fstat(fd, &st) == -1)
|
|
{
|
|
pmq_perr_ef(errno, "Failed to fstat() %s", what_file);
|
|
return false;
|
|
}
|
|
|
|
if (! S_ISREG(st.st_mode))
|
|
{
|
|
pmq_perr_f("Internal error: Expected regular file");
|
|
return false;
|
|
}
|
|
|
|
uint64_t actual_file_size = (uint64_t) st.st_size;
|
|
|
|
if (actual_file_size != expected_file_size)
|
|
{
|
|
pmq_perr_f("%s has wrong size. Expected: %" PRIu64 ", got: %" PRIu64,
|
|
what_file, expected_file_size, actual_file_size);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool pmq_init_loadexisting(PMQ *q)
|
|
{
|
|
// Open State File
|
|
{
|
|
q->statefile_fd = pmq_openat_regular_existing(q->basedir_fd.get(),
|
|
"state.dat", O_RDWR);
|
|
|
|
if (! q->statefile_fd.valid())
|
|
{
|
|
pmq_perr_ef(errno, "Failed to open state.dat file");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
Commit_Record commit_record;
|
|
|
|
// Load commit record and store in commit_record
|
|
{
|
|
if (! pmq_pread_all(q->statefile_fd.get(),
|
|
Untyped_Slice(&commit_record, sizeof commit_record),
|
|
0, "state.dat"))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (! pmq_validate_commit_record_weak_ordering(commit_record, cks_discard_csn, cks_csn))
|
|
return false;
|
|
|
|
if (! pmq_validate_commit_record_weak_ordering(commit_record, cks_ssn, wal_ssn))
|
|
return false;
|
|
|
|
if (! pmq_validate_commit_record_weak_ordering(commit_record, cks_msn, wal_msn))
|
|
return false;
|
|
|
|
{
|
|
uint64_t file_size = commit_record.chunkfile_size_bytes;
|
|
|
|
if ((file_size % PMQ_CHUNK_SIZE) != 0)
|
|
{
|
|
pmq_perr_f(
|
|
"state.dat file contains invalid chunkfile size: "
|
|
"%" PRIu64 " which is not a multiple of the chunk size "
|
|
"(%" PRIu64 ")", file_size, PMQ_CHUNK_SIZE);
|
|
return false;
|
|
}
|
|
|
|
uint64_t chunks_count = file_size / PMQ_CHUNK_SIZE;
|
|
CSN csn_lo = commit_record.cks_discard_csn;
|
|
CSN csn_hi = commit_record.cks_csn;
|
|
|
|
if (csn_hi - csn_lo > chunks_count)
|
|
{
|
|
pmq_perr_f("state.dat cks_discard_csn=%" PRIu64 ", cks_csn=%" PRIu64,
|
|
csn_lo.value(), csn_hi.value());
|
|
pmq_perr_f(
|
|
"state.dat file contains invalid chunk cursor positions: "
|
|
" Their distance exceeds the size of the chunks-file "
|
|
"(%" PRIu64 " > %" PRIu64 ".", csn_hi - csn_lo, chunks_count);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
{
|
|
uint64_t file_size = commit_record.slotsfile_size_bytes;
|
|
|
|
if ((file_size % PMQ_SLOT_SIZE) != 0)
|
|
{
|
|
pmq_perr_f(
|
|
"state.dat file contains invalid slots-file size: "
|
|
"%" PRIu64 " which is not a multiple of the slot size "
|
|
"(%" PRIu64 ")", file_size, PMQ_SLOT_SIZE);
|
|
return false;
|
|
}
|
|
|
|
uint64_t slots_count = file_size / PMQ_SLOT_SIZE;
|
|
SSN ssn_lo = commit_record.cks_ssn;
|
|
SSN ssn_hi = commit_record.wal_ssn;
|
|
|
|
if (ssn_hi - ssn_lo > slots_count)
|
|
{
|
|
pmq_perr_f(
|
|
"state.dat file contains invalid slot cursor positions: "
|
|
" Their distance exceeds the size of the slots-file.");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO: Currently the slots-file and the in-memory slots-ringbuffer are the same size
|
|
// Later, make the slots-file smaller (just because it doesn't need to be very big)
|
|
// and be very careful how to load to memory.
|
|
{
|
|
q->slotsfile_size_bytes = commit_record.slotsfile_size_bytes;
|
|
q->slotsfile_fd = pmq_openat_regular_existing(q->basedir_fd.get(),
|
|
"wal.dat", O_RDWR);
|
|
|
|
if (! q->slotsfile_fd.valid())
|
|
{
|
|
pmq_perr_ef(errno, "Failed to open slots file (wal.dat)");
|
|
return false;
|
|
}
|
|
|
|
if (! pmq_inithelper_check_file_size(q->slotsfile_fd.get(),
|
|
q->slotsfile_size_bytes, "state-file (state.dat)"))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
if (! pmq_pread_all(
|
|
q->slotsfile_fd.get(),
|
|
q->in_queue.slots.as_slice().untyped(),
|
|
0, "slots-file (wal.dat)"))
|
|
{
|
|
pmq_perr_f("Failed to read from slots file to in-memory slots ringbuffer");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Load chunk store
|
|
{
|
|
Chunk_Store *cks = &q->chunk_store;
|
|
|
|
pmq_init_chunk_store_size(cks, commit_record.chunkfile_size_bytes);
|
|
|
|
cks->chunk_fd = pmq_openat_regular_existing(q->basedir_fd.get(),
|
|
"chunks.dat", O_RDWR);
|
|
|
|
if (! cks->chunk_fd.valid())
|
|
{
|
|
pmq_perr_ef(errno, "Failed to open chunks.dat file");
|
|
return false;
|
|
}
|
|
|
|
if (! pmq_inithelper_check_file_size(cks->chunk_fd.get(),
|
|
cks->capacity_bytes, "chunk file (chunks.dat)"))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Initialize In_Queue_Cursors
|
|
{
|
|
In_Queue_Cursors ic;
|
|
ic.msn = commit_record.wal_msn;
|
|
ic.ssn_mem = commit_record.wal_ssn;
|
|
ic.msn_disk = commit_record.cks_msn;
|
|
ic.ssn_disk = commit_record.cks_ssn;
|
|
|
|
q->pub_in_queue_cursors = ic;
|
|
}
|
|
|
|
// Initialize persister cursors
|
|
{
|
|
Persist_Cursors pc;
|
|
pc.wal_ssn = commit_record.wal_ssn;
|
|
pc.wal_msn = commit_record.wal_msn;
|
|
pc.cks_csn = commit_record.cks_csn;
|
|
pc.cks_msn = commit_record.cks_msn;
|
|
pc.cks_ssn = commit_record.cks_ssn;
|
|
pc.cks_discard_csn = commit_record.cks_discard_csn;
|
|
|
|
q->pub_persist_cursors.store(pc);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool pmq_init(PMQ *q, const PMQ_Init_Params *params)
|
|
{
|
|
q->basedir_path.set(params->basedir_path);
|
|
|
|
const char *basedir_path = q->basedir_path.get().buffer;
|
|
|
|
// Set up In_Queue
|
|
// This is currently independent of any database state, so we can do it first.
|
|
{
|
|
// TODO how to find proper size (slot count) for the In_Queue buffer?
|
|
// For most use cases, we don't need extremely high bandwidth, but we
|
|
// should think about making it tunable and come up with recommendations.
|
|
// Or even allow it to be sized dynamically.
|
|
|
|
q->in_queue.slot_count = 512 * 1024; // each slot is 128 bytes
|
|
q->in_queue.size_bytes = q->in_queue.slot_count * PMQ_SLOT_SIZE;
|
|
q->in_queue.slots_persist_watermark = q->in_queue.slot_count / 2;
|
|
|
|
pmq_debug_f("in-queue size: %" PRIu64 " (%" PRIu64 " slots)",
|
|
q->in_queue.size_bytes, q->in_queue.slot_count);
|
|
|
|
// We could consider making an SHM file here to back the In_Queue memory,
|
|
// making the In_Queue persist across application restarts.
|
|
// This would allow to recover any message that was successfully enqueued
|
|
// to the In_Queue (unless the machine was also restarted or crashed
|
|
// before recovery). On the other hand, it would require elaborate
|
|
// recovery code.
|
|
|
|
if (! q->in_queue.mapping.create(NULL, q->in_queue.size_bytes,
|
|
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0))
|
|
{
|
|
pmq_perr_ef(errno, "Failed to mmap() queue memory");
|
|
return false;
|
|
}
|
|
|
|
PMQ_Slot *slots = (PMQ_Slot *) q->in_queue.mapping.get();
|
|
pmq_assert(slots);
|
|
__pmq_assert_aligned(slots, 16);
|
|
|
|
q->in_queue.slots.reset(Slice<PMQ_Slot>(slots, q->in_queue.slot_count));
|
|
}
|
|
|
|
|
|
// Create or load the on-disk database
|
|
|
|
q->basedir_fd = pmq_open_dir(basedir_path);
|
|
|
|
if (! q->basedir_fd.valid())
|
|
{
|
|
if (! (errno == ENOTDIR || errno == ENOENT))
|
|
{
|
|
pmq_perr_ef(errno, "Failed to open queue directory at %s",
|
|
basedir_path);
|
|
return false;
|
|
}
|
|
|
|
pmq_msg_f("No queue directory present at %s", basedir_path);
|
|
pmq_msg_f("Creating new queue directory at %s", basedir_path);
|
|
|
|
if (! pmq_init_createnew(q, params))
|
|
{
|
|
pmq_perr_f("Failed to create queue directory at %s", basedir_path);
|
|
return false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pmq_msg_f("Loading existing queue from %s", basedir_path);
|
|
|
|
if (! pmq_init_loadexisting(q))
|
|
{
|
|
pmq_perr_f("Failed to load queue directory at %s", basedir_path);
|
|
return false;
|
|
}
|
|
|
|
if (params->create_size != 0 &&
|
|
params->create_size != q->chunk_store.capacity_bytes)
|
|
{
|
|
pmq_warn_f("NOTE: Configured chunk store size is %" PRIu64
|
|
" bytes, which is different from the size of the existing"
|
|
" chunk store: %" PRIu64 " bytes."
|
|
" The chunk store size configuration is currently only"
|
|
" considered when creating a new chunk store.",
|
|
params->create_size,
|
|
q->chunk_store.capacity_bytes);
|
|
}
|
|
}
|
|
|
|
// Set up cursors
|
|
q->enqueuer.in_queue_cursors = q->pub_in_queue_cursors;
|
|
q->persister.persist_cursors = q->pub_persist_cursors.load();
|
|
|
|
// Initialize Chunk_Queue
|
|
{
|
|
Chunk_Queue *cq = &q->chunk_queue;
|
|
cq->cq_csn = q->persister.persist_cursors.cks_csn;
|
|
cq->cq_ssn = q->persister.persist_cursors.cks_ssn;
|
|
cq->cq_msn = q->persister.persist_cursors.cks_msn;
|
|
|
|
cq->chunks_alloc_slice.allocate(2); // only 2 chunk buffers
|
|
|
|
{
|
|
uint64_t map_size_bytes = cq->chunks_alloc_slice.capacity() * PMQ_CHUNK_SIZE;
|
|
|
|
if (! cq->chunk_buffer_mapping.create(NULL, map_size_bytes,
|
|
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0))
|
|
{
|
|
pmq_perr_ef(errno,
|
|
"Failed to mmap() %" PRIu64 " bytes for chunk buffer",
|
|
map_size_bytes);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
cq->chunks.reset(cq->chunks_alloc_slice.slice());
|
|
for (uint64_t i = 0; i < cq->chunks.slot_count(); i++)
|
|
{
|
|
Chunk_Buffer *cb = cq->chunks.get_slot_for(CSN(i));
|
|
cb->data = (char *) cq->chunk_buffer_mapping.get() + (i * PMQ_CHUNK_SIZE);
|
|
// initialized later, anyway...
|
|
cb->msn = MSN(0);
|
|
cb->ssn = SSN(0);
|
|
}
|
|
|
|
// current chunk page buffer starts out empty.
|
|
cq->msg_count = 0;
|
|
|
|
// Since each message is at least 1 byte large, and requires a 2 byte
|
|
// offset stored as well, we can have no more than this number of
|
|
// messages (and thus offsets) in each chunk.
|
|
cq->offsets.allocate(PMQ_CHUNK_SIZE / 3);
|
|
|
|
// Set up for submitting messages to first chunk buffer in the Chunk_Queue
|
|
// NOTE I think this will use the Chunk_Buffer identified by the value of cks_csn
|
|
// after the queue was loaded.
|
|
if (! pmq_begin_current_chunk_buffer(q))
|
|
return false;
|
|
}
|
|
|
|
{
|
|
In_Queue_Cursors ic = q->enqueuer.in_queue_cursors;
|
|
pmq_debug_f("in_queue_cursors.msn: %" PRIu64, ic.msn.value());
|
|
pmq_debug_f("in_queue_cursors.ssn_mem: %" PRIu64, ic.ssn_mem.value());
|
|
pmq_debug_f("in_queue_cursors.msn_disk: %" PRIu64, ic.msn_disk.value());
|
|
pmq_debug_f("in_queue_cursors.ssn_disk: %" PRIu64, ic.ssn_disk.value());
|
|
}
|
|
|
|
{
|
|
Chunk_Queue *cq = &q->chunk_queue;
|
|
pmq_debug_f("chunk_queue.cq_csn: %" PRIu64, cq->cq_csn.value());
|
|
pmq_debug_f("chunk_queue.cq_msn: %" PRIu64, cq->cq_msn.value());
|
|
pmq_debug_f("chunk_queue.cq_ssn: %" PRIu64, cq->cq_ssn.value());
|
|
}
|
|
|
|
{
|
|
Persist_Cursors pc = q->persister.persist_cursors;
|
|
pmq_debug_f("persister.wal_ssn: %" PRIu64, pc.wal_ssn.value());
|
|
pmq_debug_f("persister.wal_msn: %" PRIu64, pc.wal_msn.value());
|
|
pmq_debug_f("persister.cks_csn: %" PRIu64, pc.cks_csn.value());
|
|
pmq_debug_f("persister.cks_msn: %" PRIu64, pc.cks_msn.value());
|
|
pmq_debug_f("persister.cks_ssn: %" PRIu64, pc.cks_ssn.value());
|
|
pmq_debug_f("persister.cks_discard_csn: %" PRIu64, pc.cks_discard_csn.value());
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
PMQ *pmq_create(const PMQ_Init_Params *params)
|
|
{
|
|
PMQ *q = new PMQ();
|
|
if (! q)
|
|
return nullptr;
|
|
if (! pmq_init(q, params))
|
|
{
|
|
delete q;
|
|
return nullptr;
|
|
}
|
|
return q;
|
|
}
|
|
|
|
// This function makes sure that all messages currently written are synced to disk.
|
|
// When calling this function, all concurrent access (e.g. pmq_enqueue_msg())
|
|
// must have returned and no new ones may be done.
|
|
void pmq_destroy(PMQ *q)
|
|
{
|
|
if (! pmq_sync(q))
|
|
{
|
|
pmq_warn_f("Failed to sync the queue before shutting it down");
|
|
}
|
|
delete q;
|
|
}
|
|
|
|
|
|
|
|
|
|
/* PMQ_Reader */
|
|
|
|
// To be able to read the newest messages, which may not be persisted to the
|
|
// chunk store but only to the slot-file / in_queue, we need multiple read modes.
|
|
// We keep track of where we're reading explicitly because we need to reset the
|
|
// appropriate cursors whenever we're switching between the modes.
|
|
enum PMQ_Read_Mode
|
|
{
|
|
PMQ_Read_Mode_Chunkstore,
|
|
PMQ_Read_Mode_Slotsfile,
|
|
};
|
|
|
|
// read state for reading from the slots file
|
|
struct PMQ_Slots_Readstate
|
|
{
|
|
SSN ssn;
|
|
};
|
|
|
|
// read state for reading from the chunk store
|
|
struct PMQ_Chunks_Readstate
|
|
{
|
|
// tracks the current csn
|
|
CSN cnk_csn;
|
|
|
|
// tracks whether the chunk indicated by cnk_csn is loaded.
|
|
bool cnk_loaded;
|
|
|
|
// This data is extracted from chunk_page (only valid if cnk_loaded)
|
|
MSN cnk_msn;
|
|
uint64_t cnk_msgcount;
|
|
Alloc_Slice<unsigned char> cnk_buffer;
|
|
Slice<uint16_t> cnk_msgoffsets; // msg-offsets (a subrange inside the cnk_buffer)
|
|
};
|
|
|
|
struct PMQ_Reader
|
|
{
|
|
PMQ *q;
|
|
|
|
// to prevent races, the reader has its own copy of the Persist_Cursors.
|
|
// They get updated only before reading a message.
|
|
Persist_Cursors persist_cursors;
|
|
|
|
// the MSN of the next message we're going to read.
|
|
// Gets incremented each time pmq_read_msg() is called.
|
|
MSN msn;
|
|
|
|
PMQ_Read_Mode read_mode;
|
|
|
|
// Place to store an error. This will prevent reading after an error.
|
|
// Subsequent seeking (if successful) will clear the error.
|
|
PMQ_Read_Result last_result;
|
|
|
|
PMQ_Slots_Readstate slots_readstate;
|
|
PMQ_Chunks_Readstate chunks_readstate;
|
|
};
|
|
|
|
struct PMQ_Msg_Output
|
|
{
|
|
void *data;
|
|
// size of the data buffer
|
|
size_t data_size;
|
|
// where the caller wants the size of the message to be written.
|
|
size_t *size_out;
|
|
|
|
PMQ_Msg_Output(void *data, size_t data_size, size_t *size_out)
|
|
: data(data), data_size(data_size), size_out(size_out)
|
|
{}
|
|
};
|
|
|
|
static void pmq_reader_update_persist_cursors(PMQ_Reader *reader)
|
|
{
|
|
reader->persist_cursors = reader->q->pub_persist_cursors.load();
|
|
}
|
|
|
|
static bool pmq_reader_validate_chunk_hdr(PMQ_Chunks_Readstate *ckread, Pointer<const PMQ_Chunk_Hdr> hdr)
|
|
{
|
|
if (hdr->msgcount == 0)
|
|
{
|
|
pmq_perr_f("Read invalid chunk %" PRIu64 ": msgcount is 0.",
|
|
ckread->cnk_csn.value());
|
|
return false;
|
|
}
|
|
|
|
uint64_t off_end = (uint64_t) hdr->msgoffsets_off + (hdr->msgcount + 1) * sizeof (uint16_t);
|
|
if (off_end > PMQ_CHUNK_SIZE)
|
|
{
|
|
pmq_perr_f("Read invalid chunk %" PRIu64 ": msg-offsets array exceeds chunk size."
|
|
" msgcount: %" PRIu64 ", msgoffsets_off: %" PRIu64,
|
|
ckread->cnk_csn.value(), (uint64_t) hdr->msgcount, (uint64_t) hdr->msgoffsets_off);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static PMQ_Read_Result pmq_load_chunk(PMQ_Reader *reader)
|
|
{
|
|
pmq_assert(reader->read_mode == PMQ_Read_Mode_Chunkstore);
|
|
|
|
PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate;
|
|
|
|
pmq_assert(!ckread->cnk_loaded);
|
|
|
|
PMQ *q = reader->q;
|
|
|
|
if (sn64_le(reader->persist_cursors.cks_csn, ckread->cnk_csn))
|
|
{
|
|
return PMQ_Read_Result_EOF;
|
|
}
|
|
|
|
Chunk_Store *cks = &q->chunk_store;
|
|
|
|
// load chunk
|
|
uint64_t mask = pmq_mask_power_of_2(q->chunk_store.chunk_count());
|
|
uint64_t index = ckread->cnk_csn.value() & mask;
|
|
uint64_t offset = index << PMQ_CHUNK_SHIFT;
|
|
|
|
Untyped_Slice buffer_slice = ckread->cnk_buffer.untyped_slice();
|
|
pmq_assert(buffer_slice.size() == PMQ_CHUNK_SIZE);
|
|
|
|
if (! pmq_pread_all(
|
|
cks->chunk_fd.get(), buffer_slice, offset, "chunk in chunk file"))
|
|
{
|
|
return PMQ_Read_Result_IO_Error;
|
|
}
|
|
|
|
Pointer<const PMQ_Chunk_Hdr> hdr = (PMQ_Chunk_Hdr *) buffer_slice.data();
|
|
|
|
// first check if the chunk is still supposed to be there -- it might have
|
|
// been overwritten by the next chunk
|
|
{
|
|
pmq_reader_update_persist_cursors(reader);
|
|
if (sn64_lt(ckread->cnk_csn, reader->persist_cursors.cks_discard_csn))
|
|
{
|
|
pmq_debug_f("LOST SYNC: ckread->cnk_csn=%" PRIu64 ", reader->persist_cursors.cks_discard_csn=%" PRIu64,
|
|
ckread->cnk_csn.value(), reader->persist_cursors.cks_discard_csn.value());
|
|
return PMQ_Read_Result_Out_Of_Bounds;
|
|
}
|
|
}
|
|
|
|
if (! pmq_reader_validate_chunk_hdr(ckread, hdr))
|
|
{
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
|
|
// Initial validation of the loaded chunk completed.
|
|
// Set variables and return success.
|
|
|
|
ckread->cnk_msn = hdr->msn;
|
|
ckread->cnk_msgcount = hdr->msgcount;
|
|
ckread->cnk_msgoffsets = Slice<uint16_t>(
|
|
(uint16_t *) ((char *) ckread->cnk_buffer.data() + hdr->msgoffsets_off),
|
|
hdr->msgcount + 1);
|
|
ckread->cnk_loaded = true;
|
|
|
|
// Set the MSN to this chunk's msn too.
|
|
reader->msn = hdr->msn;
|
|
|
|
return PMQ_Read_Result_Success;
|
|
}
|
|
|
|
static void pmq_reset_to_specific_chunk(PMQ_Reader *reader, CSN csn)
|
|
{
|
|
PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate;
|
|
ckread->cnk_loaded = false;
|
|
ckread->cnk_csn = csn;
|
|
}
|
|
|
|
static PMQ_Read_Result pmq_reset_to_specific_chunk_and_load(
|
|
PMQ_Reader *reader, CSN csn)
|
|
{
|
|
pmq_reset_to_specific_chunk(reader, csn);
|
|
return pmq_load_chunk(reader);
|
|
}
|
|
|
|
static void pmq_reader_copy_chunk_header(PMQ_Chunks_Readstate *ckread, PMQ_Chunk_Hdr *out)
|
|
{
|
|
*out = *(PMQ_Chunk_Hdr *) ckread->cnk_buffer.data();
|
|
}
|
|
|
|
static bool pmq_check_chunk_msns(CSN csn_lo, CSN csn_hi,
|
|
const PMQ_Chunk_Hdr *hdr_lo, const PMQ_Chunk_Hdr *hdr_hi)
|
|
{
|
|
if (sn64_lt(csn_hi, csn_lo))
|
|
return pmq_check_chunk_msns(csn_hi, csn_lo, hdr_hi, hdr_lo);
|
|
|
|
MSN cnk_lo_last_msn = hdr_lo->msn + hdr_lo->msgcount;
|
|
MSN cnk_hi_first_msn = hdr_hi->msn;
|
|
|
|
if (csn_lo + 1 == csn_hi)
|
|
{
|
|
if (cnk_lo_last_msn != cnk_hi_first_msn)
|
|
{
|
|
pmq_perr_f("Integrity error while reading chunks: MSN %" PRIu64
|
|
" was expected in the chunk following chunk %" PRIu64
|
|
" but found %" PRIu64,
|
|
cnk_lo_last_msn.value(),
|
|
csn_lo.value(),
|
|
cnk_hi_first_msn.value());
|
|
return false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// maybe we should check sn64_lt() instead of sn64_le(), because
|
|
// chunks must contain at least 1 message, at least currently.
|
|
if (! sn64_le(cnk_lo_last_msn, cnk_hi_first_msn))
|
|
{
|
|
pmq_perr_f("Integrity error while reading chunks: Chunk SN %" PRIu64
|
|
" < %" PRIu64 " but these chunks have low / high MSNs "
|
|
"%" PRIu64 " >= %" PRIu64,
|
|
csn_lo.value(),
|
|
csn_hi.value(),
|
|
cnk_lo_last_msn.value(),
|
|
cnk_hi_first_msn.value());
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static PMQ_Read_Result pmq_bsearch_msg(PMQ_Reader *reader, MSN msn, CSN csn_lo, CSN csn_hi)
|
|
{
|
|
if (reader->read_mode != PMQ_Read_Mode_Chunkstore)
|
|
{
|
|
reader->read_mode = PMQ_Read_Mode_Chunkstore;
|
|
reader->chunks_readstate.cnk_loaded = false;
|
|
}
|
|
|
|
PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate;
|
|
|
|
bool hdr_valid = false;
|
|
PMQ_Chunk_Hdr hdr;
|
|
CSN hdr_csn;
|
|
|
|
for (;;)
|
|
{
|
|
CSN csn = csn_lo + (csn_hi - csn_lo) / 2;
|
|
|
|
PMQ_Read_Result readres = pmq_reset_to_specific_chunk_and_load(reader, csn);
|
|
|
|
if (readres == PMQ_Read_Result_Out_Of_Bounds)
|
|
{
|
|
// Assuming that csn_lo was valid when we were called,
|
|
// we now have a situation where the chunk was concurrently discarded.
|
|
if (csn == csn_lo)
|
|
{
|
|
// Already at the final recursion (csn_lo + 1 == csn_hi). Search
|
|
// space is now empty.
|
|
return PMQ_Read_Result_Out_Of_Bounds;
|
|
}
|
|
// shrink the search space, adapt lower boundary to account for the concurrently discarded data.
|
|
csn_lo = csn + 1;
|
|
}
|
|
else if (readres == PMQ_Read_Result_EOF)
|
|
{
|
|
// Could this happen? I believe not. We're assuming that at the start,
|
|
// csn_lo == csn_hi or csn_hi - 1 was valid.
|
|
assert(0);
|
|
}
|
|
else if (readres != PMQ_Read_Result_Success)
|
|
{
|
|
return readres;
|
|
}
|
|
else
|
|
{
|
|
if (hdr_valid)
|
|
{
|
|
PMQ_Chunk_Hdr old_hdr = hdr;
|
|
CSN old_csn = hdr_csn;
|
|
|
|
pmq_reader_copy_chunk_header(ckread, &hdr);
|
|
hdr_csn = csn;
|
|
|
|
if (! pmq_check_chunk_msns(csn, old_csn, &hdr, &old_hdr))
|
|
{
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pmq_reader_copy_chunk_header(ckread, &hdr);
|
|
hdr_csn = csn;
|
|
hdr_valid = true;
|
|
}
|
|
|
|
PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate;
|
|
|
|
if (sn64_lt(msn, ckread->cnk_msn))
|
|
{
|
|
if (csn == csn_lo)
|
|
// already final iteration
|
|
return PMQ_Read_Result_Out_Of_Bounds;
|
|
csn_hi = csn;
|
|
}
|
|
else if (sn64_ge(msn, ckread->cnk_msn + ckread->cnk_msgcount))
|
|
{
|
|
if (csn == csn_lo)
|
|
// already final iteration
|
|
return PMQ_Read_Result_Out_Of_Bounds;
|
|
csn_lo = csn + 1;
|
|
}
|
|
else
|
|
{
|
|
// message inside this block.
|
|
return PMQ_Read_Result_Success;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static PMQ_Read_Result pmq_reader_seek_to_msg_chunkstore(PMQ_Reader *reader, MSN msn)
|
|
{
|
|
Persist_Cursors pc = reader->persist_cursors;
|
|
|
|
pmq_assert(sn64_le(pc.cks_discard_csn, pc.cks_csn));
|
|
if (pc.cks_discard_csn == pc.cks_csn)
|
|
{
|
|
// The store is empty.
|
|
// Since we already detected that msn is older than pc.cks_msn, we return
|
|
// Out_Of_Bounds, not EOF.
|
|
return PMQ_Read_Result_Out_Of_Bounds;
|
|
}
|
|
|
|
CSN csn_lo = pc.cks_discard_csn;
|
|
CSN csn_hi = pc.cks_csn - 1;
|
|
|
|
PMQ_Read_Result result = pmq_bsearch_msg(reader, msn, csn_lo, csn_hi);
|
|
|
|
if (result != PMQ_Read_Result_Success)
|
|
return result;
|
|
|
|
// Currently setting the msn only after the appropriate chunk was found and
|
|
// loaded successfully. We might want to change this later.
|
|
reader->msn = msn;
|
|
return PMQ_Read_Result_Success;
|
|
}
|
|
|
|
struct PMQ_Slot_Header_Read_Result
|
|
{
|
|
bool is_leader_slot;
|
|
uint16_t msgsize;
|
|
uint16_t nslots_req;
|
|
};
|
|
|
|
// Helper for function that read slots.
|
|
// NOTE: Enqueuer lock must be held!
|
|
static PMQ_Read_Result pmq_read_slot_header(PMQ *q, SSN ssn, PMQ_Slot_Header_Read_Result *out)
|
|
{
|
|
//XXX this code is copied and adapted from pmq_compact()
|
|
|
|
const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn);
|
|
|
|
// Extract message size from slot header.
|
|
out->is_leader_slot = (slot->flags & PMQ_SLOT_LEADER_MASK) != 0;
|
|
out->msgsize = slot->msgsize;
|
|
out->nslots_req = (slot->msgsize + PMQ_SLOT_SPACE - 1) / PMQ_SLOT_SPACE;
|
|
|
|
// TODO validate msgsize field, does it make sense?
|
|
|
|
return PMQ_Read_Result_Success;
|
|
}
|
|
|
|
// Seek message in the slots file.
|
|
// Currently this requires locking the enqueuer and a linear scan.
|
|
// We should look for ways to improve.
|
|
static PMQ_Read_Result pmq_reader_seek_to_msg_slotsfile(PMQ_Reader *reader, MSN msn)
|
|
{
|
|
Persist_Cursors pc = reader->persist_cursors;
|
|
assert(sn64_inrange(msn, pc.cks_msn, pc.wal_msn)); // checked in caller
|
|
|
|
std::lock_guard<std::mutex> lock(reader->q->enqueue_mutex);
|
|
|
|
// To prevent races, we need to check again using the enqueuer's cursors
|
|
// that the MSN that we're looking for is still in the In_Queue.
|
|
|
|
In_Queue_Cursors *ic = &reader->q->enqueuer.in_queue_cursors;
|
|
if (sn64_inrange(msn, ic->msn_disk, ic->msn))
|
|
{
|
|
if (sn64_inrange(pc.wal_msn, msn, ic->msn))
|
|
// this is almost guaranteed but there is a race that should be
|
|
// impossible in practice (requires ic->msn to wrap around between
|
|
// msn and pc.wal_msn).
|
|
{
|
|
MSN msn_cur = ic->msn_disk;
|
|
SSN ssn_cur = ic->ssn_disk;
|
|
|
|
while (msn_cur != msn)
|
|
{
|
|
if (sn64_ge(ssn_cur, pc.wal_ssn))
|
|
{
|
|
pmq_perr_f("Integrity Error: Reached end of persisted region in slotsfile "
|
|
"but did not encounter msn=%" PRIu64, msn.value());
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
|
|
PMQ_Slot_Header_Read_Result slot_read_result;
|
|
|
|
if (PMQ_Read_Result readres = pmq_read_slot_header(reader->q, ssn_cur, &slot_read_result);
|
|
readres != PMQ_Read_Result_Success)
|
|
{
|
|
return readres;
|
|
}
|
|
|
|
if (! slot_read_result.is_leader_slot)
|
|
{
|
|
// Earlier there was an assert() here instead of an integrity check,
|
|
// assuming that RAM should never be corrupted. However, the RAM might
|
|
// be filled from disk, and we currently don't validate the data after
|
|
// loading. Thus we now consider slot memory just as corruptible as
|
|
// disk data.
|
|
pmq_perr_f("Integrity Error: slot %" PRIu64 " is not a leader slot.", ssn_cur.value());
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
|
|
if (pc.wal_ssn - ssn_cur < slot_read_result.nslots_req)
|
|
{
|
|
pmq_perr_f("Integrity Error: forwarding %d slots through the slots file"
|
|
" would skip over persisted region", (int) slot_read_result.nslots_req);
|
|
pmq_perr_f("current msn=%" PRIu64 ", ssn=%" PRIu64 ", last valid slot is %" PRIu64,
|
|
msn_cur.value(), ssn_cur.value(), pc.wal_msn.value());
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
|
|
ssn_cur += slot_read_result.nslots_req;
|
|
msn_cur += 1;
|
|
}
|
|
|
|
reader->read_mode = PMQ_Read_Mode_Slotsfile;
|
|
reader->slots_readstate.ssn = ssn_cur;
|
|
return PMQ_Read_Result_Success;
|
|
}
|
|
}
|
|
|
|
// if we missed the window (race condition) we can expect to find the message in the chunk store.
|
|
return pmq_reader_seek_to_msg_chunkstore(reader, msn);
|
|
}
|
|
|
|
static PMQ_Read_Result pmq_reader_seek_to_msg_impl_real(PMQ_Reader *reader, MSN msn)
|
|
{
|
|
pmq_reader_update_persist_cursors(reader);
|
|
|
|
Persist_Cursors pc = reader->persist_cursors;
|
|
|
|
if (sn64_ge(msn, pc.cks_msn))
|
|
{
|
|
if (sn64_gt(msn, pc.wal_msn))
|
|
{
|
|
return PMQ_Read_Result_Out_Of_Bounds;
|
|
}
|
|
return pmq_reader_seek_to_msg_slotsfile(reader, msn);
|
|
}
|
|
|
|
return pmq_reader_seek_to_msg_chunkstore(reader, msn);
|
|
}
|
|
|
|
static PMQ_Read_Result pmq_reader_seek_to_msg_impl(PMQ_Reader *reader, MSN msn)
|
|
{
|
|
PMQ_Read_Result result = pmq_reader_seek_to_msg_impl_real(reader, msn);
|
|
reader->last_result = result;
|
|
|
|
if (result == PMQ_Read_Result_Success)
|
|
{
|
|
reader->msn = msn;
|
|
}
|
|
else
|
|
{
|
|
pmq_assert(result != PMQ_Read_Result_EOF); // seeking shouldn't return EOF
|
|
}
|
|
return result;
|
|
}
|
|
|
|
PMQ_Read_Result pmq_reader_seek_to_msg(PMQ_Reader *reader, uint64_t msn_value)
|
|
{
|
|
MSN msn = MSN(msn_value);
|
|
return pmq_reader_seek_to_msg_impl(reader, msn);
|
|
}
|
|
|
|
PMQ_Read_Result pmq_reader_seek_to_current(PMQ_Reader *reader)
|
|
{
|
|
pmq_reader_update_persist_cursors(reader);
|
|
|
|
MSN msn = reader->persist_cursors.wal_msn;
|
|
pmq_debug_f("Try seeking to MSN %" PRIu64, msn.value());
|
|
|
|
return pmq_reader_seek_to_msg_impl(reader, msn);
|
|
}
|
|
|
|
PMQ_Read_Result pmq_reader_seek_to_csn_impl(PMQ_Reader *reader, CSN csn)
|
|
{
|
|
Persist_Cursors *pc = &reader->persist_cursors;
|
|
|
|
if (uint64_t chunks_in_store = pc->cks_csn - pc->cks_discard_csn;
|
|
csn - pc->cks_discard_csn >= chunks_in_store)
|
|
{
|
|
if (csn == pc->cks_csn)
|
|
{
|
|
// While we cannot know the msn from a chunk (there are no chunks) we
|
|
// can take the cks_msn instead.
|
|
// This should return EOF but the reader should positioned correctly.
|
|
return pmq_reader_seek_to_msg_impl(reader, pc->cks_msn);
|
|
}
|
|
return PMQ_Read_Result_Out_Of_Bounds;
|
|
}
|
|
|
|
// Otherwise, let's load a chunk and read the oldest msn from there.
|
|
// The reader state management should be cleaned up. It's not very clear
|
|
// what all the members mean and how they need to be mutated.
|
|
|
|
reader->read_mode = PMQ_Read_Mode_Chunkstore;
|
|
|
|
PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate;
|
|
|
|
PMQ_Read_Result result = pmq_reset_to_specific_chunk_and_load(reader, csn);
|
|
|
|
if (result != PMQ_Read_Result_Success)
|
|
{
|
|
// EOF should not happen because of our prior checks.
|
|
// I would like to use an assert but at least in theory there is the
|
|
// chance of a wraparound happening concurrently.
|
|
if (result == PMQ_Read_Result_EOF)
|
|
{
|
|
// EOF would be misleading since we are not "positioned". Not sure what to do currently.
|
|
result = PMQ_Read_Result_Out_Of_Bounds;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
reader->msn = ckread->cnk_msn;
|
|
return PMQ_Read_Result_Success;
|
|
}
|
|
|
|
PMQ_Read_Result pmq_reader_seek_to_oldest(PMQ_Reader *reader)
|
|
{
|
|
pmq_reader_update_persist_cursors(reader);
|
|
|
|
CSN csn = reader->persist_cursors.cks_discard_csn;
|
|
pmq_debug_f("Try seeking to CSN %" PRIu64, csn.value());
|
|
|
|
reader->last_result = pmq_reader_seek_to_csn_impl(reader, csn);
|
|
if (reader->last_result == PMQ_Read_Result_Success)
|
|
{
|
|
pmq_debug_f("Succeeded in seeking to CSN %" PRIu64 ". MSN is %" PRIu64,
|
|
csn.value(), reader->msn.value());
|
|
}
|
|
else
|
|
{
|
|
pmq_debug_f("Seeking to CSN failed");
|
|
}
|
|
return reader->last_result;
|
|
}
|
|
|
|
static PMQ_Read_Result pmq_read_msg_slotsfile(PMQ_Reader *reader, PMQ_Msg_Output output);
|
|
|
|
// Attempt to read the message given by reader->msn from the chunk store.
|
|
// We may have to switch to reading from the slotsfile if we detect an EOF.
|
|
static PMQ_Read_Result pmq_read_msg_chunkstore(PMQ_Reader *reader, PMQ_Msg_Output output)
|
|
{
|
|
pmq_assert(reader->read_mode == PMQ_Read_Mode_Chunkstore);
|
|
|
|
PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate;
|
|
|
|
if (! ckread->cnk_loaded)
|
|
{
|
|
PMQ_Read_Result readres = pmq_load_chunk(reader);
|
|
|
|
if (readres == PMQ_Read_Result_EOF)
|
|
{
|
|
pmq_debug_f("Reader switches to slots file");
|
|
// Switch to slot-file read mode
|
|
reader->read_mode = PMQ_Read_Mode_Slotsfile;
|
|
reader->slots_readstate.ssn = reader->persist_cursors.cks_ssn;
|
|
return pmq_read_msg_slotsfile(reader, output);
|
|
}
|
|
|
|
if (readres != PMQ_Read_Result_Success)
|
|
{
|
|
return readres;
|
|
}
|
|
|
|
pmq_assert(ckread->cnk_loaded);
|
|
}
|
|
else if (reader->msn - ckread->cnk_msn == ckread->cnk_msgcount)
|
|
{
|
|
// Load next chunk
|
|
CSN csn_old = ckread->cnk_csn;
|
|
PMQ_Chunk_Hdr hdr_old;
|
|
pmq_reader_copy_chunk_header(ckread, &hdr_old);
|
|
|
|
PMQ_Read_Result readres =
|
|
pmq_reset_to_specific_chunk_and_load(reader, ckread->cnk_csn + 1);
|
|
|
|
if (readres != PMQ_Read_Result_Success)
|
|
return readres;
|
|
|
|
CSN csn_new = ckread->cnk_csn;
|
|
PMQ_Chunk_Hdr hdr_new;
|
|
pmq_reader_copy_chunk_header(ckread, &hdr_new);
|
|
|
|
if (! pmq_check_chunk_msns(csn_old, csn_new, &hdr_old, &hdr_new))
|
|
{
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
}
|
|
|
|
// Chunk is present
|
|
pmq_assert(sn64_le(ckread->cnk_msn, reader->msn));
|
|
pmq_assert(sn64_lt(reader->msn, ckread->cnk_msn + ckread->cnk_msgcount));
|
|
uint64_t msgindex = reader->msn - ckread->cnk_msn;
|
|
uint64_t msgoff = ckread->cnk_msgoffsets.at(msgindex);
|
|
uint64_t nextoff = ckread->cnk_msgoffsets.at(msgindex + 1);
|
|
uint64_t msgsize = nextoff - msgoff;
|
|
|
|
if (msgoff >= nextoff)
|
|
{
|
|
pmq_perr_f("Invalid offsets in chunk %" PRIu64
|
|
": Offset #%u and #%u are %u > %u",
|
|
ckread->cnk_csn.value(),
|
|
(unsigned) msgindex, (unsigned) msgindex + 1,
|
|
(unsigned) msgoff, (unsigned) nextoff);
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
|
|
if (nextoff > PMQ_CHUNK_SIZE)
|
|
{
|
|
pmq_perr_f("Invalid offset in chunk %" PRIu64 ": "
|
|
"Offset #%u = %u exceed chunk size",
|
|
ckread->cnk_csn.value(),
|
|
(unsigned) msgindex + 1, (unsigned) nextoff);
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
|
|
*output.size_out = msgsize;
|
|
|
|
if (msgsize <= output.data_size)
|
|
{
|
|
Untyped_Slice slice = ckread->cnk_buffer.untyped_slice().offset_bytes(msgoff);
|
|
copy_from_slice(output.data, slice, msgsize);
|
|
}
|
|
|
|
reader->msn += 1;
|
|
|
|
return PMQ_Read_Result_Success;
|
|
}
|
|
|
|
// Attempt to read the message given by reader->msn from the chunk store.
|
|
// We may have to switch to reading from the chunk store if we detect that
|
|
// we've lost sync -- this may happen if the message we want to read has
|
|
// already disappeared (was overwritten) from the slotsfile.
|
|
static PMQ_Read_Result pmq_read_msg_slotsfile(PMQ_Reader *reader, PMQ_Msg_Output output)
|
|
{
|
|
pmq_assert(reader->read_mode == PMQ_Read_Mode_Slotsfile);
|
|
|
|
PMQ *q = reader->q;
|
|
PMQ_Slots_Readstate *slread = &reader->slots_readstate;
|
|
SSN ssn = slread->ssn;
|
|
|
|
if (ssn == reader->persist_cursors.wal_ssn)
|
|
{
|
|
return PMQ_Read_Result_EOF;
|
|
}
|
|
|
|
if (sn64_lt(reader->persist_cursors.wal_ssn, ssn))
|
|
{
|
|
pmq_debug_f("sn64_lt(reader->persist_cursors.wal_ssn, ssn): wal_ssn=%" PRIu64 ", ssn=%" PRIu64,
|
|
reader->persist_cursors.wal_ssn.value(), ssn.value());
|
|
// Should we even allow this to happen?
|
|
return PMQ_Read_Result_Out_Of_Bounds;
|
|
}
|
|
|
|
// NOTE: We need to be careful to avoid that the ringbuffer slots that we
|
|
// read get overwritten concurrently because of new messages being enqueued.
|
|
// For now we will simply lock the in_queue. We may try to optimize this later.
|
|
// One possible approach could be to check that the slots that we read from
|
|
// are valid -- check it both before and after we read the slots.
|
|
|
|
// !!! IDEA !!! instead of locking the in-queue, we could lock the persister.
|
|
// The reason why this should work is that data from the in-queue only gets
|
|
// overwritten after having been persisted.
|
|
// On the other hand, locking the persister might block for an unreasonable
|
|
// amount of time.
|
|
|
|
std::lock_guard<std::mutex> lock(q->enqueue_mutex);
|
|
|
|
// check that the message we're looking for is still there.
|
|
if (sn64_gt(q->enqueuer.in_queue_cursors.ssn_disk, ssn))
|
|
{
|
|
// The slot was already overwritten before we took the lock.
|
|
// pmq_reader_seek_to_msg() should find the message in the chunk store.
|
|
PMQ_Read_Result readres = pmq_reader_seek_to_msg_impl(reader, reader->msn);
|
|
|
|
if (readres != PMQ_Read_Result_Success)
|
|
return readres;
|
|
|
|
return pmq_read_msg_chunkstore(reader, output);
|
|
}
|
|
|
|
PMQ_Slot_Header_Read_Result slot_read_result;
|
|
|
|
{
|
|
PMQ_Read_Result readres = pmq_read_slot_header(q, ssn, &slot_read_result);
|
|
if (readres != PMQ_Read_Result_Success)
|
|
return readres;
|
|
}
|
|
|
|
if (! slot_read_result.is_leader_slot)
|
|
{
|
|
// Earlier there was an assert() here instead of an integrity check,
|
|
// assuming that RAM should never be corrupted. However, the RAM might
|
|
// be filled from disk, and we currently don't validate the data after
|
|
// loading. Thus we now consider slot memory just as corruptible as
|
|
// disk data.
|
|
pmq_perr_f("slot %" PRIu64 " is not a leader slot.", ssn.value());
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
|
|
*output.size_out = slot_read_result.msgsize;
|
|
|
|
if (slot_read_result.msgsize > output.data_size)
|
|
return PMQ_Read_Result_Buffer_Too_Small;
|
|
|
|
if (reader->persist_cursors.wal_ssn - ssn < slot_read_result.nslots_req)
|
|
{
|
|
pmq_perr_f("Integrity error: Read inconsistent msgsize from slot");
|
|
return PMQ_Read_Result_Integrity_Error;
|
|
}
|
|
|
|
// copy one message
|
|
{
|
|
char *dst = (char *) output.data;
|
|
uint64_t remain = slot_read_result.msgsize;
|
|
for (; remain >= PMQ_SLOT_SPACE;)
|
|
{
|
|
const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn);
|
|
const char *src = __pmq_assume_aligned<16>(slot->payload);
|
|
memcpy(dst, src, PMQ_SLOT_SPACE);
|
|
++ ssn;
|
|
dst += PMQ_SLOT_SPACE;
|
|
remain -= PMQ_SLOT_SPACE;
|
|
}
|
|
if (remain)
|
|
{
|
|
const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn);
|
|
const char *src = __pmq_assume_aligned<16>(slot->payload);
|
|
memcpy(dst, src, remain);
|
|
++ ssn;
|
|
}
|
|
}
|
|
|
|
slread->ssn = ssn;
|
|
reader->msn += 1;
|
|
|
|
return PMQ_Read_Result_Success;
|
|
}
|
|
|
|
PMQ_Read_Result pmq_read_msg(PMQ_Reader *reader,
|
|
void *data, size_t size, size_t *out_size)
|
|
{
|
|
if (reader->last_result != PMQ_Read_Result_Success
|
|
&& reader->last_result != PMQ_Read_Result_EOF)
|
|
{
|
|
return reader->last_result; // need to seek to clear the error!
|
|
}
|
|
|
|
PMQ_Msg_Output output(data, size, out_size);
|
|
|
|
pmq_reader_update_persist_cursors(reader);
|
|
|
|
if (sn64_ge(reader->msn, reader->persist_cursors.wal_msn))
|
|
{
|
|
if (reader->msn == reader->persist_cursors.wal_msn)
|
|
{
|
|
//pmq_debug_f("Reader reaches EOF at msn=%" PRIu64, reader->msn.value());
|
|
return PMQ_Read_Result_EOF;
|
|
}
|
|
return PMQ_Read_Result_Out_Of_Bounds;
|
|
}
|
|
|
|
switch (reader->read_mode)
|
|
{
|
|
case PMQ_Read_Mode_Chunkstore:
|
|
pmq_debug_f("Read message %" PRIu64 " from chunk store.", reader->msn.value());
|
|
return pmq_read_msg_chunkstore(reader, output);
|
|
case PMQ_Read_Mode_Slotsfile:
|
|
pmq_debug_f("Read message %" PRIu64 " from slots file.", reader->msn.value());
|
|
return pmq_read_msg_slotsfile(reader, output);
|
|
default:
|
|
// shouldn't happen.
|
|
pmq_assert(0);
|
|
abort();
|
|
}
|
|
}
|
|
|
|
PMQ_Reader *pmq_reader_create(PMQ *q)
|
|
{
|
|
PMQ_Reader *reader = new PMQ_Reader;
|
|
|
|
if (! reader)
|
|
{
|
|
pmq_perr_f("Failed to allocate reader!");
|
|
return nullptr;
|
|
}
|
|
|
|
reader->q = q;
|
|
reader->msn = MSN(0);
|
|
reader->read_mode = PMQ_Read_Mode_Chunkstore;
|
|
reader->last_result = PMQ_Read_Result_Success;
|
|
reader->chunks_readstate.cnk_csn = CSN(0); // for now
|
|
reader->chunks_readstate.cnk_buffer.allocate(PMQ_CHUNK_SIZE);
|
|
reader->chunks_readstate.cnk_loaded = false;
|
|
reader->chunks_readstate.cnk_msn = MSN(0);
|
|
reader->chunks_readstate.cnk_msgcount = 0;
|
|
reader->slots_readstate.ssn = SSN(0);
|
|
return reader;
|
|
}
|
|
|
|
void pmq_reader_destroy(PMQ_Reader *reader)
|
|
{
|
|
// TODO?
|
|
delete reader;
|
|
}
|
|
|
|
PMQ *pmq_reader_get_pmq(PMQ_Reader *reader)
|
|
{
|
|
return reader->q;
|
|
}
|
|
|
|
uint64_t pmq_reader_get_current_msn(PMQ_Reader *reader)
|
|
{
|
|
return reader->msn.value();
|
|
}
|
|
|
|
uint64_t pmq_reader_find_old_msn(PMQ_Reader *reader)
|
|
{
|
|
for (uint64_t distance = 1; ; distance = (distance ? 2 * distance : 1))
|
|
{
|
|
pmq_reader_update_persist_cursors(reader);
|
|
Persist_Cursors persist_cursors = reader->persist_cursors;
|
|
CSN csn = persist_cursors.cks_discard_csn + distance;
|
|
if (sn64_ge(csn, persist_cursors.cks_csn))
|
|
{
|
|
return persist_cursors.cks_msn.value();
|
|
}
|
|
// possible optimization: don't load the whole chunk but only the header
|
|
PMQ_Read_Result readres = pmq_reset_to_specific_chunk_and_load(reader, csn);
|
|
if (readres == PMQ_Read_Result_Success)
|
|
{
|
|
return reader->msn.value();
|
|
}
|
|
}
|
|
}
|
|
|
|
PMQ_Persist_Info pmq_reader_get_persist_info(PMQ_Reader *reader)
|
|
{
|
|
return pmq_get_persist_info(reader->q);
|
|
}
|
|
|
|
bool pmq_reader_eof(PMQ_Reader *reader)
|
|
{
|
|
// this is a bit wacky -- we read the pub_persist_cursors, which requires a mutex lock,
|
|
// because we do not know from the current context if we could just access the Persister State's prive persist_cursors.
|
|
|
|
// NOTE: We expect that wal_msn is always kept "in front" of cks_msn (the chunk-store MSN).
|
|
// Even when the wal does not have any additional slots -- in this case, we expect wal_msn == cks_msn.
|
|
|
|
pmq_reader_update_persist_cursors(reader);
|
|
|
|
MSN wal_msn = reader->persist_cursors.wal_msn;
|
|
return sn64_ge(reader->msn, wal_msn);
|
|
}
|