#include "pmq_common.hpp" #include "pmq.hpp" static constexpr uint64_t PMQ_SLOT_SIZE = 128; static constexpr uint64_t PMQ_SLOT_HEADER_SIZE = 16; static constexpr uint64_t PMQ_SLOT_SPACE = PMQ_SLOT_SIZE - PMQ_SLOT_HEADER_SIZE; // A bit somewhere in the slot header that indicates that a given slot is the // first slot of a sequence of slots that hold a message. static constexpr uint64_t PMQ_SLOT_LEADER_MASK = 1; static constexpr uint64_t PMQ_CHUNK_SHIFT = 16; static constexpr uint64_t PMQ_CHUNK_SIZE = (uint64_t) 1 << PMQ_CHUNK_SHIFT; class CSN_Tag{}; class SSN_Tag{}; class MSN_Tag{}; using CSN = SN; using SSN = SN; using MSN = SN; struct PMQ_Chunk_Hdr { // msn: msn of first message stored in this chunk. // msgoffsets_off: offset in the chunk to an array of (msgcount + 1) offsets. MSN msn; uint16_t msgcount; uint16_t msgoffsets_off; uint32_t pad; // pad to 16 bytes for now }; struct PMQ_Slot { uint32_t flags; uint32_t msgsize; uint32_t pad0; uint32_t pad1; char payload[PMQ_SLOT_SPACE]; __pmq_artificial_method Untyped_Slice payload_untyped_slice() { return Untyped_Slice(payload, sizeof payload); } }; /* In-memory enqueue buffer. This is where incoming message get written first. * It consists of a ringbuffer of fixed-size slots. * A slot has a header and a payload. The size of each slot is PMQ_SLOT_SIZE, * and the payload can be up to up to PMQ_SLOT_SPACE bytes. * In the future, we might support even concurrent writes. * This structure needs no locking; its contents are static except when * initialization and destroying. * Accessing the cursors though needs a mutex lock. */ struct In_Queue { uint64_t slot_count = 0; uint64_t size_bytes = 0; // slot_count * PMQ_SLOT_SIZE // When reaching this fill level (in slots) we persist unconditionally. uint64_t slots_persist_watermark = 0; // A shared memory file store that survives application restarts (but not // OS restarts). Posix_FD shm_fd; // Memory mapping of @shm_fd. MMap_Region mapping; Ringbuffer slots; // the buffer from the mapping. }; /* Cursors published by the enqueuer. The ssn_* members index into the * In_Queue. They are consumed by the persister and by reader threads. * They get written by enqueuer threads only. * * NOTE: The *_disk cursors are treated as "tail" cursors i.e. mark the end of * the valid region of the queue. They are a copy from the persister's cursors. * They get copied only from time to time as necessary if the In_Queue ran out * of space to store new messages: We have ssn_disk <= ssn_mem and msn_disk <= * msn. The cursors indicate how far the persister has come in persisting * chunks by compacting messages (stored in slots) from the In_Queue. */ struct In_Queue_Cursors { MSN msn; SSN ssn_mem; MSN msn_disk; SSN ssn_disk; }; /* An in-memory representation for a chunk page that is about to be written to * disk. * Contains an untyped buffer and minimal book-keeping information. */ struct Chunk_Buffer { void *data; // Tracking msn and ssn of first message so we can know how many pages // to write() or fsync(). MSN msn; SSN ssn; // last_msn and last_ssn: These fields indicate the one-past-last msn and // ssn. They get set only when the chunk buffer gets finalized. // The purpose is to allow the persister thread to update the persister // cursors after persisting the chunk buffer. // Because msn's and ssn's are continuous (i.e. last_msn is equal to the // next buffer's msn field), these data fields may seem redundant -- the // persister could use the msn and ssn from the following chunk buffer // instead. // However, that approach has in the past caused a bug where the persister // used the cursors from the following buffer before that buffer was even // initialized -- effectively decreasing the cursors to an earlier state // instead of advancing them. // After finding the bug, it was clear that we should introduce a little bit // of redundancy in order to keep things simple: the persister will access // only finalized buffers, and those will always have last_msn and last_ssn // fields set correctly. MSN last_msn; SSN last_ssn; __pmq_artificial_method Untyped_Slice untyped_slice() const { return Untyped_Slice(data, PMQ_CHUNK_SIZE); } __pmq_artificial_method PMQ_Chunk_Hdr *get_chunk_header() const { return (PMQ_Chunk_Hdr *) data; } }; /* A queue of in-memory chunks. * We might write() only one chunk at a time, but we need to hold each chunk in * memory until it is fsynced, so we need a bunch of chunk buffers. * */ struct Chunk_Queue { // The only purpose of this is to be a "holder" for the data buffers // contained in the "chunks" Chunk_Buffer's. // It's currently a single contiguous mmap allocation, i.e. (PMQ_CHUNK_SIZE // * chunks.slot_count()) MMap_Region chunk_buffer_mapping; // The only purpose of this is to be a "holder" for the Chunk_Buffer // structures (the "chunks" Ringbuffer is non-owning). Alloc_Slice chunks_alloc_slice; Ringbuffer chunks; /* CSN, MSN, SSN of the "current" next message that will be compacted. */ MSN cq_msn; CSN cq_csn; SSN cq_ssn; // Construction data for currently built chunk // buffer of current chunk (current chunk is identified Persist_Cursors::cq_csn) Chunk_Buffer *ck_buffer; uint64_t msg_count; // number of messages compacted in current chunk /* Array of message offsets. msg_count + 1 elements are always valid. The * next message will be appended (if it fits) at an offset of * offsets[msg_count] bytes in the current chunks page. When the chunk is * finalized, the position of the array is set. */ Alloc_Slice offsets; }; /* Persistent chunk store -- storing chunks on the file system. */ struct Chunk_Store { Posix_FD chunk_fd; uint64_t capacity_bytes = 0; // After persisting to the chunk store, when at least high_watermark many // chunks are filled, we may discard some to lower the chunk fill count to // low_watermark. // Discarding may also be required in the middle of persisting when all // chunks are full. But normally this shouldn't happen because the // In_Queue's capacity should be smaller than (chunk_count - // high_watermark) chunks. // Since discarding chunks involves an fsync() (really a write barrrier // would be enough but in practice we only have fsync() currently), // we discard many chunks at once to hide the overhead coming from disk // latency. uint64_t chunks_high_watermark = 0; uint64_t chunks_low_watermark = 0; __pmq_artificial_method uint64_t chunk_count() const { return capacity_bytes >> PMQ_CHUNK_SHIFT; } }; /* Cursors published by the persister. They index into the chunk store. They * are consumed by message readers. Sometimes enqueuer threads read these * cursors as well, to be able to skip persisting slots from the In_Queue. */ struct Persist_Cursors { // The wal_ssn member indicates the tentative next slot to be written to the // In_Queue's persist file. // Most slots never end up going to that persist file but are compacted // directly to the chunk store. At each sync to disk, only the slots that // can't form a complete chunk buffer go to the In_Queue's persist file. In // that file, only the slots from cks_ssn to wal_ssn (exclusive) are valid. SSN wal_ssn; // we also store the MSN corresponding to the wal_ssn. MSN wal_msn; // We might want to also introduce a cursor to indicate the oldest valid // chunk buffer in the (in-memory) queue. But currently only the flusher is // reading from the queue -- always at position cks_csn. // The next chunk that will be written (and fsync'ed) to disk. CSN cks_csn; // The msn of the first message that is stored in the chunk indicated by // cks_csn. That msn is also stored in that chunk's header. MSN cks_msn; // The ssn of the leader slot where the first message of the chunk indicated // by cks_csn was stored. Note, this ssn is _not_ stored in the chunk's // header -- it only has value coordinating with the In_Queue. SSN cks_ssn; // The next chunk that will be discarded from the chunk store. CSN cks_discard_csn; }; /* This structure is persisted in a state file. */ struct Commit_Record { // Number of slots of the in-queue // must be a power of 2 currently. uint64_t inqueue_slotcount; uint64_t slotsfile_size_bytes; // next ssn that will hit the slots-persist file SSN wal_ssn; MSN wal_msn; uint64_t chunkfile_size_bytes; // Csn, msn, and ssn of the next chunk that will be persisted to the chunk // store. CSN cks_csn; MSN cks_msn; SSN cks_ssn; // The next chunk that will be discarded from the chunk store CSN cks_discard_csn; }; // Data owned by the enqueuer functionality. There can only be 1 enqueuer // thread at a time. struct Enqueuer { PMQ_Enqueuer_Stats enqueuer_stats; In_Queue_Cursors in_queue_cursors; }; // Data owner by the persister functionality. There can only be 1 persister // thread at a time struct Persister { PMQ_Persister_Stats stats; Persist_Cursors persist_cursors; }; struct PMQ { PMQ_Owned_String basedir_path; Posix_FD basedir_fd; Posix_FD slotsfile_fd; uint64_t slotsfile_size_bytes = 0; In_Queue in_queue; // Chunks that get compacted from the in_queue. They will by persisted // to the chunk_store. Chunk_Queue chunk_queue; Chunk_Store chunk_store; // Cursors published by enqueuer threads, consumable by persister and reader // threads. In_Queue_Cursors pub_in_queue_cursors; PMQ_PROFILED_MUTEX(pub_in_queue_mutex); PMQ_PROFILED_CONDVAR(pub_in_queue_cond); // Cursors published by persister threads, consumable by enqueuer and reader // threads. Mutex_Protected pub_persist_cursors; Mutex_Protected pub_persister_stats; // must be held to guarantee only 1 enqueuer at a time (protects only // In_Queue currently). PMQ_PROFILED_MUTEX(enqueue_mutex); Enqueuer enqueuer; // Must be held to to guarantee only 1 persister at a time. Flushing may // happen by a dedicated persister thread that checks regularly, or by an // enqueuer thread when the In_Queue is full. PMQ_PROFILED_MUTEX(persist_mutex); Persister persister; Posix_FD statefile_fd; }; void pmq_get_stats(PMQ *q, PMQ_Stats *out_stats) { PMQ_Stats stats = {}; stats.persister = q->pub_persister_stats.load(); { PMQ_PROFILED_LOCK(lock_, q->enqueue_mutex); stats.enqueuer = q->enqueuer.enqueuer_stats; } *out_stats = stats; } PMQ_Persist_Info pmq_get_persist_info(PMQ *q) { Persist_Cursors persist_cursors = q->pub_persist_cursors.load(); PMQ_Persist_Info out; out.wal_msn = persist_cursors.wal_msn.value(); out.cks_msn = persist_cursors.cks_msn.value(); out.cks_discard_csn = persist_cursors.cks_discard_csn.value(); return out; } static bool pmq_persist_finished_chunk_buffers(PMQ *q); // Called only on initialization and then subsequently by pmq_switch_to_next_chunk_buffer() // Note: must be called from a persister context (with persist_mutex locked) static bool pmq_begin_current_chunk_buffer(PMQ *q) { PMQ_PROFILED_FUNCTION; Persist_Cursors *pc = &q->persister.persist_cursors; Chunk_Queue *cq = &q->chunk_queue; uint64_t num_chunks = cq->chunks_alloc_slice.capacity(); if (cq->cq_csn - pc->cks_csn == num_chunks) { if (! pmq_persist_finished_chunk_buffers(q)) return false; } Chunk_Buffer *ck_buffer = cq->chunks.get_slot_for(cq->cq_csn); ck_buffer->msn = cq->cq_msn; ck_buffer->ssn = cq->cq_ssn; ck_buffer->last_msn = MSN(0); // only set when chunk buffer gets finalized ck_buffer->last_ssn = SSN(0); // only set when chunk buffer gets finalized cq->ck_buffer = ck_buffer; cq->msg_count = 0; cq->offsets[0] = 16; // XXX ??? return true; } static void pmq_finalize_current_chunk_buffer(PMQ *q) { Chunk_Queue *cq = &q->chunk_queue; Chunk_Buffer *cb = cq->ck_buffer; cb->last_msn = cq->cq_msn; cb->last_ssn = cq->cq_ssn; pmq_assert(cq->cq_msn == cb->msn + cq->msg_count); pmq_debug_f("Finalize chunk %" PRIu64 ": MSN %" PRIu64 " - %" PRIu64 ", %" PRIu64 " messages. Last msg ends at %" PRIu64, cq->cq_csn.value(), cb->msn.value(), cb->last_msn.value(), cq->msg_count, (uint64_t) cq->offsets[cq->msg_count]); // msn: msn of first message stored in this chunk. // msgoffsets_off: offset in the chunk to an array of (msgcount + 1) offsets. Untyped_Slice chunk_slice = cq->ck_buffer->untyped_slice(); Slice offsets_slice = cq->offsets.slice().sub_slice(0, cq->msg_count + 1); PMQ_Chunk_Hdr *hdr = (PMQ_Chunk_Hdr *) chunk_slice.data(); hdr->msn = cq->ck_buffer->msn; hdr->msgcount = cq->msg_count; hdr->msgoffsets_off = PMQ_CHUNK_SIZE - offsets_slice.size_in_bytes(); pmq_debug_f("Place msgoffsets array in chunk at bytes offset %" PRIu64, (uint64_t) hdr->msgoffsets_off); // zero out gap between last message and message-offsets-array zero_out_slice(chunk_slice .limit_size_bytes(hdr->msgoffsets_off) .offset_bytes(offsets_slice.at(cq->msg_count))); Untyped_Slice chunk_offsets_slice = chunk_slice.offset_bytes(hdr->msgoffsets_off); copy_slice(chunk_offsets_slice, offsets_slice.untyped()); } // Finalize the current chunk buffer and start the next one. static bool pmq_switch_to_next_chunk_buffer(PMQ *q) { Chunk_Queue *cq = &q->chunk_queue; pmq_finalize_current_chunk_buffer(q); // "ship" cq->cq_csn += 1; if (! pmq_begin_current_chunk_buffer(q)) return false; pmq_assert(cq->ck_buffer->msn.value() == cq->cq_msn.value()); return true; } static bool pmq_msg_fits_current_chunk_buffer(PMQ *q, uint64_t msgsize) { Chunk_Queue *cq = &q->chunk_queue; // Compute start of offsets-array given the number of messages in the chunk. // Why (msg_count + 2)? Here is why: (msg_count + 2) = (next_count + 1). // next_count = the count, after appending current message. // Add 1 to that because the offsets array holds one more slot to include // the final size. uint64_t offsets_off = PMQ_CHUNK_SIZE - (cq->msg_count + 2) * (uint64_t) sizeof cq->offsets[0]; uint64_t msgs_end = cq->offsets[cq->msg_count] + msgsize; return msgs_end <= offsets_off; } // Helper function for pmq_persist() // NOTE: persist_mutex must be locked static bool pmq_compact(PMQ *q, SSN compact_ssn, SSN max_ssn) { if (false) // NOLINT { pmq_debug_f("pmq_persist(ssn=%" PRIu64 ", max_ssn=%" PRIu64 ")", compact_ssn.value(), max_ssn.value()); } PMQ_PROFILED_FUNCTION; Chunk_Queue *cq = &q->chunk_queue; for (;;) { if (sn64_ge(cq->cq_ssn, max_ssn)) { return true; } // Extract message size from slot header. uint64_t msgsize; { SSN ssn = cq->cq_ssn; const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn); if ((slot->flags & PMQ_SLOT_LEADER_MASK) == 0) { // Earlier there was an assert() here instead of an integrity check, // assuming that RAM should never be corrupted. However, the RAM might // be filled from disk, and we currently don't validate the data after // loading. Thus we now consider slot memory just as corruptible as // disk data. pmq_perr_f("slot %" PRIu64 " is not a leader slot.", ssn.value()); return false; } msgsize = slot->msgsize; } // check if there is enough room for the message in current chunk buffer. // If necessary, start a new chunk buffer. This in turn may require // flushing another chunk buffer to disk (we may flush multiple // considering throughput vs latency). if (! pmq_msg_fits_current_chunk_buffer(q, msgsize)) { if (! pmq_switch_to_next_chunk_buffer(q)) { return false; } // Actually let's always try to compact up to max_ssn // sice we should avoid writing small batches. // So disabling this early return. if(false) // NOLINT if (sn64_ge(cq->cq_ssn, compact_ssn)) { return true; } } // compute number of slots that we need to read, and copy to out-message uint64_t nslots_req = (msgsize + PMQ_SLOT_SPACE - 1) / PMQ_SLOT_SPACE; uint64_t nslots_avail = max_ssn - cq->cq_ssn; if (nslots_req > nslots_avail) { pmq_perr_f("Internal error: Invalid msgsize field in the slots file!"); pmq_perr_f("Internal error: msgsize is %" PRIu64 ", needs %" PRIu64 " slots but I believe only %" PRIu64 " are available!", msgsize, nslots_req, nslots_avail); return false; // can't we do a little more to handle the issue? } // copy one message uint64_t remain = msgsize; uint64_t dst_offset = cq->offsets[cq->msg_count]; for (uint64_t i = 0; i < nslots_req; i++) { // copy slots to chunks SSN ssn = cq->cq_ssn + i; const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn); void *dst = (char *) cq->ck_buffer->data + dst_offset; const void *src = __pmq_assume_aligned<16>(slot->payload); uint64_t n = remain < PMQ_SLOT_SPACE ? remain : PMQ_SLOT_SPACE; memcpy(dst, src, n); dst_offset += n; } cq->cq_ssn += nslots_req; cq->cq_msn += 1; cq->offsets[cq->msg_count + 1] = cq->offsets[cq->msg_count] + msgsize; cq->msg_count += 1; } return true; } static bool pmq_commit(PMQ *q); // forward decl. We should get rid of this and fix the order. // Helper function for pmq_persist() // NOTE: persister lock must be taken! // Persists all chunk buffers from disk_csn up to (but excluding) cq_csn. static bool pmq_persist_finished_chunk_buffers(PMQ *q) { Chunk_Queue *cq = &q->chunk_queue; Persist_Cursors *pc = &q->persister.persist_cursors; pmq_assert(cq->cq_csn - pc->cks_csn <= q->chunk_queue.chunks.slot_count()); CSN cq_csn = cq->cq_csn; uint64_t chunk_slot_mask = pmq_mask_power_of_2(q->chunk_store.chunk_count()); for (CSN csn = pc->cks_csn; csn != cq_csn;) { pmq_assert(csn - pc->cks_discard_csn <= q->chunk_store.chunk_count()); if (csn - pc->cks_discard_csn == q->chunk_store.chunk_count()) { // All chunks are filled. This should rarely happen since chunks are // normally discarded when the high-watermark chunk fill count is // reached. Typically, the chunk store is much larger than the // In_Queue, and we should not get here. // We discard some chunks manually by calling pmq_commit(). if (! pmq_commit(q)) return false; } pmq_assert(csn - pc->cks_discard_csn < q->chunk_store.chunk_count()); Chunk_Buffer *cb = q->chunk_queue.chunks.get_slot_for(csn); Pointer hdr = cb->get_chunk_header(); pmq_debug_f("Persist chunk buffer csn=%" PRIu64 ", pointer is %p, msn is %" PRIu64, csn.value(), cb, hdr->msn.value()); // write chunk buffer to disk. Untyped_Slice slice = cb->untyped_slice(); pmq_assert(slice.size() == PMQ_CHUNK_SIZE); int fd = q->chunk_store.chunk_fd.get(); uint64_t chunk_slot_index = csn.value() & chunk_slot_mask; off_t offset_bytes = chunk_slot_index << PMQ_CHUNK_SHIFT; if (! pmq_pwrite_all(fd, slice, offset_bytes, "chunk buffer")) { // should we publish the new cursors anyway? (see exit code below) return false; } csn += 1; // Advance chunk store pointers pc->cks_csn = csn; pc->cks_ssn = cb->last_ssn; pc->cks_msn = cb->last_msn; pmq_debug_f("persisted. pc->cks_msn=%" PRIu64 ", pc->cks_ssn=%" PRIu64, pc->cks_msn.value(), pc->cks_ssn.value()); } return true; } // Helper function for pmq_persist_unpersisted_slots() // Persist a contiguous sub-range of all slots. // The *slots* argument must correspond to the slots ringbuffer. // start_index + count may not exceed the slots size. static bool pmq_persist_slots_slice(PMQ *q, uint64_t start_index, uint64_t count) { Slice slots = q->in_queue.slots.as_slice(); uint64_t offset_bytes = start_index * sizeof (PMQ_Slot); Untyped_Slice slice = slots.sub_slice(start_index, count).untyped(); pmq_debug_f("Persist %" PRIu64 " slots (%zu bytes) starting from %" PRIu64, count, slice.size(), start_index); bool ret = pmq_pwrite_all(q->slotsfile_fd.get(), slice, offset_bytes, "slots-file"); if (ret) { q->persister.stats.wal_flushes += 1; q->persister.stats.wal_flush_bytes += slice.size(); } return ret; } // Helper function for pmq_persist() // NOTE: persister lock must be taken! // This function writes all unpersisted slots to the slots-file. The point of // not writing them as a chunk is that we only want to write complete chunk // buffers. The reason is that each chunk gets written once only (append-only), // and chunks are fixed-size (PMQ_CHUNK_SIZE). Syncing a complete chunk would // mean spending at least a whole chunk regardless of the number of messages in // it. static bool pmq_persist_unpersisted_slots(PMQ *q, SSN persist_ssn) { PMQ_PROFILED_FUNCTION; Persist_Cursors *pc = &q->persister.persist_cursors; if (sn64_lt(pc->wal_ssn, pc->cks_ssn)) { // The chunk store contains more recent data than the slots file. // Effectively clear slotsfile, by setting the valid range from cks_ssn to cks_ssn pc->wal_ssn = pc->cks_ssn; pc->wal_msn = pc->cks_msn; } if (sn64_ge(pc->wal_ssn, persist_ssn)) { return true; } pmq_debug_f("Persist unpersisted slots from %" PRIu64 " to %" PRIu64, pc->wal_ssn.value(), persist_ssn.value()); SSN ssn_lo = pc->wal_ssn; SSN ssn_hi = persist_ssn; uint64_t count = q->in_queue.slots.slot_count(); uint64_t mask = pmq_mask_power_of_2(count); uint64_t i_lo = ssn_lo.value() & mask; uint64_t i_hi = ssn_hi.value() & mask; bool ret; if (i_lo <= i_hi) { ret = pmq_persist_slots_slice(q, i_lo, i_hi - i_lo); } else { ret = pmq_persist_slots_slice(q, i_lo, count - i_lo); if (ret) ret = pmq_persist_slots_slice(q, 0, i_hi); } if (! ret) { pmq_perr_f("Failed to persist to slots-file!"); return false; } q->persister.stats.fsync_calls += 1; if (fsync(q->slotsfile_fd.get()) < 0) { pmq_perr_ef(errno, "fsync() of slots file failed"); return false; } // Only now we update the cursor. pc->wal_ssn = ssn_hi; // We also need to update the MSN accordingly. To find the MSN, because the // MSN is currently not stored in the slots, instead we count the number of // slot leaders. // This assumes that all the follower slots have been atomically enqueued // with each leader. for (SSN ssn = ssn_lo; ssn != ssn_hi; ssn ++) { PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn); if ((slot->flags & PMQ_SLOT_LEADER_MASK)) { pc->wal_msn ++; } } return true; } // Helper function, only used by pmq_commit() // Compute the next value of cks_discard_csn. // NOTE: persister lock must be taken static CSN pmq_compute_next_discard_csn(PMQ *q) { Persist_Cursors *pc = &q->persister.persist_cursors; uint64_t chunks_count = pc->cks_csn - pc->cks_discard_csn; uint64_t low_mark = q->chunk_store.chunks_low_watermark; uint64_t high_mark = q->chunk_store.chunks_high_watermark; if (chunks_count < high_mark) return pc->cks_discard_csn; CSN old_discard_csn = pc->cks_discard_csn; CSN new_discard_csn = pc->cks_csn - low_mark; pmq_debug_f("Discarding chunks from %" PRIu64 " to %" PRIu64, old_discard_csn.value(), new_discard_csn.value()); return new_discard_csn; } // persister lock must be taken bool __pmq_profiled pmq_commit(PMQ *q) { PMQ_PROFILED_FUNCTION; { q->persister.stats.fsync_calls += 1; if (fsync(q->chunk_store.chunk_fd.get()) < 0) { pmq_perr_ef(errno, "fsync() of chunks file failed"); return false; } } Persist_Cursors *pc = &q->persister.persist_cursors; Commit_Record commit_record; commit_record.inqueue_slotcount = q->in_queue.slot_count; commit_record.slotsfile_size_bytes = q->slotsfile_size_bytes; commit_record.wal_ssn = pc->wal_ssn; commit_record.wal_msn = pc->wal_msn; commit_record.chunkfile_size_bytes = q->chunk_store.capacity_bytes; commit_record.cks_csn = pc->cks_csn; commit_record.cks_msn = pc->cks_msn; commit_record.cks_ssn = pc->cks_ssn; commit_record.cks_discard_csn = pmq_compute_next_discard_csn(q); { Untyped_Slice slice = Untyped_Slice(&commit_record, sizeof commit_record); if (! pmq_pwrite_all(q->statefile_fd.get(), slice, 0, "state.dat file")) { return false; } } { if (fsync(q->statefile_fd.get()) < 0) { pmq_perr_ef(errno, "fsync() of statefile failed"); return false; } q->persister.stats.fsync_calls += 1; } // Successfully committed the next discard cursor, now we can recycle the // released chunks internally. pc->cks_discard_csn = commit_record.cks_discard_csn; q->pub_persister_stats.store(q->persister.stats); q->pub_persist_cursors.store(q->persister.persist_cursors); return true; } // Persist messages from the In_Queue to the Chunk_Queue, at least until reaching ssn. // The given max_ssn is the hard stop, a good choice here is the In_Queue's ssn_mem. // The function isn't able to determine max_ssn as ssn_mem on its own, since it // may or may not be used from within the enqueuer context. // NOTE: This function tries to fill the current Chunk_Buffer once it reaches compact_ssn. static bool pmq_persist(PMQ *q, SSN ssn, SSN max_ssn) { if (! pmq_compact(q, ssn, max_ssn)) goto error; if (! pmq_persist_finished_chunk_buffers(q)) goto error; if (! pmq_persist_unpersisted_slots(q, ssn)) goto error; if (! pmq_commit(q)) goto error; return true; error: pmq_perr_f("Failed to persist slots"); return false; } // Only meant to be called by pmq_sync() static bool _pmq_sync(PMQ *q) { In_Queue_Cursors ic; PMQ_PROFILED_LOCK(lock_, q->persist_mutex); { PMQ_PROFILED_SCOPE("wait-fill"); PMQ_PROFILED_UNIQUE_LOCK(lock_, q->pub_in_queue_mutex); for (;;) { ic = q->pub_in_queue_cursors; pmq_assert(sn64_le(ic.ssn_disk, ic.ssn_mem)); uint64_t slots_fill = ic.ssn_mem - ic.ssn_disk; if (slots_fill >= q->in_queue.slots_persist_watermark) break; auto max_wait_time = std::chrono::milliseconds(50); auto wait_result = q->pub_in_queue_cond.wait_for(lock_, max_wait_time); if (wait_result == std::cv_status::timeout) break; q->persister.stats.wakeups += 1; } } if (false) // NOLINT { pmq_debug_f("ic.ssn_mem is now %" PRIu64, ic.ssn_mem.value()); uint64_t slots_fill = ic.ssn_mem - ic.ssn_disk; pmq_debug_f("slots_fill is now %" PRIu64, slots_fill); pmq_debug_f("slots_persist_watermark is %" PRIu64, q->in_queue.slots_persist_watermark); } if (! pmq_persist(q, ic.ssn_mem, ic.ssn_mem)) return false; q->persister.stats.num_async_flushes += 1; return true; } // Entry point to persisted all messages that have been successfully enqueued // so far. Concurrent operations (e.g. pmq_enqueue_msg()) are possible, but may // not be persisted this time. bool pmq_sync(PMQ *q) { PMQ_PROFILED_FUNCTION; bool ret = _pmq_sync(q); if (! ret) pmq_perr_f("Failed to pmq_sync()!"); return ret; } // Helper function for pmq_enqueue_msg // Attempts to make enough room in the In_Queue // enqueue_mutex must be locked. static bool __pmq_profiled pmq_prepare_input_slots(PMQ *q, uint64_t nslots_req) { PMQ_PROFILED_FUNCTION; In_Queue_Cursors *ic = &q->enqueuer.in_queue_cursors; uint64_t slot_count = q->in_queue.slot_count; SSN next_ssn_mem = ic->ssn_mem + nslots_req; pmq_assert(ic->ssn_mem - ic->ssn_disk <= slot_count); if (next_ssn_mem - ic->ssn_disk <= slot_count) return true; // Update the ssn_disk cursor from the pub_persist_cursors. Those hold the // same values as q->persister.persist_cursors, just ever so slightly // outdated. This information lets us detect if we can jump out early, // without requiring to lock the persister context, which can take a lot of // time. { Persist_Cursors pc = q->pub_persist_cursors.load(); ic->msn_disk = pc.cks_msn; ic->ssn_disk = pc.cks_ssn; } if (next_ssn_mem - ic->ssn_disk <= slot_count) return true; // Still not enough room, need to switch to persister context (lock // it) and flush some more messages. q->enqueuer.enqueuer_stats.buffer_full_count += 1; PMQ_PROFILED_LOCK(lock_, q->persist_mutex); if (! pmq_persist(q, next_ssn_mem - slot_count, ic->ssn_mem)) { return false; } if (false) // NOLINT { Chunk_Queue *cq = &q->chunk_queue; Persist_Cursors *pc = &q->persister.persist_cursors; pmq_assert(sn64_ge(cq->cq_ssn, ic->ssn_mem)); pmq_debug_f("ic->ssn_mem: %" PRIu64 ", cq_ssn - cks_ssn: %" PRIu64, ic->ssn_mem.value(), cq->cq_ssn.value() - pc->cks_ssn.value()); } if (false) // NOLINT { SSN old_ssn_disk = ic->ssn_disk; SSN new_ssn_disk = q->persister.persist_cursors.cks_ssn; pmq_debug_f("Flushed %" PRIu64 " ssns", ic->ssn_disk - old_ssn_disk); if (sn64_le(new_ssn_disk, old_ssn_disk)) { pmq_perr_f("Something is wrong: %" PRIu64 ", %" PRIu64, old_ssn_disk.value(), ic->ssn_disk.value()); } pmq_assert(slot_count >= (ic->ssn_mem - ic->ssn_disk)); } // Update the ssn_disk cursor from the (locked) persister context. { ic->msn_disk = q->persister.persist_cursors.cks_msn; ic->ssn_disk = q->persister.persist_cursors.cks_ssn; } pmq_assert(next_ssn_mem - ic->ssn_disk <= slot_count); return true; } // Helper function for pmq_enqueue_msg(). // Serialize message to In_Queue's memory buffer. // Expects enqueue_mutex to be taken. // Expects that there is enough room to serialize the message (pmq_prepare_input_slots()) static void pmq_serialize_msg(PMQ *q, const void *data, size_t size) { PMQ_PROFILED_FUNCTION; In_Queue_Cursors *ic = &q->enqueuer.in_queue_cursors; SSN ssn_mem = ic->ssn_mem; SSN old_ssn_mem = ic->ssn_mem; uint64_t slot_count = q->in_queue.slot_count; pmq_assert(pmq_is_power_of_2(slot_count)); uint32_t slot_flags = PMQ_SLOT_LEADER_MASK; // write full slots size_t i = 0; while (i + PMQ_SLOT_SPACE <= size) { PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn_mem); slot->flags = slot_flags; slot->msgsize = size - i; memcpy(__pmq_assume_aligned<16>(slot->payload), (const char *) data + i, PMQ_SLOT_SPACE); ssn_mem += 1; i += PMQ_SLOT_SPACE; slot_flags &= ~PMQ_SLOT_LEADER_MASK; } // write last slot if (i < size) { PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn_mem); slot->flags = slot_flags; slot->msgsize = size - i; memcpy(__pmq_assume_aligned<16>(slot->payload), (const char *) data + i, size - i); ssn_mem += 1; } // can bump ssn_mem cursor, publish new cursors field, and release lock now ic->ssn_mem = ssn_mem; ic->msn += 1; q->enqueuer.enqueuer_stats.total_messages_enqueued += 1; q->enqueuer.enqueuer_stats.total_bytes_enqueued += size; { uint64_t new_slot_count = ic->ssn_mem - ic->ssn_disk; uint64_t old_slot_count = old_ssn_mem - ic->ssn_disk; bool notify = old_slot_count < q->in_queue.slots_persist_watermark && new_slot_count >= q->in_queue.slots_persist_watermark; { PMQ_PROFILED_UNIQUE_LOCK(lock_, q->pub_in_queue_mutex); q->pub_in_queue_cursors = *ic; if (notify) q->pub_in_queue_cond.notify_one(); } } pmq_assert(ic->ssn_mem - ic->ssn_disk <= slot_count); } bool pmq_enqueue_msg(PMQ *q, const void *data, size_t size) { PMQ_PROFILED_FUNCTION; pmq_assert(size > 0); uint64_t nslots_req = (size + PMQ_SLOT_SPACE - 1) / PMQ_SLOT_SPACE; PMQ_PROFILED_LOCK(lock_, q->enqueue_mutex); if (! pmq_prepare_input_slots(q, nslots_req)) return false; pmq_serialize_msg(q, data, size); return true; } static void pmq_init_chunk_store_size(Chunk_Store *cks, uint64_t capacity_bytes) { pmq_assert(pmq_is_power_of_2(capacity_bytes)); pmq_assert(capacity_bytes >= PMQ_Megabytes(64)); cks->capacity_bytes = capacity_bytes; uint64_t chunks_count = cks->capacity_bytes >> PMQ_CHUNK_SHIFT; // What is a reasonable watermark at which we should start discarding chunks? // Note that while discarding a chunk is logically only advancing a CSN cursor, // it's very expensive because we have to fsync() that updated cursor to disk. // For now, I'm deciding to set them to chunks_count minus 256 resp. 512. // On each discard we'll be discarding between (hi_mark - low_mark) and // (chunks_count - low_mark) chunks, i.e. between 16 and 32 MiB of data. // These values should be fair when targetting a reasonable throughput of // 2GB/sec and an fsync() latency of ~5ms. cks->chunks_low_watermark = chunks_count - 512; cks->chunks_high_watermark = chunks_count - 256; pmq_assert(cks->chunks_low_watermark < chunks_count); pmq_assert(cks->chunks_high_watermark < chunks_count); pmq_assert(cks->chunks_low_watermark < cks->chunks_high_watermark); } static bool pmq_init_createnew(PMQ *q, const PMQ_Init_Params *params) { const char *basedir_path = q->basedir_path.get().buffer; if (mkdir(basedir_path, 0750) == -1) { pmq_perr_ef(errno, "Failed to create queue directory %s", basedir_path); return false; } q->basedir_fd = pmq_open_dir(basedir_path); if (! q->basedir_fd.valid()) { pmq_perr_ef(errno, "Failed to open the directory we created: %s", basedir_path); return false; } // Initialize In_Queue_Cursors to all 0. { q->enqueuer.in_queue_cursors = In_Queue_Cursors {}; } // Initialize persister cursors to all 0. { q->persister.persist_cursors = Persist_Cursors {}; } // Create slots-file. // The slots-file is called "wal.dat" but it's not really a WAL -- only a // buffer to store the rest slots that didn't make a complete chunk page. { q->slotsfile_fd = pmq_openat_regular_create(q->basedir_fd.get(), "wal.dat", O_RDWR, 0644); if (! q->slotsfile_fd.valid()) { pmq_perr_ef(errno, "Failed to create slots file (wal.dat)"); return false; } //TODO: currently this must be the same size as the in-memory slots buffer. Fix this, we only need a tiny file on disk to persist the remaining slots that //didn't fill a complete chunk page. q->slotsfile_size_bytes = q->in_queue.size_bytes; if (fallocate(q->slotsfile_fd.get(), FALLOC_FL_ZERO_RANGE, 0, q->slotsfile_size_bytes) == -1) { pmq_perr_ef(errno, "Failed to fallocate() slots file"); return false; } } // Create chunk store { Chunk_Store *cks = &q->chunk_store; uint64_t create_size = params->create_size; if (create_size == 0) create_size = PMQ_Gigabytes(1); // default to 1 GiB if (create_size < PMQ_Megabytes(64)) { pmq_perr_f("PMQ_Init_Params::create_size is invalid: " "Must be at least 64 MiB. Requested: %" PRIu64, create_size); return false; } if (! pmq_is_power_of_2(create_size)) { pmq_warn_f("PMQ_Init_Params::create_size is not a power of 2: %" PRIu64, create_size); create_size *= 2; while (! pmq_is_power_of_2(create_size)) create_size = create_size & (create_size - 1); pmq_warn_f("PMQ_Init_Params::create_size is not a power of 2: rounded up to %" PRIu64, create_size); } pmq_init_chunk_store_size(cks, create_size); cks->chunk_fd = pmq_openat_regular_create(q->basedir_fd.get(), "chunks.dat", O_RDWR, 0644); if (! cks->chunk_fd.valid()) { pmq_perr_ef(errno, "Failed to create chunks file"); return false; } if (fallocate(cks->chunk_fd.get(), FALLOC_FL_ZERO_RANGE, 0, cks->capacity_bytes) == -1) { pmq_perr_ef(errno, "Failed to fallocate() chunks file" " to size %" PRIu64, cks->capacity_bytes); return false; } } // Create state.dat file { q->statefile_fd = pmq_openat_regular_create(q->basedir_fd.get(), "state.dat", O_RDWR, 0644); if (! q->statefile_fd.valid()) { pmq_perr_ef(errno, "Failed to open state.dat file"); return false; } // Is it ok to try and reuse the pmq_commit() function to initialize the file? if (! pmq_commit(q)) return false; } // Sync basedir to make sure the new files are persisted. { if (fsync(q->basedir_fd.get()) == -1) { pmq_perr_ef(errno, "Error from fsync() on base directory"); return false; } } return true; } static bool __pmq_validate_commit_record_weak_ordering( uint64_t sn_lo, uint64_t sn_hi, const char *name_lo, const char *name_hi) { if (! _sn64_le(sn_lo, sn_hi)) { pmq_perr_f("Integrity error in state.dat file: We expected %s <= %s" " but their values are %" PRIu64 " > %" PRIu64, name_lo, name_hi, sn_lo, sn_hi); return false; } return true; } template static bool _pmq_validate_commit_record_weak_ordering( T sn_lo, T sn_hi, const char *name_lo, const char *name_hi) { return __pmq_validate_commit_record_weak_ordering( sn_lo.value(), sn_hi.value(), name_lo, name_hi); } #define pmq_validate_commit_record_weak_ordering(cr, lo, hi) \ _pmq_validate_commit_record_weak_ordering((cr).lo, (cr).hi, #lo, #hi) static bool pmq_inithelper_check_file_size( int fd, uint64_t expected_file_size, const char *what_file) { pmq_assert(fd >= 0); struct stat st; if (fstat(fd, &st) == -1) { pmq_perr_ef(errno, "Failed to fstat() %s", what_file); return false; } if (! S_ISREG(st.st_mode)) { pmq_perr_f("Internal error: Expected regular file"); return false; } uint64_t actual_file_size = (uint64_t) st.st_size; if (actual_file_size != expected_file_size) { pmq_perr_f("%s has wrong size. Expected: %" PRIu64 ", got: %" PRIu64, what_file, expected_file_size, actual_file_size); return false; } return true; } static bool pmq_init_loadexisting(PMQ *q) { // Open State File { q->statefile_fd = pmq_openat_regular_existing(q->basedir_fd.get(), "state.dat", O_RDWR); if (! q->statefile_fd.valid()) { pmq_perr_ef(errno, "Failed to open state.dat file"); return false; } } Commit_Record commit_record; // Load commit record and store in commit_record { if (! pmq_pread_all(q->statefile_fd.get(), Untyped_Slice(&commit_record, sizeof commit_record), 0, "state.dat")) { return false; } if (! pmq_validate_commit_record_weak_ordering(commit_record, cks_discard_csn, cks_csn)) return false; if (! pmq_validate_commit_record_weak_ordering(commit_record, cks_ssn, wal_ssn)) return false; if (! pmq_validate_commit_record_weak_ordering(commit_record, cks_msn, wal_msn)) return false; { uint64_t file_size = commit_record.chunkfile_size_bytes; if ((file_size % PMQ_CHUNK_SIZE) != 0) { pmq_perr_f( "state.dat file contains invalid chunkfile size: " "%" PRIu64 " which is not a multiple of the chunk size " "(%" PRIu64 ")", file_size, PMQ_CHUNK_SIZE); return false; } uint64_t chunks_count = file_size / PMQ_CHUNK_SIZE; CSN csn_lo = commit_record.cks_discard_csn; CSN csn_hi = commit_record.cks_csn; if (csn_hi - csn_lo > chunks_count) { pmq_perr_f("state.dat cks_discard_csn=%" PRIu64 ", cks_csn=%" PRIu64, csn_lo.value(), csn_hi.value()); pmq_perr_f( "state.dat file contains invalid chunk cursor positions: " " Their distance exceeds the size of the chunks-file " "(%" PRIu64 " > %" PRIu64 ".", csn_hi - csn_lo, chunks_count); return false; } } { uint64_t file_size = commit_record.slotsfile_size_bytes; if ((file_size % PMQ_SLOT_SIZE) != 0) { pmq_perr_f( "state.dat file contains invalid slots-file size: " "%" PRIu64 " which is not a multiple of the slot size " "(%" PRIu64 ")", file_size, PMQ_SLOT_SIZE); return false; } uint64_t slots_count = file_size / PMQ_SLOT_SIZE; SSN ssn_lo = commit_record.cks_ssn; SSN ssn_hi = commit_record.wal_ssn; if (ssn_hi - ssn_lo > slots_count) { pmq_perr_f( "state.dat file contains invalid slot cursor positions: " " Their distance exceeds the size of the slots-file."); return false; } } } // TODO: Currently the slots-file and the in-memory slots-ringbuffer are the same size // Later, make the slots-file smaller (just because it doesn't need to be very big) // and be very careful how to load to memory. { q->slotsfile_size_bytes = commit_record.slotsfile_size_bytes; q->slotsfile_fd = pmq_openat_regular_existing(q->basedir_fd.get(), "wal.dat", O_RDWR); if (! q->slotsfile_fd.valid()) { pmq_perr_ef(errno, "Failed to open slots file (wal.dat)"); return false; } if (! pmq_inithelper_check_file_size(q->slotsfile_fd.get(), q->slotsfile_size_bytes, "state-file (state.dat)")) { return false; } if (! pmq_pread_all( q->slotsfile_fd.get(), q->in_queue.slots.as_slice().untyped(), 0, "slots-file (wal.dat)")) { pmq_perr_f("Failed to read from slots file to in-memory slots ringbuffer"); return false; } } // Load chunk store { Chunk_Store *cks = &q->chunk_store; pmq_init_chunk_store_size(cks, commit_record.chunkfile_size_bytes); cks->chunk_fd = pmq_openat_regular_existing(q->basedir_fd.get(), "chunks.dat", O_RDWR); if (! cks->chunk_fd.valid()) { pmq_perr_ef(errno, "Failed to open chunks.dat file"); return false; } if (! pmq_inithelper_check_file_size(cks->chunk_fd.get(), cks->capacity_bytes, "chunk file (chunks.dat)")) { return false; } } // Initialize In_Queue_Cursors { In_Queue_Cursors ic; ic.msn = commit_record.wal_msn; ic.ssn_mem = commit_record.wal_ssn; ic.msn_disk = commit_record.cks_msn; ic.ssn_disk = commit_record.cks_ssn; q->pub_in_queue_cursors = ic; } // Initialize persister cursors { Persist_Cursors pc; pc.wal_ssn = commit_record.wal_ssn; pc.wal_msn = commit_record.wal_msn; pc.cks_csn = commit_record.cks_csn; pc.cks_msn = commit_record.cks_msn; pc.cks_ssn = commit_record.cks_ssn; pc.cks_discard_csn = commit_record.cks_discard_csn; q->pub_persist_cursors.store(pc); } return true; } static bool pmq_init(PMQ *q, const PMQ_Init_Params *params) { q->basedir_path.set(params->basedir_path); const char *basedir_path = q->basedir_path.get().buffer; // Set up In_Queue // This is currently independent of any database state, so we can do it first. { // TODO how to find proper size (slot count) for the In_Queue buffer? // For most use cases, we don't need extremely high bandwidth, but we // should think about making it tunable and come up with recommendations. // Or even allow it to be sized dynamically. q->in_queue.slot_count = 512 * 1024; // each slot is 128 bytes q->in_queue.size_bytes = q->in_queue.slot_count * PMQ_SLOT_SIZE; q->in_queue.slots_persist_watermark = q->in_queue.slot_count / 2; pmq_debug_f("in-queue size: %" PRIu64 " (%" PRIu64 " slots)", q->in_queue.size_bytes, q->in_queue.slot_count); // We could consider making an SHM file here to back the In_Queue memory, // making the In_Queue persist across application restarts. // This would allow to recover any message that was successfully enqueued // to the In_Queue (unless the machine was also restarted or crashed // before recovery). On the other hand, it would require elaborate // recovery code. if (! q->in_queue.mapping.create(NULL, q->in_queue.size_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) { pmq_perr_ef(errno, "Failed to mmap() queue memory"); return false; } PMQ_Slot *slots = (PMQ_Slot *) q->in_queue.mapping.get(); pmq_assert(slots); __pmq_assert_aligned(slots, 16); q->in_queue.slots.reset(Slice(slots, q->in_queue.slot_count)); } // Create or load the on-disk database q->basedir_fd = pmq_open_dir(basedir_path); if (! q->basedir_fd.valid()) { if (! (errno == ENOTDIR || errno == ENOENT)) { pmq_perr_ef(errno, "Failed to open queue directory at %s", basedir_path); return false; } pmq_msg_f("No queue directory present at %s", basedir_path); pmq_msg_f("Creating new queue directory at %s", basedir_path); if (! pmq_init_createnew(q, params)) { pmq_perr_f("Failed to create queue directory at %s", basedir_path); return false; } } else { pmq_msg_f("Loading existing queue from %s", basedir_path); if (! pmq_init_loadexisting(q)) { pmq_perr_f("Failed to load queue directory at %s", basedir_path); return false; } if (params->create_size != 0 && params->create_size != q->chunk_store.capacity_bytes) { pmq_warn_f("NOTE: Configured chunk store size is %" PRIu64 " bytes, which is different from the size of the existing" " chunk store: %" PRIu64 " bytes." " The chunk store size configuration is currently only" " considered when creating a new chunk store.", params->create_size, q->chunk_store.capacity_bytes); } } // Set up cursors q->enqueuer.in_queue_cursors = q->pub_in_queue_cursors; q->persister.persist_cursors = q->pub_persist_cursors.load(); // Initialize Chunk_Queue { Chunk_Queue *cq = &q->chunk_queue; cq->cq_csn = q->persister.persist_cursors.cks_csn; cq->cq_ssn = q->persister.persist_cursors.cks_ssn; cq->cq_msn = q->persister.persist_cursors.cks_msn; cq->chunks_alloc_slice.allocate(2); // only 2 chunk buffers { uint64_t map_size_bytes = cq->chunks_alloc_slice.capacity() * PMQ_CHUNK_SIZE; if (! cq->chunk_buffer_mapping.create(NULL, map_size_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) { pmq_perr_ef(errno, "Failed to mmap() %" PRIu64 " bytes for chunk buffer", map_size_bytes); return false; } } cq->chunks.reset(cq->chunks_alloc_slice.slice()); for (uint64_t i = 0; i < cq->chunks.slot_count(); i++) { Chunk_Buffer *cb = cq->chunks.get_slot_for(CSN(i)); cb->data = (char *) cq->chunk_buffer_mapping.get() + (i * PMQ_CHUNK_SIZE); // initialized later, anyway... cb->msn = MSN(0); cb->ssn = SSN(0); } // current chunk page buffer starts out empty. cq->msg_count = 0; // Since each message is at least 1 byte large, and requires a 2 byte // offset stored as well, we can have no more than this number of // messages (and thus offsets) in each chunk. cq->offsets.allocate(PMQ_CHUNK_SIZE / 3); // Set up for submitting messages to first chunk buffer in the Chunk_Queue // NOTE I think this will use the Chunk_Buffer identified by the value of cks_csn // after the queue was loaded. if (! pmq_begin_current_chunk_buffer(q)) return false; } { In_Queue_Cursors ic = q->enqueuer.in_queue_cursors; pmq_debug_f("in_queue_cursors.msn: %" PRIu64, ic.msn.value()); pmq_debug_f("in_queue_cursors.ssn_mem: %" PRIu64, ic.ssn_mem.value()); pmq_debug_f("in_queue_cursors.msn_disk: %" PRIu64, ic.msn_disk.value()); pmq_debug_f("in_queue_cursors.ssn_disk: %" PRIu64, ic.ssn_disk.value()); } { Chunk_Queue *cq = &q->chunk_queue; pmq_debug_f("chunk_queue.cq_csn: %" PRIu64, cq->cq_csn.value()); pmq_debug_f("chunk_queue.cq_msn: %" PRIu64, cq->cq_msn.value()); pmq_debug_f("chunk_queue.cq_ssn: %" PRIu64, cq->cq_ssn.value()); } { Persist_Cursors pc = q->persister.persist_cursors; pmq_debug_f("persister.wal_ssn: %" PRIu64, pc.wal_ssn.value()); pmq_debug_f("persister.wal_msn: %" PRIu64, pc.wal_msn.value()); pmq_debug_f("persister.cks_csn: %" PRIu64, pc.cks_csn.value()); pmq_debug_f("persister.cks_msn: %" PRIu64, pc.cks_msn.value()); pmq_debug_f("persister.cks_ssn: %" PRIu64, pc.cks_ssn.value()); pmq_debug_f("persister.cks_discard_csn: %" PRIu64, pc.cks_discard_csn.value()); } return true; } PMQ *pmq_create(const PMQ_Init_Params *params) { PMQ *q = new PMQ(); if (! q) return nullptr; if (! pmq_init(q, params)) { delete q; return nullptr; } return q; } // This function makes sure that all messages currently written are synced to disk. // When calling this function, all concurrent access (e.g. pmq_enqueue_msg()) // must have returned and no new ones may be done. void pmq_destroy(PMQ *q) { if (! pmq_sync(q)) { pmq_warn_f("Failed to sync the queue before shutting it down"); } delete q; } /* PMQ_Reader */ // To be able to read the newest messages, which may not be persisted to the // chunk store but only to the slot-file / in_queue, we need multiple read modes. // We keep track of where we're reading explicitly because we need to reset the // appropriate cursors whenever we're switching between the modes. enum PMQ_Read_Mode { PMQ_Read_Mode_Chunkstore, PMQ_Read_Mode_Slotsfile, }; // read state for reading from the slots file struct PMQ_Slots_Readstate { SSN ssn; }; // read state for reading from the chunk store struct PMQ_Chunks_Readstate { // tracks the current csn CSN cnk_csn; // tracks whether the chunk indicated by cnk_csn is loaded. bool cnk_loaded; // This data is extracted from chunk_page (only valid if cnk_loaded) MSN cnk_msn; uint64_t cnk_msgcount; Alloc_Slice cnk_buffer; Slice cnk_msgoffsets; // msg-offsets (a subrange inside the cnk_buffer) }; struct PMQ_Reader { PMQ *q; // to prevent races, the reader has its own copy of the Persist_Cursors. // They get updated only before reading a message. Persist_Cursors persist_cursors; // the MSN of the next message we're going to read. // Gets incremented each time pmq_read_msg() is called. MSN msn; PMQ_Read_Mode read_mode; // Place to store an error. This will prevent reading after an error. // Subsequent seeking (if successful) will clear the error. PMQ_Read_Result last_result; PMQ_Slots_Readstate slots_readstate; PMQ_Chunks_Readstate chunks_readstate; }; struct PMQ_Msg_Output { void *data; // size of the data buffer size_t data_size; // where the caller wants the size of the message to be written. size_t *size_out; PMQ_Msg_Output(void *data, size_t data_size, size_t *size_out) : data(data), data_size(data_size), size_out(size_out) {} }; static void pmq_reader_update_persist_cursors(PMQ_Reader *reader) { reader->persist_cursors = reader->q->pub_persist_cursors.load(); } static bool pmq_reader_validate_chunk_hdr(PMQ_Chunks_Readstate *ckread, Pointer hdr) { if (hdr->msgcount == 0) { pmq_perr_f("Read invalid chunk %" PRIu64 ": msgcount is 0.", ckread->cnk_csn.value()); return false; } uint64_t off_end = (uint64_t) hdr->msgoffsets_off + (hdr->msgcount + 1) * sizeof (uint16_t); if (off_end > PMQ_CHUNK_SIZE) { pmq_perr_f("Read invalid chunk %" PRIu64 ": msg-offsets array exceeds chunk size." " msgcount: %" PRIu64 ", msgoffsets_off: %" PRIu64, ckread->cnk_csn.value(), (uint64_t) hdr->msgcount, (uint64_t) hdr->msgoffsets_off); return false; } return true; } static PMQ_Read_Result pmq_load_chunk(PMQ_Reader *reader) { pmq_assert(reader->read_mode == PMQ_Read_Mode_Chunkstore); PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate; pmq_assert(!ckread->cnk_loaded); PMQ *q = reader->q; if (sn64_le(reader->persist_cursors.cks_csn, ckread->cnk_csn)) { return PMQ_Read_Result_EOF; } Chunk_Store *cks = &q->chunk_store; // load chunk uint64_t mask = pmq_mask_power_of_2(q->chunk_store.chunk_count()); uint64_t index = ckread->cnk_csn.value() & mask; uint64_t offset = index << PMQ_CHUNK_SHIFT; Untyped_Slice buffer_slice = ckread->cnk_buffer.untyped_slice(); pmq_assert(buffer_slice.size() == PMQ_CHUNK_SIZE); if (! pmq_pread_all( cks->chunk_fd.get(), buffer_slice, offset, "chunk in chunk file")) { return PMQ_Read_Result_IO_Error; } Pointer hdr = (PMQ_Chunk_Hdr *) buffer_slice.data(); // first check if the chunk is still supposed to be there -- it might have // been overwritten by the next chunk { pmq_reader_update_persist_cursors(reader); if (sn64_lt(ckread->cnk_csn, reader->persist_cursors.cks_discard_csn)) { pmq_debug_f("LOST SYNC: ckread->cnk_csn=%" PRIu64 ", reader->persist_cursors.cks_discard_csn=%" PRIu64, ckread->cnk_csn.value(), reader->persist_cursors.cks_discard_csn.value()); return PMQ_Read_Result_Out_Of_Bounds; } } if (! pmq_reader_validate_chunk_hdr(ckread, hdr)) { return PMQ_Read_Result_Integrity_Error; } // Initial validation of the loaded chunk completed. // Set variables and return success. ckread->cnk_msn = hdr->msn; ckread->cnk_msgcount = hdr->msgcount; ckread->cnk_msgoffsets = Slice( (uint16_t *) ((char *) ckread->cnk_buffer.data() + hdr->msgoffsets_off), hdr->msgcount + 1); ckread->cnk_loaded = true; // Set the MSN to this chunk's msn too. reader->msn = hdr->msn; return PMQ_Read_Result_Success; } static void pmq_reset_to_specific_chunk(PMQ_Reader *reader, CSN csn) { PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate; ckread->cnk_loaded = false; ckread->cnk_csn = csn; } static PMQ_Read_Result pmq_reset_to_specific_chunk_and_load( PMQ_Reader *reader, CSN csn) { pmq_reset_to_specific_chunk(reader, csn); return pmq_load_chunk(reader); } static void pmq_reader_copy_chunk_header(PMQ_Chunks_Readstate *ckread, PMQ_Chunk_Hdr *out) { *out = *(PMQ_Chunk_Hdr *) ckread->cnk_buffer.data(); } static bool pmq_check_chunk_msns(CSN csn_lo, CSN csn_hi, const PMQ_Chunk_Hdr *hdr_lo, const PMQ_Chunk_Hdr *hdr_hi) { if (sn64_lt(csn_hi, csn_lo)) return pmq_check_chunk_msns(csn_hi, csn_lo, hdr_hi, hdr_lo); MSN cnk_lo_last_msn = hdr_lo->msn + hdr_lo->msgcount; MSN cnk_hi_first_msn = hdr_hi->msn; if (csn_lo + 1 == csn_hi) { if (cnk_lo_last_msn != cnk_hi_first_msn) { pmq_perr_f("Integrity error while reading chunks: MSN %" PRIu64 " was expected in the chunk following chunk %" PRIu64 " but found %" PRIu64, cnk_lo_last_msn.value(), csn_lo.value(), cnk_hi_first_msn.value()); return false; } } else { // maybe we should check sn64_lt() instead of sn64_le(), because // chunks must contain at least 1 message, at least currently. if (! sn64_le(cnk_lo_last_msn, cnk_hi_first_msn)) { pmq_perr_f("Integrity error while reading chunks: Chunk SN %" PRIu64 " < %" PRIu64 " but these chunks have low / high MSNs " "%" PRIu64 " >= %" PRIu64, csn_lo.value(), csn_hi.value(), cnk_lo_last_msn.value(), cnk_hi_first_msn.value()); return false; } } return true; } static PMQ_Read_Result pmq_bsearch_msg(PMQ_Reader *reader, MSN msn, CSN csn_lo, CSN csn_hi) { if (reader->read_mode != PMQ_Read_Mode_Chunkstore) { reader->read_mode = PMQ_Read_Mode_Chunkstore; reader->chunks_readstate.cnk_loaded = false; } PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate; bool hdr_valid = false; PMQ_Chunk_Hdr hdr; CSN hdr_csn; for (;;) { CSN csn = csn_lo + (csn_hi - csn_lo) / 2; PMQ_Read_Result readres = pmq_reset_to_specific_chunk_and_load(reader, csn); if (readres == PMQ_Read_Result_Out_Of_Bounds) { // Assuming that csn_lo was valid when we were called, // we now have a situation where the chunk was concurrently discarded. if (csn == csn_lo) { // Already at the final recursion (csn_lo + 1 == csn_hi). Search // space is now empty. return PMQ_Read_Result_Out_Of_Bounds; } // shrink the search space, adapt lower boundary to account for the concurrently discarded data. csn_lo = csn + 1; } else if (readres == PMQ_Read_Result_EOF) { // Could this happen? I believe not. We're assuming that at the start, // csn_lo == csn_hi or csn_hi - 1 was valid. assert(0); } else if (readres != PMQ_Read_Result_Success) { return readres; } else { if (hdr_valid) { PMQ_Chunk_Hdr old_hdr = hdr; CSN old_csn = hdr_csn; pmq_reader_copy_chunk_header(ckread, &hdr); hdr_csn = csn; if (! pmq_check_chunk_msns(csn, old_csn, &hdr, &old_hdr)) { return PMQ_Read_Result_Integrity_Error; } } else { pmq_reader_copy_chunk_header(ckread, &hdr); hdr_csn = csn; hdr_valid = true; } PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate; if (sn64_lt(msn, ckread->cnk_msn)) { if (csn == csn_lo) // already final iteration return PMQ_Read_Result_Out_Of_Bounds; csn_hi = csn; } else if (sn64_ge(msn, ckread->cnk_msn + ckread->cnk_msgcount)) { if (csn == csn_lo) // already final iteration return PMQ_Read_Result_Out_Of_Bounds; csn_lo = csn + 1; } else { // message inside this block. return PMQ_Read_Result_Success; } } } } static PMQ_Read_Result pmq_reader_seek_to_msg_chunkstore(PMQ_Reader *reader, MSN msn) { Persist_Cursors pc = reader->persist_cursors; pmq_assert(sn64_le(pc.cks_discard_csn, pc.cks_csn)); if (pc.cks_discard_csn == pc.cks_csn) { // The store is empty. // Since we already detected that msn is older than pc.cks_msn, we return // Out_Of_Bounds, not EOF. return PMQ_Read_Result_Out_Of_Bounds; } CSN csn_lo = pc.cks_discard_csn; CSN csn_hi = pc.cks_csn - 1; PMQ_Read_Result result = pmq_bsearch_msg(reader, msn, csn_lo, csn_hi); if (result != PMQ_Read_Result_Success) return result; // Currently setting the msn only after the appropriate chunk was found and // loaded successfully. We might want to change this later. reader->msn = msn; return PMQ_Read_Result_Success; } struct PMQ_Slot_Header_Read_Result { bool is_leader_slot; uint16_t msgsize; uint16_t nslots_req; }; // Helper for function that read slots. // NOTE: Enqueuer lock must be held! static PMQ_Read_Result pmq_read_slot_header(PMQ *q, SSN ssn, PMQ_Slot_Header_Read_Result *out) { //XXX this code is copied and adapted from pmq_compact() const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn); // Extract message size from slot header. out->is_leader_slot = (slot->flags & PMQ_SLOT_LEADER_MASK) != 0; out->msgsize = slot->msgsize; out->nslots_req = (slot->msgsize + PMQ_SLOT_SPACE - 1) / PMQ_SLOT_SPACE; // TODO validate msgsize field, does it make sense? return PMQ_Read_Result_Success; } // Seek message in the slots file. // Currently this requires locking the enqueuer and a linear scan. // We should look for ways to improve. static PMQ_Read_Result pmq_reader_seek_to_msg_slotsfile(PMQ_Reader *reader, MSN msn) { Persist_Cursors pc = reader->persist_cursors; assert(sn64_inrange(msn, pc.cks_msn, pc.wal_msn)); // checked in caller std::lock_guard lock(reader->q->enqueue_mutex); // To prevent races, we need to check again using the enqueuer's cursors // that the MSN that we're looking for is still in the In_Queue. In_Queue_Cursors *ic = &reader->q->enqueuer.in_queue_cursors; if (sn64_inrange(msn, ic->msn_disk, ic->msn)) { if (sn64_inrange(pc.wal_msn, msn, ic->msn)) // this is almost guaranteed but there is a race that should be // impossible in practice (requires ic->msn to wrap around between // msn and pc.wal_msn). { MSN msn_cur = ic->msn_disk; SSN ssn_cur = ic->ssn_disk; while (msn_cur != msn) { if (sn64_ge(ssn_cur, pc.wal_ssn)) { pmq_perr_f("Integrity Error: Reached end of persisted region in slotsfile " "but did not encounter msn=%" PRIu64, msn.value()); return PMQ_Read_Result_Integrity_Error; } PMQ_Slot_Header_Read_Result slot_read_result; if (PMQ_Read_Result readres = pmq_read_slot_header(reader->q, ssn_cur, &slot_read_result); readres != PMQ_Read_Result_Success) { return readres; } if (! slot_read_result.is_leader_slot) { // Earlier there was an assert() here instead of an integrity check, // assuming that RAM should never be corrupted. However, the RAM might // be filled from disk, and we currently don't validate the data after // loading. Thus we now consider slot memory just as corruptible as // disk data. pmq_perr_f("Integrity Error: slot %" PRIu64 " is not a leader slot.", ssn_cur.value()); return PMQ_Read_Result_Integrity_Error; } if (pc.wal_ssn - ssn_cur < slot_read_result.nslots_req) { pmq_perr_f("Integrity Error: forwarding %d slots through the slots file" " would skip over persisted region", (int) slot_read_result.nslots_req); pmq_perr_f("current msn=%" PRIu64 ", ssn=%" PRIu64 ", last valid slot is %" PRIu64, msn_cur.value(), ssn_cur.value(), pc.wal_msn.value()); return PMQ_Read_Result_Integrity_Error; } ssn_cur += slot_read_result.nslots_req; msn_cur += 1; } reader->read_mode = PMQ_Read_Mode_Slotsfile; reader->slots_readstate.ssn = ssn_cur; return PMQ_Read_Result_Success; } } // if we missed the window (race condition) we can expect to find the message in the chunk store. return pmq_reader_seek_to_msg_chunkstore(reader, msn); } static PMQ_Read_Result pmq_reader_seek_to_msg_impl_real(PMQ_Reader *reader, MSN msn) { pmq_reader_update_persist_cursors(reader); Persist_Cursors pc = reader->persist_cursors; if (sn64_ge(msn, pc.cks_msn)) { if (sn64_gt(msn, pc.wal_msn)) { return PMQ_Read_Result_Out_Of_Bounds; } return pmq_reader_seek_to_msg_slotsfile(reader, msn); } return pmq_reader_seek_to_msg_chunkstore(reader, msn); } static PMQ_Read_Result pmq_reader_seek_to_msg_impl(PMQ_Reader *reader, MSN msn) { PMQ_Read_Result result = pmq_reader_seek_to_msg_impl_real(reader, msn); reader->last_result = result; if (result == PMQ_Read_Result_Success) { reader->msn = msn; } else { pmq_assert(result != PMQ_Read_Result_EOF); // seeking shouldn't return EOF } return result; } PMQ_Read_Result pmq_reader_seek_to_msg(PMQ_Reader *reader, uint64_t msn_value) { MSN msn = MSN(msn_value); return pmq_reader_seek_to_msg_impl(reader, msn); } PMQ_Read_Result pmq_reader_seek_to_current(PMQ_Reader *reader) { pmq_reader_update_persist_cursors(reader); MSN msn = reader->persist_cursors.wal_msn; pmq_debug_f("Try seeking to MSN %" PRIu64, msn.value()); return pmq_reader_seek_to_msg_impl(reader, msn); } PMQ_Read_Result pmq_reader_seek_to_csn_impl(PMQ_Reader *reader, CSN csn) { Persist_Cursors *pc = &reader->persist_cursors; if (uint64_t chunks_in_store = pc->cks_csn - pc->cks_discard_csn; csn - pc->cks_discard_csn >= chunks_in_store) { if (csn == pc->cks_csn) { // While we cannot know the msn from a chunk (there are no chunks) we // can take the cks_msn instead. // This should return EOF but the reader should positioned correctly. return pmq_reader_seek_to_msg_impl(reader, pc->cks_msn); } return PMQ_Read_Result_Out_Of_Bounds; } // Otherwise, let's load a chunk and read the oldest msn from there. // The reader state management should be cleaned up. It's not very clear // what all the members mean and how they need to be mutated. reader->read_mode = PMQ_Read_Mode_Chunkstore; PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate; PMQ_Read_Result result = pmq_reset_to_specific_chunk_and_load(reader, csn); if (result != PMQ_Read_Result_Success) { // EOF should not happen because of our prior checks. // I would like to use an assert but at least in theory there is the // chance of a wraparound happening concurrently. if (result == PMQ_Read_Result_EOF) { // EOF would be misleading since we are not "positioned". Not sure what to do currently. result = PMQ_Read_Result_Out_Of_Bounds; } return result; } reader->msn = ckread->cnk_msn; return PMQ_Read_Result_Success; } PMQ_Read_Result pmq_reader_seek_to_oldest(PMQ_Reader *reader) { pmq_reader_update_persist_cursors(reader); CSN csn = reader->persist_cursors.cks_discard_csn; pmq_debug_f("Try seeking to CSN %" PRIu64, csn.value()); reader->last_result = pmq_reader_seek_to_csn_impl(reader, csn); if (reader->last_result == PMQ_Read_Result_Success) { pmq_debug_f("Succeeded in seeking to CSN %" PRIu64 ". MSN is %" PRIu64, csn.value(), reader->msn.value()); } else { pmq_debug_f("Seeking to CSN failed"); } return reader->last_result; } static PMQ_Read_Result pmq_read_msg_slotsfile(PMQ_Reader *reader, PMQ_Msg_Output output); // Attempt to read the message given by reader->msn from the chunk store. // We may have to switch to reading from the slotsfile if we detect an EOF. static PMQ_Read_Result pmq_read_msg_chunkstore(PMQ_Reader *reader, PMQ_Msg_Output output) { pmq_assert(reader->read_mode == PMQ_Read_Mode_Chunkstore); PMQ_Chunks_Readstate *ckread = &reader->chunks_readstate; if (! ckread->cnk_loaded) { PMQ_Read_Result readres = pmq_load_chunk(reader); if (readres == PMQ_Read_Result_EOF) { pmq_debug_f("Reader switches to slots file"); // Switch to slot-file read mode reader->read_mode = PMQ_Read_Mode_Slotsfile; reader->slots_readstate.ssn = reader->persist_cursors.cks_ssn; return pmq_read_msg_slotsfile(reader, output); } if (readres != PMQ_Read_Result_Success) { return readres; } pmq_assert(ckread->cnk_loaded); } else if (reader->msn - ckread->cnk_msn == ckread->cnk_msgcount) { // Load next chunk CSN csn_old = ckread->cnk_csn; PMQ_Chunk_Hdr hdr_old; pmq_reader_copy_chunk_header(ckread, &hdr_old); PMQ_Read_Result readres = pmq_reset_to_specific_chunk_and_load(reader, ckread->cnk_csn + 1); if (readres != PMQ_Read_Result_Success) return readres; CSN csn_new = ckread->cnk_csn; PMQ_Chunk_Hdr hdr_new; pmq_reader_copy_chunk_header(ckread, &hdr_new); if (! pmq_check_chunk_msns(csn_old, csn_new, &hdr_old, &hdr_new)) { return PMQ_Read_Result_Integrity_Error; } } // Chunk is present pmq_assert(sn64_le(ckread->cnk_msn, reader->msn)); pmq_assert(sn64_lt(reader->msn, ckread->cnk_msn + ckread->cnk_msgcount)); uint64_t msgindex = reader->msn - ckread->cnk_msn; uint64_t msgoff = ckread->cnk_msgoffsets.at(msgindex); uint64_t nextoff = ckread->cnk_msgoffsets.at(msgindex + 1); uint64_t msgsize = nextoff - msgoff; if (msgoff >= nextoff) { pmq_perr_f("Invalid offsets in chunk %" PRIu64 ": Offset #%u and #%u are %u > %u", ckread->cnk_csn.value(), (unsigned) msgindex, (unsigned) msgindex + 1, (unsigned) msgoff, (unsigned) nextoff); return PMQ_Read_Result_Integrity_Error; } if (nextoff > PMQ_CHUNK_SIZE) { pmq_perr_f("Invalid offset in chunk %" PRIu64 ": " "Offset #%u = %u exceed chunk size", ckread->cnk_csn.value(), (unsigned) msgindex + 1, (unsigned) nextoff); return PMQ_Read_Result_Integrity_Error; } *output.size_out = msgsize; if (msgsize <= output.data_size) { Untyped_Slice slice = ckread->cnk_buffer.untyped_slice().offset_bytes(msgoff); copy_from_slice(output.data, slice, msgsize); } reader->msn += 1; return PMQ_Read_Result_Success; } // Attempt to read the message given by reader->msn from the chunk store. // We may have to switch to reading from the chunk store if we detect that // we've lost sync -- this may happen if the message we want to read has // already disappeared (was overwritten) from the slotsfile. static PMQ_Read_Result pmq_read_msg_slotsfile(PMQ_Reader *reader, PMQ_Msg_Output output) { pmq_assert(reader->read_mode == PMQ_Read_Mode_Slotsfile); PMQ *q = reader->q; PMQ_Slots_Readstate *slread = &reader->slots_readstate; SSN ssn = slread->ssn; if (ssn == reader->persist_cursors.wal_ssn) { return PMQ_Read_Result_EOF; } if (sn64_lt(reader->persist_cursors.wal_ssn, ssn)) { pmq_debug_f("sn64_lt(reader->persist_cursors.wal_ssn, ssn): wal_ssn=%" PRIu64 ", ssn=%" PRIu64, reader->persist_cursors.wal_ssn.value(), ssn.value()); // Should we even allow this to happen? return PMQ_Read_Result_Out_Of_Bounds; } // NOTE: We need to be careful to avoid that the ringbuffer slots that we // read get overwritten concurrently because of new messages being enqueued. // For now we will simply lock the in_queue. We may try to optimize this later. // One possible approach could be to check that the slots that we read from // are valid -- check it both before and after we read the slots. // !!! IDEA !!! instead of locking the in-queue, we could lock the persister. // The reason why this should work is that data from the in-queue only gets // overwritten after having been persisted. // On the other hand, locking the persister might block for an unreasonable // amount of time. std::lock_guard lock(q->enqueue_mutex); // check that the message we're looking for is still there. if (sn64_gt(q->enqueuer.in_queue_cursors.ssn_disk, ssn)) { // The slot was already overwritten before we took the lock. // pmq_reader_seek_to_msg() should find the message in the chunk store. PMQ_Read_Result readres = pmq_reader_seek_to_msg_impl(reader, reader->msn); if (readres != PMQ_Read_Result_Success) return readres; return pmq_read_msg_chunkstore(reader, output); } PMQ_Slot_Header_Read_Result slot_read_result; { PMQ_Read_Result readres = pmq_read_slot_header(q, ssn, &slot_read_result); if (readres != PMQ_Read_Result_Success) return readres; } if (! slot_read_result.is_leader_slot) { // Earlier there was an assert() here instead of an integrity check, // assuming that RAM should never be corrupted. However, the RAM might // be filled from disk, and we currently don't validate the data after // loading. Thus we now consider slot memory just as corruptible as // disk data. pmq_perr_f("slot %" PRIu64 " is not a leader slot.", ssn.value()); return PMQ_Read_Result_Integrity_Error; } *output.size_out = slot_read_result.msgsize; if (slot_read_result.msgsize > output.data_size) return PMQ_Read_Result_Buffer_Too_Small; if (reader->persist_cursors.wal_ssn - ssn < slot_read_result.nslots_req) { pmq_perr_f("Integrity error: Read inconsistent msgsize from slot"); return PMQ_Read_Result_Integrity_Error; } // copy one message { char *dst = (char *) output.data; uint64_t remain = slot_read_result.msgsize; for (; remain >= PMQ_SLOT_SPACE;) { const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn); const char *src = __pmq_assume_aligned<16>(slot->payload); memcpy(dst, src, PMQ_SLOT_SPACE); ++ ssn; dst += PMQ_SLOT_SPACE; remain -= PMQ_SLOT_SPACE; } if (remain) { const PMQ_Slot *slot = q->in_queue.slots.get_slot_for(ssn); const char *src = __pmq_assume_aligned<16>(slot->payload); memcpy(dst, src, remain); ++ ssn; } } slread->ssn = ssn; reader->msn += 1; return PMQ_Read_Result_Success; } PMQ_Read_Result pmq_read_msg(PMQ_Reader *reader, void *data, size_t size, size_t *out_size) { if (reader->last_result != PMQ_Read_Result_Success && reader->last_result != PMQ_Read_Result_EOF) { return reader->last_result; // need to seek to clear the error! } PMQ_Msg_Output output(data, size, out_size); pmq_reader_update_persist_cursors(reader); if (sn64_ge(reader->msn, reader->persist_cursors.wal_msn)) { if (reader->msn == reader->persist_cursors.wal_msn) { //pmq_debug_f("Reader reaches EOF at msn=%" PRIu64, reader->msn.value()); return PMQ_Read_Result_EOF; } return PMQ_Read_Result_Out_Of_Bounds; } switch (reader->read_mode) { case PMQ_Read_Mode_Chunkstore: pmq_debug_f("Read message %" PRIu64 " from chunk store.", reader->msn.value()); return pmq_read_msg_chunkstore(reader, output); case PMQ_Read_Mode_Slotsfile: pmq_debug_f("Read message %" PRIu64 " from slots file.", reader->msn.value()); return pmq_read_msg_slotsfile(reader, output); default: // shouldn't happen. pmq_assert(0); abort(); } } PMQ_Reader *pmq_reader_create(PMQ *q) { PMQ_Reader *reader = new PMQ_Reader; if (! reader) { pmq_perr_f("Failed to allocate reader!"); return nullptr; } reader->q = q; reader->msn = MSN(0); reader->read_mode = PMQ_Read_Mode_Chunkstore; reader->last_result = PMQ_Read_Result_Success; reader->chunks_readstate.cnk_csn = CSN(0); // for now reader->chunks_readstate.cnk_buffer.allocate(PMQ_CHUNK_SIZE); reader->chunks_readstate.cnk_loaded = false; reader->chunks_readstate.cnk_msn = MSN(0); reader->chunks_readstate.cnk_msgcount = 0; reader->slots_readstate.ssn = SSN(0); return reader; } void pmq_reader_destroy(PMQ_Reader *reader) { // TODO? delete reader; } PMQ *pmq_reader_get_pmq(PMQ_Reader *reader) { return reader->q; } uint64_t pmq_reader_get_current_msn(PMQ_Reader *reader) { return reader->msn.value(); } uint64_t pmq_reader_find_old_msn(PMQ_Reader *reader) { for (uint64_t distance = 1; ; distance = (distance ? 2 * distance : 1)) { pmq_reader_update_persist_cursors(reader); Persist_Cursors persist_cursors = reader->persist_cursors; CSN csn = persist_cursors.cks_discard_csn + distance; if (sn64_ge(csn, persist_cursors.cks_csn)) { return persist_cursors.cks_msn.value(); } // possible optimization: don't load the whole chunk but only the header PMQ_Read_Result readres = pmq_reset_to_specific_chunk_and_load(reader, csn); if (readres == PMQ_Read_Result_Success) { return reader->msn.value(); } } } PMQ_Persist_Info pmq_reader_get_persist_info(PMQ_Reader *reader) { return pmq_get_persist_info(reader->q); } bool pmq_reader_eof(PMQ_Reader *reader) { // this is a bit wacky -- we read the pub_persist_cursors, which requires a mutex lock, // because we do not know from the current context if we could just access the Persister State's prive persist_cursors. // NOTE: We expect that wal_msn is always kept "in front" of cks_msn (the chunk-store MSN). // Even when the wal does not have any additional slots -- in this case, we expect wal_msn == cks_msn. pmq_reader_update_persist_cursors(reader); MSN wal_msn = reader->persist_cursors.wal_msn; return sn64_ge(reader->msn, wal_msn); }