diff --git a/crates/ironcache-store/src/kvobj.rs b/crates/ironcache-store/src/kvobj.rs index e5faa63..c312223 100644 --- a/crates/ironcache-store/src/kvobj.rs +++ b/crates/ironcache-store/src/kvobj.rs @@ -1994,6 +1994,622 @@ impl KvObj { } } +// =========================================================================== +// The SINGLE-ALLOCATION table entry (memory Round 3, OBJECT_LAYOUT.md #111). +// +// The per-shard table now stores ONE `Entry` per key, NOT a `HashMap` key/value +// pair. The previous representation was `hashbrown::HashMap, KvObj>`: +// THREE allocations per string key (the map's owned key `Box`, the `KvObj`'s own +// duplicate key `Box`, and the value `Box`), carried in an 80-byte map slot. +// +// `Entry` collapses a STRING key to ONE allocation (`Entry::Str`): a single blob +// that holds, contiguously, a small packed header, the optional TTL deadline, the +// key length, the key bytes, and the value bytes (the value is INLINE in the same +// blob, tighter than redis's two-allocation kvobj). The table itself stores ONLY +// the `Entry` and derives the key from inside it (the `entry_key` helper), so there +// is no separate map key allocation and no key duplication. The slot is one machine +// word (the `Box<[u8]>` fat pointer is two words, but the enum is `repr`-sized to +// the larger arm, see the slot-size note on [`Entry`]). +// +// A COLLECTION key (`Entry::Coll`) is a `Box`: a small header + the key +// + the existing boxed collection value. Collections already heap-allocate their +// contents, so one extra small box is negligible and keeps the `Entry` enum small. +// +// The blob is parsed with SAFE slicing only (`get(..)` / `try_into` / +// `u64::from_le_bytes`); there is NO `unsafe` (the crate is `#![forbid(unsafe_code)]`). +// =========================================================================== + +/// The byte offset of the key-length field is computed from the header + optional +/// TTL; these constants name the fixed-size pieces of the [`Entry::Str`] blob. +mod blob { + /// Header: `[data_type:u8][encoding:u8][flags:u8]`. + pub const HEADER_LEN: usize = 3; + /// The TTL deadline (u64 little-endian milliseconds), present iff the + /// [`FLAG_HAS_TTL`] bit is set in the flags byte. + pub const TTL_LEN: usize = 8; + /// The key-length prefix (u32 little-endian), always present. + pub const KEYLEN_LEN: usize = 4; + /// The flags byte bit: a TTL deadline u64 follows the header. + pub const FLAG_HAS_TTL: u8 = 0b0000_0001; +} + +/// Encode a [`DataType`] as the blob header's first byte. +fn data_type_to_u8(t: DataType) -> u8 { + match t { + DataType::String => 0, + DataType::List => 1, + DataType::Set => 2, + DataType::Hash => 3, + DataType::ZSet => 4, + DataType::Stream => 5, + } +} + +/// Decode the blob header's first byte back into a [`DataType`]. +fn data_type_from_u8(b: u8) -> DataType { + match b { + 1 => DataType::List, + 2 => DataType::Set, + 3 => DataType::Hash, + 4 => DataType::ZSet, + 5 => DataType::Stream, + // 0 (and any unexpected byte, defensively) is String. + _ => DataType::String, + } +} + +/// Encode an [`Encoding`] as the blob header's second byte. +fn encoding_to_u8(e: Encoding) -> u8 { + match e { + Encoding::Int => 0, + Encoding::EmbStr => 1, + Encoding::Raw => 2, + Encoding::ListPack => 3, + Encoding::QuickList => 4, + Encoding::IntSet => 5, + Encoding::HashTable => 6, + Encoding::SkipList => 7, + } +} + +/// Decode the blob header's second byte back into an [`Encoding`]. +fn encoding_from_u8(b: u8) -> Encoding { + match b { + 0 => Encoding::Int, + 2 => Encoding::Raw, + 3 => Encoding::ListPack, + 4 => Encoding::QuickList, + 5 => Encoding::IntSet, + 6 => Encoding::HashTable, + 7 => Encoding::SkipList, + // 1 (and any unexpected byte, defensively) is EmbStr. + _ => Encoding::EmbStr, + } +} + +/// The collection value held inside a [`CollEntry`]. The four collection structs +/// (`ListVal`/`HashVal`/`SetVal`/`ZSetVal`) are owned here UNCHANGED; the `CollEntry` +/// `Box` provides the single indirection (memory Round 1 boxed the values inside the +/// old `ValueRepr`; Round 3 moves that box up to the whole collection entry). +#[derive(Debug, Clone)] +pub enum CollVal { + /// A LIST value (PR-5). + List(ListVal), + /// A HASH value (PR-6). + Hash(HashVal), + /// A SET value (PR-7). + Set(SetVal), + /// A ZSET value (PR-8). + ZSet(ZSetVal), +} + +impl CollVal { + /// The data type this collection reports. + #[must_use] + pub fn data_type(&self) -> DataType { + match self { + CollVal::List(_) => DataType::List, + CollVal::Hash(_) => DataType::Hash, + CollVal::Set(_) => DataType::Set, + CollVal::ZSet(_) => DataType::ZSet, + } + } + + /// The encoding this collection reports (a pure function of its active repr). + #[must_use] + pub fn encoding(&self) -> Encoding { + match self { + CollVal::List(l) => l.encoding(), + CollVal::Hash(h) => h.encoding(), + CollVal::Set(s) => s.encoding(), + CollVal::ZSet(z) => z.encoding(), + } + } + + /// The sum of element byte lengths (the value side of accounting). + #[must_use] + pub fn element_bytes(&self) -> usize { + match self { + CollVal::List(l) => l.element_bytes(), + CollVal::Hash(h) => h.element_bytes(), + CollVal::Set(s) => s.element_bytes(), + CollVal::ZSet(z) => z.element_bytes(), + } + } + + /// The element count. + #[must_use] + pub fn len(&self) -> usize { + match self { + CollVal::List(l) => l.len(), + CollVal::Hash(h) => h.len(), + CollVal::Set(s) => s.len(), + CollVal::ZSet(z) => z.len(), + } + } + + /// Whether the collection holds zero elements. + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// A COLLECTION table entry (memory Round 3): a small header, the key bytes, the +/// optional TTL deadline, and the boxed-up collection value. One `Box` +/// behind the [`Entry::Coll`] arm. The key is stored INSIDE so the table's eq/hash +/// closures read it (no separate map key allocation). +#[derive(Debug, Clone)] +pub struct CollEntry { + /// The eviction rank (2-bit S3-FIFO counter; RESERVED, mirrors the old + /// `Header::eviction_rank`). Not load-bearing today (the policy owns the rank). + pub eviction_rank: u8, + /// The forkless-snapshot version stamp (#60; RESERVED, mirrors the old + /// `Header::snapshot_version`). + pub snapshot_version: u32, + /// The absolute TTL deadline, if any. + pub expire_at: Option, + /// The key bytes (stored here so the table eq/hash closures can read them). + pub key: Box<[u8]>, + /// The collection value. + pub value: CollVal, +} + +/// One key/value table entry as a SINGLE allocation behind a small slot (memory +/// Round 3, OBJECT_LAYOUT.md #111). The per-shard `hashbrown::HashTable` stores +/// `Entry` directly and derives the key from inside it. +/// +/// ## Slot size +/// +/// The enum is two arms, each one pointer wide (`Box<[u8]>` is a fat pointer, two +/// words; `Box` is one word), so `size_of::()` is 16 bytes on a +/// 64-bit target (the `Box<[u8]>` fat-pointer arm dominates; the discriminant folds +/// into the niche/padding). The hashbrown `HashTable` slot is therefore 16 +/// bytes plus the 1-byte control tag, versus the old `HashMap, KvObj>` +/// whose value alone (`KvObj`) was 64 bytes carried in an 80-byte slot, AND which +/// owned a separate key `Box`. This is the Round 3 win: a small slot, one allocation +/// per string key, and no key duplication. +#[derive(Debug, Clone)] +pub enum Entry { + /// A STRING-family value (int/embstr/raw): ONE blob holding the packed header, + /// the optional TTL, the key length, the key bytes, then the value bytes. For an + /// int-encoded value the value bytes are the CANONICAL DECIMAL bytes (so a read + /// borrows them directly and `OBJECT ENCODING` still reports `int` from the + /// header), realizing the int materialization with no per-read allocation. + Str(Box<[u8]>), + /// A COLLECTION value (list/hash/set/zset): the boxed [`CollEntry`]. + Coll(Box), +} + +impl Entry { + /// Assemble a [`Entry::Str`] blob from its parts (the single write site for the + /// blob layout). `value_bytes` are the bytes stored after the key: for int the + /// canonical decimal digits, for embstr/raw the string bytes. + fn build_str_blob( + data_type: DataType, + encoding: Encoding, + expire_at: Option, + key: &[u8], + value_bytes: &[u8], + ) -> Box<[u8]> { + let has_ttl = expire_at.is_some(); + let ttl_len = if has_ttl { blob::TTL_LEN } else { 0 }; + let total = blob::HEADER_LEN + ttl_len + blob::KEYLEN_LEN + key.len() + value_bytes.len(); + let mut buf = Vec::with_capacity(total); + // Header: type, encoding, flags. + buf.push(data_type_to_u8(data_type)); + buf.push(encoding_to_u8(encoding)); + buf.push(if has_ttl { blob::FLAG_HAS_TTL } else { 0 }); + // Optional TTL deadline (u64 LE). + if let Some(UnixMillis(deadline)) = expire_at { + buf.extend_from_slice(&deadline.to_le_bytes()); + } + // Key length (u32 LE) + key bytes. A key longer than u32::MAX is not + // representable; Redis keys are bounded far below this, and the command + // layer rejects oversize keys, so the cast is safe in practice. Saturate + // defensively rather than wrap. + let key_len = u32::try_from(key.len()).unwrap_or(u32::MAX); + buf.extend_from_slice(&key_len.to_le_bytes()); + buf.extend_from_slice(key); + // Value bytes (the rest of the blob). + buf.extend_from_slice(value_bytes); + buf.into_boxed_slice() + } + + /// Build a STRING entry from a key and an already-[`classify`]d value. + #[must_use] + pub fn str_from_classified( + key: &[u8], + classified: Classified, + bytes: &[u8], + expire_at: Option, + ) -> Self { + match classified { + Classified::Int(n) => Entry::str_from_int(key, n, expire_at), + Classified::EmbStr => Entry::Str(Entry::build_str_blob( + DataType::String, + Encoding::EmbStr, + expire_at, + key, + bytes, + )), + Classified::Raw => Entry::Str(Entry::build_str_blob( + DataType::String, + Encoding::Raw, + expire_at, + key, + bytes, + )), + } + } + + /// Build a STRING entry from a key and raw value bytes, classifying the encoding. + #[must_use] + pub fn str_from_bytes(key: &[u8], bytes: &[u8], expire_at: Option) -> Self { + Entry::str_from_classified(key, classify(bytes), bytes, expire_at) + } + + /// Build an int-encoded STRING entry: the value bytes are the canonical decimal + /// digits (so the read path borrows them; `OBJECT ENCODING` reports `int`). + #[must_use] + pub fn str_from_int(key: &[u8], n: i64, expire_at: Option) -> Self { + let mut buf = [0u8; 20]; + let digits = format_i64(n, &mut buf); + Entry::Str(Entry::build_str_blob( + DataType::String, + Encoding::Int, + expire_at, + key, + digits, + )) + } + + /// Build a COLLECTION entry from a key and a [`CollVal`]. + #[must_use] + pub fn coll(key: &[u8], value: CollVal, expire_at: Option) -> Self { + Entry::Coll(Box::new(CollEntry { + eviction_rank: 0, + snapshot_version: 0, + expire_at, + key: key.to_vec().into_boxed_slice(), + value, + })) + } + + /// Build an entry from a key and an owned RMW write value ([`NewValueOwned`]). + #[must_use] + pub fn from_new_owned(key: &[u8], value: NewValueOwned, expire_at: Option) -> Self { + match value { + NewValueOwned::Int(n) => Entry::str_from_int(key, n, expire_at), + NewValueOwned::Bytes(b) => Entry::str_from_bytes(key, &b, expire_at), + NewValueOwned::List(elems) => { + let mut list = ListVal::new(); + for e in &elems { + list.push_back(e); + } + Entry::coll(key, CollVal::List(list), expire_at) + } + NewValueOwned::Hash(pairs) => { + let mut hash = HashVal::new(); + for (f, v) in &pairs { + hash.set(f, v); + } + Entry::coll(key, CollVal::Hash(hash), expire_at) + } + NewValueOwned::Set(members) => { + Entry::coll(key, CollVal::Set(SetVal::from_members(&members)), expire_at) + } + NewValueOwned::ZSet(pairs) => { + Entry::coll(key, CollVal::ZSet(ZSetVal::from_pairs(&pairs)), expire_at) + } + } + } + + /// Build an `Entry` from a fully-formed [`KvObj`] (the `insert_object` / move + /// paths; the `KvObj` is the public builder/transfer type the tests construct). + #[must_use] + pub fn from_kvobj(obj: KvObj) -> Self { + let KvObj { + header, + key, + value, + expire_at, + } = obj; + match value { + ValueRepr::Int(n) => Entry::str_from_int(&key, n, expire_at), + // The embstr-vs-raw distinction lives in the HEADER encoding, not the + // variant, so honor `header.encoding` when laying down the blob. + ValueRepr::Inline(b) | ValueRepr::Raw(b) => Entry::Str(Entry::build_str_blob( + header.data_type, + header.encoding, + expire_at, + &key, + &b, + )), + ValueRepr::List(l) => Entry::coll(&key, CollVal::List(*l), expire_at), + ValueRepr::Hash(h) => Entry::coll(&key, CollVal::Hash(*h), expire_at), + ValueRepr::Set(s) => Entry::coll(&key, CollVal::Set(*s), expire_at), + ValueRepr::ZSet(z) => Entry::coll(&key, CollVal::ZSet(*z), expire_at), + } + } + + // -- STRING blob parsing (SAFE slicing only) -- + + /// The flags byte of a Str blob. + fn str_flags(blob: &[u8]) -> u8 { + blob.get(2).copied().unwrap_or(0) + } + + /// The byte offset where the key-length field begins (after the header and the + /// optional TTL). + fn str_keylen_offset(blob: &[u8]) -> usize { + if Entry::str_flags(blob) & blob::FLAG_HAS_TTL != 0 { + blob::HEADER_LEN + blob::TTL_LEN + } else { + blob::HEADER_LEN + } + } + + /// The key length stored in a Str blob. + fn str_key_len(blob: &[u8]) -> usize { + let off = Entry::str_keylen_offset(blob); + blob.get(off..off + blob::KEYLEN_LEN) + .and_then(|s| s.try_into().ok()) + .map_or(0, |a: [u8; 4]| u32::from_le_bytes(a) as usize) + } + + /// The byte offset where the key bytes begin in a Str blob. + fn str_key_offset(blob: &[u8]) -> usize { + Entry::str_keylen_offset(blob) + blob::KEYLEN_LEN + } + + /// The KEY bytes of this entry (used by the table's eq/hash closures and the + /// eviction/SCAN hooks). For a `Coll` this is the stored `CollEntry.key`. + #[must_use] + pub fn key(&self) -> &[u8] { + match self { + Entry::Str(blob) => { + let off = Entry::str_key_offset(blob); + let klen = Entry::str_key_len(blob); + blob.get(off..off + klen).unwrap_or(&[]) + } + Entry::Coll(c) => &c.key, + } + } + + /// The VALUE bytes of a STRING entry (decimal digits for int, the string bytes + /// for embstr/raw). Returns an empty slice for a collection entry (which has no + /// byte-readable value, matching the old `view_of` for a collection). + #[must_use] + pub fn str_value_bytes(&self) -> &[u8] { + match self { + Entry::Str(blob) => { + let val_off = Entry::str_key_offset(blob) + Entry::str_key_len(blob); + blob.get(val_off..).unwrap_or(&[]) + } + Entry::Coll(_) => &[], + } + } + + /// The logical data type (for TYPE / WRONGTYPE / the SCAN type filter). + #[must_use] + pub fn data_type(&self) -> DataType { + match self { + Entry::Str(blob) => data_type_from_u8(blob.first().copied().unwrap_or(0)), + Entry::Coll(c) => c.value.data_type(), + } + } + + /// The internal encoding (for `OBJECT ENCODING`). For a Str entry it is read + /// straight from the header byte; for a Coll it is the live pure-function repr. + #[must_use] + pub fn encoding(&self) -> Encoding { + match self { + Entry::Str(blob) => encoding_from_u8(blob.get(1).copied().unwrap_or(1)), + Entry::Coll(c) => c.value.encoding(), + } + } + + /// The absolute TTL deadline, if any. + #[must_use] + pub fn expire_at(&self) -> Option { + match self { + Entry::Str(blob) => { + if Entry::str_flags(blob) & blob::FLAG_HAS_TTL != 0 { + blob.get(blob::HEADER_LEN..blob::HEADER_LEN + blob::TTL_LEN) + .and_then(|s| s.try_into().ok()) + .map(|a: [u8; 8]| UnixMillis(u64::from_le_bytes(a))) + } else { + None + } + } + Entry::Coll(c) => c.expire_at, + } + } + + /// Overwrite this entry's TTL deadline in place (EXPIRE/PERSIST/KEEPTTL on an + /// otherwise-untouched value). For a Str entry a TTL add/remove changes the blob + /// LENGTH (the 8-byte deadline field appears/disappears), so the blob is rebuilt; + /// a deadline-only change to an already-TTL'd blob is patched in place. For a + /// Coll entry it is a plain field write. + pub fn set_expire_at(&mut self, expire_at: Option) { + match self { + Entry::Str(blob) => { + let had_ttl = Entry::str_flags(blob) & blob::FLAG_HAS_TTL != 0; + match (had_ttl, expire_at) { + // Patch the existing TTL field in place (same blob length). + (true, Some(UnixMillis(deadline))) => { + let bytes = deadline.to_le_bytes(); + for (i, b) in bytes.iter().enumerate() { + blob[blob::HEADER_LEN + i] = *b; + } + } + // No change (no TTL before, none after). + (false, None) => {} + // Add or remove the TTL field: the blob length changes, so rebuild. + _ => { + let data_type = self.data_type(); + let encoding = self.encoding(); + // Re-extract key + value from the OLD blob before rebuilding. + let key = self.key().to_vec(); + let value = self.str_value_bytes().to_vec(); + *self = Entry::Str(Entry::build_str_blob( + data_type, encoding, expire_at, &key, &value, + )); + } + } + } + Entry::Coll(c) => c.expire_at = expire_at, + } + } + + /// Whether this entry's TTL deadline has strictly passed at `now` (the lazy + /// backstop predicate; `now > deadline`, alive at `now == deadline`). + #[must_use] + pub fn is_expired(&self, now: UnixMillis) -> bool { + match self.expire_at() { + Some(deadline) => now > deadline, + None => false, + } + } + + /// The logical byte length of the value (STRLEN basis). + #[must_use] + pub fn logical_len(&self) -> usize { + match self { + Entry::Str(_) => self.str_value_bytes().len(), + Entry::Coll(c) => c.value.element_bytes(), + } + } + + /// The accounting weight: key bytes + value logical bytes. Identical model to the + /// old `KvObj::accounted_bytes` (the `used_memory` counter and the accounting + /// tests rely on this exact figure). + #[must_use] + pub fn accounted_bytes(&self) -> usize { + self.key().len() + self.logical_len() + } + + /// Whether this entry is a COLLECTION (list/hash/set/zset). + #[must_use] + pub fn is_collection(&self) -> bool { + matches!(self, Entry::Coll(_)) + } + + /// The element count IF this is a collection, else `None`. + #[must_use] + pub fn collection_len(&self) -> Option { + match self { + Entry::Coll(c) => Some(c.value.len()), + Entry::Str(_) => None, + } + } + + /// Whether this is a COLLECTION holding zero elements (the empty-collection + /// -deletes-key backstop check, by element count). + #[must_use] + pub fn is_empty_collection(&self) -> bool { + self.collection_len() == Some(0) + } + + /// Recompute and store the encoding for a COLLECTION entry from its current repr + /// (after an in-place edit a list may cross listpack->quicklist, etc.). A no-op + /// for a Str entry (its encoding is fixed at write time and patched by + /// `set_value_bytes`). Mirrors the old `KvObj::recompute_encoding`. + pub fn recompute_encoding(&mut self) { + if let Entry::Coll(_c) = self { + // The Coll encoding is a PURE FUNCTION of the live value (read via + // `c.value.encoding()`), so there is nothing cached to update: `encoding()` + // already reflects the post-edit repr. Kept as an explicit no-op so the + // store's call site reads the same as the old KvObj path. + } + } + + /// A mutable borrow of the stored collection value as a `&mut dyn ListValue`, or + /// `None` if this entry is not a list. + pub fn as_list_mut(&mut self) -> Option<&mut ListVal> { + match self { + Entry::Coll(c) => match &mut c.value { + CollVal::List(l) => Some(l), + _ => None, + }, + Entry::Str(_) => None, + } + } + + /// A mutable borrow of the stored HASH value, or `None`. + pub fn as_hash_mut(&mut self) -> Option<&mut HashVal> { + match self { + Entry::Coll(c) => match &mut c.value { + CollVal::Hash(h) => Some(h), + _ => None, + }, + Entry::Str(_) => None, + } + } + + /// A mutable borrow of the stored SET value, or `None`. + pub fn as_set_mut(&mut self) -> Option<&mut SetVal> { + match self { + Entry::Coll(c) => match &mut c.value { + CollVal::Set(s) => Some(s), + _ => None, + }, + Entry::Str(_) => None, + } + } + + /// A mutable borrow of the stored ZSET value, or `None`. + pub fn as_zset_mut(&mut self) -> Option<&mut ZSetVal> { + match self { + Entry::Coll(c) => match &mut c.value { + CollVal::ZSet(z) => Some(z), + _ => None, + }, + Entry::Str(_) => None, + } + } + + /// Re-key this entry to `new_key` (the RENAME/MOVE/COPY relocation; the value + /// object is preserved INTACT with its encoding + remaining TTL). For a Str entry + /// the key is INSIDE the blob, so the blob is rebuilt with the new key; for a Coll + /// entry it is a field write. + pub fn rekey(&mut self, new_key: &[u8]) { + match self { + Entry::Str(_) => { + let data_type = self.data_type(); + let encoding = self.encoding(); + let expire_at = self.expire_at(); + let value = self.str_value_bytes().to_vec(); + *self = Entry::Str(Entry::build_str_blob( + data_type, encoding, expire_at, new_key, &value, + )); + } + Entry::Coll(c) => c.key = new_key.to_vec().into_boxed_slice(), + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/ironcache-store/src/lib.rs b/crates/ironcache-store/src/lib.rs index 25feef3..cd6c593 100644 --- a/crates/ironcache-store/src/lib.rs +++ b/crates/ironcache-store/src/lib.rs @@ -32,16 +32,16 @@ pub mod encoding; pub mod kvobj; use bytes::Bytes; -use hashbrown::HashMap; -use hashbrown::HashSet; -use hashbrown::hash_map::Entry; +use hashbrown::hash_map::Entry as WatchMapEntry; +use hashbrown::{DefaultHashBuilder, HashMap, HashSet, HashTable}; use ironcache_eviction::{EvictionPolicy, Policy, map_policy_name}; use ironcache_storage::{ AccountingHook, CountingAccounting, DataType, EvictionHook, ExpireWrite, Keyspace, MoveMode, MoveOutcome, NewValue, NullEviction, OccupiedEntry, OccupiedEntryMut, RmwAction, RmwEntry, RmwStep, ScanCursor, Store, UnixMillis, ValueRef, WatchEntry, }; -use kvobj::{KvObj, int_decimal_bytes}; +use kvobj::{Entry, KvObj}; +use std::hash::BuildHasher; /// The FIXED-SEED stable key hash that the SCAN cursor iterates in ascending order /// (KEYSPACE.md "the full hash recomputed from the embedded key"). It is a small @@ -199,8 +199,21 @@ fn scan_plan<'a>( /// [`NullEviction`] and [`CountingAccounting`]. #[derive(Debug)] pub struct ShardStore { - /// One key->kvobj map per database. `dbs[db]` is the keyspace for `SELECT db`. - dbs: Vec, KvObj>>, + /// One per-database SwissTable, each storing a single-allocation [`Entry`] per key + /// (memory Round 3). `dbs[db]` is the keyspace for `SELECT db`. Unlike a + /// `HashMap, _>`, the low-level [`HashTable`] stores ONLY the entry and + /// derives the key from inside it ([`Entry::key`]), so there is no separate map key + /// allocation and no key duplication. Lookups hash the probe key with [`Self::hasher`] + /// and pass the hash + an eq closure to `find`/`find_entry`/`entry`. + dbs: Vec>, + /// The fixed per-store hasher used for EVERY key hash fed to the [`HashTable`] + /// explicit-hash API (the table stores no hasher of its own). One `RandomState` + /// instance shared across all dbs so a key hashes identically regardless of which db + /// it lands in; constructed once at boot. This is `hashbrown`'s default + /// (`foldhash`), the same hasher the prior `HashMap` used internally. It is NOT the + /// SCAN order hash ([`scan_hash`], a fixed-seed stable hash); this one only needs to + /// be a good table hash and may vary run-to-run. + hasher: DefaultHashBuilder, /// The eviction policy hook (no-op in PR-2a). eviction: E, /// The accounting hook (logical-byte counter in PR-2a). It is fed the same @@ -275,10 +288,11 @@ impl ShardStore { let n = databases.max(1) as usize; let mut dbs = Vec::with_capacity(n); for _ in 0..n { - dbs.push(HashMap::new()); + dbs.push(HashTable::new()); } ShardStore { dbs, + hasher: DefaultHashBuilder::default(), eviction, accounting, used: 0, @@ -322,11 +336,21 @@ impl ShardStore { /// signature and changes no observable command behavior, only the table's /// pre-allocated capacity. pub fn reserve(&mut self, db: u32, additional: usize) { - if let Some(map) = self.dbs.get_mut(db as usize) { - map.reserve(additional); + let hasher = self.hasher.clone(); + if let Some(table) = self.dbs.get_mut(db as usize) { + // The explicit-hash table's `reserve` needs a hasher closure to re-place + // entries on a grow: hash each entry's embedded key. + table.reserve(additional, |e| hasher.hash_one(e.key())); } } + /// Hash a probe key with the store's fixed table hasher (the value fed to the + /// [`HashTable`] explicit-hash API). NOT the SCAN order hash. + #[inline] + fn key_hash(&self, key: &[u8]) -> u64 { + self.hasher.hash_one(key) + } + /// The WATCH write-funnel NOTIFY (TRANSACTIONS.md per-key dirty-CAS, PR-10b). Called /// from the store-internal write funnel ([`Self::put_object`], [`Self::remove_object`], /// [`Self::remove_object_crediting`]) so EVERY create/overwrite/delete/expiry of a @@ -403,13 +427,13 @@ impl ShardStore { /// waist method). #[must_use] pub fn len(&self) -> usize { - self.dbs.iter().map(HashMap::len).sum() + self.dbs.iter().map(HashTable::len).sum() } /// Whether the store holds no entries. #[must_use] pub fn is_empty(&self) -> bool { - self.dbs.iter().all(HashMap::is_empty) + self.dbs.iter().all(HashTable::is_empty) } /// The map index for the validated logical `db`. The command layer validates the @@ -442,10 +466,11 @@ impl ShardStore { /// single hash probe with the Entry API (or a get-once handle threaded to the /// caller) once the read path is restructured around it. No change now. fn expire_if_due(&mut self, db: u32, db_idx: usize, key: &[u8], now: UnixMillis) -> bool { + let h = self.key_hash(key); let due = self .dbs .get(db_idx) - .and_then(|m| m.get(key)) + .and_then(|t| t.find(h, |e| e.key() == key)) .is_some_and(|o| o.is_expired(now)); if due { // Route the removal through the WRITE FUNNEL (`remove_object`): it fires @@ -464,7 +489,9 @@ impl ShardStore { return false; } // Present-and-live iff it exists (it did not expire above). - self.dbs.get(db_idx).is_some_and(|m| m.contains_key(key)) + self.dbs + .get(db_idx) + .is_some_and(|t| t.find(h, |e| e.key() == key).is_some()) } /// Insert or replace `key`'s object, adjusting the accounting/eviction hooks for @@ -479,7 +506,7 @@ impl ShardStore { /// /// `db` is the validated logical DB id passed to the hooks; `db_idx` is the /// (possibly clamped) Vec index for the map (see [`Self::db_index`]). - fn put_object(&mut self, db: u32, db_idx: usize, key: &[u8], obj: KvObj) -> bool { + fn put_object(&mut self, db: u32, db_idx: usize, key: &[u8], obj: Entry) -> bool { // WATCH notify (PR-10b): a create or overwrite of a watched key bumps its // version (gated behind the watched_count fast path inside touch_watch). This // fires for a create on a watched-ABSENT key too (a watched-absent key now @@ -487,20 +514,24 @@ impl ShardStore { // bytes (any write touches the version, matching Redis). self.touch_watch(db, key); let new_bytes = obj.accounted_bytes(); - let boxed: Box<[u8]> = key.to_vec().into_boxed_slice(); + let h = self.key_hash(key); + let hasher = self.hasher.clone(); // Replace inside the entry scope, capturing any old weight, then update the - // hooks AFTER the table borrow ends (the hooks borrow `self` mutably). - let old_bytes = match self.dbs[db_idx].entry(boxed) { - Entry::Occupied(mut e) => { - let old = e.get().accounted_bytes(); - *e.get_mut() = obj; - Some(old) - } - Entry::Vacant(e) => { - e.insert(obj); - None - } - }; + // hooks AFTER the table borrow ends (the hooks borrow `self` mutably). The + // explicit-hash `entry` takes the probe hash, an eq closure (compare embedded + // keys), and a hasher closure (re-place entries on a grow). + let old_bytes = + match self.dbs[db_idx].entry(h, |e| e.key() == key, |e| hasher.hash_one(e.key())) { + hashbrown::hash_table::Entry::Occupied(mut e) => { + let old = e.get().accounted_bytes(); + *e.get_mut() = obj; + Some(old) + } + hashbrown::hash_table::Entry::Vacant(e) => { + e.insert(obj); + None + } + }; if let Some(old) = old_bytes { self.account_sub(old); self.eviction.on_remove(db, key, old); @@ -527,8 +558,15 @@ impl ShardStore { // A watched-but-ABSENT key flush is handled in flush_db (it iterates the watch // slots), since remove_object only fires for a key that was actually resident. self.touch_watch(db, key); - if let Some(obj) = self.dbs[db_idx].remove(key) { - let bytes = obj.accounted_bytes(); + let h = self.key_hash(key); + let removed = match self.dbs[db_idx].find_entry(h, |e| e.key() == key) { + Ok(occ) => { + let (obj, _) = occ.remove(); + Some(obj.accounted_bytes()) + } + Err(_absent) => None, + }; + if let Some(bytes) = removed { self.account_sub(bytes); self.eviction.on_remove(db, key, bytes); true @@ -555,7 +593,15 @@ impl ShardStore { // touches a watched key's version (a collection drained to empty by an edit is a // modification, like any delete). self.touch_watch(db, key); - if self.dbs[db_idx].remove(key).is_some() { + let h = self.key_hash(key); + let existed = match self.dbs[db_idx].find_entry(h, |e| e.key() == key) { + Ok(occ) => { + occ.remove(); + true + } + Err(_absent) => false, + }; + if existed { self.account_sub(bytes); self.eviction.on_remove(db, key, bytes); true @@ -564,69 +610,47 @@ impl ShardStore { } } - /// Build the read-borrow view for an object. An int materializes its decimal - /// bytes (owned); a string borrows the stored buffer. - /// - /// FOLLOW-UP (#8/Efficient): the int branch allocates a fresh `Bytes` per read - /// via `int_decimal_bytes`. When the FAM object-layout work lands, format the - /// decimal digits into an inline/borrowable buffer carried by the view (or by - /// the object) so an int read does no per-read heap allocation. No change now. - fn view_of(obj: &KvObj) -> ValueRef<'_> { - match &obj.value { - kvobj::ValueRepr::Int(n) => { - ValueRef::from_int_bytes(obj.header.data_type, obj.expire_at, int_decimal_bytes(*n)) - } - // Embstr and raw both borrow their bytes the same way; the embstr-vs-raw - // distinction is carried by `obj.header.encoding`, not the variant. - kvobj::ValueRepr::Inline(b) | kvobj::ValueRepr::Raw(b) => { - ValueRef::borrowed(obj.header.data_type, obj.header.encoding, obj.expire_at, b) - } - // A LIST/HASH/SET is not byte-readable as a string: the command layer only - // reads its data_type / encoding from the view (GET checks String; OBJECT - // ENCODING reads the encoding, e.g. listpack/quicklist/hashtable/intset). The - // bytes are empty so a misrouted as_bytes() yields nothing rather than leaking - // a representation. - kvobj::ValueRepr::List(_) - | kvobj::ValueRepr::Hash(_) - | kvobj::ValueRepr::Set(_) - | kvobj::ValueRepr::ZSet(_) => ValueRef::borrowed( - obj.header.data_type, - obj.header.encoding, - obj.expire_at, - &[], - ), + /// Set the TTL deadline of the entry for `key` in table `db_idx` (the TTL-only + /// write path shared by the `rmw`/`rmw_mut` Keep/Mutated arms). For a Str entry an + /// add/remove of the deadline rebuilds the blob (the 8-byte field appears/ + /// disappears); a deadline-only change patches in place. A no-op if the key is gone. + fn set_entry_expire(&mut self, db_idx: usize, key: &[u8], deadline: Option) { + let h = self.key_hash(key); + if let Some(obj) = self.dbs[db_idx].find_mut(h, |e| e.key() == key) { + obj.set_expire_at(deadline); } } - /// Build the rmw observation handle for an object (same int-materialization as - /// [`Self::view_of`]). Returns the handle plus the int decimal `Bytes` keeper so - /// the borrow stays valid for the closure. - fn occupied_of(obj: &KvObj) -> OccupiedEntry<'_> { - match &obj.value { - kvobj::ValueRepr::Int(n) => OccupiedEntry::from_int_bytes( - obj.header.data_type, - obj.expire_at, - int_decimal_bytes(*n), - ), - // Embstr and raw both borrow their bytes the same way; the embstr-vs-raw - // distinction is carried by `obj.header.encoding`, not the variant. - kvobj::ValueRepr::Inline(b) | kvobj::ValueRepr::Raw(b) => { - OccupiedEntry::borrowed(obj.header.data_type, obj.header.encoding, obj.expire_at, b) - } - // A LIST/HASH/SET observed through the READ-ONLY rmw arm (e.g. a numeric RMW - // that hits a collection key) exposes empty bytes; the closure sees the - // collection data_type and returns WRONGTYPE. In-place collection edits use the - // MUTABLE arm (`rmw_mut` -> OccupiedEntryMut), not this read-only handle. - kvobj::ValueRepr::List(_) - | kvobj::ValueRepr::Hash(_) - | kvobj::ValueRepr::Set(_) - | kvobj::ValueRepr::ZSet(_) => OccupiedEntry::borrowed( - obj.header.data_type, - obj.header.encoding, - obj.expire_at, - &[], - ), - } + /// Build the read-borrow view for an entry. Memory Round 3: an int-encoded value + /// stores its CANONICAL DECIMAL bytes INLINE in the blob (encoding reported as + /// `int` from the header), so the view borrows them directly with NO per-read + /// allocation (the prior `int_decimal_bytes` allocation the FOLLOW-UP note flagged + /// is now eliminated). A string borrows its stored bytes; a collection borrows an + /// empty slice (only its data_type / encoding are read for GET / OBJECT ENCODING). + fn view_of(obj: &Entry) -> ValueRef<'_> { + // `str_value_bytes()` is the int decimal digits / embstr / raw bytes for a Str + // entry and an empty slice for a Coll entry (which is not byte-readable as a + // string: GET checks the String data_type; OBJECT ENCODING reads the encoding). + ValueRef::borrowed( + obj.data_type(), + obj.encoding(), + obj.expire_at(), + obj.str_value_bytes(), + ) + } + + /// Build the rmw observation handle for an entry (same borrow rule as + /// [`Self::view_of`]: int decimal bytes are inline, so no per-read allocation). + /// A collection observed through the READ-ONLY rmw arm exposes empty bytes; the + /// closure sees the collection data_type and returns WRONGTYPE. In-place collection + /// edits use the MUTABLE arm (`rmw_mut` -> OccupiedEntryMut), not this handle. + fn occupied_of(obj: &Entry) -> OccupiedEntry<'_> { + OccupiedEntry::borrowed( + obj.data_type(), + obj.encoding(), + obj.expire_at(), + obj.str_value_bytes(), + ) } } @@ -641,7 +665,10 @@ impl Store for ShardStore { // eventual single-source migration (see the eviction crate docs) and is not // written on the access path, since nothing reads it today. self.eviction.on_access(db, key); - self.dbs[db_idx].get(key).map(Self::view_of) + let h = self.key_hash(key); + self.dbs[db_idx] + .find(h, |e| e.key() == key) + .map(Self::view_of) } fn upsert( @@ -657,14 +684,17 @@ impl Store for ShardStore { // its old deadline (for ExpireWrite::Keep). let existed = self.expire_if_due(db, db_idx, key, now); let old_deadline = if existed { - self.dbs[db_idx].get(key).and_then(|o| o.expire_at) + let h = self.key_hash(key); + self.dbs[db_idx] + .find(h, |e| e.key() == key) + .and_then(Entry::expire_at) } else { None }; let new_deadline = resolve_expire(expire, old_deadline); let obj = match value { - NewValue::Bytes(b) => KvObj::from_bytes(key, b, new_deadline), - NewValue::Int(n) => KvObj::from_int(key, n, new_deadline), + NewValue::Bytes(b) => Entry::str_from_bytes(key, b, new_deadline), + NewValue::Int(n) => Entry::str_from_int(key, n, new_deadline), }; self.put_object(db, db_idx, key, obj); existed @@ -695,7 +725,10 @@ impl Store for ShardStore { // `on_access`); the kvobj `eviction_rank` header field is RESERVED, not // written here (nothing reads it). See the eviction crate docs. self.eviction.on_access(db, key); - let obj = self.dbs[db_idx].get(key).expect("live entry present"); + let h = self.key_hash(key); + let obj = self.dbs[db_idx] + .find(h, |e| e.key() == key) + .expect("live entry present"); let entry = RmwEntry::Occupied(Self::occupied_of(obj)); f(entry) } else { @@ -704,7 +737,10 @@ impl Store for ShardStore { // The current (pre-write) deadline, for ExpireWrite::Keep/Unchanged. let old_deadline = if live { - self.dbs[db_idx].get(key).and_then(|o| o.expire_at) + let h = self.key_hash(key); + self.dbs[db_idx] + .find(h, |e| e.key() == key) + .and_then(Entry::expire_at) } else { None }; @@ -724,10 +760,7 @@ impl Store for ShardStore { // real-change branch: a no-op TTL write (bare GETEX, an EXPIRE that // does not move the deadline) keeps the key CLEAN, matching Redis. self.touch_watch(db, key); - if let Some(obj) = self.dbs[db_idx].get_mut(key) { - obj.expire_at = new_deadline; - obj.header.ttl_present = new_deadline.is_some(); - } + self.set_entry_expire(db_idx, key, new_deadline); } } } @@ -736,7 +769,7 @@ impl Store for ShardStore { ExpireWrite::Unchanged => old_deadline, other => resolve_expire(other, old_deadline), }; - let obj = KvObj::from_new_owned(key, v, new_deadline); + let obj = Entry::from_new_owned(key, v, new_deadline); self.put_object(db, db_idx, key, obj); } RmwAction::Delete => { @@ -754,10 +787,7 @@ impl Store for ShardStore { other => resolve_expire(other, old_deadline), }; if new_deadline != old_deadline { - if let Some(obj) = self.dbs[db_idx].get_mut(key) { - obj.expire_at = new_deadline; - obj.header.ttl_present = new_deadline.is_some(); - } + self.set_entry_expire(db_idx, key, new_deadline); } } } @@ -784,57 +814,55 @@ impl Store for ShardStore { // For the OccupiedMut path the store MEASURES the accounting delta itself (it // does not trust the handler): record the pre-edit weight, hand out a typed // mutable handle, run the closure, then measure the post-edit weight. + let key_h = self.key_hash(key); let old_bytes = if live { self.eviction.on_access(db, key); - self.dbs[db_idx].get(key).map_or(0, KvObj::accounted_bytes) + self.dbs[db_idx] + .find(key_h, |e| e.key() == key) + .map_or(0, Entry::accounted_bytes) } else { 0 }; let step = if live { - let obj = self.dbs[db_idx].get_mut(key).expect("live entry present"); - // Read the REAL pre-edit metadata off the header BEFORE taking the typed - // mutable borrow (these are Copy scalars; `as_list_mut` then borrows the - // value mutably). The mutable handle carries the same type/encoding/TTL the - // read-only `occupied_of()` path exposes, so PR-6/7/8 can read accurate - // metadata off the mutable arm. The store still recomputes the POST-edit + let obj = self.dbs[db_idx] + .find_mut(key_h, |e| e.key() == key) + .expect("live entry present"); + // Read the REAL pre-edit metadata BEFORE taking the typed mutable borrow + // (these are Copy scalars; `as_*_mut` then borrows the value mutably). The + // mutable handle carries the same type/encoding/TTL the read-only + // `occupied_of()` path exposes. The store still recomputes the POST-edit // encoding after a `Mutated` return; this is the PRE-edit snapshot. - let data_type = obj.header.data_type; - let encoding = obj.header.encoding; - let expire_at = obj.expire_at; - // Build the typed mutable view: a list yields the list arm, a hash the hash - // arm, anything else the non-collection arm (the handler's `as_*_mut` then - // returns None -> WRONGTYPE). The borrow of `obj` lives only for the closure - // call. The collection arms are selected per repr; the empty-collection check - // after a Mutated return uses `KvObj::is_empty_collection`, which is defined - // over the SAME `collection_len` mapping (kvobj.rs), so the two sites cannot - // drift (the PR-5 review's consolidation; new collection types add an arm - // here AND a `collection_len` arm there in lockstep). + let data_type = obj.data_type(); + let encoding = obj.encoding(); + let expire_at = obj.expire_at(); + // Build the typed mutable view from the entry's collection arm: a list yields + // the list arm, etc.; a Str entry yields the non-collection arm (the handler's + // `as_*_mut` then returns None -> WRONGTYPE). The empty-collection check after + // a Mutated return uses `Entry::is_empty_collection`, defined over the SAME + // `collection_len` mapping (kvobj.rs), so the two sites cannot drift. // - // The repr is matched ONCE (not via sequential `as_*_mut` borrows, which - // would each take and drop a fresh `&mut` and obscure the dispatch) so each + // The repr is matched ONCE (not via sequential `as_*_mut` borrows, which would + // each take and drop a fresh `&mut` and obscure the dispatch) so each // collection type maps to exactly one arm. - let entry = match &mut obj.value { - // The collection variants are boxed (memory Round 1); deref through the - // `Box` (`&mut **`) to the concrete `&mut *Val`, which then coerces to the - // `&mut dyn *Value` trait object the typed view constructors take. - kvobj::ValueRepr::List(l) => { - RmwEntry::OccupiedMut(OccupiedEntryMut::list(encoding, expire_at, &mut **l)) - } - kvobj::ValueRepr::Hash(h) => { - RmwEntry::OccupiedMut(OccupiedEntryMut::hash(encoding, expire_at, &mut **h)) - } - kvobj::ValueRepr::Set(s) => { - RmwEntry::OccupiedMut(OccupiedEntryMut::set(encoding, expire_at, &mut **s)) - } - kvobj::ValueRepr::ZSet(z) => { - RmwEntry::OccupiedMut(OccupiedEntryMut::zset(encoding, expire_at, &mut **z)) - } - kvobj::ValueRepr::Int(_) - | kvobj::ValueRepr::Inline(_) - | kvobj::ValueRepr::Raw(_) => RmwEntry::OccupiedMut( - OccupiedEntryMut::non_collection(data_type, encoding, expire_at), - ), + let entry = match obj { + Entry::Coll(c) => match &mut c.value { + kvobj::CollVal::List(l) => { + RmwEntry::OccupiedMut(OccupiedEntryMut::list(encoding, expire_at, l)) + } + kvobj::CollVal::Hash(h) => { + RmwEntry::OccupiedMut(OccupiedEntryMut::hash(encoding, expire_at, h)) + } + kvobj::CollVal::Set(s) => { + RmwEntry::OccupiedMut(OccupiedEntryMut::set(encoding, expire_at, s)) + } + kvobj::CollVal::ZSet(z) => { + RmwEntry::OccupiedMut(OccupiedEntryMut::zset(encoding, expire_at, z)) + } + }, + Entry::Str(_) => RmwEntry::OccupiedMut(OccupiedEntryMut::non_collection( + data_type, encoding, expire_at, + )), }; f(entry) } else { @@ -842,7 +870,9 @@ impl Store for ShardStore { }; let old_deadline = if live { - self.dbs[db_idx].get(key).and_then(|o| o.expire_at) + self.dbs[db_idx] + .find(key_h, |e| e.key() == key) + .and_then(Entry::expire_at) } else { None }; @@ -859,10 +889,7 @@ impl Store for ShardStore { // TTL change on a watched key is a write, scoped to the real-change // branch so a no-op TTL write stays clean (matches Redis). self.touch_watch(db, key); - if let Some(obj) = self.dbs[db_idx].get_mut(key) { - obj.expire_at = new_deadline; - obj.header.ttl_present = new_deadline.is_some(); - } + self.set_entry_expire(db_idx, key, new_deadline); } } } @@ -871,7 +898,7 @@ impl Store for ShardStore { ExpireWrite::Unchanged => old_deadline, other => resolve_expire(other, old_deadline), }; - let obj = KvObj::from_new_owned(key, v, new_deadline); + let obj = Entry::from_new_owned(key, v, new_deadline); self.put_object(db, db_idx, key, obj); } RmwAction::Delete => { @@ -894,8 +921,8 @@ impl Store for ShardStore { RmwAction::Mutated => { if live { let emptied = self.dbs[db_idx] - .get(key) - .is_some_and(KvObj::is_empty_collection); + .find(key_h, |e| e.key() == key) + .is_some_and(Entry::is_empty_collection); if emptied { // Same pre-edit-weight credit as the Delete arm: the edit // already shrank the in-memory object, so credit `old_bytes`. @@ -913,7 +940,9 @@ impl Store for ShardStore { // same-size in-place path, so the notify must fire here. (The emptied // branch above already notifies via remove_object_crediting.) self.touch_watch(db, key); - let new_bytes = self.dbs[db_idx].get(key).map_or(0, KvObj::accounted_bytes); + let new_bytes = self.dbs[db_idx] + .find(key_h, |e| e.key() == key) + .map_or(0, Entry::accounted_bytes); // Re-account the signed delta and re-fire the eviction sizing // so the policy's per-key byte estimate tracks the edit. if new_bytes != old_bytes { @@ -928,11 +957,10 @@ impl Store for ShardStore { ExpireWrite::Unchanged => old_deadline, other => resolve_expire(other, old_deadline), }; - if let Some(obj) = self.dbs[db_idx].get_mut(key) { + if let Some(obj) = self.dbs[db_idx].find_mut(key_h, |e| e.key() == key) { obj.recompute_encoding(); if new_deadline != old_deadline { - obj.expire_at = new_deadline; - obj.header.ttl_present = new_deadline.is_some(); + obj.set_expire_at(new_deadline); } } } @@ -952,7 +980,10 @@ impl Store for ShardStore { if !self.expire_if_due(db, db_idx, key, now) { return None; } - self.dbs[db_idx].get(key).map(|o| o.header.data_type) + let h = self.key_hash(key); + self.dbs[db_idx] + .find(h, |e| e.key() == key) + .map(Entry::data_type) } fn used_memory(&self) -> u64 { @@ -1062,10 +1093,12 @@ impl ironcache_storage::Admit for ShardSto let db_idx = self.db_index(db); // Inspect the candidate (immutable borrow), extract the state, then drop // the borrow before any mutating call (the hooks borrow self mut). - let (present, is_expired, lacks_ttl) = match self.dbs[db_idx].get(&*key) { - Some(obj) => (true, obj.is_expired(now), obj.expire_at.is_none()), - None => (false, false, true), - }; + let kh = self.key_hash(&key); + let (present, is_expired, lacks_ttl) = + match self.dbs[db_idx].find(kh, |e| e.key() == &key[..]) { + Some(obj) => (true, obj.is_expired(now), obj.expire_at().is_none()), + None => (false, false, true), + }; // A STALE victim (the policy offered a key the store no longer holds, e.g. // a Random roster entry the store did not actually delete on a prior skip): // prune it from the policy so it is not re-offered, then ask for the next. @@ -1157,13 +1190,17 @@ impl ironcache_storage::PolicySwap for ShardStore .dbs .iter() .enumerate() - .flat_map(|(db_idx, map)| { + .flat_map(|(db_idx, table)| { let db = db_idx as u32; - map.iter().filter_map(move |(key, obj)| { + table.iter().filter_map(move |obj| { if obj.is_expired(now) { None } else { - Some((db, key.clone(), obj.accounted_bytes())) + Some(( + db, + obj.key().to_vec().into_boxed_slice(), + obj.accounted_bytes(), + )) } }) }) @@ -1197,10 +1234,11 @@ impl ironcache_storage::ActiveExpiry for Sha /// count. fn reap_if_expired(&mut self, db: u32, key: &[u8], now: UnixMillis) -> bool { let db_idx = self.db_index(db); + let h = self.key_hash(key); let expired = self .dbs .get(db_idx) - .and_then(|m| m.get(key)) + .and_then(|t| t.find(h, |e| e.key() == key)) .is_some_and(|o| o.is_expired(now)); if !expired { return false; @@ -1238,19 +1276,23 @@ impl Keyspace for ShardStore { mut keep: impl FnMut(&[u8], DataType) -> bool, ) -> (ScanCursor, Vec>) { let db_idx = self.db_index(db); - let Some(map) = self.dbs.get(db_idx) else { + let Some(table) = self.dbs.get(db_idx) else { return (ScanCursor::START, Vec::new()); }; - if map.is_empty() { + if table.is_empty() { // Empty db -> complete immediately (cursor 0). return (ScanCursor::START, Vec::new()); } // The sorted (scan_hash, key_bytes) view. `scan_hash` is recomputed from the - // key bytes here, NOT read from the table's internal hasher, so the order is - // stable across calls and across a resize (KEYSPACE.md). Sorting by (hash, - // bytes) gives a total order even for equal-hash keys. - let mut order: Vec<(u64, &[u8])> = map.keys().map(|k| (scan_hash(k), k.as_ref())).collect(); + // key bytes (read out of each entry), NOT from the table's internal hasher, so + // the order is stable across calls and across a resize (KEYSPACE.md). Sorting by + // (hash, bytes) gives a total order even for equal-hash keys. Each `&[u8]` + // borrows the key INSIDE its entry (no separate key allocation). + let mut order: Vec<(u64, &[u8])> = table + .iter() + .map(|e| (scan_hash(e.key()), e.key())) + .collect(); order.sort_unstable(); // Walk the sorted order, choosing which keys to EXAMINE this batch and what the @@ -1264,11 +1306,15 @@ impl Keyspace for ShardStore { // MATCH/TYPE `keep` filter BEFORE cloning the key into the result. let mut kept: Vec> = Vec::with_capacity(plan.examined.len()); for &key in &plan.examined { - if let Some(obj) = map.get(key) { + // Re-find the entry by its embedded key (the `order`/`plan.examined` + // slices borrow the keys inside the entries; `find` reaches the entry to + // read its metadata). `self.hasher` and `table` are disjoint fields, so + // both immutable borrows coexist. + if let Some(obj) = table.find(self.hasher.hash_one(key), |e| e.key() == key) { if obj.is_expired(now) { continue; } - if keep(key, obj.header.data_type) { + if keep(key, obj.data_type()) { kept.push(key.to_vec().into_boxed_slice()); } } @@ -1280,30 +1326,30 @@ impl Keyspace for ShardStore { let db_idx = self.db_index(db); // RAW table length (Redis does not active-expire on DBSIZE): the dict size, // including not-yet-reaped expired keys. No lazy backstop here. - self.dbs.get(db_idx).map_or(0, HashMap::len) + self.dbs.get(db_idx).map_or(0, HashTable::len) } fn random_key(&mut self, db: u32, pick: u64, now: UnixMillis) -> Option> { let db_idx = self.db_index(db); - let map = self.dbs.get(db_idx)?; - let n = map.len(); + let table = self.dbs.get(db_idx)?; + let n = table.len(); if n == 0 { return None; } // The caller drew `pick` from the Env RNG (ADR-0003: the store reads no RNG). // Map it to a starting index, then probe forward DETERMINISTICALLY in the // sorted scan order, skipping expired keys, so an expired key at the picked - // position does not yield `None` while live keys remain. - let mut order: Vec<&[u8]> = map.keys().map(std::convert::AsRef::as_ref).collect(); - order.sort_unstable_by(|a, b| scan_hash(a).cmp(&scan_hash(b)).then(a.cmp(b))); + // position does not yield `None` while live keys remain. The order carries the + // key + its live/expired flag so no re-lookup is needed. + let mut order: Vec<(&[u8], bool)> = + table.iter().map(|e| (e.key(), e.is_expired(now))).collect(); + order.sort_unstable_by(|a, b| scan_hash(a.0).cmp(&scan_hash(b.0)).then(a.0.cmp(b.0))); let start = (pick % n as u64) as usize; for off in 0..n { let idx = (start + off) % n; - let key = order[idx]; - if let Some(obj) = map.get(key) { - if !obj.is_expired(now) { - return Some(key.to_vec().into_boxed_slice()); - } + let (key, expired) = order[idx]; + if !expired { + return Some(key.to_vec().into_boxed_slice()); } } None @@ -1312,7 +1358,10 @@ impl Keyspace for ShardStore { fn flush_db(&mut self, db: u32) -> u64 { let db_idx = self.db_index(db); let keys: Vec> = match self.dbs.get(db_idx) { - Some(map) => map.keys().cloned().collect(), + Some(table) => table + .iter() + .map(|e| e.key().to_vec().into_boxed_slice()) + .collect(), None => return 0, }; let mut removed = 0u64; @@ -1375,11 +1424,14 @@ impl Keyspace for ShardStore { // Take the source object INTACT (preserving encoding + remaining TTL). Re-key it // to the destination key bytes; the value representation and `expire_at` are - // carried unchanged (KEYSPACE.md "moves the value object INTACT"). - let Some(mut obj) = self.dbs[src_idx].get(src).cloned() else { + // carried unchanged (KEYSPACE.md "moves the value object INTACT"). For a Str + // entry `rekey` rebuilds the blob with the new embedded key; for a Coll it is a + // field write. + let src_h = self.key_hash(src); + let Some(mut obj) = self.dbs[src_idx].find(src_h, |e| e.key() == src).cloned() else { return MoveOutcome::NoSource; }; - obj.key = dst.to_vec().into_boxed_slice(); + obj.rekey(dst); // Write the destination through the funnel (fires insert hooks, accounts bytes; // a replaced live destination is credited inside put_object). @@ -1435,13 +1487,13 @@ impl ironcache_storage::Watch for ShardStore let present = self.expire_if_due(db, db_idx, key, now); let probe = (db, key.to_vec().into_boxed_slice()); let version = match self.watch_versions.entry(probe) { - Entry::Occupied(mut e) => { + WatchMapEntry::Occupied(mut e) => { let slot = e.get_mut(); slot.watchers += 1; self.watched_count += 1; slot.version } - Entry::Vacant(e) => { + WatchMapEntry::Vacant(e) => { let version = self.version_clock; e.insert(WatchSlot { version, @@ -1495,7 +1547,7 @@ impl ironcache_storage::Watch for ShardStore fn unwatch(&mut self, entries: &[WatchEntry]) { for entry in entries { let probe = (entry.db, entry.key.clone()); - if let Entry::Occupied(mut e) = self.watch_versions.entry(probe) { + if let WatchMapEntry::Occupied(mut e) = self.watch_versions.entry(probe) { let slot = e.get_mut(); slot.watchers = slot.watchers.saturating_sub(1); // Each entry corresponds to exactly one watcher increment from @@ -1558,7 +1610,11 @@ impl ShardStore { pub fn insert_object(&mut self, db: u32, obj: KvObj) { let db_idx = self.db_index(db); let key = obj.key.clone(); - self.put_object(db, db_idx, &key, obj); + // The public builder/transfer type is `KvObj` (tests construct it and set its + // fields directly); convert it to the single-allocation table `Entry` at the + // funnel boundary. + let entry = Entry::from_kvobj(obj); + self.put_object(db, db_idx, &key, entry); } } @@ -1577,7 +1633,7 @@ fn resolve_expire(expire: ExpireWrite, old: Option) -> Option Bytes { - int_decimal_bytes(n) + kvobj::int_decimal_bytes(n) } // --------------------------------------------------------------------------- diff --git a/docs/bench/OPTIMIZATION_LOG.md b/docs/bench/OPTIMIZATION_LOG.md index 7ce774f..4bdf65e 100644 --- a/docs/bench/OPTIMIZATION_LOG.md +++ b/docs/bench/OPTIMIZATION_LOG.md @@ -89,7 +89,21 @@ stay boxed structs (not flat blobs). Scoped as Round 3 (the big one). Micro-twea (u64 TTL sentinel, inline short keys) are deliberately SKIPPED because the single-alloc rewrite subsumes them (no tunnel vision on soon-replaced changes). -### Round 3 (next, the big one): single-allocation blob entry - VALIDATED design +| 3 | L-FAM (v1): single-allocation blob `Entry` in `hashbrown::HashTable` (no key dup) | 3 allocs->1, slot 80->16, approach redis | bytes/key (128B) 386.85 -> **221.5** (gap 1.77x -> **1.01x, near parity**); 32B 291 -> **121** (2.88x -> 1.20x); memmodel table slack 125.8 -> 26.2, int 155.8 -> 57.8 | qps ~71k (within noise/budget of round 2's 78k; blob parse on access) | **KEPT (pending review)** - collapsed the 2.4x memory gap to ~parity; all 840 tests green, waist unchanged | + +### Round 3 detail +Per-db table is now `hashbrown::HashTable` (low-level explicit-hash API, +no key duplication) with `Entry = Str(Box<[u8]>)` single blob +`[type|enc|flags|ttl?|key_len|key|value]` or `Coll(Box)`; key+value+ttl +in ONE allocation for strings (3 allocs -> 1), slot 80 -> 16. Confined to +ironcache-store internals; the Store waist + ValueRef/RmwEntry/side-traits +unchanged; SCAN keeps the deterministic scan_hash cursor; TTL in the blob header. +SAFE (no unsafe; safe blob slicing). Near-parity with redis 8.8.0 on memory. +NEXT to CLEARLY win memory: a thin pointer (ThinVec/ThinArc) slot 16->8 + cache +the hash in the header (also helps lookup); and the throughput gap (still ~2x, +unproven-clean on macOS) needs hot-path work + a pinned-Linux run. + +### Round 3 (built; was: next, the big one): single-allocation blob entry - VALIDATED design Research (redis 8.2 kvobj, valkey 8.0/8.1, Dragonfly Dashtable, hashbrown, SwissTable/Dash/MemC3/F14 papers) confirms the lever and a SAFE Rust path: