TSMS6W4DOKQNUQ4PEMTLOIODR33VFPN6MMNS73ZPSU4BOQVRGPNAC
CCNPHVQCIGINWTLXCHOASGVWUPBZXFOLM2F7HTKMEA2DMFTOX7TAC
KMT3MF5NLEQIPZLHCRYDGQ5EA46HJCG3C2ANEPMZGKGHDK77ADPAC
OP6SVMOD2GTQ7VNJ4E5KYFG4MIYA7HBMXJTADALMZH4PY7OQRMZQC
6DMPXOAT5GQ3BQQOMUZN2GMBQPRA4IB7CCPHTQTIFGO3KWWAKF3QC
OFINGD26ZWCRDVVDI2ZIBLMHXKEMJA6MRNLANJYUHQPIJLPA7J2AC
RV2L6CZWTMUQ2A52YDAFVHDFGURZL3H4SSCDC347UGN23D3J5KZQC
LSQ6V7M66TEGLJ7QBLRVDX4E7UKJTDQTEXZOS3KGPGFKVXNLPKBQC
H3FVSQIQGFCFKCPXVOSFHP4OSUOBBURJESCZNGQTNDAAD3WQSBEQC
APPY2E7M5NHNC6MFYXSVEKJVAILK7YAZVTVE3W75EK2JNFVS3XBQC
73Z2UB3JGRLFNFORE7D64O4IHIFSZASD4G4FLJ4FJLHANT75MGIAC
HN6Z5DU4WYMAIOOSNVHLIIMNF6Q53TNJ7YC27SLKWNXVYCTACQKQC
QEUTVAZ4F4EJXRDMWDMYXF6XEDMX7YPVG4IIXEKPIR3K54E5W5OAC
XEU2QVLCHPYOOD4TQIPEEVYOVSFMKFPLJYWEJYXYJAZ7S54KWDZAC
DV4A2LR7Q5LAEGAQHLO34PZCHGJUHPAMRZFGT7GUFNKVQKPJNOYQC
6UVFCERMGSGNRWCVC3GWO5HWV6MSWE433DXBJVC7KRPP6LLJLCSQC
LROAI3NBBSCU4T2YA6EHJYKKKL75AU5A7C7WIRCGIQ56S6HPLRXQC
QYDGYIZRNFRIQD7RUCY5YAN3F2THZA74E5UOHPIFWSULEJFAFVJQC
WS4ZQM4RMIHZ6XZKSDQJGHN5SSSWFL4H236USOPUA33S6RC53RFAC
OTWDDJE7TTE73D6BGF4ZN6BH2NFUFLPME2VJ3CPALH463UGWLEIQC
ESUI5EUZUBDPHNN3APU33IFORYPYR6J3WEMEZG57FKF3EH66ZBHAC
Q7DRIBBRE4MNG4NP3PVIXAJF5PQYLFWYIVK2O4VVLEO6XY3BOSFQC
EAAYH6BQWDK52EC5RG3BEZQU3FJPN5RRRN4U5KDKDVPKXBVJMNDAC
MSRWB47YP6L5BVTS53QQPBOHY5SXTSTR5KD6IIF35UWCTEUOCQWQC
7P43FPFAXMDYUIQV7U2DGPN4UI3RNHYFWEUEA22UBDSQTYBJRAQQC
KX3WVNZW5KHVEH6EOQTZ4RBEFFJ3SGF5I467X3JWZ74PURRK4HVAC
7WJNSPEWJSJROOYHU6QROWPUZ6WNIOUG2BPSOPDPCK6RG2NQU6OQC
X3QVVQIS7B7L3XYZAWL3OOBUXOJ6RMOKQ45YMLLGAHYPEEKZ45ZAC
6DCQHIFPEH4GZKSRRS32GMKDRPZH4MTCGOUEI7YEUVKWENBF3JWAC
NXMFNPZ7VWJRLC3M5QJJVTICXCMGE24F3HVIZA7A7RLVMLQMLDVQC
T73WR2BX2QDQ6APOREBXUKNH52FDLJNBGWPQUYB2TAF2PT7XCL2AC
UUUVNC4DWEEL7WV5IRPKPZ6HZMYCPA53XM7LJWICUD4E6GN37IRQC
W26CFMAQOXMUK4ZOJMAN4SMBXMWFQHO7HCTEVW73FQSRMJZFGJIQC
PXF3R6SVXJXN2NMLMWNY5OFV5QYVE2VZTLGIZDZVK5ZVLFTVSSWQC
KM3JAFGPFV7MP7M2LJIYRVAUTU646B3IRXADTRZKOU2RF7LUB62QC
UAQX27N4PI4LHEW6LSHJETIE5MV7JTEMPLTJFYUBMYVPC43H7VOAC
// `lookup` has the same semantic as
// `core::slice::binary_search`, i.e. it returns either
// `Ok(n)`, where `n` is the position where `(k0, v0)` was
// found, or `Err(n)` where `n` is the position where
// `(k0, v0)` can be inserted to preserve the order.
//! Implementation of B tree pages for Sized types, i.e. types whose
//! representation has a size known at compile time (and the same as
//! `core::mem::size_of()`).
//!
//! The details of the implementation are as follows:
//!
//! - The page starts with a 16 bytes header of the following form
//! (where all the fields are encoded in Little-Endian):
//!
//! ```
//! #[repr(C)]
//! pub struct Header {
//! /// Offset to the first entry in the page, shifted 3 bits
//! /// to the right to allow for the dirty bit (plus two
//! /// extra bits, zero for now), as explained in the
//! /// documentation of CowPage, at the root of this
//! /// crate. This is 4096 for empty pages, and it is unused
//! /// for leaves. Moreover, this field can't be increased:
//! /// when it reaches the bottom, the page must be cloned.
//! data: u16,
//! /// Number of entries on the page.
//! n: u16,
//! /// CRC (if used).
//! crc: u32,
//! /// The 52 most significant bits are the left child of
//! /// this page (0 for leaves), while the 12 LSBs represent
//! /// the space this page would take when cloned from scratch,
//! /// minus the header. The reason for this is that entries
//! /// in internal nodes aren't really removed when deleted,
//! /// they're only "unlinked" from the array of offsets (see
//! /// below). Therefore, we must have a way to tell when a
//! /// page can be "compacted" to reclaim space.
//! left_page: u64,
//! }
//! ```
//! - For leaves, the rest of the page has the same representation as
//! an array of length `n`, of elements of type `Tuple<K, V>`:
//! ```
//! #[repr(C)]
//! struct Tuple<K, V> {
//! k: K,
//! v: V,
//! }
//! ```
//! If the alignment of that structure is more than 16 bytes, we
//! need to add some padding between the header and that array.
//! - For internal nodes, the rest of the page starts with an array of
//! length `n` of Little-Endian-encoded `u64`, where the 12 least
//! significant bits of each `u64` are an offset to a `Tuple<K, V>` in
//! the page, and the 52 other bits are an offset in the file to the
//! right child of the entry.
//!
//! Moreover, the offset represented by the 12 LSBs is after (or at)
//! `header.data`.
//! We say we can "allocate" in the page if the `data` of the header
//! is greater than or equal to the position of the last "offset",
//! plus the size we want to allocate (note that since we allocate
//! from the end of the page, `data` is always a multiple of the
//! alignment of `Tuple<K, V>`).
mod alloc;
mod put;
mod alloc; // The "alloc" trait, to provide common methods for leaves and internal nodes.
mod put; // Inserting a new value onto a page (possibly splitting the page).
mod rebalance; // Rebalance two sibling pages to the left or to the right.
use super::page_unsized::{header, PageCursor};
// Cursors are quite straightforward. One non-trivial thing is
// that they represent both a position on a page and the interval
// from that position to the end of the page, as is apparent in
// their `split_at` method.
//
// Another thing to note is that -1 and `c.total` are valid
// positions for a cursor `c`. The reason for this is that
// position `-1` has a right child (which is the left child of the
// first element).
fn current<'a, T: LoadPage>(
_txn: &T,
page: crate::Page<'a>,
c: &Self::Cursor,
) -> Option<(&'a K, &'a V, u64)> {
if c.cur < 0 || c.cur >= c.total as isize {
None
} else if c.is_leaf {
let f = core::mem::size_of::<Tuple<K, V>>();
let off = {
let al = core::mem::align_of::<Tuple<K, V>>();
let hdr = (HDR + al - 1) & !(al - 1);
hdr + c.cur as usize * f
};
unsafe {
let kv = &*(page.data.as_ptr().add(off as usize) as *const Tuple<K, V>);
Some((&kv.k, &kv.v, 0))
}
} else {
unsafe {
let off =
u64::from_le(*(page.data.as_ptr().add(HDR) as *const u64).add(c.cur as usize));
let kv = &*(page.data.as_ptr().add((off as usize) & 0xfff) as *const Tuple<K, V>);
Some((&kv.k, &kv.v, off & !0xfff))
}
}
fn split_at(c: &Self::Cursor) -> (Self::Cursor, Self::Cursor) {
(
PageCursor {
cur: 0,
total: c.cur.max(0) as usize,
is_leaf: c.is_leaf,
},
*c,
)
}
}
// This is the first non-trivial method, let's explain it.
fn current<'a, T: LoadPage>(
_txn: &T,
page: crate::Page<'a>,
c: &Self::Cursor,
) -> Option<(&'a K, &'a V, u64)> {
// First, there's no current entry if the cursor is outside
// the range of entries.
if c.cur < 0 || c.cur >= c.total as isize {
None
} else if c.is_leaf {
// Else, if this is a leaf, the elements are packed
// together in a contiguous array.
//
// This means that the header may be followed by
// padding. These are constants known at compile-time, by
// the way, so `al` and `hdr` are optimised away by the
// compiler.
let al = core::mem::align_of::<Tuple<K, V>>();
// The following is a way to compute the first multiple of
// `al` after `HDR`, assuming `al` is a power of 2 (which
// is always the case since we get it from `align_of`).
let hdr = (HDR + al - 1) & !(al - 1);
// The position of the `Tuple<K, V>` we're looking for is
// `f * cur` bytes after the padded header:
let f = core::mem::size_of::<Tuple<K, V>>();
let kv = unsafe {
&*(page.data.as_ptr().add(hdr + c.cur as usize * f) as *const Tuple<K, V>)
};
Some((&kv.k, &kv.v, 0))
} else {
// Internal nodes have an extra level of indirection: we
// first need to find `off`, the offset in the page, in
// the initial array of offsets. Since these offsets are
// `u64`, and the header is of size 16 bytes, the array is
// already aligned.
unsafe {
let off =
u64::from_le(*(page.data.as_ptr().add(HDR + 8 * c.cur as usize) as *const u64));
// Check that we aren't reading outside of the page
// (for example because of a malformed offset).
assert!((off as usize & 0xfff) + core::mem::size_of::<Tuple<K, V>>() <= 4096);
// Once we have the offset, cast its 12 LSBs to a
// position in the page, and read the `Tuple<K, V>` at
// that position.
let kv = &*(page.data.as_ptr().add((off as usize) & 0xfff) as *const Tuple<K, V>);
Some((&kv.k, &kv.v, off & !0xfff))
}
// The left and right child methods aren't really surprising. One
// thing to note is that cursors are always in positions between
// `-1` and `c.total` (bounds included), so we only have to check
// one side of the bound in the assertions.
//
// We also check, before entering the `unsafe` sections, that
// we're only reading data that is on a page.
// `lookup` has the same semantic as
// `core::slice::binary_search`, i.e. it returns either
// `Ok(n)`, where `n` is the position where `(k0, v0)` was
// found, or `Err(n)` where `n` is the position where
// `(k0, v0)` can be inserted to preserve the order.
let off = {
let al = core::mem::align_of::<Tuple<K, V>>();
let hdr_size = (HDR + al - 1) & !(al - 1);
(0, (hdr_size + f * n) as u16)
};
Ok(Leaf::kv(txn, page, off))
let al = core::mem::align_of::<Tuple<K, V>>();
let hdr_size = (HDR + al - 1) & !(al - 1);
let tup =
&*(page.data.as_ptr().add(hdr_size + f * n) as *const Tuple<K, V>);
Ok((&tup.k, &tup.v, 0))
// When deleting from internal nodes, we take a replacement from
// one of the leaves (in our current implementation, the leftmost
// entry of the right subtree). This method copies an entry from
// the leaf onto the program stack, which is necessary since
// deletions in leaves overwrites entries.
//
// Another design choice would have been to do the same as for the
// unsized implementation, but in this case this would have meant
// copying the saved value to the end of the leaf, potentially
// preventing merges, and not even saving a memory copy.
let mut n = 0;
clone::<K, V, Internal>(page.as_page(), &mut new, s, &mut n);
clone::<K, V, Internal>(page.as_page(), &mut new, s, &mut 0);
// Mark the former version of the page as free.
unsafe {
let f = core::mem::size_of::<Tuple<K, V>>();
if c.is_leaf {
let n = hdr.n() as usize;
let f = core::mem::size_of::<Tuple<K, V>>();
if c.is_leaf {
// In leaves, we need to move the n - c - 1 elements
// that are strictly after the cursor, by `f` (the
// size of an entry).
//
// Here's the reasoning to avoid off-by-one errors: if
// `c` is 0 (i.e. we're deleting the first element on
// the page), we remove one entry, so there are n - 1
// remaining entries.
let n = hdr.n() as usize;
let hdr_size = {
// As usual, header + padding
let hdr_size = (HDR + al - 1) & !(al - 1);
let off = c.cur as usize * f;
let kv_ptr = p.add(hdr_size + off);
core::ptr::copy(kv_ptr.add(f), kv_ptr, f * (n - c.cur as usize - 1));
hdr.decr(f);
} else {
(HDR + al - 1) & !(al - 1)
};
let off = hdr_size + c.cur as usize * f;
unsafe {
core::ptr::copy(p.add(off + f), p.add(off), f * (n - c.cur as usize - 1));
}
// Removing `f` bytes from the page.
hdr.decr(f);
} else {
// Internal nodes are easier to deal with, as we just
// have to move the offsets.
unsafe {
unsafe {
let off = (p.add(HDR) as *mut u64).offset(c.cur as isize - 1);
*off = (l | (u64::from_le(*off) & 0xfff)).to_le();
if l > 0 {
unsafe {
let off = (p.add(HDR) as *mut u64).offset(c.cur as isize - 1);
*off = (l | (u64::from_le(*off) & 0xfff)).to_le();
}
unsafe {
let mut new = txn.alloc_page()?;
<Page<K, V> as BTreeMutPage<K, V>>::init(&mut new);
if c.is_leaf {
let s = Leaf::offset_slice::<T, K, V>(page.as_page());
debug!("s = {:?}", s);
let (s0, s1) = s.split_at(c.cur as usize);
let (_, s1) = s1.split_at(1);
debug!("leaf s0 {:?} s1 {:?}", s0, s1);
let mut n = 0;
clone::<K, V, Leaf>(page.as_page(), &mut new, s0, &mut n);
clone::<K, V, Leaf>(page.as_page(), &mut new, s1, &mut n);
} else {
let s = Internal::offset_slice::<T, K, V>(page.as_page());
debug!("s = {:?}", s);
let (s0, s1) = s.split_at(c.cur as usize);
let (_, s1) = s1.split_at(1);
debug!("internal s0 {:?} s1 {:?}", s0, s1);
let mut n = 0;
clone::<K, V, Internal>(page.as_page(), &mut new, s0, &mut n);
debug!("offset {:?}", n);
if l > 0 {
let off = (new.0.data.add(HDR) as *mut u64).offset(n - 1);
// Immutable pages need to be cloned. The strategy is the
// same in both cases: get an "offset slice", split it at
// the cursor, remove the first entry of the second half,
// and clone.
let mut new = txn.alloc_page()?;
<Page<K, V> as BTreeMutPage<K, V>>::init(&mut new);
if c.is_leaf {
let s = Leaf::offset_slice::<T, K, V>(page.as_page());
let (s0, s1) = s.split_at(c.cur as usize);
let (_, s1) = s1.split_at(1);
let mut n = 0;
clone::<K, V, Leaf>(page.as_page(), &mut new, s0, &mut n);
clone::<K, V, Leaf>(page.as_page(), &mut new, s1, &mut n);
} else {
// Internal nodes a bit trickier, since the left child
// is not counted in the "offset slice", so we need to
// clone it separately. Also, the `l` argument to this
// function might be non-zero in this case.
let s = Internal::offset_slice::<T, K, V>(page.as_page());
let (s0, s1) = s.split_at(c.cur as usize);
let (_, s1) = s1.split_at(1);
// First, clone the left child of the page.
let hdr = header(page.as_page());
let left = hdr.left_page() & !0xfff;
unsafe { *(new.0.data.add(HDR - 8) as *mut u64) = left.to_le() };
// Then, clone the entries strictly before the cursor
// (i.e. clone `s0`).
let mut n = 0;
clone::<K, V, Internal>(page.as_page(), &mut new, s0, &mut n);
// If we need to update the left child of the deleted
// item, do it.
if l > 0 {
unsafe {
let off = new.0.data.offset(HDR as isize + (n - 1) * 8) as *mut u64;
let mut hdr_size = HDR;
let mid_size = if m.modified.c0.is_leaf {
let f = core::mem::size_of::<Tuple<K, V>>();
// First evaluate the size of the middle element on a page.
let (hdr_size, mid_size) = if m.modified.c0.is_leaf {
let size = hdr_size + mod_size + mid_size + occupied;
if size <= PAGE_SIZE {
// merge
let mut new = txn.alloc_page()?;
<Page<K, V> as BTreeMutPage<K, V>>::init(&mut new);
let freed = {
let b0 = if m.modified.page.is_dirty() { 1 } else { 0 };
let b1 = if m.other.is_dirty() { 1 } else { 0 };
[m.modified.page.offset | b0, m.other.offset | b1]
// One surprising observation here is that adding the sizes
// works. This is surprising because of alignment and
// padding. It works because we can split the sizes into an
// offset part (always 8 bytes) and a data part (of a constant
// alignment), and thus we never need any padding anywhere on
// the page.
if mod_size + mid_size + occupied <= PAGE_SIZE {
// If the concatenation fits on a page, merge.
let (page, freed) = if m.modified.c0.is_leaf {
merge::<_, _, _, Leaf>(txn, m)?
} else {
merge::<_, _, _, Internal>(txn, m)?
if m.modified.c0.is_leaf {
merge::<_, _, _, Leaf>(txn, &mut new, m)
} else {
merge::<_, _, _, Internal>(txn, &mut new, m)
}
let rc = PageCursor::new(&m.other, 0);
let first_size = <Page<K, V>>::current_size(m.other.as_page(), &rc);
// If we can't rebalance, modify and return.
if mod_size >= PAGE_SIZE / 2 || hdr_size + occupied - first_size < PAGE_SIZE / 2 {
// If the modified page is large enough, or if the other page
// has only just barely the minimum number of elements to be
// at least half-full, return.
//
// The situation where the other page isn't full enough might
// happen for example if elements occupy exactly 1/5th of a
// page + 1 byte, and the modified page has 2 of them after
// the deletion, while the other page has 3.
if mod_size >= PAGE_SIZE / 2 || hdr_size + occupied - mid_size < PAGE_SIZE / 2 {
unsafe fn cmp<T: LoadPage, K: Storable, V: Storable>(
/// An equivalent of `Ord::cmp` on `Tuple<K, V>` but using
/// `Storable::compare` instead of `Ord::cmp` on `K` and `V`.
fn cmp<T: LoadPage, K: Storable, V: Storable>(
let mut a = 0;
let mut b = s.len();
for _ in 0..4 {
let mid = (a + b) / 2;
match cmp(txn, k0, v0, p, s[mid]) {
Ordering::Less => a = mid,
Ordering::Greater => b = mid,
Ordering::Equal => b = mid + 1,
}
}
let mut n = a;
for &off in &s[a..b] {
match cmp(txn, k0, v0, p, off) {
Ordering::Less => n += 1,
for (n, off) in s.iter().enumerate() {
match cmp(txn, k0, v0, p, *off) {
Ordering::Less => {}
let mut is_first = m.skip_first;
while let Some((k, v, r)) = <Page<K, V>>::next(txn, m.page.as_page(), &mut m.c1) {
if is_first {
is_first = false;
continue;
}
// Then, clone `m.c1`, possibly skipping the first element.
let mut c1 = m.c1.clone();
if m.skip_first {
<Page<K, V>>::move_next(&mut c1);
}
while let Some((k, v, r)) = <Page<K, V>>::next(txn, m.page.as_page(), &mut c1) {
fn merge<T: LoadPage, K: Storable + core::fmt::Debug, V: Storable + core::fmt::Debug, L: Alloc>(
txn: &T,
new: &mut MutPage,
/// The very unsurprising `merge` function
fn merge<
T: AllocPage + LoadPage,
K: Storable + core::fmt::Debug,
V: Storable + core::fmt::Debug,
L: Alloc,
>(
txn: &mut T,
) {
) -> Result<(MutPage, [u64; 2]), T::Error> {
// This algorithm is probably a bit naive, especially for leaves,
// where if the left page is mutable, we can copy the other page
// onto it.
// Indeed, we first allocate a page, then clone both pages onto
// it, in a different order depending on whether the modified page
// is the left or the right child.
let mut new = txn.alloc_page()?;
<Page<K, V> as BTreeMutPage<K, V>>::init(&mut new);
alloc::<_, _, L>(new, m.mid.0, m.mid.1, 0, 0, &mut n);
modify::<_, _, _, L>(txn, new, &mut m.modified, &mut n);
alloc::<_, _, L>(&mut new, m.mid.0, m.mid.1, 0, 0, &mut n);
modify::<_, _, _, L>(txn, &mut new, &mut m.modified, &mut n);
/// Clone a slice of offsets onto a page. This essentially does what
/// it says. Note that the leftmost child of a page is not included in
/// the offset slices, so we don't have to handle it here.
///
/// This should really be in the `Alloc` trait, but Rust doesn't have
/// associated type constructors, so we can't have an associated type
/// that's sometimes a slice and sometimes a "Range".
let off_new = L::alloc(hdr, size, core::mem::align_of::<Tuple<K, V>>());
core::ptr::copy_nonoverlapping(ptr, new.0.data.offset(off_new as isize), size);
L::set_offset(new, *n, r, off_new);
let data = hdr.data() as u16;
let off_new = data - size as u16;
hdr.set_data(off_new);
hdr.set_n(hdr.n() + 1);
hdr.incr(8 + size);
// Copy the entry from the original page to its
// position on the new page.
core::ptr::copy_nonoverlapping(ptr, new.0.data.add(off_new as usize), size);
// Set the offset to this new entry in the offset
// array, along with the right child page.
let ptr = new.0.data.offset(HDR as isize + *n * 8) as *mut u64;
*ptr = (r | off_new as u64).to_le();
let (new_right, k, v) = if r == 0 && right_hdr.is_dirty() {
// Rewrite the deleted element at the end of the page, so that
// the pointer is still valid.
let (new_right, k, v) = if r == 0 && m.other_is_mutable && right_hdr.is_dirty() {
// Since `r == 0`, we know this is a leaf. Therefore, we need
// to rewrite the deleted element to the end of the page, so
// that the pointer to the new middle entry is valid when this
// function returns.
// If this isn't a leaf, or isn't mutable, use the standard
// deletion function, because we know this will leave the
// pointer to the current entry valid.
// We extend the pointer's lifetime here, because we know the
// current deletion (we only rebalance during deletions) won't
// touch this page anymore after this.
// Surprisingly, the `rebalance_right` function is simpler,
// since:
//
// - if we call it to rebalance two internal nodes, we're in the easy
// case of rebalance_left.
//
// - Else, the middle element is the last one on the left page, and
// isn't erased be leaf deletions, because these just move entries to
// the left.
) -> Result<Put<'a, K, V>, T::Error> {
let size = core::mem::size_of::<Tuple<K, V>>()
+ if k1v1.is_some() {
core::mem::size_of::<Tuple<K, V>>()
} else {
0
};
) -> Result<crate::btree::put::Put<'a, K, V>, T::Error> {
let size = if k1v1.is_some() { 2 } else { 1 };
fn can_alloc<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool;
fn can_compact<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool;
fn alloc(hdr: &mut Header, size: usize, align: usize) -> u16;
/// Is there enough room to add one entry into this page (without cloning)?
fn can_alloc<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool;
// n = number of items to remove
/// If we clone the page, will there be enough room for one entry?
fn can_compact<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool;
/// Remove the leftmost `n` elements from the page. On leaves,
/// this moves the last truncated element to the end of the page
/// and returns a pointer to that element (this function is only
/// used in `crate::btree::put`, and the pointer becomes invalid
/// at the end of the `crate::btree::put`).
///
/// Returning the last deleted element isn't required for internal
/// nodes, since the entries are still valid there.
fn set_offset(new: &mut MutPage, n: isize, r: u64, off: u16);
/// Set the right child of the `n`th entry to `r`. If `n == -1`,
/// this method can be used to set the leftmost child of a page.
) -> Offsets<'a, Self::Offset>;
fn kv<'a, T: LoadPage, K: Storable, V: Storable>(
txn: &T,
page: crate::Page,
off: (u64, u16),
) -> (&'a K, &'a V, u64);
) -> Offsets<'a>;
/// First element of a slice offset. For the sake of code
/// simplicity, we return a `u64` even for leaves. In internal
/// nodes, the 52 MSBs encode a child page, and the 12 LSBs encode
/// an offset in the page.
fn first<'a, K, V>(off: &Offsets<'a>) -> u64;
/// The tuple and right child at offset `off`.
fn kv<'a, K: Storable, V: Storable>(page: crate::Page, off: u64) -> (&'a K, &'a V, u64);
impl<'a, A: Into<(u64, u16)> + Copy> Offsets<'a, A> {
impl<'a> Offsets<'a> {
/// Splitting an offset slice, with the same semantics as
/// `split_at` for slices, except if `mid` is equal to the length,
/// in which case we return `(self, &[])`.
pub fn first<T, K: Storable, V: Storable>(&self) -> (u64, u16) {
match self {
Offsets::Slice(s) => s[0].into(),
}
pub struct Leaf {}
pub struct Internal {}
impl Alloc for Leaf {
// Checking if there's enough room is just bookkeeping.
fn can_alloc<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool {
let f = core::mem::size_of::<Tuple<K, V>>();
let al = core::mem::align_of::<Tuple<K, V>>();
let header_size = (HDR + al - 1) & !(al - 1);
header_size + (hdr.n() as usize + n) * f <= PAGE_SIZE
}
/// There's no possible compaction on leaves, it's the same as allocating.
fn can_compact<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool {
Self::can_alloc::<K, V>(hdr, n)
}
fn set_right_child(_: &mut MutPage, _: isize, _: u64) {}
fn offset_slice<'a, T: LoadPage, K: Storable, V: Storable>(
page: crate::Page<'a>,
) -> Offsets<'a> {
let hdr = header(page);
Offsets::Range(0..(hdr.n() as usize))
}
/// This returns an offset on the page, so we need to compute that.
fn first<'a, K, V>(off: &Offsets<'a>) -> u64 {
match off {
Offsets::Slice(_) => unreachable!(),
impl Alloc for Leaf {
fn can_alloc<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool {
/// Here, we just move `total - *n` elements to the right, and
/// increase the bookkeeping values (occupied bytes and number of
/// elements).
fn alloc_insert<K: Storable, V: Storable>(new: &mut MutPage, n: &mut isize, _: u64) -> usize {
let al = core::mem::align_of::<Tuple<K, V>>();
assert_eq!(size % al, 0);
let header_size = (HDR + al - 1) & !(al - 1);
header_size + (hdr.n() as usize) * f + size <= PAGE_SIZE
}
fn can_compact<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool {
assert_eq!(size % al, 0);
let header_size = (HDR + al - 1) & !(al - 1);
debug!(
"can_compact, leaf: {:?} {:?} {:?}",
header_size,
hdr.left_page() & 0xfff,
size
);
header_size + ((hdr.left_page() & 0xfff) as usize) + size <= PAGE_SIZE
let hdr_size = (HDR + al - 1) & !(al - 1);
let hdr_n = header_mut(new).n();
unsafe {
core::ptr::copy(
new.0.data.add(hdr_size + (*n as usize) * f),
new.0.data.add(hdr_size + (*n as usize) * f + f),
(hdr_n as usize - (*n as usize)) * f,
);
}
let hdr = header_mut(new);
hdr.set_n(hdr.n() + 1);
hdr.incr(f);
hdr_size + (*n as usize) * f
// Now, this is a bit trickier. This performs a rotation by 1
// without all the rotation machinery from the Rust core
// library.
//
// Indeed, since we're in a leaf, we are effectively erasing
// the split entry. There is a choice to be made here, between
// passing the entry by value or by reference. Because splits
// might cascade with the same middle entry, passing it by
// value may be significantly worse if the tree is deep, and
// is never significantly better (we'll end up copying this
// entry twice anyway).
fn alloc(hdr: &mut Header, size: usize, align: usize) -> u16 {
assert_eq!(size % align, 0);
hdr.set_n(hdr.n() + 1);
hdr.incr(size);
0
}
impl Alloc for Internal {
fn can_alloc<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool {
(HDR as usize) + (hdr.n() as usize) * 8 + n * (8 + core::mem::size_of::<Tuple<K, V>>())
< hdr.data() as usize
}
fn can_compact<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool {
(HDR as usize)
+ ((hdr.left_page() & 0xfff) as usize)
+ n * (8 + core::mem::size_of::<Tuple<K, V>>())
<= PAGE_SIZE
fn alloc_insert<K: Storable, V: Storable>(new: &mut MutPage, n: &mut isize, _: u64) -> usize {
let f = core::mem::size_of::<Tuple<K, V>>();
let al = core::mem::align_of::<Tuple<K, V>>();
let hdr_size = (HDR + al - 1) & !(al - 1);
let hdr_n = header_mut(new).n();
/// Set the right-hand child in the offset array, preserving the
/// current offset.
fn set_right_child(page: &mut MutPage, n: isize, r: u64) {
core::ptr::copy(
new.0.data.add(hdr_size + (*n as usize) * f),
new.0.data.add(hdr_size + (*n as usize) * f + f),
(hdr_n as usize - (*n as usize)) * f,
);
debug_assert_eq!(r & 0xfff, 0);
let ptr = page.0.data.offset(HDR as isize + n * 8) as *mut u64;
let off = u64::from_le(*ptr) & 0xfff;
*ptr = (r | off as u64).to_le();
fn offset_slice<'a, T: LoadPage, K: Storable, V: Storable>(
page: crate::Page<'a>,
) -> Offsets<'a, Self::Offset> {
let hdr = header(page);
Offsets::Range(0..(hdr.n() as usize))
fn first<'a, K, V>(off: &Offsets<'a>) -> u64 {
match off {
Offsets::Slice(s) => s[0],
Offsets::Range(_) => unreachable!(),
}
fn kv<'a, T: LoadPage, K: Storable, V: Storable>(
_txn: &T,
page: crate::Page,
(_, off): (u64, u16),
) -> (&'a K, &'a V, u64) {
fn kv<'a, K: Storable, V: Storable>(page: crate::Page, off: u64) -> (&'a K, &'a V, u64) {
let tup = &*(page.data.as_ptr().add(off as usize) as *const Tuple<K, V>);
debug!(">>>>>>>> kv {:?} {:?}", off, tup);
(&tup.k, &tup.v, 0)
let tup = &*(page.data.as_ptr().add((off & 0xfff) as usize) as *const Tuple<K, V>);
(&tup.k, &tup.v, off & !0xfff)
impl Alloc for Internal {
fn can_alloc<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool {
debug!("can_alloc {:?}", hdr.data());
(HDR as usize) + (hdr.n() as usize) * 8 + 8 + size < hdr.data() as usize
}
fn can_compact<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool {
debug!(
"can_compact, internal: {:?} {:?} {:?}",
HDR,
hdr.left_page() & 0xfff,
size
);
(HDR as usize) + (hdr.n() as usize) * 8 + ((hdr.left_page() & 0xfff) as usize) + 8 + size
<= PAGE_SIZE
}
// Much simpler than in leaves, because we just need to move the
// offsets. There's a little bit of bookkeeping involved though.
let f = core::mem::size_of::<Tuple<K, V>>();
let size = { ((8 + f) * (hdr_n as usize - n)) as u64 };
// Remaining occupied size on the page (minus the header).
let size = (8 + core::mem::size_of::<Tuple<K, V>>()) * (hdr_n as usize - n);
debug!("truncate_left {:?} {:?}", hdr.left_page(), size);
hdr.set_occupied(size);
// Because the leftmost child has been copied from an entry
// containing an offset, its 12 LBSs are still enconding an
// offset on the page, whereas it should encode the occupied
// bytes on the page. Reset it.
hdr.set_occupied(size as u64);
fn alloc(hdr: &mut Header, size: usize, align: usize) -> u16 {
assert_eq!(size % align, 0);
let data = hdr.data();
hdr.set_data(data - size as u16);
hdr.set_n(hdr.n() + 1);
hdr.incr(size);
data - size as u16
}
let size = core::mem::size_of::<Tuple<K, V>>();
debug!("alloc internal {:?} {:?}", n, size);
let (hdr_n, off_new) = {
let hdr = header_mut(new);
(
hdr.n(),
Self::alloc(&mut *hdr, size, core::mem::align_of::<Tuple<K, V>>()),
)
};
debug!("off_new = {:?}", off_new);
let hdr = header_mut(new);
// Move the `data` field one entry to the left.
let data = hdr.data() as u16;
let off_new = data - core::mem::size_of::<Tuple<K, V>>() as u16;
hdr.set_data(off_new);
// Increment the number of entries, add the newly occupied size.
hdr.set_n(hdr.n() + 1);
hdr.incr(8 + core::mem::size_of::<Tuple<K, V>>());
let hdr_n = hdr.n();
}
fn set_offset(new: &mut MutPage, n: isize, r: u64, off: u16) {
unsafe {
let ptr = new.0.data.offset(HDR as isize + n * 8) as *mut u64;
*ptr = (r | off as u64).to_le();
}
}
fn set_right_child(page: &mut MutPage, n: isize, r: u64) {
unsafe {
let ptr = page.0.data.offset(HDR as isize + n * 8) as *mut u64;
let off = u64::from_le(*ptr) & 0xfff;
*ptr = (r | off as u64).to_le();
}
type Offset = InternalOffset;
fn offset_slice<'a, T, K: Storable, V: Storable>(
page: crate::Page<'a>,
) -> Offsets<'a, Self::Offset> {
let hdr = header(page);
unsafe {
Offsets::Slice(core::slice::from_raw_parts(
page.data.as_ptr().add(HDR) as *const InternalOffset,
hdr.n() as usize,
))
}
}
fn kv<'a, T: LoadPage, K: Storable, V: Storable>(
_txn: &T,
page: crate::Page,
(r, off): (u64, u16),
) -> (&'a K, &'a V, u64) {
unsafe {
let tup = &*(page.data.as_ptr().add(off as usize) as *const Tuple<K, V>);
(&tup.k, &tup.v, r)
}
}
#[derive(Debug, Clone, Copy)]
#[repr(C)]
pub struct LeafOffset(u16);
impl Into<(u64, u16)> for LeafOffset {
fn into(self) -> (u64, u16) {
(0, self.0)
}
}
impl Into<usize> for LeafOffset {
fn into(self) -> usize {
self.0 as usize
}
}
#[derive(Debug, Clone, Copy)]
#[repr(C)]
pub struct InternalOffset(u64);
impl Into<(u64, u16)> for InternalOffset {
fn into(self) -> (u64, u16) {
(self.0 & !0xfff, (self.0 & 0xfff) as u16)
}
}
impl Into<usize> for InternalOffset {
fn into(self) -> usize {
self.0 as usize
}
}
//! An implementation of B trees.
//! An implementation of B trees. The core operations on B trees
//! (lookup, iterate, put and del) are generic in the actual
//! implementation of nodes, via the [`BTreePage`] and
//! [`BTreeMutPage`]. This allows for a simpler code for the
//! high-level functions, as well as specialised, high-performance
//! implementations for the nodes.
//!
//! Two different implementations are supplied: one in [`page`] for
//! types with a size known at compile-time, which yields denser
//! leaves, and hence shallower trees (if the values are really using
//! the space). The other one, in [`page_unsized`], is for
//! dynamic-sized types, or types whose representation is dynamic, for
//! example enums that are `Sized` in Rust, but whose cases vary
//! widely in size.
#[derive(Debug)]
pub struct Ok {
page: MutPage,
freed: u64,
}
#[derive(Debug)]
pub enum Put<'a, K: ?Sized, V: ?Sized> {
Ok(Ok),
Split {
split_key: &'a K,
split_value: &'a V,
left: MutPage,
right: MutPage,
freed: u64,
},
}
m: Concat<'a, K, V, Self>,
) -> Result<Op<'a, T, K, V>, T::Error>;
}
/// Represents the result of a merge or rebalance, without touching
/// the parent of the merge/rebalance.
#[derive(Debug)]
pub enum Op<'a, T, K: ?Sized, V: ?Sized> {
Merged {
// New merged page.
page: MutPage,
// Pages freed by the merge (0 meaning "no page").
freed: [u64; 2],
marker: core::marker::PhantomData<T>,
},
Rebalanced {
// New middle key.
k: &'a K,
// New middle value.
v: &'a V,
// Do `k` and `v` come from a page shared with another tree?
incr_kv_rc: bool,
// New left page.
l: u64,
// New right page.
r: u64,
// Pages freed by the rebalance (0 meaning "no page").
freed: [u64; 2],
},
Put(Put<'a, K, V>),
}
/// Represents a page with modifications from a merge.
#[derive(Debug)]
pub struct ModifiedPage<'a, K: ?Sized, V: ?Sized, P: BTreePage<K, V>> {
pub page: CowPage,
// Whether the page is mutable with another tree.
pub mutable: bool,
// Start copying c0 (keep `page`'s left child).
pub c0: P::Cursor,
// If > 0, replace the right child of c0's last element with `l`.
pub l: u64,
// If > 0, replace the left child of c1's last element with `r`.
pub r: u64,
// Possibly insert a new binding.
pub ins: Option<(&'a K, &'a V)>,
// If a rebalance causes a split, we might need to insert an entry
// after the replacement.
pub ins2: Option<(&'a K, &'a V)>,
// The first element of c1 is to be deleted, the others must be copied.
pub c1: P::Cursor,
// Whether to skip `c1`'s first element.
pub skip_first: bool,
m: del::Concat<'a, K, V, Self>,
) -> Result<del::Op<'a, T, K, V>, T::Error>;
/// Represents the concatenation of a modified page and one of its
/// sibling (left or right).
#[derive(Debug)]
pub struct Concat<'a, K: ?Sized, V: ?Sized, P: BTreePage<K, V>> {
/// Modified page.
pub modified: ModifiedPage<'a, K, V, P>,
/// Middle element.
pub mid: (&'a K, &'a V),
/// Sibling of the modified page.
pub other: CowPage,
/// Is the modified field on the left or on the right of the
/// concatenation?
pub mod_is_left: bool,
/// Is the other page owned by this tree? If not, it means `other`
/// is shared with another tree, and hence we need to increase the
/// reference count of entries coming from `other`.
pub other_is_mutable: bool,
}
/// Represents the result of a merge or rebalance, without touching
/// the parent of the merge/rebalance.
#[derive(Debug)]
pub enum Op<'a, T, K: ?Sized, V: ?Sized> {
Merged {
// New merged page.
page: MutPage,
// Pages freed by the merge (0 meaning "no page").
freed: [u64; 2],
marker: core::marker::PhantomData<T>,
},
Rebalanced {
// New middle key.
k: &'a K,
// New middle value.
v: &'a V,
// Do `k` and `v` come from a page shared with another tree?
incr_kv_rc: bool,
// New left page.
l: u64,
// New right page.
r: u64,
// Pages freed by the rebalance (0 meaning "no page").
freed: [u64; 2],
},
Put(crate::btree::put::Put<'a, K, V>),
}
/// Represents a page with modifications from a merge.
#[derive(Debug)]
pub struct ModifiedPage<'a, K: ?Sized, V: ?Sized, P: BTreePage<K, V>> {
pub page: CowPage,
// Whether the page is mutable with another tree.
pub mutable: bool,
// Start copying c0 (keep `page`'s left child).
pub c0: P::Cursor,
// If > 0, replace the right child of c0's last element with `l`.
pub l: u64,
// If > 0, replace the left child of c1's last element with `r`.
pub r: u64,
// Possibly insert a new binding.
pub ins: Option<(&'a K, &'a V)>,
// If a rebalance causes a split, we might need to insert an entry
// after the replacement.
pub ins2: Option<(&'a K, &'a V)>,
// The first element of c1 is to be deleted, the others must be copied.
pub c1: P::Cursor,
// Whether to skip `c1`'s first element.
pub skip_first: bool,
}
/// Represents the concatenation of a modified page and one of its
/// sibling (left or right).
#[derive(Debug)]
pub struct Concat<'a, K: ?Sized, V: ?Sized, P: BTreePage<K, V>> {
/// Modified page.
pub modified: ModifiedPage<'a, K, V, P>,
/// Middle element.
pub mid: (&'a K, &'a V),
/// Sibling of the modified page.
pub other: CowPage,
/// Is the modified field on the left or on the right of the
/// concatenation?
pub mod_is_left: bool,
/// Is the other page owned by this tree? If not, it means `other`
/// is shared with another tree, and hence we need to increase the
/// reference count of entries coming from `other`.
pub other_is_mutable: bool,
}
let mut left_page = P::right_child(cur.page.as_page(), &cur.cursor);
while left_page > 0 {
if cursor.first_rc_len >= N_CURSORS && txn.rc(left_page)? >= 2 {
cursor.first_rc_len = cursor.len()
}
let page = txn.load_page(left_page)?;
let curs = P::cursor_first(&page);
left_page = P::left_child(page.as_page(), &curs);
cursor.push(PageCursor { page, cursor: curs });
}
let leaf_is_shared = cursor.len() >= cursor.first_rc_len;
let right_subtree = P::right_child(cur.page.as_page(), &cur.cursor);
cursor.set_first_leaf(txn, right_subtree)?;
let leaf_is_shared = cursor.len() >= cursor.first_rc_len();
pub first_rc_len: usize,
/// The length of the stack at the position of the first page
/// shared with another tree. This definition is a little bit
/// convoluted, but allows for efficient comparisons between
/// `first_rc_len` and `len` to check whether a page is shared
/// with another tree.
first_rc_len: usize,
/// Number of initialised items on the stack.
pub(super) fn set_first_leaf<T: LoadPage>(
&mut self,
txn: &T,
mut left_page: u64,
) -> Result<(), T::Error> {
while left_page > 0 {
if self.first_rc_len >= N_CURSORS && txn.rc(left_page)? >= 2 {
self.first_rc_len = self.len
}
let page = txn.load_page(left_page)?;
let curs = P::cursor_first(&page);
left_page = P::left_child(page.as_page(), &curs);
self.push(PageCursor { page, cursor: curs });
}
Ok(())
}
/// Reset the cursor to the first element of the database.
pub fn reset<'a, T: LoadPage>(&mut self) {
self.len = 1;
let init = unsafe { &mut *self.stack[0].as_mut_ptr() };
init.cursor = P::cursor_before(&init.page);
}
// An invariant of cursors, fundamental to understand the `next`
// and `prev` functions below, is that the lower levels (in the
// tree, upper levels on the stack) are the left children of the
// cursor's position on a page.
/// Set the cursor to the first position larger than or equal to
/// the specified key and value.
debug!("{:?}", cursor);
if let Some((k, v)) = k {
if let Ok((kk, vv, _)) = P::set_cursor(txn, current.page.as_page(), cursor, k, v) {
if v.is_some() {
return Ok(Some((kk, vv)));
}
last_match = Some((kk, vv));
last_matching_page = self.len
} else {
debug!("not found on page {:?}", current.page)
if let Ok((kk, vv, _)) = P::set_cursor(txn, current.page.as_page(), cursor, k, v) {
if v.is_some() {
return Ok(Some((kk, vv)));
}
impl<
'a,
K: Storable + ?Sized + core::fmt::Debug,
V: Storable + ?Sized + core::fmt::Debug,
P: BTreePage<K, V> + 'a,
> Cursor<K, V, P>
{
// Cursor invariant: the lower levels (in the tree, upper levels
// on the stack) are the left children of the cursor's position on
// a page.
pub fn next<T: LoadPage>(&mut self, txn: &'a T) -> Result<Option<(&'a K, &'a V)>, T::Error> {
debug!("=== next");
/// Return the current position of the cursor, and move the cursor
/// to the next position.
pub fn next<'a, T: LoadPage>(
&mut self,
txn: &'a T,
) -> Result<Option<(&'a K, &'a V)>, T::Error> {
pub fn prev<T: LoadPage>(&mut self, txn: &'a T) -> Result<Option<(&'a K, &'a V)>, T::Error> {
debug!("======== prev");
/// Move the cursor to the previous entry, and return that
/// entry.
pub fn prev<'a, T: LoadPage>(
&mut self,
txn: &'a T,
) -> Result<Option<(&'a K, &'a V)>, T::Error> {