TSMS6W4DOKQNUQ4PEMTLOIODR33VFPN6MMNS73ZPSU4BOQVRGPNAC CCNPHVQCIGINWTLXCHOASGVWUPBZXFOLM2F7HTKMEA2DMFTOX7TAC KMT3MF5NLEQIPZLHCRYDGQ5EA46HJCG3C2ANEPMZGKGHDK77ADPAC OP6SVMOD2GTQ7VNJ4E5KYFG4MIYA7HBMXJTADALMZH4PY7OQRMZQC 6DMPXOAT5GQ3BQQOMUZN2GMBQPRA4IB7CCPHTQTIFGO3KWWAKF3QC OFINGD26ZWCRDVVDI2ZIBLMHXKEMJA6MRNLANJYUHQPIJLPA7J2AC RV2L6CZWTMUQ2A52YDAFVHDFGURZL3H4SSCDC347UGN23D3J5KZQC LSQ6V7M66TEGLJ7QBLRVDX4E7UKJTDQTEXZOS3KGPGFKVXNLPKBQC H3FVSQIQGFCFKCPXVOSFHP4OSUOBBURJESCZNGQTNDAAD3WQSBEQC APPY2E7M5NHNC6MFYXSVEKJVAILK7YAZVTVE3W75EK2JNFVS3XBQC 73Z2UB3JGRLFNFORE7D64O4IHIFSZASD4G4FLJ4FJLHANT75MGIAC HN6Z5DU4WYMAIOOSNVHLIIMNF6Q53TNJ7YC27SLKWNXVYCTACQKQC QEUTVAZ4F4EJXRDMWDMYXF6XEDMX7YPVG4IIXEKPIR3K54E5W5OAC XEU2QVLCHPYOOD4TQIPEEVYOVSFMKFPLJYWEJYXYJAZ7S54KWDZAC DV4A2LR7Q5LAEGAQHLO34PZCHGJUHPAMRZFGT7GUFNKVQKPJNOYQC 6UVFCERMGSGNRWCVC3GWO5HWV6MSWE433DXBJVC7KRPP6LLJLCSQC LROAI3NBBSCU4T2YA6EHJYKKKL75AU5A7C7WIRCGIQ56S6HPLRXQC QYDGYIZRNFRIQD7RUCY5YAN3F2THZA74E5UOHPIFWSULEJFAFVJQC WS4ZQM4RMIHZ6XZKSDQJGHN5SSSWFL4H236USOPUA33S6RC53RFAC OTWDDJE7TTE73D6BGF4ZN6BH2NFUFLPME2VJ3CPALH463UGWLEIQC ESUI5EUZUBDPHNN3APU33IFORYPYR6J3WEMEZG57FKF3EH66ZBHAC Q7DRIBBRE4MNG4NP3PVIXAJF5PQYLFWYIVK2O4VVLEO6XY3BOSFQC EAAYH6BQWDK52EC5RG3BEZQU3FJPN5RRRN4U5KDKDVPKXBVJMNDAC MSRWB47YP6L5BVTS53QQPBOHY5SXTSTR5KD6IIF35UWCTEUOCQWQC 7P43FPFAXMDYUIQV7U2DGPN4UI3RNHYFWEUEA22UBDSQTYBJRAQQC KX3WVNZW5KHVEH6EOQTZ4RBEFFJ3SGF5I467X3JWZ74PURRK4HVAC 7WJNSPEWJSJROOYHU6QROWPUZ6WNIOUG2BPSOPDPCK6RG2NQU6OQC X3QVVQIS7B7L3XYZAWL3OOBUXOJ6RMOKQ45YMLLGAHYPEEKZ45ZAC 6DCQHIFPEH4GZKSRRS32GMKDRPZH4MTCGOUEI7YEUVKWENBF3JWAC NXMFNPZ7VWJRLC3M5QJJVTICXCMGE24F3HVIZA7A7RLVMLQMLDVQC T73WR2BX2QDQ6APOREBXUKNH52FDLJNBGWPQUYB2TAF2PT7XCL2AC UUUVNC4DWEEL7WV5IRPKPZ6HZMYCPA53XM7LJWICUD4E6GN37IRQC W26CFMAQOXMUK4ZOJMAN4SMBXMWFQHO7HCTEVW73FQSRMJZFGJIQC PXF3R6SVXJXN2NMLMWNY5OFV5QYVE2VZTLGIZDZVK5ZVLFTVSSWQC KM3JAFGPFV7MP7M2LJIYRVAUTU646B3IRXADTRZKOU2RF7LUB62QC UAQX27N4PI4LHEW6LSHJETIE5MV7JTEMPLTJFYUBMYVPC43H7VOAC // `lookup` has the same semantic as// `core::slice::binary_search`, i.e. it returns either// `Ok(n)`, where `n` is the position where `(k0, v0)` was// found, or `Err(n)` where `n` is the position where// `(k0, v0)` can be inserted to preserve the order.
//! Implementation of B tree pages for Sized types, i.e. types whose//! representation has a size known at compile time (and the same as//! `core::mem::size_of()`).//!//! The details of the implementation are as follows://!//! - The page starts with a 16 bytes header of the following form//! (where all the fields are encoded in Little-Endian)://!//! ```//! #[repr(C)]//! pub struct Header {//! /// Offset to the first entry in the page, shifted 3 bits//! /// to the right to allow for the dirty bit (plus two//! /// extra bits, zero for now), as explained in the//! /// documentation of CowPage, at the root of this//! /// crate. This is 4096 for empty pages, and it is unused//! /// for leaves. Moreover, this field can't be increased://! /// when it reaches the bottom, the page must be cloned.//! data: u16,//! /// Number of entries on the page.//! n: u16,//! /// CRC (if used).//! crc: u32,//! /// The 52 most significant bits are the left child of//! /// this page (0 for leaves), while the 12 LSBs represent//! /// the space this page would take when cloned from scratch,//! /// minus the header. The reason for this is that entries//! /// in internal nodes aren't really removed when deleted,//! /// they're only "unlinked" from the array of offsets (see//! /// below). Therefore, we must have a way to tell when a//! /// page can be "compacted" to reclaim space.//! left_page: u64,//! }//! ```//! - For leaves, the rest of the page has the same representation as//! an array of length `n`, of elements of type `Tuple<K, V>`://! ```//! #[repr(C)]//! struct Tuple<K, V> {//! k: K,//! v: V,//! }//! ```//! If the alignment of that structure is more than 16 bytes, we//! need to add some padding between the header and that array.//! - For internal nodes, the rest of the page starts with an array of//! length `n` of Little-Endian-encoded `u64`, where the 12 least//! significant bits of each `u64` are an offset to a `Tuple<K, V>` in//! the page, and the 52 other bits are an offset in the file to the//! right child of the entry.//!//! Moreover, the offset represented by the 12 LSBs is after (or at)//! `header.data`.//! We say we can "allocate" in the page if the `data` of the header//! is greater than or equal to the position of the last "offset",//! plus the size we want to allocate (note that since we allocate//! from the end of the page, `data` is always a multiple of the//! alignment of `Tuple<K, V>`).
mod alloc;mod put;
mod alloc; // The "alloc" trait, to provide common methods for leaves and internal nodes.mod put; // Inserting a new value onto a page (possibly splitting the page).mod rebalance; // Rebalance two sibling pages to the left or to the right.use super::page_unsized::{header, PageCursor};
// Cursors are quite straightforward. One non-trivial thing is// that they represent both a position on a page and the interval// from that position to the end of the page, as is apparent in// their `split_at` method.//// Another thing to note is that -1 and `c.total` are valid// positions for a cursor `c`. The reason for this is that// position `-1` has a right child (which is the left child of the// first element).
fn current<'a, T: LoadPage>(_txn: &T,page: crate::Page<'a>,c: &Self::Cursor,) -> Option<(&'a K, &'a V, u64)> {if c.cur < 0 || c.cur >= c.total as isize {None} else if c.is_leaf {let f = core::mem::size_of::<Tuple<K, V>>();let off = {let al = core::mem::align_of::<Tuple<K, V>>();let hdr = (HDR + al - 1) & !(al - 1);hdr + c.cur as usize * f};unsafe {let kv = &*(page.data.as_ptr().add(off as usize) as *const Tuple<K, V>);Some((&kv.k, &kv.v, 0))}} else {unsafe {let off =u64::from_le(*(page.data.as_ptr().add(HDR) as *const u64).add(c.cur as usize));let kv = &*(page.data.as_ptr().add((off as usize) & 0xfff) as *const Tuple<K, V>);Some((&kv.k, &kv.v, off & !0xfff))}}
fn split_at(c: &Self::Cursor) -> (Self::Cursor, Self::Cursor) {(PageCursor {cur: 0,total: c.cur.max(0) as usize,is_leaf: c.is_leaf,},*c,)
}}// This is the first non-trivial method, let's explain it.fn current<'a, T: LoadPage>(_txn: &T,page: crate::Page<'a>,c: &Self::Cursor,) -> Option<(&'a K, &'a V, u64)> {// First, there's no current entry if the cursor is outside// the range of entries.if c.cur < 0 || c.cur >= c.total as isize {None} else if c.is_leaf {// Else, if this is a leaf, the elements are packed// together in a contiguous array.//// This means that the header may be followed by// padding. These are constants known at compile-time, by// the way, so `al` and `hdr` are optimised away by the// compiler.let al = core::mem::align_of::<Tuple<K, V>>();// The following is a way to compute the first multiple of// `al` after `HDR`, assuming `al` is a power of 2 (which// is always the case since we get it from `align_of`).let hdr = (HDR + al - 1) & !(al - 1);// The position of the `Tuple<K, V>` we're looking for is// `f * cur` bytes after the padded header:let f = core::mem::size_of::<Tuple<K, V>>();let kv = unsafe {&*(page.data.as_ptr().add(hdr + c.cur as usize * f) as *const Tuple<K, V>)};Some((&kv.k, &kv.v, 0))} else {// Internal nodes have an extra level of indirection: we// first need to find `off`, the offset in the page, in// the initial array of offsets. Since these offsets are// `u64`, and the header is of size 16 bytes, the array is// already aligned.unsafe {let off =u64::from_le(*(page.data.as_ptr().add(HDR + 8 * c.cur as usize) as *const u64));// Check that we aren't reading outside of the page// (for example because of a malformed offset).assert!((off as usize & 0xfff) + core::mem::size_of::<Tuple<K, V>>() <= 4096);// Once we have the offset, cast its 12 LSBs to a// position in the page, and read the `Tuple<K, V>` at// that position.let kv = &*(page.data.as_ptr().add((off as usize) & 0xfff) as *const Tuple<K, V>);Some((&kv.k, &kv.v, off & !0xfff))}
// The left and right child methods aren't really surprising. One// thing to note is that cursors are always in positions between// `-1` and `c.total` (bounds included), so we only have to check// one side of the bound in the assertions.//// We also check, before entering the `unsafe` sections, that// we're only reading data that is on a page.
// `lookup` has the same semantic as// `core::slice::binary_search`, i.e. it returns either// `Ok(n)`, where `n` is the position where `(k0, v0)` was// found, or `Err(n)` where `n` is the position where// `(k0, v0)` can be inserted to preserve the order.
let off = {let al = core::mem::align_of::<Tuple<K, V>>();let hdr_size = (HDR + al - 1) & !(al - 1);(0, (hdr_size + f * n) as u16)};Ok(Leaf::kv(txn, page, off))
let al = core::mem::align_of::<Tuple<K, V>>();let hdr_size = (HDR + al - 1) & !(al - 1);let tup =&*(page.data.as_ptr().add(hdr_size + f * n) as *const Tuple<K, V>);Ok((&tup.k, &tup.v, 0))
// When deleting from internal nodes, we take a replacement from// one of the leaves (in our current implementation, the leftmost// entry of the right subtree). This method copies an entry from// the leaf onto the program stack, which is necessary since// deletions in leaves overwrites entries.//// Another design choice would have been to do the same as for the// unsized implementation, but in this case this would have meant// copying the saved value to the end of the leaf, potentially// preventing merges, and not even saving a memory copy.
let mut n = 0;clone::<K, V, Internal>(page.as_page(), &mut new, s, &mut n);
clone::<K, V, Internal>(page.as_page(), &mut new, s, &mut 0);// Mark the former version of the page as free.
unsafe {let f = core::mem::size_of::<Tuple<K, V>>();if c.is_leaf {let n = hdr.n() as usize;
let f = core::mem::size_of::<Tuple<K, V>>();if c.is_leaf {// In leaves, we need to move the n - c - 1 elements// that are strictly after the cursor, by `f` (the// size of an entry).//// Here's the reasoning to avoid off-by-one errors: if// `c` is 0 (i.e. we're deleting the first element on// the page), we remove one entry, so there are n - 1// remaining entries.let n = hdr.n() as usize;let hdr_size = {// As usual, header + padding
let hdr_size = (HDR + al - 1) & !(al - 1);let off = c.cur as usize * f;let kv_ptr = p.add(hdr_size + off);core::ptr::copy(kv_ptr.add(f), kv_ptr, f * (n - c.cur as usize - 1));hdr.decr(f);} else {
(HDR + al - 1) & !(al - 1)};let off = hdr_size + c.cur as usize * f;unsafe {core::ptr::copy(p.add(off + f), p.add(off), f * (n - c.cur as usize - 1));}// Removing `f` bytes from the page.hdr.decr(f);} else {// Internal nodes are easier to deal with, as we just// have to move the offsets.unsafe {
unsafe {let off = (p.add(HDR) as *mut u64).offset(c.cur as isize - 1);*off = (l | (u64::from_le(*off) & 0xfff)).to_le();
if l > 0 {unsafe {let off = (p.add(HDR) as *mut u64).offset(c.cur as isize - 1);*off = (l | (u64::from_le(*off) & 0xfff)).to_le();}
unsafe {let mut new = txn.alloc_page()?;<Page<K, V> as BTreeMutPage<K, V>>::init(&mut new);if c.is_leaf {let s = Leaf::offset_slice::<T, K, V>(page.as_page());debug!("s = {:?}", s);let (s0, s1) = s.split_at(c.cur as usize);let (_, s1) = s1.split_at(1);debug!("leaf s0 {:?} s1 {:?}", s0, s1);let mut n = 0;clone::<K, V, Leaf>(page.as_page(), &mut new, s0, &mut n);clone::<K, V, Leaf>(page.as_page(), &mut new, s1, &mut n);} else {let s = Internal::offset_slice::<T, K, V>(page.as_page());debug!("s = {:?}", s);let (s0, s1) = s.split_at(c.cur as usize);let (_, s1) = s1.split_at(1);debug!("internal s0 {:?} s1 {:?}", s0, s1);let mut n = 0;clone::<K, V, Internal>(page.as_page(), &mut new, s0, &mut n);debug!("offset {:?}", n);if l > 0 {let off = (new.0.data.add(HDR) as *mut u64).offset(n - 1);
// Immutable pages need to be cloned. The strategy is the// same in both cases: get an "offset slice", split it at// the cursor, remove the first entry of the second half,// and clone.let mut new = txn.alloc_page()?;<Page<K, V> as BTreeMutPage<K, V>>::init(&mut new);if c.is_leaf {let s = Leaf::offset_slice::<T, K, V>(page.as_page());let (s0, s1) = s.split_at(c.cur as usize);let (_, s1) = s1.split_at(1);let mut n = 0;clone::<K, V, Leaf>(page.as_page(), &mut new, s0, &mut n);clone::<K, V, Leaf>(page.as_page(), &mut new, s1, &mut n);} else {// Internal nodes a bit trickier, since the left child// is not counted in the "offset slice", so we need to// clone it separately. Also, the `l` argument to this// function might be non-zero in this case.let s = Internal::offset_slice::<T, K, V>(page.as_page());let (s0, s1) = s.split_at(c.cur as usize);let (_, s1) = s1.split_at(1);// First, clone the left child of the page.let hdr = header(page.as_page());let left = hdr.left_page() & !0xfff;unsafe { *(new.0.data.add(HDR - 8) as *mut u64) = left.to_le() };// Then, clone the entries strictly before the cursor// (i.e. clone `s0`).let mut n = 0;clone::<K, V, Internal>(page.as_page(), &mut new, s0, &mut n);// If we need to update the left child of the deleted// item, do it.if l > 0 {unsafe {let off = new.0.data.offset(HDR as isize + (n - 1) * 8) as *mut u64;
let mut hdr_size = HDR;let mid_size = if m.modified.c0.is_leaf {let f = core::mem::size_of::<Tuple<K, V>>();
// First evaluate the size of the middle element on a page.let (hdr_size, mid_size) = if m.modified.c0.is_leaf {
let size = hdr_size + mod_size + mid_size + occupied;if size <= PAGE_SIZE {// mergelet mut new = txn.alloc_page()?;<Page<K, V> as BTreeMutPage<K, V>>::init(&mut new);let freed = {let b0 = if m.modified.page.is_dirty() { 1 } else { 0 };let b1 = if m.other.is_dirty() { 1 } else { 0 };[m.modified.page.offset | b0, m.other.offset | b1]
// One surprising observation here is that adding the sizes// works. This is surprising because of alignment and// padding. It works because we can split the sizes into an// offset part (always 8 bytes) and a data part (of a constant// alignment), and thus we never need any padding anywhere on// the page.if mod_size + mid_size + occupied <= PAGE_SIZE {// If the concatenation fits on a page, merge.let (page, freed) = if m.modified.c0.is_leaf {merge::<_, _, _, Leaf>(txn, m)?} else {merge::<_, _, _, Internal>(txn, m)?
if m.modified.c0.is_leaf {merge::<_, _, _, Leaf>(txn, &mut new, m)} else {merge::<_, _, _, Internal>(txn, &mut new, m)}
let rc = PageCursor::new(&m.other, 0);let first_size = <Page<K, V>>::current_size(m.other.as_page(), &rc);// If we can't rebalance, modify and return.if mod_size >= PAGE_SIZE / 2 || hdr_size + occupied - first_size < PAGE_SIZE / 2 {
// If the modified page is large enough, or if the other page// has only just barely the minimum number of elements to be// at least half-full, return.//// The situation where the other page isn't full enough might// happen for example if elements occupy exactly 1/5th of a// page + 1 byte, and the modified page has 2 of them after// the deletion, while the other page has 3.if mod_size >= PAGE_SIZE / 2 || hdr_size + occupied - mid_size < PAGE_SIZE / 2 {
unsafe fn cmp<T: LoadPage, K: Storable, V: Storable>(
/// An equivalent of `Ord::cmp` on `Tuple<K, V>` but using/// `Storable::compare` instead of `Ord::cmp` on `K` and `V`.fn cmp<T: LoadPage, K: Storable, V: Storable>(
let mut a = 0;let mut b = s.len();for _ in 0..4 {let mid = (a + b) / 2;match cmp(txn, k0, v0, p, s[mid]) {Ordering::Less => a = mid,Ordering::Greater => b = mid,Ordering::Equal => b = mid + 1,}}let mut n = a;for &off in &s[a..b] {match cmp(txn, k0, v0, p, off) {Ordering::Less => n += 1,
for (n, off) in s.iter().enumerate() {match cmp(txn, k0, v0, p, *off) {Ordering::Less => {}
let mut is_first = m.skip_first;while let Some((k, v, r)) = <Page<K, V>>::next(txn, m.page.as_page(), &mut m.c1) {if is_first {is_first = false;continue;}
// Then, clone `m.c1`, possibly skipping the first element.let mut c1 = m.c1.clone();if m.skip_first {<Page<K, V>>::move_next(&mut c1);}while let Some((k, v, r)) = <Page<K, V>>::next(txn, m.page.as_page(), &mut c1) {
fn merge<T: LoadPage, K: Storable + core::fmt::Debug, V: Storable + core::fmt::Debug, L: Alloc>(txn: &T,new: &mut MutPage,
/// The very unsurprising `merge` functionfn merge<T: AllocPage + LoadPage,K: Storable + core::fmt::Debug,V: Storable + core::fmt::Debug,L: Alloc,>(txn: &mut T,
) {
) -> Result<(MutPage, [u64; 2]), T::Error> {// This algorithm is probably a bit naive, especially for leaves,// where if the left page is mutable, we can copy the other page// onto it.// Indeed, we first allocate a page, then clone both pages onto// it, in a different order depending on whether the modified page// is the left or the right child.let mut new = txn.alloc_page()?;<Page<K, V> as BTreeMutPage<K, V>>::init(&mut new);
alloc::<_, _, L>(new, m.mid.0, m.mid.1, 0, 0, &mut n);modify::<_, _, _, L>(txn, new, &mut m.modified, &mut n);
alloc::<_, _, L>(&mut new, m.mid.0, m.mid.1, 0, 0, &mut n);modify::<_, _, _, L>(txn, &mut new, &mut m.modified, &mut n);
/// Clone a slice of offsets onto a page. This essentially does what/// it says. Note that the leftmost child of a page is not included in/// the offset slices, so we don't have to handle it here.////// This should really be in the `Alloc` trait, but Rust doesn't have/// associated type constructors, so we can't have an associated type/// that's sometimes a slice and sometimes a "Range".
let off_new = L::alloc(hdr, size, core::mem::align_of::<Tuple<K, V>>());core::ptr::copy_nonoverlapping(ptr, new.0.data.offset(off_new as isize), size);L::set_offset(new, *n, r, off_new);
let data = hdr.data() as u16;let off_new = data - size as u16;hdr.set_data(off_new);hdr.set_n(hdr.n() + 1);hdr.incr(8 + size);// Copy the entry from the original page to its// position on the new page.core::ptr::copy_nonoverlapping(ptr, new.0.data.add(off_new as usize), size);// Set the offset to this new entry in the offset// array, along with the right child page.let ptr = new.0.data.offset(HDR as isize + *n * 8) as *mut u64;*ptr = (r | off_new as u64).to_le();
let (new_right, k, v) = if r == 0 && right_hdr.is_dirty() {// Rewrite the deleted element at the end of the page, so that// the pointer is still valid.
let (new_right, k, v) = if r == 0 && m.other_is_mutable && right_hdr.is_dirty() {// Since `r == 0`, we know this is a leaf. Therefore, we need// to rewrite the deleted element to the end of the page, so// that the pointer to the new middle entry is valid when this// function returns.
// If this isn't a leaf, or isn't mutable, use the standard// deletion function, because we know this will leave the// pointer to the current entry valid.// We extend the pointer's lifetime here, because we know the// current deletion (we only rebalance during deletions) won't// touch this page anymore after this.
// Surprisingly, the `rebalance_right` function is simpler,// since://// - if we call it to rebalance two internal nodes, we're in the easy// case of rebalance_left.//// - Else, the middle element is the last one on the left page, and// isn't erased be leaf deletions, because these just move entries to// the left.
) -> Result<Put<'a, K, V>, T::Error> {let size = core::mem::size_of::<Tuple<K, V>>()+ if k1v1.is_some() {core::mem::size_of::<Tuple<K, V>>()} else {0};
) -> Result<crate::btree::put::Put<'a, K, V>, T::Error> {let size = if k1v1.is_some() { 2 } else { 1 };
fn can_alloc<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool;fn can_compact<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool;fn alloc(hdr: &mut Header, size: usize, align: usize) -> u16;
/// Is there enough room to add one entry into this page (without cloning)?fn can_alloc<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool;
// n = number of items to remove
/// If we clone the page, will there be enough room for one entry?fn can_compact<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool;/// Remove the leftmost `n` elements from the page. On leaves,/// this moves the last truncated element to the end of the page/// and returns a pointer to that element (this function is only/// used in `crate::btree::put`, and the pointer becomes invalid/// at the end of the `crate::btree::put`).////// Returning the last deleted element isn't required for internal/// nodes, since the entries are still valid there.
fn set_offset(new: &mut MutPage, n: isize, r: u64, off: u16);
/// Set the right child of the `n`th entry to `r`. If `n == -1`,/// this method can be used to set the leftmost child of a page.
) -> Offsets<'a, Self::Offset>;fn kv<'a, T: LoadPage, K: Storable, V: Storable>(txn: &T,page: crate::Page,off: (u64, u16),) -> (&'a K, &'a V, u64);
) -> Offsets<'a>;/// First element of a slice offset. For the sake of code/// simplicity, we return a `u64` even for leaves. In internal/// nodes, the 52 MSBs encode a child page, and the 12 LSBs encode/// an offset in the page.fn first<'a, K, V>(off: &Offsets<'a>) -> u64;/// The tuple and right child at offset `off`.fn kv<'a, K: Storable, V: Storable>(page: crate::Page, off: u64) -> (&'a K, &'a V, u64);
impl<'a, A: Into<(u64, u16)> + Copy> Offsets<'a, A> {
impl<'a> Offsets<'a> {/// Splitting an offset slice, with the same semantics as/// `split_at` for slices, except if `mid` is equal to the length,/// in which case we return `(self, &[])`.
pub fn first<T, K: Storable, V: Storable>(&self) -> (u64, u16) {match self {Offsets::Slice(s) => s[0].into(),
}pub struct Leaf {}pub struct Internal {}impl Alloc for Leaf {// Checking if there's enough room is just bookkeeping.fn can_alloc<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool {let f = core::mem::size_of::<Tuple<K, V>>();let al = core::mem::align_of::<Tuple<K, V>>();let header_size = (HDR + al - 1) & !(al - 1);header_size + (hdr.n() as usize + n) * f <= PAGE_SIZE}/// There's no possible compaction on leaves, it's the same as allocating.fn can_compact<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool {Self::can_alloc::<K, V>(hdr, n)}fn set_right_child(_: &mut MutPage, _: isize, _: u64) {}fn offset_slice<'a, T: LoadPage, K: Storable, V: Storable>(page: crate::Page<'a>,) -> Offsets<'a> {let hdr = header(page);Offsets::Range(0..(hdr.n() as usize))}/// This returns an offset on the page, so we need to compute that.fn first<'a, K, V>(off: &Offsets<'a>) -> u64 {match off {Offsets::Slice(_) => unreachable!(),
impl Alloc for Leaf {fn can_alloc<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool {
/// Here, we just move `total - *n` elements to the right, and/// increase the bookkeeping values (occupied bytes and number of/// elements).fn alloc_insert<K: Storable, V: Storable>(new: &mut MutPage, n: &mut isize, _: u64) -> usize {
let al = core::mem::align_of::<Tuple<K, V>>();assert_eq!(size % al, 0);let header_size = (HDR + al - 1) & !(al - 1);header_size + (hdr.n() as usize) * f + size <= PAGE_SIZE}fn can_compact<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool {
assert_eq!(size % al, 0);let header_size = (HDR + al - 1) & !(al - 1);debug!("can_compact, leaf: {:?} {:?} {:?}",header_size,hdr.left_page() & 0xfff,size);header_size + ((hdr.left_page() & 0xfff) as usize) + size <= PAGE_SIZE
let hdr_size = (HDR + al - 1) & !(al - 1);let hdr_n = header_mut(new).n();unsafe {core::ptr::copy(new.0.data.add(hdr_size + (*n as usize) * f),new.0.data.add(hdr_size + (*n as usize) * f + f),(hdr_n as usize - (*n as usize)) * f,);}let hdr = header_mut(new);hdr.set_n(hdr.n() + 1);hdr.incr(f);hdr_size + (*n as usize) * f
// Now, this is a bit trickier. This performs a rotation by 1// without all the rotation machinery from the Rust core// library.//// Indeed, since we're in a leaf, we are effectively erasing// the split entry. There is a choice to be made here, between// passing the entry by value or by reference. Because splits// might cascade with the same middle entry, passing it by// value may be significantly worse if the tree is deep, and// is never significantly better (we'll end up copying this// entry twice anyway).
fn alloc(hdr: &mut Header, size: usize, align: usize) -> u16 {assert_eq!(size % align, 0);hdr.set_n(hdr.n() + 1);hdr.incr(size);0
}impl Alloc for Internal {fn can_alloc<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool {(HDR as usize) + (hdr.n() as usize) * 8 + n * (8 + core::mem::size_of::<Tuple<K, V>>())< hdr.data() as usize}fn can_compact<K: Storable, V: Storable>(hdr: &Header, n: usize) -> bool {(HDR as usize)+ ((hdr.left_page() & 0xfff) as usize)+ n * (8 + core::mem::size_of::<Tuple<K, V>>())<= PAGE_SIZE
fn alloc_insert<K: Storable, V: Storable>(new: &mut MutPage, n: &mut isize, _: u64) -> usize {let f = core::mem::size_of::<Tuple<K, V>>();let al = core::mem::align_of::<Tuple<K, V>>();let hdr_size = (HDR + al - 1) & !(al - 1);let hdr_n = header_mut(new).n();
/// Set the right-hand child in the offset array, preserving the/// current offset.fn set_right_child(page: &mut MutPage, n: isize, r: u64) {
core::ptr::copy(new.0.data.add(hdr_size + (*n as usize) * f),new.0.data.add(hdr_size + (*n as usize) * f + f),(hdr_n as usize - (*n as usize)) * f,);
debug_assert_eq!(r & 0xfff, 0);let ptr = page.0.data.offset(HDR as isize + n * 8) as *mut u64;let off = u64::from_le(*ptr) & 0xfff;*ptr = (r | off as u64).to_le();
fn offset_slice<'a, T: LoadPage, K: Storable, V: Storable>(page: crate::Page<'a>,) -> Offsets<'a, Self::Offset> {let hdr = header(page);Offsets::Range(0..(hdr.n() as usize))
fn first<'a, K, V>(off: &Offsets<'a>) -> u64 {match off {Offsets::Slice(s) => s[0],Offsets::Range(_) => unreachable!(),}
fn kv<'a, T: LoadPage, K: Storable, V: Storable>(_txn: &T,page: crate::Page,(_, off): (u64, u16),) -> (&'a K, &'a V, u64) {
fn kv<'a, K: Storable, V: Storable>(page: crate::Page, off: u64) -> (&'a K, &'a V, u64) {
let tup = &*(page.data.as_ptr().add(off as usize) as *const Tuple<K, V>);debug!(">>>>>>>> kv {:?} {:?}", off, tup);(&tup.k, &tup.v, 0)
let tup = &*(page.data.as_ptr().add((off & 0xfff) as usize) as *const Tuple<K, V>);(&tup.k, &tup.v, off & !0xfff)
impl Alloc for Internal {fn can_alloc<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool {debug!("can_alloc {:?}", hdr.data());(HDR as usize) + (hdr.n() as usize) * 8 + 8 + size < hdr.data() as usize}fn can_compact<K: Storable, V: Storable>(hdr: &Header, size: usize) -> bool {debug!("can_compact, internal: {:?} {:?} {:?}",HDR,hdr.left_page() & 0xfff,size);(HDR as usize) + (hdr.n() as usize) * 8 + ((hdr.left_page() & 0xfff) as usize) + 8 + size<= PAGE_SIZE}
// Much simpler than in leaves, because we just need to move the// offsets. There's a little bit of bookkeeping involved though.
let f = core::mem::size_of::<Tuple<K, V>>();let size = { ((8 + f) * (hdr_n as usize - n)) as u64 };
// Remaining occupied size on the page (minus the header).let size = (8 + core::mem::size_of::<Tuple<K, V>>()) * (hdr_n as usize - n);
debug!("truncate_left {:?} {:?}", hdr.left_page(), size);hdr.set_occupied(size);
// Because the leftmost child has been copied from an entry// containing an offset, its 12 LBSs are still enconding an// offset on the page, whereas it should encode the occupied// bytes on the page. Reset it.hdr.set_occupied(size as u64);
fn alloc(hdr: &mut Header, size: usize, align: usize) -> u16 {assert_eq!(size % align, 0);let data = hdr.data();hdr.set_data(data - size as u16);hdr.set_n(hdr.n() + 1);hdr.incr(size);data - size as u16}
let size = core::mem::size_of::<Tuple<K, V>>();debug!("alloc internal {:?} {:?}", n, size);let (hdr_n, off_new) = {let hdr = header_mut(new);(hdr.n(),Self::alloc(&mut *hdr, size, core::mem::align_of::<Tuple<K, V>>()),)};debug!("off_new = {:?}", off_new);
let hdr = header_mut(new);// Move the `data` field one entry to the left.let data = hdr.data() as u16;let off_new = data - core::mem::size_of::<Tuple<K, V>>() as u16;hdr.set_data(off_new);// Increment the number of entries, add the newly occupied size.hdr.set_n(hdr.n() + 1);hdr.incr(8 + core::mem::size_of::<Tuple<K, V>>());let hdr_n = hdr.n();
}fn set_offset(new: &mut MutPage, n: isize, r: u64, off: u16) {unsafe {let ptr = new.0.data.offset(HDR as isize + n * 8) as *mut u64;*ptr = (r | off as u64).to_le();}}fn set_right_child(page: &mut MutPage, n: isize, r: u64) {unsafe {let ptr = page.0.data.offset(HDR as isize + n * 8) as *mut u64;let off = u64::from_le(*ptr) & 0xfff;*ptr = (r | off as u64).to_le();}
type Offset = InternalOffset;fn offset_slice<'a, T, K: Storable, V: Storable>(page: crate::Page<'a>,) -> Offsets<'a, Self::Offset> {let hdr = header(page);unsafe {Offsets::Slice(core::slice::from_raw_parts(page.data.as_ptr().add(HDR) as *const InternalOffset,hdr.n() as usize,))}}fn kv<'a, T: LoadPage, K: Storable, V: Storable>(_txn: &T,page: crate::Page,(r, off): (u64, u16),) -> (&'a K, &'a V, u64) {unsafe {let tup = &*(page.data.as_ptr().add(off as usize) as *const Tuple<K, V>);(&tup.k, &tup.v, r)}}
#[derive(Debug, Clone, Copy)]#[repr(C)]pub struct LeafOffset(u16);impl Into<(u64, u16)> for LeafOffset {fn into(self) -> (u64, u16) {(0, self.0)}}impl Into<usize> for LeafOffset {fn into(self) -> usize {self.0 as usize}}#[derive(Debug, Clone, Copy)]#[repr(C)]pub struct InternalOffset(u64);impl Into<(u64, u16)> for InternalOffset {fn into(self) -> (u64, u16) {(self.0 & !0xfff, (self.0 & 0xfff) as u16)}}impl Into<usize> for InternalOffset {fn into(self) -> usize {self.0 as usize}}
//! An implementation of B trees.
//! An implementation of B trees. The core operations on B trees//! (lookup, iterate, put and del) are generic in the actual//! implementation of nodes, via the [`BTreePage`] and//! [`BTreeMutPage`]. This allows for a simpler code for the//! high-level functions, as well as specialised, high-performance//! implementations for the nodes.//!//! Two different implementations are supplied: one in [`page`] for//! types with a size known at compile-time, which yields denser//! leaves, and hence shallower trees (if the values are really using//! the space). The other one, in [`page_unsized`], is for//! dynamic-sized types, or types whose representation is dynamic, for//! example enums that are `Sized` in Rust, but whose cases vary//! widely in size.
#[derive(Debug)]pub struct Ok {page: MutPage,freed: u64,}#[derive(Debug)]pub enum Put<'a, K: ?Sized, V: ?Sized> {Ok(Ok),Split {split_key: &'a K,split_value: &'a V,left: MutPage,right: MutPage,freed: u64,},}
m: Concat<'a, K, V, Self>,) -> Result<Op<'a, T, K, V>, T::Error>;}/// Represents the result of a merge or rebalance, without touching/// the parent of the merge/rebalance.#[derive(Debug)]pub enum Op<'a, T, K: ?Sized, V: ?Sized> {Merged {// New merged page.page: MutPage,// Pages freed by the merge (0 meaning "no page").freed: [u64; 2],marker: core::marker::PhantomData<T>,},Rebalanced {// New middle key.k: &'a K,// New middle value.v: &'a V,// Do `k` and `v` come from a page shared with another tree?incr_kv_rc: bool,// New left page.l: u64,// New right page.r: u64,// Pages freed by the rebalance (0 meaning "no page").freed: [u64; 2],},Put(Put<'a, K, V>),}/// Represents a page with modifications from a merge.#[derive(Debug)]pub struct ModifiedPage<'a, K: ?Sized, V: ?Sized, P: BTreePage<K, V>> {pub page: CowPage,// Whether the page is mutable with another tree.pub mutable: bool,// Start copying c0 (keep `page`'s left child).pub c0: P::Cursor,// If > 0, replace the right child of c0's last element with `l`.pub l: u64,// If > 0, replace the left child of c1's last element with `r`.pub r: u64,// Possibly insert a new binding.pub ins: Option<(&'a K, &'a V)>,// If a rebalance causes a split, we might need to insert an entry// after the replacement.pub ins2: Option<(&'a K, &'a V)>,// The first element of c1 is to be deleted, the others must be copied.pub c1: P::Cursor,// Whether to skip `c1`'s first element.pub skip_first: bool,
m: del::Concat<'a, K, V, Self>,) -> Result<del::Op<'a, T, K, V>, T::Error>;
/// Represents the concatenation of a modified page and one of its/// sibling (left or right).#[derive(Debug)]pub struct Concat<'a, K: ?Sized, V: ?Sized, P: BTreePage<K, V>> {/// Modified page.pub modified: ModifiedPage<'a, K, V, P>,/// Middle element.pub mid: (&'a K, &'a V),/// Sibling of the modified page.pub other: CowPage,/// Is the modified field on the left or on the right of the/// concatenation?pub mod_is_left: bool,/// Is the other page owned by this tree? If not, it means `other`/// is shared with another tree, and hence we need to increase the/// reference count of entries coming from `other`.pub other_is_mutable: bool,}
/// Represents the result of a merge or rebalance, without touching/// the parent of the merge/rebalance.#[derive(Debug)]pub enum Op<'a, T, K: ?Sized, V: ?Sized> {Merged {// New merged page.page: MutPage,// Pages freed by the merge (0 meaning "no page").freed: [u64; 2],marker: core::marker::PhantomData<T>,},Rebalanced {// New middle key.k: &'a K,// New middle value.v: &'a V,// Do `k` and `v` come from a page shared with another tree?incr_kv_rc: bool,// New left page.l: u64,// New right page.r: u64,// Pages freed by the rebalance (0 meaning "no page").freed: [u64; 2],},Put(crate::btree::put::Put<'a, K, V>),}/// Represents a page with modifications from a merge.#[derive(Debug)]pub struct ModifiedPage<'a, K: ?Sized, V: ?Sized, P: BTreePage<K, V>> {pub page: CowPage,// Whether the page is mutable with another tree.pub mutable: bool,// Start copying c0 (keep `page`'s left child).pub c0: P::Cursor,// If > 0, replace the right child of c0's last element with `l`.pub l: u64,// If > 0, replace the left child of c1's last element with `r`.pub r: u64,// Possibly insert a new binding.pub ins: Option<(&'a K, &'a V)>,// If a rebalance causes a split, we might need to insert an entry// after the replacement.pub ins2: Option<(&'a K, &'a V)>,// The first element of c1 is to be deleted, the others must be copied.pub c1: P::Cursor,// Whether to skip `c1`'s first element.pub skip_first: bool,}/// Represents the concatenation of a modified page and one of its/// sibling (left or right).#[derive(Debug)]pub struct Concat<'a, K: ?Sized, V: ?Sized, P: BTreePage<K, V>> {/// Modified page.pub modified: ModifiedPage<'a, K, V, P>,/// Middle element.pub mid: (&'a K, &'a V),/// Sibling of the modified page.pub other: CowPage,/// Is the modified field on the left or on the right of the/// concatenation?pub mod_is_left: bool,/// Is the other page owned by this tree? If not, it means `other`/// is shared with another tree, and hence we need to increase the/// reference count of entries coming from `other`.pub other_is_mutable: bool,}
let mut left_page = P::right_child(cur.page.as_page(), &cur.cursor);while left_page > 0 {if cursor.first_rc_len >= N_CURSORS && txn.rc(left_page)? >= 2 {cursor.first_rc_len = cursor.len()}let page = txn.load_page(left_page)?;let curs = P::cursor_first(&page);left_page = P::left_child(page.as_page(), &curs);cursor.push(PageCursor { page, cursor: curs });}let leaf_is_shared = cursor.len() >= cursor.first_rc_len;
let right_subtree = P::right_child(cur.page.as_page(), &cur.cursor);cursor.set_first_leaf(txn, right_subtree)?;let leaf_is_shared = cursor.len() >= cursor.first_rc_len();
pub first_rc_len: usize,
/// The length of the stack at the position of the first page/// shared with another tree. This definition is a little bit/// convoluted, but allows for efficient comparisons between/// `first_rc_len` and `len` to check whether a page is shared/// with another tree.first_rc_len: usize,/// Number of initialised items on the stack.
pub(super) fn set_first_leaf<T: LoadPage>(&mut self,txn: &T,mut left_page: u64,) -> Result<(), T::Error> {while left_page > 0 {if self.first_rc_len >= N_CURSORS && txn.rc(left_page)? >= 2 {self.first_rc_len = self.len}let page = txn.load_page(left_page)?;let curs = P::cursor_first(&page);left_page = P::left_child(page.as_page(), &curs);self.push(PageCursor { page, cursor: curs });}Ok(())}/// Reset the cursor to the first element of the database.pub fn reset<'a, T: LoadPage>(&mut self) {self.len = 1;let init = unsafe { &mut *self.stack[0].as_mut_ptr() };init.cursor = P::cursor_before(&init.page);}// An invariant of cursors, fundamental to understand the `next`// and `prev` functions below, is that the lower levels (in the// tree, upper levels on the stack) are the left children of the// cursor's position on a page./// Set the cursor to the first position larger than or equal to/// the specified key and value.
debug!("{:?}", cursor);if let Some((k, v)) = k {if let Ok((kk, vv, _)) = P::set_cursor(txn, current.page.as_page(), cursor, k, v) {if v.is_some() {return Ok(Some((kk, vv)));}last_match = Some((kk, vv));last_matching_page = self.len} else {debug!("not found on page {:?}", current.page)
if let Ok((kk, vv, _)) = P::set_cursor(txn, current.page.as_page(), cursor, k, v) {if v.is_some() {return Ok(Some((kk, vv)));
}impl<'a,K: Storable + ?Sized + core::fmt::Debug,V: Storable + ?Sized + core::fmt::Debug,P: BTreePage<K, V> + 'a,> Cursor<K, V, P>{// Cursor invariant: the lower levels (in the tree, upper levels// on the stack) are the left children of the cursor's position on// a page.
pub fn next<T: LoadPage>(&mut self, txn: &'a T) -> Result<Option<(&'a K, &'a V)>, T::Error> {debug!("=== next");
/// Return the current position of the cursor, and move the cursor/// to the next position.pub fn next<'a, T: LoadPage>(&mut self,txn: &'a T,) -> Result<Option<(&'a K, &'a V)>, T::Error> {
pub fn prev<T: LoadPage>(&mut self, txn: &'a T) -> Result<Option<(&'a K, &'a V)>, T::Error> {debug!("======== prev");
/// Move the cursor to the previous entry, and return that/// entry.pub fn prev<'a, T: LoadPage>(&mut self,txn: &'a T,) -> Result<Option<(&'a K, &'a V)>, T::Error> {