2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997
5 * Sleepycat Software. All rights reserved.
7 * @(#)db_page.h 10.11 (Sleepycat) 9/3/97
16 * This implementation requires that values within the following structures
17 * NOT be padded -- note, ANSI C permits random padding within structures.
18 * If your compiler pads randomly you can just forget ever making DB run on
19 * your system. In addition, no data type can require larger alignment than
20 * its own size, e.g., a 4-byte data element may not require 8-byte alignment.
22 * Note that key/data lengths are often stored in db_indx_t's -- this is
23 * not accidental, nor does it limit the key/data size. If the key/data
24 * item fits on a page, it's guaranteed to be small enough to fit into a
25 * db_indx_t, and storing it in one saves space.
28 #define PGNO_METADATA 0 /* Metadata page number. */
29 #define PGNO_INVALID 0 /* Metadata page number, therefore illegal. */
30 #define PGNO_ROOT 1 /* Root is page #1. */
32 /************************************************************************
33 BTREE METADATA PAGE LAYOUT
34 ************************************************************************/
37 * Btree metadata page layout:
39 * +-----------------------------------+
40 * | lsn | pgno | magic |
41 * +-----------------------------------+
42 * | version | pagesize | free |
43 * +-----------------------------------+
44 * | flags | unused ... |
45 * +-----------------------------------+
47 typedef struct _btmeta {
48 DB_LSN lsn; /* 00-07: LSN. */
49 db_pgno_t pgno; /* 08-11: Current page number. */
50 u_int32_t magic; /* 12-15: Magic number. */
51 u_int32_t version; /* 16-19: Version. */
52 u_int32_t pagesize; /* 20-23: Pagesize. */
53 u_int32_t maxkey; /* 24-27: Btree: Maxkey. */
54 u_int32_t minkey; /* 28-31: Btree: Minkey. */
55 u_int32_t free; /* 32-35: Free list page number. */
56 #define BTM_DUP 0x001 /* Duplicates. */
57 #define BTM_RECNO 0x002 /* Recno tree. */
58 #define BTM_RECNUM 0x004 /* Btree: maintain record count. */
59 #define BTM_FIXEDLEN 0x008 /* Recno: fixed length records. */
60 #define BTM_RENUMBER 0x010 /* Recno: renumber on insert/delete. */
61 #define BTM_MASK 0x01f
62 u_int32_t flags; /* 36-39: Flags. */
63 u_int32_t re_len; /* 40-43: Recno: fixed-length record length. */
64 u_int32_t re_pad; /* 44-47: Recno: fixed-length record pad. */
65 /* 48-67: Unique file ID. */
66 u_int8_t uid[DB_FILE_ID_LEN];
68 u_int32_t spare[13]; /* 68-123: Save some room for growth. */
70 DB_BTREE_LSTAT stat; /* 124-163: Statistics. */
73 /************************************************************************
74 HASH METADATA PAGE LAYOUT
75 ************************************************************************/
78 * Hash metadata page layout:
80 * +-----------------------------------+
81 * | lsn | magic | version |
82 * +-----------------------------------+
83 * | pagesize | ovfl_point| last_freed|
84 * +-----------------------------------+
85 * | max_bucket| high_mask | low_mask |
86 * +-----------------------------------+
87 * | ffactor | nelem | charkey |
88 * +-----------------------------------+
89 * | spares[32]| flags | unused |
90 * +-----------------------------------+
92 /* Hash Table Information */
93 typedef struct hashhdr { /* Disk resident portion */
94 DB_LSN lsn; /* 00-07: LSN of the header page */
95 db_pgno_t pgno; /* 08-11: Page number (btree compatibility). */
96 u_int32_t magic; /* 12-15: Magic NO for hash tables */
97 u_int32_t version; /* 16-19: Version ID */
98 u_int32_t pagesize; /* 20-23: Bucket/Page Size */
99 u_int32_t ovfl_point; /* 24-27: Overflow page allocation location */
100 u_int32_t last_freed; /* 28-31: Last freed overflow page pgno */
101 u_int32_t max_bucket; /* 32-35: ID of Maximum bucket in use */
102 u_int32_t high_mask; /* 36-39: Modulo mask into table */
103 u_int32_t low_mask; /* 40-43: Modulo mask into table lower half */
104 u_int32_t ffactor; /* 44-47: Fill factor */
105 u_int32_t nelem; /* 48-51: Number of keys in hash table */
106 u_int32_t h_charkey; /* 52-55: Value of hash(CHARKEY) */
107 #define DB_HASH_DUP 0x01
108 u_int32_t flags; /* 56-59: Allow duplicates. */
109 #define NCACHED 32 /* number of spare points */
110 /* 60-187: Spare pages for overflow */
111 u_int32_t spares[NCACHED];
112 /* 188-207: Unique file ID. */
113 u_int8_t uid[DB_FILE_ID_LEN];
116 * Minimum page size is 256.
120 /************************************************************************
122 ************************************************************************/
125 * +-----------------------------------+
126 * | lsn | pgno | prev pgno |
127 * +-----------------------------------+
128 * | next pgno | entries | hf offset |
129 * +-----------------------------------+
130 * | level | type | index |
131 * +-----------------------------------+
132 * | index | free --> |
133 * +-----------+-----------------------+
134 * | F R E E A R E A |
135 * +-----------------------------------+
136 * | <-- free | item |
137 * +-----------------------------------+
138 * | item | item | item |
139 * +-----------------------------------+
141 * sizeof(PAGE) == 26 bytes, and the following indices are guaranteed to be
144 * For hash and btree leaf pages, index items are paired, e.g., inp[0] is the
145 * key for inp[1]'s data. All other types of pages only contain single items.
147 typedef struct _db_page {
148 DB_LSN lsn; /* 00-07: Log sequence number. */
149 db_pgno_t pgno; /* 08-11: Current page number. */
150 db_pgno_t prev_pgno; /* 12-15: Previous page number. */
151 db_pgno_t next_pgno; /* 16-19: Next page number. */
152 db_indx_t entries; /* 20-21: Number of item pairs on the page. */
153 db_indx_t hf_offset; /* 22-23: High free byte page offset. */
156 * The btree levels are numbered from the leaf to the root, starting
157 * with 1, so the leaf is level 1, its parent is level 2, and so on.
158 * We maintain this level on all btree pages, but the only place that
159 * we actually need it is on the root page. It would not be difficult
160 * to hide the byte on the root page once it becomes an internal page,
161 * so we could get this byte back if we needed it for something else.
164 #define MAXBTREELEVEL 255
165 u_int8_t level; /* 24: Btree tree level. */
167 #define P_INVALID 0 /* Invalid page type. */
168 #define P_DUPLICATE 1 /* Duplicate. */
169 #define P_HASH 2 /* Hash. */
170 #define P_IBTREE 3 /* Btree internal. */
171 #define P_IRECNO 4 /* Recno internal. */
172 #define P_LBTREE 5 /* Btree leaf. */
173 #define P_LRECNO 6 /* Recno leaf. */
174 #define P_OVERFLOW 7 /* Overflow. */
175 u_int8_t type; /* 25: Page type. */
176 db_indx_t inp[1]; /* Variable length index of items. */
179 /* Element macros. */
180 #define LSN(p) (((PAGE *)p)->lsn)
181 #define PGNO(p) (((PAGE *)p)->pgno)
182 #define PREV_PGNO(p) (((PAGE *)p)->prev_pgno)
183 #define NEXT_PGNO(p) (((PAGE *)p)->next_pgno)
184 #define NUM_ENT(p) (((PAGE *)p)->entries)
185 #define HOFFSET(p) (((PAGE *)p)->hf_offset)
186 #define LEVEL(p) (((PAGE *)p)->level)
187 #define TYPE(p) (((PAGE *)p)->type)
191 * The next_pgno and prev_pgno fields are not maintained for btree and recno
192 * internal pages. It's a minor performance improvement, and more, it's
193 * hard to do when deleting internal pages, and it decreases the chance of
194 * deadlock during deletes and splits.
197 * The btree/recno access method needs db_recno_t bytes of space on the root
198 * page to specify how many records are stored in the tree. (The alternative
199 * is to store the number of records in the meta-data page, which will create
200 * a second hot spot in trees being actively modified, or recalculate it from
201 * the BINTERNAL fields on each access.) Overload the prev_pgno field.
204 (TYPE(p) == P_LBTREE ? NUM_ENT(p) / 2 : \
205 TYPE(p) == P_LRECNO ? NUM_ENT(p) : PREV_PGNO(p))
206 #define RE_NREC_ADJ(p, adj) \
208 #define RE_NREC_SET(p, num) \
215 * Don't modify the page's LSN, code depends on it being unchanged after a
218 #define P_INIT(pg, pg_size, n, pg_prev, pg_next, btl, pg_type) do { \
220 PREV_PGNO(pg) = pg_prev; \
221 NEXT_PGNO(pg) = pg_next; \
223 HOFFSET(pg) = pg_size; \
225 TYPE(pg) = pg_type; \
228 /* Page header length (offset to first index). */
229 #define P_OVERHEAD (SSZA(PAGE, inp))
231 /* First free byte. */
232 #define LOFFSET(pg) (P_OVERHEAD + NUM_ENT(pg) * sizeof(db_indx_t))
234 /* Free space on the page. */
235 #define P_FREESPACE(pg) (HOFFSET(pg) - LOFFSET(pg))
237 /* Get a pointer to the bytes at a specific index. */
238 #define P_ENTRY(pg, indx) ((u_int8_t *)pg + ((PAGE *)pg)->inp[indx])
240 /************************************************************************
242 ************************************************************************/
245 * Overflow items are referenced by HOFFPAGE and BOVERFLOW structures, which
246 * store a page number (the first page of the overflow item) and a length
247 * (the total length of the overflow item). The overflow item consists of
248 * some number of overflow pages, linked by the next_pgno field of the page.
249 * A next_pgno field of PGNO_INVALID flags the end of the overflow item.
251 * Overflow page overloads:
252 * The amount of overflow data stored on each page is stored in the
255 * The implementation reference counts overflow items as it's possible
256 * for them to be promoted onto btree internal pages. The reference
257 * count is stored in the entries field.
259 #define OV_LEN(p) (((PAGE *)p)->hf_offset)
260 #define OV_REF(p) (((PAGE *)p)->entries)
262 /* Maximum number of bytes that you can put on an overflow page. */
263 #define P_MAXSPACE(psize) ((psize) - P_OVERHEAD)
265 /************************************************************************
267 ************************************************************************/
269 /* Each index references a group of bytes on the page. */
270 #define H_KEYDATA 1 /* Key/data item. */
271 #define H_DUPLICATE 2 /* Duplicate key/data item. */
272 #define H_OFFPAGE 3 /* Overflow key/data item. */
273 #define H_OFFDUP 4 /* Overflow page of duplicates. */
276 * The first and second types are H_KEYDATA and H_DUPLICATE, represented
277 * by the HKEYDATA structure:
279 * +-----------------------------------+
280 * | type | key/data ... |
281 * +-----------------------------------+
283 * For duplicates, the data field encodes duplicate elements in the data
286 * +---------------------------------------------------------------+
287 * | type | len1 | element1 | len1 | len2 | element2 | len2 |
288 * +---------------------------------------------------------------+
290 * Thus, by keeping track of the offset in the element, we can do both
291 * backward and forward traversal.
293 typedef struct _hkeydata {
294 u_int8_t type; /* 00: Page type. */
295 u_int8_t data[1]; /* Variable length key/data item. */
298 /* Get a HKEYDATA item for a specific index. */
299 #define GET_HKEYDATA(pg, indx) \
300 ((HKEYDATA *)P_ENTRY(pg, indx))
303 * The length of any HKEYDATA item. Note that indx is an element index,
306 #define LEN_HITEM(pg, pgsize, indx) \
307 (((indx) == 0 ? pgsize : pg->inp[indx - 1]) - pg->inp[indx])
309 #define LEN_HKEYDATA(pg, psize, indx) \
310 (((indx) == 0 ? psize : pg->inp[indx - 1]) - \
311 pg->inp[indx] - HKEYDATA_SIZE(0))
314 * Page space required to add a new HKEYDATA item to the page, with and
315 * without the index value.
317 #define HKEYDATA_SIZE(len) \
318 ((len) + SSZA(HKEYDATA, data))
319 #define HKEYDATA_PSIZE(len) \
320 (HKEYDATA_SIZE(len) + sizeof(db_indx_t))
322 /* Put a HKEYDATA item at the location referenced by a page entry. */
323 #define PUT_HKEYDATA(pe, kd, len, type) { \
324 ((HKEYDATA *)pe)->type = type; \
325 memcpy((u_int8_t *)pe + sizeof(u_int8_t), kd, len); \
329 * Macros the describe the page layout in terms of key-data pairs.
330 * The use of "pindex" indicates that the argument is the index
331 * expressed in pairs instead of individual elements.
333 #define H_NUMPAIRS(pg) (NUM_ENT(pg) / 2)
334 #define H_KEYINDEX(pindx) (2 * (pindx))
335 #define H_DATAINDEX(pindx) ((2 * (pindx)) + 1)
336 #define H_PAIRKEY(pg, pindx) GET_HKEYDATA(pg, H_KEYINDEX(pindx))
337 #define H_PAIRDATA(pg, pindx) GET_HKEYDATA(pg, H_DATAINDEX(pindx))
338 #define H_PAIRSIZE(pg, psize, pindx) \
339 (LEN_HITEM(pg, psize, H_KEYINDEX(pindx)) + \
340 LEN_HITEM(pg, psize, H_DATAINDEX(pindx)))
341 #define LEN_HDATA(p, psize, pindx) LEN_HKEYDATA(p, psize, H_DATAINDEX(pindx))
342 #define LEN_HKEY(p, psize, pindx) LEN_HKEYDATA(p, psize, H_KEYINDEX(pindx))
345 * The third type is the H_OFFPAGE, represented by the HOFFPAGE structure:
347 * +-----------------------------------+
348 * | type | pgno_t | total len |
349 * +-----------------------------------+
351 typedef struct _hoffpage {
352 u_int8_t type; /* 00: Page type and delete flag. */
353 u_int8_t unused[3]; /* 01-03: Padding, unused. */
354 db_pgno_t pgno; /* 04-07: Offpage page number. */
355 u_int32_t tlen; /* 08-11: Total length of item. */
358 /* Get a HOFFPAGE item for a specific index. */
359 #define GET_HOFFPAGE(pg, indx) \
360 ((HOFFPAGE *)P_ENTRY(pg, indx))
363 * Page space required to add a new HOFFPAGE item to the page, with and
364 * without the index value.
366 #define HOFFPAGE_SIZE (sizeof(HOFFPAGE))
367 #define HOFFPAGE_PSIZE (HOFFPAGE_SIZE + sizeof(db_indx_t))
370 * The fourth type is H_OFFDUP represented by the HOFFDUP structure:
372 * +-----------------------+
374 * +-----------------------+
376 typedef struct _hoffdup {
377 u_int8_t type; /* 00: Page type and delete flag. */
378 u_int8_t unused[3]; /* 01-03: Padding, unused. */
379 db_pgno_t pgno; /* 04-07: Offpage page number. */
382 /* Get a HOFFDUP item for a specific index. */
383 #define GET_HOFFDUP(pg, indx) \
384 ((HOFFDUP *)P_ENTRY(pg, indx))
387 * Page space required to add a new HOFFDUP item to the page, with and
388 * without the index value.
390 #define HOFFDUP_SIZE (sizeof(HOFFDUP))
391 #define HOFFDUP_PSIZE (HOFFDUP_SIZE + sizeof(db_indx_t))
393 /************************************************************************
395 ************************************************************************/
397 /* Each index references a group of bytes on the page. */
398 #define B_KEYDATA 1 /* Key/data item. */
399 #define B_DUPLICATE 2 /* Duplicate key/data item. */
400 #define B_OVERFLOW 3 /* Overflow key/data item. */
403 * We have to store a deleted entry flag in the page. The reason is complex,
404 * but the simple version is that we can't delete on-page items referenced by
405 * a cursor -- the return order of subsequent insertions might be wrong. The
406 * delete flag is an overload of the top bit of the type byte.
408 #define B_DELETE (0x80)
409 #define B_DCLR(t) (t) &= ~B_DELETE
410 #define B_DSET(t) (t) |= B_DELETE
411 #define B_DISSET(t) ((t) & B_DELETE)
413 #define B_TYPE(t) ((t) & ~B_DELETE)
414 #define B_TSET(t, type, deleted) { \
421 * The first type is B_KEYDATA, represented by the BKEYDATA structure:
423 * +-----------------------------------+
424 * | length | type | key/data |
425 * +-----------------------------------+
427 typedef struct _bkeydata {
428 db_indx_t len; /* 00-01: Key/data item length. */
429 u_int8_t type; /* 02: Page type AND DELETE FLAG. */
430 u_int8_t data[1]; /* Variable length key/data item. */
433 /* Get a BKEYDATA item for a specific index. */
434 #define GET_BKEYDATA(pg, indx) \
435 ((BKEYDATA *)P_ENTRY(pg, indx))
438 * Page space required to add a new BKEYDATA item to the page, with and
439 * without the index value.
441 #define BKEYDATA_SIZE(len) \
442 ALIGN((len) + SSZA(BKEYDATA, data), 4)
443 #define BKEYDATA_PSIZE(len) \
444 (BKEYDATA_SIZE(len) + sizeof(db_indx_t))
447 * The second and third types are B_DUPLICATE and B_OVERFLOW, represented
448 * by the BOVERFLOW structure:
450 * +-----------------------------------+
451 * | total len | type | unused |
452 * +-----------------------------------+
453 * | nxt: page | nxt: off | nxt: len |
454 * +-----------------------------------+
456 typedef struct _boverflow {
457 db_indx_t unused1; /* 00-01: Padding, unused. */
458 u_int8_t type; /* 02: Page type AND DELETE FLAG. */
459 u_int8_t unused2; /* 03: Padding, unused. */
460 db_pgno_t pgno; /* 04-07: Next page number. */
461 u_int32_t tlen; /* 08-11: Total length of item. */
464 /* Get a BOVERFLOW item for a specific index. */
465 #define GET_BOVERFLOW(pg, indx) \
466 ((BOVERFLOW *)P_ENTRY(pg, indx))
469 * Page space required to add a new BOVERFLOW item to the page, with and
470 * without the index value.
472 #define BOVERFLOW_SIZE \
473 ALIGN(sizeof(BOVERFLOW), 4)
474 #define BOVERFLOW_PSIZE \
475 (BOVERFLOW_SIZE + sizeof(db_indx_t))
478 * Btree leaf and hash page layouts group indices in sets of two, one
479 * for the key and one for the data. Everything else does it in sets
480 * of one to save space. I use the following macros so that it's real
481 * obvious what's going on...
486 /************************************************************************
487 BTREE INTERNAL PAGE LAYOUT
488 ************************************************************************/
491 * Btree internal entry.
493 * +-----------------------------------+
494 * | leaf pgno | type | data ... |
495 * +-----------------------------------+
497 typedef struct _binternal {
498 db_indx_t len; /* 00-01: Key/data item length. */
499 u_int8_t type; /* 02: Page type AND DELETE FLAG. */
500 u_int8_t unused; /* 03: Padding, unused. */
501 db_pgno_t pgno; /* 04-07: Page number of referenced page. */
502 db_recno_t nrecs; /* 08-11: Subtree record count. */
503 u_int8_t data[1]; /* Variable length key item. */
506 /* Get a BINTERNAL item for a specific index. */
507 #define GET_BINTERNAL(pg, indx) \
508 ((BINTERNAL *)P_ENTRY(pg, indx))
511 * Page space required to add a new BINTERNAL item to the page, with and
512 * without the index value.
514 #define BINTERNAL_SIZE(len) \
515 ALIGN((len) + SSZA(BINTERNAL, data), 4)
516 #define BINTERNAL_PSIZE(len) \
517 (BINTERNAL_SIZE(len) + sizeof(db_indx_t))
519 /************************************************************************
520 RECNO INTERNAL PAGE LAYOUT
521 ************************************************************************/
524 * The recno internal entry.
526 * +-----------------------+
527 * | leaf pgno | # of recs |
528 * +-----------------------+
531 * Why not fold this into the db_indx_t structure, it's fixed length.
533 typedef struct _rinternal {
534 db_pgno_t pgno; /* 00-03: Page number of referenced page. */
535 db_recno_t nrecs; /* 04-07: Subtree record count. */
538 /* Get a RINTERNAL item for a specific index. */
539 #define GET_RINTERNAL(pg, indx) \
540 ((RINTERNAL *)P_ENTRY(pg, indx))
543 * Page space required to add a new RINTERNAL item to the page, with and
544 * without the index value.
546 #define RINTERNAL_SIZE \
547 ALIGN(sizeof(RINTERNAL), 4)
548 #define RINTERNAL_PSIZE \
549 (RINTERNAL_SIZE + sizeof(db_indx_t))
550 #endif /* _DB_PAGE_H_ */