2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997
5 * Sleepycat Software. All rights reserved.
10 static const char sccsid[] = "@(#)mp_fget.c 10.30 (Sleepycat) 10/25/97";
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
26 #include "common_ext.h"
28 int __sleep_on_every_page_get; /* XXX: thread debugging option. */
32 * Get a page from the file.
35 memp_fget(dbmfp, pgnoaddr, flags, addrp)
46 size_t bucket, mf_offset;
49 int b_incr, b_inserted, readonly_alloc, ret;
58 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
59 * files here, and create non-existent pages in readonly files if the
60 * flags are set, later. The reason is that the hash access method
61 * wants to get empty pages that don't really exist in readonly files.
62 * The only alternative is for hash to write the last "bucket" all the
63 * time, which we don't want to do because one of our big goals in life
64 * is to keep database files small. It's sleazy as hell, but we catch
65 * any attempt to actually write the file in memp_fput().
67 #define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
70 __db_fchk(dbmp->dbenv, "memp_fget", flags, OKFLAGS)) != 0)
80 return (__db_ferr(dbmp->dbenv, "memp_fget", 1));
87 * We want to switch threads as often as possible. Sleep every time
88 * we get a new page to make it more likely.
90 if (__sleep_on_every_page_get &&
91 (__db_yield == NULL || __db_yield() != 0))
97 mf_offset = R_OFFSET(dbmp, mfp);
100 b_incr = b_inserted = readonly_alloc = ret = 0;
105 * If mmap'ing the file, just return a pointer. However, if another
106 * process has opened the file for writing since we mmap'd it, start
107 * playing the game by their rules, i.e. everything goes through the
108 * cache. All pages previously returned should be safe, as long as
109 * a locking protocol was observed.
112 * We don't discard the map because we don't know when all of the
113 * pages will have been discarded from the process' address space.
114 * It would be possible to do so by reference counting the open
115 * pages from the mmap, but it's unclear to me that it's worth it.
117 if (dbmfp->addr != NULL && dbmfp->mfp->can_mmap) {
118 lastpgno = dbmfp->len == 0 ?
119 0 : (dbmfp->len - 1) / mfp->stat.st_pagesize;
120 if (LF_ISSET(DB_MPOOL_LAST))
121 *pgnoaddr = lastpgno;
125 * Allocate a page that can never really exist. See
126 * the comment above about non-existent pages and the
127 * hash access method.
129 if (LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW))
131 else if (*pgnoaddr > lastpgno) {
132 __db_err(dbmp->dbenv,
133 "%s: page %lu doesn't exist",
134 dbmfp->path, (u_long)*pgnoaddr);
139 if (!readonly_alloc) {
140 addr = R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
150 * If requesting the last page or a new page, find the last page. The
151 * tricky thing is that the user may have created a page already that's
152 * after any page that exists in the file.
154 if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) {
156 * Temporary files may not yet have been created.
158 * Don't lock -- there are no atomicity issues for stat(2).
163 __db_ioinfo(dbmfp->path, dbmfp->fd, &size, NULL)) != 0) {
164 __db_err(dbmp->dbenv,
165 "%s: %s", dbmfp->path, strerror(ret));
169 *pgnoaddr = size == 0 ? 0 : (size - 1) / mfp->stat.st_pagesize;
172 * Walk the list of BH's, looking for later pages. Save the
173 * pointer if a later page is found so that we don't have to
174 * search the list twice.
176 * If requesting a new page, return the page one after the last
177 * page -- which we'll have to create.
179 for (tbhp = SH_TAILQ_FIRST(&mp->bhq, __bh);
180 tbhp != NULL; tbhp = SH_TAILQ_NEXT(tbhp, q, __bh))
181 if (tbhp->pgno >= *pgnoaddr &&
182 tbhp->mf_offset == mf_offset) {
184 *pgnoaddr = bhp->pgno;
186 if (LF_ISSET(DB_MPOOL_NEW))
190 /* If we already found the right buffer, return it. */
191 if (LF_ISSET(DB_MPOOL_LAST) && bhp != NULL) {
196 /* If we haven't checked the BH hash bucket queue, do the search. */
197 if (!LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW)) {
198 bucket = BUCKET(mp, mf_offset, *pgnoaddr);
200 bhp = SH_TAILQ_FIRST(&dbmp->htab[bucket], __bh);
201 bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
203 if (bhp->pgno == *pgnoaddr &&
204 bhp->mf_offset == mf_offset) {
206 ++mp->stat.st_hash_searches;
207 if (cnt > mp->stat.st_hash_longest)
208 mp->stat.st_hash_longest = cnt;
209 mp->stat.st_hash_examined += cnt;
214 ++mp->stat.st_hash_searches;
215 if (cnt > mp->stat.st_hash_longest)
216 mp->stat.st_hash_longest = cnt;
217 mp->stat.st_hash_examined += cnt;
222 * Allocate a new buffer header and data space, and mark the contents
225 if ((ret = __memp_ralloc(dbmp, sizeof(BH) -
226 sizeof(u_int8_t) + mfp->stat.st_pagesize, NULL, &bhp)) != 0)
230 if ((ALIGNTYPE)addr & (sizeof(size_t) - 1)) {
231 __db_err(dbmp->dbenv,
232 "Internal error: BH data NOT size_t aligned.");
236 memset(bhp, 0, sizeof(BH));
237 LOCKINIT(dbmp, &bhp->mutex);
240 * Prepend the bucket header to the head of the appropriate MPOOL
241 * bucket hash list. Append the bucket header to the tail of the
244 * We have to do this before we read in the page so we can discard
245 * our region lock without screwing up the world.
247 bucket = BUCKET(mp, mf_offset, *pgnoaddr);
248 SH_TAILQ_INSERT_HEAD(&dbmp->htab[bucket], bhp, hq, __bh);
249 SH_TAILQ_INSERT_TAIL(&mp->bhq, bhp, q);
250 ++mp->stat.st_page_clean;
253 /* Set the page number, and associated MPOOLFILE. */
254 bhp->mf_offset = mf_offset;
255 bhp->pgno = *pgnoaddr;
258 * If we know we created the page, zero it out and continue.
261 * Note: DB_MPOOL_NEW deliberately doesn't call the pgin function.
262 * If DB_MPOOL_CREATE is used, then the application's pgin function
263 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
264 * it can detect all of its page creates, and not bother.
266 * Otherwise, read the page into memory, optionally creating it if
267 * DB_MPOOL_CREATE is set.
269 * Increment the reference count for created buffers, but importantly,
270 * increment the reference count for buffers we're about to read so
271 * that the buffer can't move.
276 if (LF_ISSET(DB_MPOOL_NEW))
277 memset(addr, 0, mfp->stat.st_pagesize);
280 * It's possible for the read function to fail, which means
281 * that we fail as well.
283 reread: if ((ret = __memp_pgread(dbmfp,
284 bhp, LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW))) != 0)
289 * The __memp_pgread call discarded and reacquired the region
290 * lock. Because the buffer reference count was incremented
291 * before the region lock was discarded the buffer can't move
292 * and its contents can't change.
294 ++mp->stat.st_cache_miss;
295 ++mfp->stat.st_cache_miss;
299 found: /* Increment the reference count. */
300 if (bhp->ref == UINT16_T_MAX) {
301 __db_err(dbmp->dbenv,
302 "%s: too many references to page %lu",
303 dbmfp->path, bhp->pgno);
311 * Any found buffer might be trouble.
314 * I/O in progress, wait for it to finish. Because the buffer
315 * reference count was incremented before the region lock was
316 * discarded we know the buffer can't move and its contents
319 if (F_ISSET(bhp, BH_LOCKED)) {
321 LOCKBUFFER(dbmp, bhp);
322 /* Waiting for I/O to finish... */
323 UNLOCKBUFFER(dbmp, bhp);
329 * The buffer is garbage.
331 if (F_ISSET(bhp, BH_TRASH))
336 * The buffer was written, and the contents need to be
339 if (F_ISSET(bhp, BH_CALLPGIN)) {
340 if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
342 F_CLR(bhp, BH_CALLPGIN);
345 ++mp->stat.st_cache_hit;
346 ++mfp->stat.st_cache_hit;
349 mapret: LOCKHANDLE(dbmp, dbmfp->mutexp);
351 UNLOCKHANDLE(dbmp, dbmfp->mutexp);
355 * If no other process is already waiting on a created buffer,
356 * go ahead and discard it, it's not useful.
360 if (b_inserted && bhp->ref == 0)
361 __memp_bhfree(dbmp, mfp, bhp, 1);
366 *(void **)addrp = addr;