2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997
5 * Sleepycat Software. All rights reserved.
10 static const char sccsid[] = "@(#)mp_bh.c 10.15 (Sleepycat) 8/29/97";
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
25 #include "common_ext.h"
27 static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
31 * Write the page associated with a given bucket header.
33 * PUBLIC: int __memp_bhwrite
34 * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
37 __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
41 int *restartp, *wrotep;
53 * Walk the process' DB_MPOOLFILE list and find a file descriptor for
54 * the file. We also check that the descriptor is open for writing.
55 * If we find a descriptor on the file that's not open for writing, we
56 * try and upgrade it to make it writeable.
58 LOCKHANDLE(dbmp, &dbmp->mutex);
59 for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
60 dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
61 if (dbmfp->mfp == mfp) {
62 if (F_ISSET(dbmfp, MP_READONLY) &&
63 __memp_upgrade(dbmp, dbmfp, mfp))
67 UNLOCKHANDLE(dbmp, &dbmp->mutex);
72 * It's not a page from a file we've opened. If the file requires
73 * input/output processing, see if this process has ever registered
74 * information as to how to write this type of file. If not, there's
77 if (mfp->ftype != 0) {
78 LOCKHANDLE(dbmp, &dbmp->mutex);
79 for (mpreg = LIST_FIRST(&dbmp->dbregq);
80 mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
81 if (mpreg->ftype == mfp->ftype)
83 UNLOCKHANDLE(dbmp, &dbmp->mutex);
89 * Try and open the file; ignore any error, assume it's a permissions
93 * There's no negative cache here, so we may repeatedly try and open
94 * files that we have previously tried (and failed) to open.
96 dbt.size = mfp->pgcookie_len;
97 dbt.data = ADDR(dbmp, mfp->pgcookie_off);
98 if (__memp_fopen(dbmp, ADDR(dbmp, mfp->path_off),
99 mfp->ftype, 0, 0, mfp->stat.st_pagesize,
100 mfp->lsn_off, &dbt, ADDR(dbmp, mfp->fileid_off), 0, &dbmfp) != 0)
103 found: return (__memp_pgwrite(dbmfp, bhp, restartp, wrotep));
108 * Read a page from a file.
110 * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
113 __memp_pgread(dbmfp, bhp, can_create)
126 pagesize = mfp->stat.st_pagesize;
128 F_SET(bhp, BH_LOCKED | BH_TRASH);
129 LOCKBUFFER(dbmp, bhp);
133 * Temporary files may not yet have been created.
135 * Seek to the page location.
138 LOCKHANDLE(dbmp, &dbmfp->mutex);
139 if (dbmfp->fd == -1 || (ret =
140 __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0) {
144 UNLOCKHANDLE(dbmp, &dbmfp->mutex);
145 __db_err(dbmp->dbenv,
146 "%s: page %lu doesn't exist, create flag not set",
147 dbmfp->path, (u_long)bhp->pgno);
150 UNLOCKHANDLE(dbmp, &dbmfp->mutex);
152 /* Clear any uninitialized data. */
153 memset(bhp->buf, 0, pagesize);
158 * Read the page; short reads are treated like creates, although
159 * any valid data is preserved.
161 ret = __db_read(dbmfp->fd, bhp->buf, pagesize, &nr);
162 UNLOCKHANDLE(dbmp, &dbmfp->mutex);
166 if (nr == (ssize_t)pagesize)
174 /* Clear any uninitialized data. */
175 memset(bhp->buf + nr, 0, pagesize - nr);
178 /* Call any pgin function. */
179 pgin: ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
181 /* Reacquire the region lock. */
184 /* If the pgin function succeeded, the data is now valid. */
186 F_CLR(bhp, BH_TRASH);
188 /* Update the statistics. */
190 ++dbmp->mp->stat.st_page_create;
191 ++mfp->stat.st_page_create;
193 ++dbmp->mp->stat.st_page_in;
194 ++mfp->stat.st_page_in;
198 err: LOCKREGION(dbmp);
201 /* Release the buffer. */
202 F_CLR(bhp, BH_LOCKED);
203 UNLOCKBUFFER(dbmp, bhp);
210 * Write a page to a file.
212 * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *));
215 __memp_pgwrite(dbmfp, bhp, restartp, wrotep)
218 int *restartp, *wrotep;
235 if (restartp != NULL)
240 pagesize = mfp->stat.st_pagesize;
242 F_SET(bhp, BH_LOCKED);
243 LOCKBUFFER(dbmp, bhp);
246 if (restartp != NULL)
249 /* Copy the LSN off the page if we're going to need it. */
250 lg_info = dbenv->lg_info;
251 if (lg_info != NULL || F_ISSET(bhp, BH_WRITE))
252 memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
254 /* Ensure the appropriate log records are on disk. */
255 if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0)
259 * Call any pgout function. We set the callpgin flag so that on
260 * error we flag that the contents of the buffer may be trash.
266 if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
270 /* Temporary files may not yet have been created. */
271 LOCKHANDLE(dbmp, &dbmfp->mutex);
272 if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv, DB_APP_TMP,
273 NULL, NULL, &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) {
274 UNLOCKHANDLE(dbmp, &dbmfp->mutex);
275 __db_err(dbenv, "unable to create temporary backing file");
279 /* Write the page out. */
281 __db_lseek(dbmfp->fd, pagesize, bhp->pgno, 0, SEEK_SET)) != 0)
283 else if ((ret = __db_write(dbmfp->fd, bhp->buf, pagesize, &nw)) != 0)
285 UNLOCKHANDLE(dbmp, &dbmfp->mutex);
289 * Shut the compiler up; it doesn't understand the correlation
290 * between the failing clauses to __db_lseek and __db_write and
297 if (nw != (ssize_t)pagesize) {
306 /* Reacquire the region lock. */
309 /* Clean up the flags based on a successful write. */
310 F_SET(bhp, BH_CALLPGIN);
311 F_CLR(bhp, BH_DIRTY | BH_LOCKED);
312 UNLOCKBUFFER(dbmp, bhp);
315 * If we wrote a buffer which a checkpoint is waiting for, update
316 * the count of pending buffers (both in the mpool as a whole and
317 * for this file). If the count for this file goes to zero, flush
321 * We ignore errors from the sync -- it makes no sense to return an
322 * error to the calling process, so set a flag causing the sync to
325 * If the buffer we wrote has a LSN larger than the current largest
326 * we've written for this checkpoint, update the saved value.
329 if (F_ISSET(bhp, BH_WRITE)) {
330 if (log_compare(&lsn, &mp->lsn) > 0)
332 F_CLR(bhp, BH_WRITE);
335 if (--mfp->lsn_cnt == 0) {
337 * Don't lock -- there are no atomicity issues for
340 if (__db_fsync(dbmfp->fd) != 0)
341 F_SET(mp, MP_LSN_RETRY);
345 /* Update I/O statistics. */
346 ++mp->stat.st_page_out;
347 ++mfp->stat.st_page_out;
351 syserr: __db_err(dbenv,
352 "%s: %s failed for page %lu", dbmfp->path, fail, (u_long)bhp->pgno);
354 err: UNLOCKBUFFER(dbmp, bhp);
357 F_SET(bhp, BH_CALLPGIN);
358 F_CLR(bhp, BH_LOCKED);
364 * Call the pgin/pgout routine.
366 * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
369 __memp_pg(dbmfp, bhp, is_pgin)
383 LOCKHANDLE(dbmp, &dbmp->mutex);
386 for (mpreg = LIST_FIRST(&dbmp->dbregq);
387 mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) {
388 if (ftype != mpreg->ftype)
390 if (mfp->pgcookie_len == 0)
393 dbt.size = mfp->pgcookie_len;
394 dbt.data = ADDR(dbmp, mfp->pgcookie_off);
397 UNLOCKHANDLE(dbmp, &dbmp->mutex);
400 if (mpreg->pgin != NULL && (ret =
401 mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0)
404 if (mpreg->pgout != NULL && (ret =
405 mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0)
411 UNLOCKHANDLE(dbmp, &dbmp->mutex);
415 err: UNLOCKHANDLE(dbmp, &dbmp->mutex);
416 __db_err(dbmp->dbenv, "%s: %s failed for page %lu",
417 dbmfp->path, is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
423 * Free a bucket header and its referenced data.
425 * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int));
428 __memp_bhfree(dbmp, mfp, bhp, free_mem)
436 /* Delete the buffer header from the MPOOL hash list. */
437 off = BUCKET(dbmp->mp, OFFSET(dbmp, mfp), bhp->pgno);
438 SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, mq, __bh);
440 /* Delete the buffer header from the LRU chain. */
441 SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh);
444 * If we're not reusing it immediately, free the buffer header
448 __db_shalloc_free(dbmp->addr, bhp);
453 * Upgrade a file descriptor from readonly to readwrite.
456 __memp_upgrade(dbmp, dbmfp, mfp)
465 * We expect the handle to already be locked.
468 /* Check to see if we've already upgraded. */
469 if (F_ISSET(dbmfp, MP_UPGRADE))
472 /* Check to see if we've already failed. */
473 if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
477 if (__db_fdopen(ADDR(dbmp, mfp->path_off), 0, 0, 0, &fd) != 0) {
478 F_SET(dbmfp, MP_UPGRADE_FAIL);
482 /* Swap the descriptors and set the upgrade flag. */
483 (void)close(dbmfp->fd);
485 F_SET(dbmfp, MP_UPGRADE);