2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997
5 * Sleepycat Software. All rights reserved.
8 * Copyright (c) 1995, 1996
9 * The President and Fellows of Harvard University. All rights reserved.
11 * This code is derived from software contributed to Berkeley by
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the University of
25 * California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 static const char sccsid[] = "@(#)txn.c 10.24 (Sleepycat) 9/3/97";
51 * This file contains the top level routines of the transaction library.
52 * It assumes that a lock manager and log manager that conform to the db_log(3)
53 * and db_lock(3) interfaces exist.
56 #ifndef NO_SYSTEM_INCLUDES
57 #include <sys/types.h>
76 #include "db_dispatch.h"
80 #include "common_ext.h"
82 static int __txn_check_running __P((const DB_TXN *));
83 static int __txn_create __P((DB_ENV *, const char *, u_int));
84 static int __txn_end __P((DB_TXN *, int));
85 static int __txn_grow_region __P((DB_TXNMGR *));
86 static int __txn_undo __P((DB_TXN *));
87 static int __txn_validate_region __P((DB_TXNMGR *));
90 * Create and initialize a transaction region in shared memory.
92 * +1 means that the db_create failed, so we did not create the region.
93 * -1 means that we got some sort of system error.
96 __txn_create(dbenv, path, mode)
101 DB_TXNREGION *txn_region;
104 int fd, i, maxtxns, ret;
106 maxtxns = dbenv->tx_max != 0 ? dbenv->tx_max : 1000;
109 ret = __db_rcreate(dbenv, DB_APP_NONE, path,
110 DEFAULT_TXN_FILE, mode, TXN_REGION_SIZE(maxtxns), &fd, &txn_region);
112 /* Region may have existed. If it didn't, the open will fail. */
116 txn_region->magic = DB_TXNMAGIC;
117 txn_region->version = DB_TXNVERSION;
118 txn_region->maxtxns = maxtxns;
119 txn_region->last_txnid = TXN_MINIMUM;
120 /* XXX If we ever do more types of locking and logging, this changes. */
121 txn_region->logtype = 0;
122 txn_region->locktype = 0;
123 txn_region->free_txn = 0;
124 txn_region->time_ckp = now;
125 ZERO_LSN(txn_region->last_ckp);
126 ZERO_LSN(txn_region->pending_ckp);
128 for (txnp = &txn_region->table[0], i = 0; i < maxtxns; i++, txnp++) {
129 ZERO_LSN(txnp->begin_lsn);
130 txnp->status = TXN_UNALLOC;
133 txn_region->table[maxtxns - 1].txnid = TXN_INVALID;
135 /* Unlock the region. */
136 (void)__db_mutex_unlock(&txn_region->hdr.lock, fd);
138 /* Now unmap and close the region. */
139 if ((ret = __db_rclose(dbenv, fd, txn_region)) != 0) {
140 (void)txn_unlink(path, 1 /* force */, dbenv);
148 txn_open(path, flags, mode, dbenv, mgrpp)
155 DB_TXNREGION *txn_regionp;
156 int fd, ret, retry_cnt;
162 /* Validate arguments. */
165 #ifdef HAVE_SPINLOCKS
166 #define OKFLAGS (DB_CREATE | DB_THREAD | DB_TXN_NOSYNC)
168 #define OKFLAGS (DB_CREATE | DB_TXN_NOSYNC)
170 if ((ret = __db_fchk(dbenv, "txn_open", flags, OKFLAGS)) != 0)
174 retry: if (LF_ISSET(DB_CREATE) && (ret = __txn_create(dbenv, path, mode)) != 0)
175 if (ret == EAGAIN && ++retry_cnt < 0) {
176 (void)__db_sleep(1, 0);
178 } else /* We did not really create the region */
182 retry1: if ((ret = __db_ropen(dbenv, DB_APP_NONE, path, DEFAULT_TXN_FILE,
183 flags & ~(DB_CREATE | DB_THREAD | DB_TXN_NOSYNC),
184 &fd, &txn_regionp)) != 0) {
185 if (ret == EAGAIN && ++retry_cnt < 3) {
186 (void)__db_sleep(1, 0);
193 /* Check if valid region. */
194 if (txn_regionp->magic != DB_TXNMAGIC) {
195 __db_err(dbenv, "txn_open: Bad magic number");
200 /* Now, create the transaction manager structure and set its fields. */
201 if ((tmgrp = (DB_TXNMGR *)malloc(sizeof(DB_TXNMGR))) == NULL) {
202 __db_err(dbenv, "txn_open: %s", strerror(errno));
207 tmgrp->dbenv = dbenv;
209 dbenv->tx_recover == NULL ? __db_dispatch : dbenv->tx_recover;
210 tmgrp->region = txn_regionp;
211 tmgrp->reg_size = txn_regionp->hdr.size;
213 tmgrp->flags = LF_ISSET(DB_TXN_NOSYNC | DB_THREAD);
214 TAILQ_INIT(&tmgrp->txn_chain);
215 if (LF_ISSET(DB_THREAD))
216 __db_mutex_init(&tmgrp->mutex, -1);
220 out: if (txn_regionp != NULL)
221 (void)__db_rclose(dbenv, fd, txn_regionp);
222 if (flags & DB_CREATE)
223 (void)txn_unlink(path, 1, dbenv);
230 * Internally, we use TXN_DETAIL structures, but we allocate and return
231 * DB_TXN structures that provide access to the transaction ID and the
232 * offset in the transaction region of the TXN_DETAIL structure.
235 txn_begin(tmgrp, parent, txnpp)
244 LOCK_TXNREGION(tmgrp);
246 if ((ret = __txn_validate_region(tmgrp)) != 0) {
247 UNLOCK_TXNREGION(tmgrp);
251 /* Remove element from free list. */
252 if (tmgrp->region->free_txn == TXN_INVALID &&
253 (ret = __txn_grow_region(tmgrp)) != 0) {
254 UNLOCK_TXNREGION(tmgrp);
258 index = tmgrp->region->free_txn;
259 txnp = &tmgrp->region->table[index];
260 tmgrp->region->free_txn = txnp->txnid;
262 if (txnp->status != TXN_UNALLOC) {
263 UNLOCK_TXNREGION(tmgrp);
267 /* Make sure that last_txnid is not going to wrap around. */
268 if (tmgrp->region->last_txnid == TXN_INVALID)
271 if ((retp = (DB_TXN *)malloc(sizeof(DB_TXN))) == NULL) {
272 __db_err(tmgrp->dbenv, "txn_begin : %s", strerror(ENOMEM));
273 UNLOCK_TXNREGION(tmgrp);
277 id = ++tmgrp->region->last_txnid;
278 tmgrp->region->nbegins++;
282 txnp->status = TXN_RUNNING;
283 ZERO_LSN(txnp->last_lsn);
284 ZERO_LSN(txnp->begin_lsn);
286 UNLOCK_TXNREGION(tmgrp);
288 ZERO_LSN(retp->last_lsn);
290 retp->parent = parent;
291 retp->off = (u_int8_t *)txnp - (u_int8_t *)tmgrp->region;
294 if (tmgrp->dbenv->lg_info != NULL &&
295 (ret = __txn_regop_log(tmgrp->dbenv->lg_info,
296 retp, &txnp->begin_lsn, 0, TXN_BEGIN)) != 0) {
298 /* Deallocate transaction. */
299 LOCK_TXNREGION(tmgrp);
300 txnp->txnid = tmgrp->region->free_txn;
301 tmgrp->region->free_txn = txnp - &tmgrp->region->table[0];
302 UNLOCK_TXNREGION(tmgrp);
307 LOCK_TXNTHREAD(tmgrp);
308 TAILQ_INSERT_TAIL(&tmgrp->txn_chain, retp, links);
309 UNLOCK_TXNTHREAD(tmgrp);
315 /* The db_txn(3) man page describes txn_commit. */
323 if ((ret = __txn_check_running(txnp)) != 0)
327 if ((logp = txnp->mgrp->dbenv->lg_info) != NULL &&
328 (ret = __txn_regop_log(logp,
329 txnp, &txnp->last_lsn,
330 F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, TXN_COMMIT))
334 return (__txn_end(txnp, 1));
337 /* The db_txn(3) man page describes txn_abort. */
344 if ((ret = __txn_check_running(txnp)) != 0)
347 if ((ret = __txn_undo(txnp)) != 0) {
348 __db_err(txnp->mgrp->dbenv,
349 "txn_abort: Log undo failed %s", strerror(ret));
352 return (__txn_end(txnp, 0));
356 * Flush the log so a future commit is guaranteed to succeed.
366 if ((ret = __txn_check_running(txnp)) != 0)
369 if (txnp->mgrp->dbenv->lg_info) {
370 ret = log_flush(txnp->mgrp->dbenv->lg_info, &txnp->last_lsn);
372 __db_err(txnp->mgrp->dbenv,
373 "txn_prepare: log_flush failed %s\n",
378 LOCK_TXNTHREAD(txnp->mgrp);
379 tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off);
380 tp->status = TXN_PREPARED;
381 UNLOCK_TXNTHREAD(txnp->mgrp);
386 * Return the transaction ID associated with a particular transaction
392 return (txnp->txnid);
396 * The db_txn(3) man page describes txn_close. Currently the caller should
397 * arrange a checkpoint before calling txn_close.
407 * This function had better only be called once per process
408 * (i.e., not per thread), so there should be no synchronization
411 for (ret = 0, txnp = TAILQ_FIRST(&tmgrp->txn_chain);
412 txnp != TAILQ_END(&tmgrp->txn_chain);
413 txnp = TAILQ_FIRST(&tmgrp->txn_chain)) {
414 if ((t_ret = txn_abort(txnp)) != 0 && ret == 0)
418 if (tmgrp->dbenv->lg_info && (t_ret =
419 log_flush(tmgrp->dbenv->lg_info, NULL)) != 0 &&
423 if ((t_ret = __db_rclose(tmgrp->dbenv, tmgrp->fd, tmgrp->region)) != 0
433 * The db_txn(3) man page describes txn_unlink. Right now it is up to
434 * txn_close to write the final checkpoint record.
437 txn_unlink(path, force, dbenv)
442 return (__db_runlink(dbenv,
443 DB_APP_NONE, path, DEFAULT_TXN_FILE, force));
446 /* Internal routines. */
449 * Return 0 if the txnp is reasonable, otherwise returns EINVAL.
452 __txn_check_running(txnp)
458 if (txnp != NULL && txnp->mgrp != NULL && txnp->mgrp->region != NULL) {
459 tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off);
460 if (tp->status != TXN_RUNNING)
464 return (tp == NULL ? EINVAL : 0);
468 __txn_end(txnp, is_commit)
481 TAILQ_REMOVE(&mgr->txn_chain, txnp, links);
482 UNLOCK_TXNTHREAD(mgr);
484 /* Release the locks. */
485 locker = txnp->txnid;
486 request.op = DB_LOCK_PUT_ALL;
488 if (mgr->dbenv->lk_info) {
489 ret = lock_vec(mgr->dbenv->lk_info, locker, 0,
491 if (ret != 0 && (ret != DB_LOCK_DEADLOCK || is_commit)) {
492 __db_err(mgr->dbenv, "%s: release locks failed %s",
493 is_commit ? "txn_commit" : "txn_abort",
499 /* End the transaction. */
501 tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + txnp->off);
502 tp->status = TXN_UNALLOC;
503 tp->txnid = mgr->region->free_txn;
504 mgr->region->free_txn = tp - &mgr->region->table[0];
506 mgr->region->ncommits++;
508 mgr->region->naborts++;
509 UNLOCK_TXNREGION(mgr);
511 FREE(txnp, sizeof(*txnp));
518 * Undo the transaction with id txnid. Returns 0 on success and sets
519 * errno and returns -1 on failure.
532 logp = mgr->dbenv->lg_info;
537 * This is the simplest way to code this, but if the mallocs during
538 * recovery turn out to be a performance issue, we can do the
539 * allocation here and use DB_DBT_USERMEM.
541 memset(&rdbt, 0, sizeof(rdbt));
542 if (F_ISSET(logp, DB_AM_THREAD))
543 F_SET(&rdbt, DB_DBT_MALLOC);
545 key_lsn = txnp->last_lsn; /* structure assignment */
546 for (ret = 0; ret == 0 && !IS_ZERO_LSN(key_lsn);) {
548 * The dispatch routine returns the lsn of the record
549 * before the current one in the key_lsn argument.
551 if ((ret = log_get(logp, &key_lsn, &rdbt, DB_SET)) == 0) {
553 mgr->recover(logp, &rdbt, &key_lsn, TXN_UNDO, NULL);
554 if (F_ISSET(logp, DB_AM_THREAD) && rdbt.data != NULL) {
567 * Transaction checkpoint.
568 * If either kbytes or minutes is non-zero, then we only take the checkpoint
569 * more than "minutes" minutes have passed since the last checkpoint or if
570 * more than "kbytes" of log data have been written since the last checkpoint.
571 * When taking a checkpoint, find the oldest active transaction and figure out
572 * its first LSN. This is the lowest LSN we can checkpoint, since any record
573 * written after since that point may be involved in a transaction and may
574 * therefore need to be undone in the case of an abort.
577 txn_checkpoint(mgr, kbytes, minutes)
578 const DB_TXNMGR *mgr;
579 long kbytes, minutes;
582 DB_LSN ckp_lsn, last_ckp;
584 u_int32_t bytes_written, i;
585 time_t last_ckp_time, now;
589 if (kbytes < 0 || minutes < 0)
593 * Check if we need to run recovery.
600 last_ckp_time = mgr->region->time_ckp;
601 UNLOCK_TXNREGION(mgr);
603 if (now - last_ckp_time >= (time_t)(minutes * 60))
608 dblp = mgr->dbenv->lg_info;
609 LOCK_LOGREGION(dblp);
610 bytes_written = dblp->lp->written;
611 ckp_lsn = dblp->lp->lsn;
612 UNLOCK_LOGREGION(dblp);
613 if (bytes_written >= (u_int32_t)(kbytes * 1024))
618 * If we checked time and data and didn't go to checkpoint,
621 if (minutes != 0 || kbytes != 0)
625 if (IS_ZERO_LSN(ckp_lsn)) {
626 dblp = mgr->dbenv->lg_info;
627 LOCK_LOGREGION(dblp);
628 ckp_lsn = dblp->lp->lsn;
629 UNLOCK_LOGREGION(dblp);
633 * We have to find an LSN such that all transactions begun
634 * before that LSN are complete.
638 if (!IS_ZERO_LSN(mgr->region->pending_ckp))
639 ckp_lsn = mgr->region->pending_ckp;
641 for (txnp = &mgr->region->table[0], i = 0;
642 i < mgr->region->maxtxns; i++, txnp++) {
645 * Look through the transaction table for the LSN of
646 * the transaction that is in-use (e.g., not
647 * TXN_UNALLOC) and whose begin lsn is the lowest.
649 if (txnp->status != TXN_UNALLOC &&
650 !IS_ZERO_LSN(txnp->begin_lsn) &&
651 log_compare(&txnp->begin_lsn, &ckp_lsn) < 0)
652 ckp_lsn = txnp->begin_lsn;
655 mgr->region->pending_ckp = ckp_lsn;
656 UNLOCK_TXNREGION(mgr);
658 ret = memp_sync(mgr->dbenv->mp_info, &ckp_lsn);
661 "txn_checkpoint: system failure in memp_sync %s\n",
663 } else if (ret == 0 && mgr->dbenv->lg_info != NULL) {
665 last_ckp = mgr->region->last_ckp;
666 ZERO_LSN(mgr->region->pending_ckp);
667 UNLOCK_TXNREGION(mgr);
669 if ((ret = __txn_ckp_log(mgr->dbenv->lg_info,
670 NULL, &ckp_lsn, DB_CHECKPOINT, &ckp_lsn, &last_ckp)) != 0) {
672 "txn_checkpoint: log failed at LSN [%ld %ld] %s\n",
673 (long)ckp_lsn.file, (long)ckp_lsn.offset,
679 mgr->region->last_ckp = ckp_lsn;
680 (void)time(&mgr->region->time_ckp);
681 UNLOCK_TXNREGION(mgr);
684 * ret < 0 means that there are still buffers to flush; the
685 * checkpoint is not complete. Back off and try again.
691 * This is called at every interface to verify if the region
692 * has changed size, and if so, to remap the region in and
693 * reset the process pointers.
696 __txn_validate_region(tp)
701 if (tp->reg_size == tp->region->hdr.size)
704 /* Grow the region. */
705 if ((ret = __db_rremap(tp->dbenv, tp->region,
706 tp->reg_size, tp->region->hdr.size, tp->fd, &tp->region)) != 0)
709 tp->reg_size = tp->region->hdr.size;
715 __txn_grow_region(tp)
723 oldmax = tp->region->maxtxns;
724 incr = oldmax * sizeof(DB_TXN);
726 if ((ret = __db_rgrow(tp->dbenv, tp->fd, incr)) != 0)
729 if ((ret = __db_rremap(tp->dbenv, tp->region,
730 tp->reg_size, tp->reg_size + incr, tp->fd, &tp->region)) != 0)
732 tp->reg_size += incr;
735 * Initialize all the new transactions and up the transaction count.
737 for (i = 0, tx = &tp->region->table[oldmax]; i < oldmax; i++, tx++) {
738 ZERO_LSN(tx->begin_lsn);
739 tx->status = TXN_UNALLOC;
740 tx->txnid = oldmax + i + 1;
742 tp->region->free_txn = oldmax;
743 tp->region->maxtxns = 2 * oldmax;
744 tp->region->table[tp->region->maxtxns - 1].txnid = TXN_INVALID;
750 txn_stat(mgr, statp, db_malloc)
753 void *(*db_malloc) __P((size_t));
757 u_int32_t i, nactive, ndx;
760 nactive = mgr->region->nbegins -
761 mgr->region->naborts - mgr->region->ncommits;
762 UNLOCK_TXNREGION(mgr);
765 * Allocate a bunch of extra active structures to handle any
766 * that have been created since we unlocked the region.
768 nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * (nactive + 200);
769 if (db_malloc == NULL)
770 stats = (DB_TXN_STAT *)malloc(nbytes);
772 stats = (DB_TXN_STAT *)db_malloc(nbytes);
778 stats->st_last_txnid = mgr->region->last_txnid;
779 stats->st_last_ckp = mgr->region->last_ckp;
780 stats->st_maxtxns = mgr->region->maxtxns;
781 stats->st_naborts = mgr->region->naborts;
782 stats->st_nbegins = mgr->region->nbegins;
783 stats->st_ncommits = mgr->region->ncommits;
784 stats->st_pending_ckp = mgr->region->pending_ckp;
785 stats->st_time_ckp = mgr->region->time_ckp;
786 stats->st_nactive = stats->st_nbegins -
787 stats->st_naborts - stats->st_ncommits;
788 if (stats->st_nactive > nactive + 200)
789 stats->st_nactive = nactive + 200;
790 stats->st_txnarray = (DB_TXN_ACTIVE *)&stats[1];
792 for (ndx = 0, i = 0; i < mgr->region->maxtxns; i++)
793 if (mgr->region->table[i].status != TXN_UNALLOC) {
794 stats->st_txnarray[ndx].txnid =
795 mgr->region->table[i].txnid;
796 stats->st_txnarray[ndx].lsn =
797 mgr->region->table[i].begin_lsn;
800 if (ndx >= stats->st_nactive)
804 UNLOCK_TXNREGION(mgr);