2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997
5 * Sleepycat Software. All rights reserved.
8 * Copyright (c) 1995, 1996
9 * The President and Fellows of Harvard University. All rights reserved.
11 * This code is derived from software contributed to Berkeley by
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the University of
25 * California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 static const char sccsid[] = "@(#)txn.c 10.30 (Sleepycat) 9/23/97";
50 #ifndef NO_SYSTEM_INCLUDES
51 #include <sys/types.h>
70 #include "db_dispatch.h"
74 #include "common_ext.h"
76 static int __txn_check_running __P((const DB_TXN *));
77 static int __txn_create __P((DB_ENV *, const char *, u_int));
78 static int __txn_end __P((DB_TXN *, int));
79 static int __txn_grow_region __P((DB_TXNMGR *));
80 static int __txn_undo __P((DB_TXN *));
81 static int __txn_validate_region __P((DB_TXNMGR *));
84 * This file contains the top level routines of the transaction library.
85 * It assumes that a lock manager and log manager that conform to the db_log(3)
86 * and db_lock(3) interfaces exist.
88 * Create and initialize a transaction region in shared memory.
89 * Return 0 on success, errno on failure.
92 __txn_create(dbenv, path, mode)
97 DB_TXNREGION *txn_region;
101 maxtxns = dbenv->tx_max != 0 ? dbenv->tx_max : 1000;
104 ret = __db_rcreate(dbenv, DB_APP_NONE, path,
105 DEFAULT_TXN_FILE, mode, TXN_REGION_SIZE(maxtxns), &fd, &txn_region);
107 /* Region may have existed. If it didn't, the open will fail. */
111 txn_region->magic = DB_TXNMAGIC;
112 txn_region->version = DB_TXNVERSION;
113 txn_region->maxtxns = maxtxns;
114 txn_region->last_txnid = TXN_MINIMUM;
115 /* XXX If we ever do more types of locking and logging, this changes. */
116 txn_region->logtype = 0;
117 txn_region->locktype = 0;
118 txn_region->time_ckp = now;
119 ZERO_LSN(txn_region->last_ckp);
120 ZERO_LSN(txn_region->pending_ckp);
121 SH_TAILQ_INIT(&txn_region->active_txn);
122 __db_shalloc_init((void *)&txn_region[1],
123 TXN_REGION_SIZE(maxtxns) - sizeof(DB_TXNREGION));
125 /* Unlock the region. */
126 (void)__db_mutex_unlock(&txn_region->hdr.lock, fd);
128 /* Now unmap and close the region. */
129 if ((ret = __db_rclose(dbenv, fd, txn_region)) != 0) {
130 (void)txn_unlink(path, 1 /* force */, dbenv);
137 txn_open(path, flags, mode, dbenv, mgrpp)
144 DB_TXNREGION *txn_regionp;
145 int fd, ret, retry_cnt;
151 /* Validate arguments. */
154 #ifdef HAVE_SPINLOCKS
155 #define OKFLAGS (DB_CREATE | DB_THREAD | DB_TXN_NOSYNC)
157 #define OKFLAGS (DB_CREATE | DB_TXN_NOSYNC)
159 if ((ret = __db_fchk(dbenv, "txn_open", flags, OKFLAGS)) != 0)
163 retry: if (LF_ISSET(DB_CREATE) && (ret = __txn_create(dbenv, path, mode)) != 0)
164 if (ret == EAGAIN && ++retry_cnt < 0) {
165 (void)__db_sleep(1, 0);
167 } else /* We did not really create the region */
171 retry1: if ((ret = __db_ropen(dbenv, DB_APP_NONE, path, DEFAULT_TXN_FILE,
172 flags & ~(DB_CREATE | DB_THREAD | DB_TXN_NOSYNC),
173 &fd, &txn_regionp)) != 0) {
174 if (ret == EAGAIN && ++retry_cnt < 3) {
175 (void)__db_sleep(1, 0);
182 /* Check if valid region. */
183 if (txn_regionp->magic != DB_TXNMAGIC) {
184 __db_err(dbenv, "txn_open: Bad magic number");
189 /* Now, create the transaction manager structure and set its fields. */
190 if ((tmgrp = (DB_TXNMGR *)malloc(sizeof(DB_TXNMGR))) == NULL) {
191 __db_err(dbenv, "txn_open: %s", strerror(ENOMEM));
196 tmgrp->dbenv = dbenv;
198 dbenv->tx_recover == NULL ? __db_dispatch : dbenv->tx_recover;
199 tmgrp->region = txn_regionp;
200 tmgrp->reg_size = txn_regionp->hdr.size;
202 tmgrp->flags = LF_ISSET(DB_TXN_NOSYNC | DB_THREAD);
203 tmgrp->mem = &txn_regionp[1];
204 tmgrp->mutexp = NULL;
205 TAILQ_INIT(&tmgrp->txn_chain);
206 if (LF_ISSET(DB_THREAD)) {
207 LOCK_TXNREGION(tmgrp);
208 if ((ret = __db_shalloc(tmgrp->mem, sizeof(db_mutex_t),
209 MUTEX_ALIGNMENT, &tmgrp->mutexp)) == 0)
210 __db_mutex_init(tmgrp->mutexp, -1);
211 UNLOCK_TXNREGION(tmgrp);
218 out: if (txn_regionp != NULL)
219 (void)__db_rclose(dbenv, fd, txn_regionp);
220 if (flags & DB_CREATE)
221 (void)txn_unlink(path, 1, dbenv);
223 if (tmgrp->mutexp != NULL) {
224 LOCK_TXNREGION(tmgrp);
225 __db_shalloc_free(tmgrp->mem, tmgrp->mutexp);
226 UNLOCK_TXNREGION(tmgrp);
234 * Internally, we use TXN_DETAIL structures, but we allocate and return
235 * DB_TXN structures that provide access to the transaction ID and the
236 * offset in the transaction region of the TXN_DETAIL structure.
239 txn_begin(tmgrp, parent, txnpp)
248 LOCK_TXNREGION(tmgrp);
250 if ((ret = __txn_validate_region(tmgrp)) != 0)
253 /* Allocate a new transaction detail structure. */
254 if ((ret = __db_shalloc(tmgrp->mem, sizeof(TXN_DETAIL), 0, &txnp)) != 0
255 && ret == ENOMEM && (ret = __txn_grow_region(tmgrp)) == 0)
256 ret = __db_shalloc(tmgrp->mem, sizeof(TXN_DETAIL), 0, &txnp);
261 /* Make sure that last_txnid is not going to wrap around. */
262 if (tmgrp->region->last_txnid == TXN_INVALID)
265 if ((retp = (DB_TXN *)malloc(sizeof(DB_TXN))) == NULL) {
266 __db_err(tmgrp->dbenv, "txn_begin : %s", strerror(ENOMEM));
271 id = ++tmgrp->region->last_txnid;
272 tmgrp->region->nbegins++;
275 ZERO_LSN(txnp->last_lsn);
276 ZERO_LSN(txnp->begin_lsn);
278 txnp->status = TXN_RUNNING;
279 SH_TAILQ_INSERT_HEAD(&tmgrp->region->active_txn,
280 txnp, links, __txn_detail);
282 UNLOCK_TXNREGION(tmgrp);
284 ZERO_LSN(retp->last_lsn);
286 retp->parent = parent;
287 retp->off = (u_int8_t *)txnp - (u_int8_t *)tmgrp->region;
290 if (tmgrp->dbenv->lg_info != NULL &&
291 (ret = __txn_regop_log(tmgrp->dbenv->lg_info,
292 retp, &txnp->begin_lsn, 0, TXN_BEGIN)) != 0) {
294 /* Deallocate transaction. */
295 LOCK_TXNREGION(tmgrp);
296 SH_TAILQ_REMOVE(&tmgrp->region->active_txn,
297 txnp, links, __txn_detail);
298 __db_shalloc_free(tmgrp->mem, txnp);
299 UNLOCK_TXNREGION(tmgrp);
304 LOCK_TXNTHREAD(tmgrp);
305 TAILQ_INSERT_TAIL(&tmgrp->txn_chain, retp, links);
306 UNLOCK_TXNTHREAD(tmgrp);
312 __db_shalloc_free(tmgrp->mem, txnp);
314 UNLOCK_TXNREGION(tmgrp);
318 /* The db_txn(3) man page describes txn_commit. */
326 if ((ret = __txn_check_running(txnp)) != 0)
330 if ((logp = txnp->mgrp->dbenv->lg_info) != NULL &&
331 (ret = __txn_regop_log(logp,
332 txnp, &txnp->last_lsn,
333 F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, TXN_COMMIT))
337 return (__txn_end(txnp, 1));
340 /* The db_txn(3) man page describes txn_abort. */
347 if ((ret = __txn_check_running(txnp)) != 0)
350 if ((ret = __txn_undo(txnp)) != 0) {
351 __db_err(txnp->mgrp->dbenv,
352 "txn_abort: Log undo failed %s", strerror(ret));
355 return (__txn_end(txnp, 0));
359 * Flush the log so a future commit is guaranteed to succeed.
368 if ((ret = __txn_check_running(txnp)) != 0)
371 if (txnp->mgrp->dbenv->lg_info != NULL) {
372 if ((ret = log_flush(txnp->mgrp->dbenv->lg_info,
373 &txnp->last_lsn)) != 0)
374 __db_err(txnp->mgrp->dbenv,
375 "txn_prepare: log_flush failed %s\n",
380 LOCK_TXNTHREAD(txnp->mgrp);
381 tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off);
382 tp->status = TXN_PREPARED;
383 UNLOCK_TXNTHREAD(txnp->mgrp);
388 * Return the transaction ID associated with a particular transaction
394 return (txnp->txnid);
398 * The db_txn(3) man page describes txn_close. Currently the caller should
399 * arrange a checkpoint before calling txn_close.
409 * This function had better only be called once per process
410 * (i.e., not per thread), so there should be no synchronization
413 for (ret = 0, txnp = TAILQ_FIRST(&tmgrp->txn_chain);
414 txnp != TAILQ_END(&tmgrp->txn_chain);
415 txnp = TAILQ_FIRST(&tmgrp->txn_chain)) {
416 if ((t_ret = txn_abort(txnp)) != 0 && ret == 0)
420 if (tmgrp->dbenv->lg_info && (t_ret =
421 log_flush(tmgrp->dbenv->lg_info, NULL)) != 0 &&
425 if (tmgrp->mutexp != NULL) {
426 LOCK_TXNREGION(tmgrp);
427 __db_shalloc_free(tmgrp->mem, tmgrp->mutexp);
428 UNLOCK_TXNREGION(tmgrp);
431 if ((t_ret = __db_rclose(tmgrp->dbenv, tmgrp->fd, tmgrp->region)) != 0
442 * The db_txn(3) man page describes txn_unlink. Right now it is up to
443 * txn_close to write the final checkpoint record.
446 txn_unlink(path, force, dbenv)
451 return (__db_runlink(dbenv,
452 DB_APP_NONE, path, DEFAULT_TXN_FILE, force));
455 /* Internal routines. */
458 * Return 0 if the txnp is reasonable, otherwise returns EINVAL.
461 __txn_check_running(txnp)
467 if (txnp != NULL && txnp->mgrp != NULL && txnp->mgrp->region != NULL) {
468 tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off);
469 if (tp->status != TXN_RUNNING)
473 return (tp == NULL ? EINVAL : 0);
477 __txn_end(txnp, is_commit)
490 TAILQ_REMOVE(&mgr->txn_chain, txnp, links);
491 UNLOCK_TXNTHREAD(mgr);
493 /* Release the locks. */
494 locker = txnp->txnid;
495 request.op = DB_LOCK_PUT_ALL;
497 if (mgr->dbenv->lk_info) {
498 ret = lock_vec(mgr->dbenv->lk_info, locker, 0,
500 if (ret != 0 && (ret != DB_LOCK_DEADLOCK || is_commit)) {
501 __db_err(mgr->dbenv, "%s: release locks failed %s",
502 is_commit ? "txn_commit" : "txn_abort",
508 /* End the transaction. */
510 tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + txnp->off);
511 SH_TAILQ_REMOVE(&mgr->region->active_txn, tp, links, __txn_detail);
512 __db_shalloc_free(mgr->mem, tp);
514 mgr->region->ncommits++;
516 mgr->region->naborts++;
517 UNLOCK_TXNREGION(mgr);
519 FREE(txnp, sizeof(*txnp));
527 * Undo the transaction with id txnid. Returns 0 on success and
541 logp = mgr->dbenv->lg_info;
546 * This is the simplest way to code this, but if the mallocs during
547 * recovery turn out to be a performance issue, we can do the
548 * allocation here and use DB_DBT_USERMEM.
550 memset(&rdbt, 0, sizeof(rdbt));
551 if (F_ISSET(logp, DB_AM_THREAD))
552 F_SET(&rdbt, DB_DBT_MALLOC);
554 key_lsn = txnp->last_lsn; /* structure assignment */
555 for (ret = 0; ret == 0 && !IS_ZERO_LSN(key_lsn);) {
557 * The dispatch routine returns the lsn of the record
558 * before the current one in the key_lsn argument.
560 if ((ret = log_get(logp, &key_lsn, &rdbt, DB_SET)) == 0) {
562 mgr->recover(logp, &rdbt, &key_lsn, TXN_UNDO, NULL);
563 if (F_ISSET(logp, DB_AM_THREAD) && rdbt.data != NULL) {
576 * Transaction checkpoint.
577 * If either kbytes or minutes is non-zero, then we only take the checkpoint
578 * more than "minutes" minutes have passed since the last checkpoint or if
579 * more than "kbytes" of log data have been written since the last checkpoint.
580 * When taking a checkpoint, find the oldest active transaction and figure out
581 * its first LSN. This is the lowest LSN we can checkpoint, since any record
582 * written after since that point may be involved in a transaction and may
583 * therefore need to be undone in the case of an abort.
586 txn_checkpoint(mgr, kbytes, minutes)
587 const DB_TXNMGR *mgr;
591 DB_LSN ckp_lsn, last_ckp;
593 u_int32_t bytes_written;
594 time_t last_ckp_time, now;
598 if (kbytes < 0 || minutes < 0)
602 * Check if we need to run recovery.
609 last_ckp_time = mgr->region->time_ckp;
610 UNLOCK_TXNREGION(mgr);
612 if (now - last_ckp_time >= (time_t)(minutes * 60))
617 dblp = mgr->dbenv->lg_info;
618 LOCK_LOGREGION(dblp);
619 bytes_written = dblp->lp->written;
620 ckp_lsn = dblp->lp->lsn;
621 UNLOCK_LOGREGION(dblp);
622 if (bytes_written >= (u_int32_t)(kbytes * 1024))
627 * If we checked time and data and didn't go to checkpoint,
630 if (minutes != 0 || kbytes != 0)
634 if (IS_ZERO_LSN(ckp_lsn)) {
635 dblp = mgr->dbenv->lg_info;
636 LOCK_LOGREGION(dblp);
637 ckp_lsn = dblp->lp->lsn;
638 UNLOCK_LOGREGION(dblp);
642 * We have to find an LSN such that all transactions begun
643 * before that LSN are complete.
647 if (!IS_ZERO_LSN(mgr->region->pending_ckp))
648 ckp_lsn = mgr->region->pending_ckp;
651 SH_TAILQ_FIRST(&mgr->region->active_txn, __txn_detail);
653 txnp = SH_TAILQ_NEXT(txnp, links, __txn_detail)) {
656 * Look through the active transactions for the
659 if (!IS_ZERO_LSN(txnp->begin_lsn) &&
660 log_compare(&txnp->begin_lsn, &ckp_lsn) < 0)
661 ckp_lsn = txnp->begin_lsn;
664 mgr->region->pending_ckp = ckp_lsn;
665 UNLOCK_TXNREGION(mgr);
667 ret = memp_sync(mgr->dbenv->mp_info, &ckp_lsn);
670 "txn_checkpoint: system failure in memp_sync %s\n",
672 } else if (ret == 0 && mgr->dbenv->lg_info != NULL) {
674 last_ckp = mgr->region->last_ckp;
675 ZERO_LSN(mgr->region->pending_ckp);
676 UNLOCK_TXNREGION(mgr);
678 if ((ret = __txn_ckp_log(mgr->dbenv->lg_info,
679 NULL, &ckp_lsn, DB_CHECKPOINT, &ckp_lsn, &last_ckp)) != 0) {
681 "txn_checkpoint: log failed at LSN [%ld %ld] %s\n",
682 (long)ckp_lsn.file, (long)ckp_lsn.offset,
688 mgr->region->last_ckp = ckp_lsn;
689 (void)time(&mgr->region->time_ckp);
690 UNLOCK_TXNREGION(mgr);
693 * ret < 0 means that there are still buffers to flush; the
694 * checkpoint is not complete. Back off and try again.
700 * This is called at every interface to verify if the region
701 * has changed size, and if so, to remap the region in and
702 * reset the process pointers.
705 __txn_validate_region(tp)
710 if (tp->reg_size == tp->region->hdr.size)
713 /* Grow the region. */
714 if ((ret = __db_rremap(tp->dbenv, tp->region,
715 tp->reg_size, tp->region->hdr.size, tp->fd, &tp->region)) != 0)
718 tp->reg_size = tp->region->hdr.size;
719 tp->mem = &tp->region[1];
725 __txn_grow_region(tp)
733 oldmax = tp->region->maxtxns;
734 incr = oldmax * sizeof(DB_TXN);
736 if ((ret = __db_rgrow(tp->dbenv, tp->fd, incr)) != 0)
739 if ((ret = __db_rremap(tp->dbenv, tp->region,
740 tp->reg_size, tp->reg_size + incr, tp->fd, &tp->region)) != 0)
743 /* Throw the new space on the free list. */
744 curaddr = (u_int8_t *)tp->region + tp->reg_size;
745 tp->mem = &tp->region[1];
746 tp->reg_size += incr;
748 *((size_t *)curaddr) = incr - sizeof(size_t);
749 curaddr += sizeof(size_t);
750 __db_shalloc_free(tp->mem, curaddr);
752 tp->region->maxtxns = 2 * oldmax;
758 txn_stat(mgr, statp, db_malloc)
761 void *(*db_malloc) __P((size_t));
766 u_int32_t nactive, ndx;
769 nactive = mgr->region->nbegins -
770 mgr->region->naborts - mgr->region->ncommits;
771 UNLOCK_TXNREGION(mgr);
774 * Allocate a bunch of extra active structures to handle any
775 * that have been created since we unlocked the region.
777 nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * (nactive + 200);
778 if (db_malloc == NULL)
779 stats = (DB_TXN_STAT *)malloc(nbytes);
781 stats = (DB_TXN_STAT *)db_malloc(nbytes);
787 stats->st_last_txnid = mgr->region->last_txnid;
788 stats->st_last_ckp = mgr->region->last_ckp;
789 stats->st_maxtxns = mgr->region->maxtxns;
790 stats->st_naborts = mgr->region->naborts;
791 stats->st_nbegins = mgr->region->nbegins;
792 stats->st_ncommits = mgr->region->ncommits;
793 stats->st_pending_ckp = mgr->region->pending_ckp;
794 stats->st_time_ckp = mgr->region->time_ckp;
795 stats->st_nactive = stats->st_nbegins -
796 stats->st_naborts - stats->st_ncommits;
797 if (stats->st_nactive > nactive + 200)
798 stats->st_nactive = nactive + 200;
799 stats->st_txnarray = (DB_TXN_ACTIVE *)&stats[1];
802 for (txnp = SH_TAILQ_FIRST(&mgr->region->active_txn, __txn_detail);
804 txnp = SH_TAILQ_NEXT(txnp, links, __txn_detail)) {
805 stats->st_txnarray[ndx].txnid = txnp->txnid;
806 stats->st_txnarray[ndx].lsn = txnp->begin_lsn;
809 if (ndx >= stats->st_nactive)
813 UNLOCK_TXNREGION(mgr);