db_dispatch.c
上传用户:romrleung
上传日期:2022-05-23
资源大小:18897k
文件大小:37k
- /*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996-2002
- * Sleepycat Software. All rights reserved.
- */
- /*
- * Copyright (c) 1995, 1996
- * The President and Fellows of Harvard University. All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Margo Seltzer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
- #include "db_config.h"
- #ifndef lint
- static const char revid[] = "$Id: db_dispatch.c,v 11.121 2002/09/07 17:36:31 ubell Exp $";
- #endif /* not lint */
- #ifndef NO_SYSTEM_INCLUDES
- #include <sys/types.h>
- #include <stddef.h>
- #include <stdlib.h>
- #include <string.h>
- #endif
- #include "db_int.h"
- #include "dbinc/db_page.h"
- #include "dbinc/hash.h"
- #include "dbinc/log.h"
- #include "dbinc/fop.h"
- #include "dbinc/rep.h"
- #include "dbinc/txn.h"
- static int __db_limbo_fix __P((DB *,
- DB_TXN *, DB_TXNLIST *, db_pgno_t *, DBMETA *));
- static int __db_limbo_bucket __P((DB_ENV *, DB_TXN *, DB_TXNLIST *));
- static int __db_limbo_move __P((DB_ENV *, DB_TXN *, DB_TXN *, DB_TXNLIST *));
- static int __db_lock_move __P((DB_ENV *,
- u_int8_t *, db_pgno_t, db_lockmode_t, DB_TXN *, DB_TXN *));
- static int __db_default_getpgnos __P((DB_ENV *, DB_LSN *lsnp, void *));
- static int __db_txnlist_find_internal __P((DB_ENV *, void *, db_txnlist_type,
- u_int32_t, u_int8_t [DB_FILE_ID_LEN], DB_TXNLIST **, int));
- static int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *,
- int32_t, u_int8_t [DB_FILE_ID_LEN], char *, db_pgno_t));
- /*
- * __db_dispatch --
- *
- * This is the transaction dispatch function used by the db access methods.
- * It is designed to handle the record format used by all the access
- * methods (the one automatically generated by the db_{h,log,read}.sh
- * scripts in the tools directory). An application using a different
- * recovery paradigm will supply a different dispatch function to txn_open.
- *
- * PUBLIC: int __db_dispatch __P((DB_ENV *,
- * PUBLIC: int (**)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)),
- * PUBLIC: size_t, DBT *, DB_LSN *, db_recops, void *));
- */
- int
- __db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info)
- DB_ENV *dbenv; /* The environment. */
- int (**dtab)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- size_t dtabsize; /* Size of the dtab. */
- DBT *db; /* The log record upon which to dispatch. */
- DB_LSN *lsnp; /* The lsn of the record being dispatched. */
- db_recops redo; /* Redo this op (or undo it). */
- void *info;
- {
- DB_LSN prev_lsn;
- u_int32_t rectype, txnid;
- int make_call, ret;
- memcpy(&rectype, db->data, sizeof(rectype));
- memcpy(&txnid, (u_int8_t *)db->data + sizeof(rectype), sizeof(txnid));
- make_call = ret = 0;
- /* If we don't have a dispatch table, it's hard to dispatch. */
- DB_ASSERT(dtab != NULL);
- /*
- * If we find a record that is in the user's number space and they
- * have specified a recovery routine, let them handle it. If they
- * didn't specify a recovery routine, then we expect that they've
- * followed all our rules and registered new recovery functions.
- */
- switch (redo) {
- case DB_TXN_ABORT:
- case DB_TXN_APPLY:
- case DB_TXN_PRINT:
- make_call = 1;
- break;
- case DB_TXN_OPENFILES:
- /*
- * We collect all the transactions that have
- * "begin" records, those with no previous LSN,
- * so that we do not abort partial transactions.
- * These are known to be undone, otherwise the
- * log would not have been freeable.
- */
- memcpy(&prev_lsn, (u_int8_t *)db->data +
- sizeof(rectype) + sizeof(txnid), sizeof(prev_lsn));
- if (txnid != 0 && prev_lsn.file == 0 && (ret =
- __db_txnlist_add(dbenv, info, txnid, TXN_OK, NULL)) != 0)
- return (ret);
- /* FALLTHROUGH */
- case DB_TXN_POPENFILES:
- if (rectype == DB___dbreg_register ||
- rectype == DB___txn_ckp || rectype == DB___txn_recycle)
- return (dtab[rectype](dbenv, db, lsnp, redo, info));
- break;
- case DB_TXN_BACKWARD_ROLL:
- /*
- * Running full recovery in the backward pass. If we've
- * seen this txnid before and added to it our commit list,
- * then we do nothing during this pass, unless this is a child
- * commit record, in which case we need to process it. If
- * we've never seen it, then we call the appropriate recovery
- * routine.
- *
- * We need to always undo DB___db_noop records, so that we
- * properly handle any aborts before the file was closed.
- */
- switch(rectype) {
- case DB___txn_regop:
- case DB___txn_recycle:
- case DB___txn_ckp:
- case DB___db_noop:
- case DB___fop_file_remove:
- case DB___txn_child:
- make_call = 1;
- break;
- case DB___dbreg_register:
- if (txnid == 0) {
- make_call = 1;
- break;
- }
- /* FALLTHROUGH */
- default:
- if (txnid != 0 && (ret =
- __db_txnlist_find(dbenv,
- info, txnid)) != TXN_COMMIT && ret != TXN_IGNORE) {
- /*
- * If not found then, this is an incomplete
- * abort.
- */
- if (ret == TXN_NOTFOUND)
- return (__db_txnlist_add(dbenv,
- info, txnid, TXN_IGNORE, lsnp));
- make_call = 1;
- if (ret == TXN_OK &&
- (ret = __db_txnlist_update(dbenv,
- info, txnid,
- rectype == DB___txn_xa_regop ?
- TXN_PREPARE : TXN_ABORT, NULL)) != 0)
- return (ret);
- }
- }
- break;
- case DB_TXN_FORWARD_ROLL:
- /*
- * In the forward pass, if we haven't seen the transaction,
- * do nothing, else recover it.
- *
- * We need to always redo DB___db_noop records, so that we
- * properly handle any commits after the file was closed.
- */
- switch(rectype) {
- case DB___txn_recycle:
- case DB___txn_ckp:
- case DB___db_noop:
- make_call = 1;
- break;
- default:
- if (txnid != 0 && (ret = __db_txnlist_find(dbenv,
- info, txnid)) == TXN_COMMIT)
- make_call = 1;
- else if (ret != TXN_IGNORE &&
- (rectype == DB___ham_metagroup ||
- rectype == DB___ham_groupalloc ||
- rectype == DB___db_pg_alloc)) {
- /*
- * Because we cannot undo file extensions
- * all allocation records must be reprocessed
- * during rollforward in case the file was
- * just created. It may not have been
- * present during the backward pass.
- */
- make_call = 1;
- redo = DB_TXN_BACKWARD_ALLOC;
- } else if (rectype == DB___dbreg_register) {
- /*
- * This may be a transaction dbreg_register.
- * If it is, we only make the call on a COMMIT,
- * which we checked above. If it's not, then we
- * should always make the call, because we need
- * the file open information.
- */
- if (txnid == 0)
- make_call = 1;
- }
- }
- break;
- case DB_TXN_GETPGNOS:
- /*
- * If this is one of DB's own log records, we simply
- * dispatch.
- */
- if (rectype < DB_user_BEGIN) {
- make_call = 1;
- break;
- }
- /*
- * If we're still here, this is a custom record in an
- * application that's doing app-specific logging. Such a
- * record doesn't have a getpgno function for the user
- * dispatch function to call--the getpgnos functions return
- * which pages replication needs to lock using the TXN_RECS
- * structure, which is private and not something we want to
- * document.
- *
- * Thus, we leave any necessary locking for the app's
- * recovery function to do during the upcoming
- * DB_TXN_APPLY. Fill in default getpgnos info (we need
- * a stub entry for every log record that will get
- * DB_TXN_APPLY'd) and return success.
- */
- return (__db_default_getpgnos(dbenv, lsnp, info));
- default:
- return (__db_unknown_flag(dbenv, "__db_dispatch", redo));
- }
- /*
- * The switch statement uses ret to receive the return value of
- * __db_txnlist_find, which returns a large number of different
- * statuses, none of which we will be returning. For safety,
- * let's reset this here in case we ever do a "return(ret)"
- * below in the future.
- */
- ret = 0;
- if (make_call) {
- if (rectype >= DB_user_BEGIN && dbenv->app_dispatch != NULL)
- return (dbenv->app_dispatch(dbenv, db, lsnp, redo));
- else {
- /*
- * The size of the dtab table argument is the same as
- * the standard table, use the standard table's size
- * as our sanity check.
- */
- if (rectype > dtabsize || dtab[rectype] == NULL) {
- __db_err(dbenv,
- "Illegal record type %lu in log",
- (u_long)rectype);
- return (EINVAL);
- }
- return (dtab[rectype](dbenv, db, lsnp, redo, info));
- }
- }
- return (0);
- }
- /*
- * __db_add_recovery --
- *
- * PUBLIC: int __db_add_recovery __P((DB_ENV *,
- * PUBLIC: int (***)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), size_t *,
- * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
- */
- int
- __db_add_recovery(dbenv, dtab, dtabsize, func, ndx)
- DB_ENV *dbenv;
- int (***dtab) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- size_t *dtabsize;
- int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- u_int32_t ndx;
- {
- size_t i, nsize;
- int ret;
- /* Check if we have to grow the table. */
- if (ndx >= *dtabsize) {
- nsize = ndx + 40;
- if ((ret =
- __os_realloc(dbenv, nsize * sizeof((*dtab)[0]), dtab)) != 0)
- return (ret);
- for (i = *dtabsize; i < nsize; ++i)
- (*dtab)[i] = NULL;
- *dtabsize = nsize;
- }
- (*dtab)[ndx] = func;
- return (0);
- }
- /*
- * __db_txnlist_init --
- * Initialize transaction linked list.
- *
- * PUBLIC: int __db_txnlist_init __P((DB_ENV *,
- * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, void *));
- */
- int
- __db_txnlist_init(dbenv, low_txn, hi_txn, trunc_lsn, retp)
- DB_ENV *dbenv;
- u_int32_t low_txn, hi_txn;
- DB_LSN *trunc_lsn;
- void *retp;
- {
- DB_TXNHEAD *headp;
- u_int32_t tmp;
- int ret, size;
- /*
- * Size a hash table.
- * If low is zero then we are being called during rollback
- * and we need only one slot.
- * Hi maybe lower than low if we have recycled txnid's.
- * The numbers here are guesses about txn density, we can afford
- * to look at a few entries in each slot.
- */
- if (low_txn == 0)
- size = 1;
- else {
- if (hi_txn < low_txn) {
- tmp = hi_txn;
- hi_txn = low_txn;
- low_txn = tmp;
- }
- tmp = hi_txn - low_txn;
- /* See if we wrapped around. */
- if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
- tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
- size = tmp / 5;
- if (size < 100)
- size = 100;
- }
- if ((ret = __os_malloc(dbenv,
- sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
- return (ret);
- memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
- headp->maxid = hi_txn;
- headp->generation = 0;
- headp->nslots = size;
- headp->gen_alloc = 8;
- if ((ret = __os_malloc(dbenv, headp->gen_alloc *
- sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
- __os_free(dbenv, headp);
- return (ret);
- }
- headp->gen_array[0].generation = 0;
- headp->gen_array[0].txn_min = TXN_MINIMUM;
- headp->gen_array[0].txn_max = TXN_MAXIMUM;
- if (trunc_lsn != NULL)
- headp->trunc_lsn = *trunc_lsn;
- else
- ZERO_LSN(headp->trunc_lsn);
- ZERO_LSN(headp->maxlsn);
- ZERO_LSN(headp->ckplsn);
- *(void **)retp = headp;
- return (0);
- }
- /*
- * __db_txnlist_add --
- * Add an element to our transaction linked list.
- *
- * PUBLIC: int __db_txnlist_add __P((DB_ENV *,
- * PUBLIC: void *, u_int32_t, int32_t, DB_LSN *));
- */
- int
- __db_txnlist_add(dbenv, listp, txnid, status, lsn)
- DB_ENV *dbenv;
- void *listp;
- u_int32_t txnid;
- int32_t status;
- DB_LSN *lsn;
- {
- DB_TXNHEAD *hp;
- DB_TXNLIST *elp;
- int ret;
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
- return (ret);
- hp = (DB_TXNHEAD *)listp;
- LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
- elp->type = TXNLIST_TXNID;
- elp->u.t.txnid = txnid;
- elp->u.t.status = status;
- elp->u.t.generation = hp->generation;
- if (txnid > hp->maxid)
- hp->maxid = txnid;
- if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
- hp->maxlsn = *lsn;
- DB_ASSERT(lsn == NULL ||
- status != TXN_COMMIT || log_compare(&hp->maxlsn, lsn) >= 0);
- return (0);
- }
- /*
- * __db_txnlist_remove --
- * Remove an element from our transaction linked list.
- *
- * PUBLIC: int __db_txnlist_remove __P((DB_ENV *, void *, u_int32_t));
- */
- int
- __db_txnlist_remove(dbenv, listp, txnid)
- DB_ENV *dbenv;
- void *listp;
- u_int32_t txnid;
- {
- DB_TXNLIST *entry;
- return (__db_txnlist_find_internal(dbenv,
- listp, TXNLIST_TXNID, txnid,
- NULL, &entry, 1) == TXN_NOTFOUND ? TXN_NOTFOUND : TXN_OK);
- }
- /*
- * __db_txnlist_ckp --
- * Used to record the maximum checkpoint that will be retained
- * after recovery. Typically this is simply the max checkpoint, but
- * if we are doing client replication recovery or timestamp-based
- * recovery, we are going to virtually truncate the log and we need
- * to retain the last checkpoint before the truncation point.
- *
- * PUBLIC: void __db_txnlist_ckp __P((DB_ENV *, void *, DB_LSN *));
- */
- void
- __db_txnlist_ckp(dbenv, listp, ckp_lsn)
- DB_ENV *dbenv;
- void *listp;
- DB_LSN *ckp_lsn;
- {
- DB_TXNHEAD *hp;
- COMPQUIET(dbenv, NULL);
- hp = (DB_TXNHEAD *)listp;
- if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
- log_compare(&hp->maxlsn, ckp_lsn) >= 0)
- hp->ckplsn = *ckp_lsn;
- }
- /*
- * __db_txnlist_end --
- * Discard transaction linked list. Print out any error messages
- * for deleted files.
- *
- * PUBLIC: void __db_txnlist_end __P((DB_ENV *, void *));
- */
- void
- __db_txnlist_end(dbenv, listp)
- DB_ENV *dbenv;
- void *listp;
- {
- DB_TXNHEAD *hp;
- DB_TXNLIST *p;
- int i;
- if ((hp = (DB_TXNHEAD *)listp) == NULL)
- return;
- for (i = 0; i < hp->nslots; i++)
- while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
- LIST_REMOVE(p, links);
- switch (p->type) {
- case TXNLIST_LSN:
- __os_free(dbenv, p->u.l.lsn_array);
- break;
- default:
- /*
- * Possibly an incomplete DB_TXNLIST; just
- * free it.
- */
- break;
- }
- __os_free(dbenv, p);
- }
- if (hp->gen_array != NULL)
- __os_free(dbenv, hp->gen_array);
- __os_free(dbenv, listp);
- }
- /*
- * __db_txnlist_find --
- * Checks to see if a txnid with the current generation is in the
- * txnid list. This returns TXN_NOTFOUND if the item isn't in the
- * list otherwise it returns (like __db_txnlist_find_internal)
- * the status of the transaction. A txnid of 0 means the record
- * was generated while not in a transaction.
- *
- * PUBLIC: int __db_txnlist_find __P((DB_ENV *, void *, u_int32_t));
- */
- int
- __db_txnlist_find(dbenv, listp, txnid)
- DB_ENV *dbenv;
- void *listp;
- u_int32_t txnid;
- {
- DB_TXNLIST *entry;
- if (txnid == 0)
- return (TXN_NOTFOUND);
- return (__db_txnlist_find_internal(dbenv, listp,
- TXNLIST_TXNID, txnid, NULL, &entry, 0));
- }
- /*
- * __db_txnlist_update --
- * Change the status of an existing transaction entry.
- * Returns TXN_NOTFOUND if no such entry exists.
- *
- * PUBLIC: int __db_txnlist_update __P((DB_ENV *,
- * PUBLIC: void *, u_int32_t, u_int32_t, DB_LSN *));
- */
- int
- __db_txnlist_update(dbenv, listp, txnid, status, lsn)
- DB_ENV *dbenv;
- void *listp;
- u_int32_t txnid;
- u_int32_t status;
- DB_LSN *lsn;
- {
- DB_TXNHEAD *hp;
- DB_TXNLIST *elp;
- int ret;
- if (txnid == 0)
- return (TXN_NOTFOUND);
- hp = (DB_TXNHEAD *)listp;
- ret = __db_txnlist_find_internal(dbenv,
- listp, TXNLIST_TXNID, txnid, NULL, &elp, 0);
- if (ret == TXN_NOTFOUND)
- return (ret);
- elp->u.t.status = status;
- if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
- hp->maxlsn = *lsn;
- return (ret);
- }
- /*
- * __db_txnlist_find_internal --
- * Find an entry on the transaction list. If the entry is not there or
- * the list pointer is not initialized we return TXN_NOTFOUND. If the
- * item is found, we return the status. Currently we always call this
- * with an initialized list pointer but checking for NULL keeps it general.
- */
- static int
- __db_txnlist_find_internal(dbenv, listp, type, txnid, uid, txnlistp, delete)
- DB_ENV *dbenv;
- void *listp;
- db_txnlist_type type;
- u_int32_t txnid;
- u_int8_t uid[DB_FILE_ID_LEN];
- DB_TXNLIST **txnlistp;
- int delete;
- {
- DB_TXNHEAD *hp;
- DB_TXNLIST *p;
- int32_t generation;
- u_int32_t hash;
- struct __db_headlink *head;
- int i, ret;
- if ((hp = (DB_TXNHEAD *)listp) == NULL)
- return (TXN_NOTFOUND);
- switch (type) {
- case TXNLIST_TXNID:
- hash = txnid;
- /* Find the most recent generation containing this ID */
- for (i = 0; i <= hp->generation; i++)
- /* The range may wrap around the end. */
- if (hp->gen_array[i].txn_min <
- hp->gen_array[i].txn_max ?
- (txnid >= hp->gen_array[i].txn_min &&
- txnid <= hp->gen_array[i].txn_max) :
- (txnid >= hp->gen_array[i].txn_min ||
- txnid <= hp->gen_array[i].txn_max))
- break;
- DB_ASSERT(i <= hp->generation);
- generation = hp->gen_array[i].generation;
- break;
- case TXNLIST_PGNO:
- memcpy(&hash, uid, sizeof(hash));
- generation = 0;
- break;
- default:
- DB_ASSERT(0);
- return (EINVAL);
- }
- head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
- for (p = LIST_FIRST(head); p != NULL; p = LIST_NEXT(p, links)) {
- if (p->type != type)
- continue;
- switch (type) {
- case TXNLIST_TXNID:
- if (p->u.t.txnid != txnid ||
- generation != p->u.t.generation)
- continue;
- ret = p->u.t.status;
- break;
- case TXNLIST_PGNO:
- if (memcmp(uid, p->u.p.uid, DB_FILE_ID_LEN) != 0)
- continue;
- ret = 0;
- break;
- default:
- DB_ASSERT(0);
- ret = EINVAL;
- }
- if (delete == 1) {
- LIST_REMOVE(p, links);
- __os_free(dbenv, p);
- } else if (p != LIST_FIRST(head)) {
- /* Move it to head of list. */
- LIST_REMOVE(p, links);
- LIST_INSERT_HEAD(head, p, links);
- }
- *txnlistp = p;
- return (ret);
- }
- return (TXN_NOTFOUND);
- }
- /*
- * __db_txnlist_gen --
- * Change the current generation number.
- *
- * PUBLIC: int __db_txnlist_gen __P((DB_ENV *,
- * PUBLIC: void *, int, u_int32_t, u_int32_t));
- */
- int
- __db_txnlist_gen(dbenv, listp, incr, min, max)
- DB_ENV *dbenv;
- void *listp;
- int incr;
- u_int32_t min, max;
- {
- DB_TXNHEAD *hp;
- int ret;
- /*
- * During recovery generation numbers keep track of "restart"
- * checkpoints and recycle records. Restart checkpoints occur
- * whenever we take a checkpoint and there are no outstanding
- * transactions. When that happens, we can reset transaction IDs
- * back to TXNID_MINIMUM. Currently we only do the reset
- * at then end of recovery. Recycle records occrur when txnids
- * are exhausted during runtime. A free range of ids is identified
- * and logged. This code maintains a stack of ranges. A txnid
- * is given the generation number of the first range it falls into
- * in the stack.
- */
- hp = (DB_TXNHEAD *)listp;
- hp->generation += incr;
- if (incr < 0)
- memmove(hp->gen_array, &hp->gen_array[1],
- (hp->generation + 1) * sizeof(hp->gen_array[0]));
- else {
- if (hp->generation >= hp->gen_alloc) {
- hp->gen_alloc *= 2;
- if ((ret = __os_realloc(dbenv, hp->gen_alloc *
- sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
- return (ret);
- }
- memmove(&hp->gen_array[1], &hp->gen_array[0],
- hp->generation * sizeof(hp->gen_array[0]));
- hp->gen_array[0].generation = hp->generation;
- hp->gen_array[0].txn_min = min;
- hp->gen_array[0].txn_max = max;
- }
- return (0);
- }
- #define TXN_BUBBLE(AP, MAX) {
- int __j;
- DB_LSN __tmp;
-
- for (__j = 0; __j < MAX - 1; __j++)
- if (log_compare(&AP[__j], &AP[__j + 1]) < 0) {
- __tmp = AP[__j];
- AP[__j] = AP[__j + 1];
- AP[__j + 1] = __tmp;
- }
- }
- /*
- * __db_txnlist_lsnadd --
- * Add to or re-sort the transaction list lsn entry. Note that since this
- * is used during an abort, the __txn_undo code calls into the "recovery"
- * subsystem explicitly, and there is only a single TXNLIST_LSN entry on
- * the list.
- *
- * PUBLIC: int __db_txnlist_lsnadd __P((DB_ENV *, void *, DB_LSN *, u_int32_t));
- */
- int
- __db_txnlist_lsnadd(dbenv, listp, lsnp, flags)
- DB_ENV *dbenv;
- void *listp;
- DB_LSN *lsnp;
- u_int32_t flags;
- {
- DB_TXNHEAD *hp;
- DB_TXNLIST *elp;
- int i, ret;
- hp = (DB_TXNHEAD *)listp;
- for (elp = LIST_FIRST(&hp->head[0]);
- elp != NULL; elp = LIST_NEXT(elp, links))
- if (elp->type == TXNLIST_LSN)
- break;
- if (elp == NULL)
- return (DB_SURPRISE_KID);
- if (LF_ISSET(TXNLIST_NEW)) {
- if (elp->u.l.ntxns >= elp->u.l.maxn) {
- if ((ret = __os_realloc(dbenv,
- 2 * elp->u.l.maxn * sizeof(DB_LSN),
- &elp->u.l.lsn_array)) != 0)
- return (ret);
- elp->u.l.maxn *= 2;
- }
- elp->u.l.lsn_array[elp->u.l.ntxns++] = *lsnp;
- } else
- /* Simply replace the 0th element. */
- elp->u.l.lsn_array[0] = *lsnp;
- /*
- * If we just added a new entry and there may be NULL entries, so we
- * have to do a complete bubble sort, not just trickle a changed entry
- * around.
- */
- for (i = 0; i < (!LF_ISSET(TXNLIST_NEW) ? 1 : elp->u.l.ntxns); i++)
- TXN_BUBBLE(elp->u.l.lsn_array, elp->u.l.ntxns);
- *lsnp = elp->u.l.lsn_array[0];
- return (0);
- }
- /*
- * __db_txnlist_lsninit --
- * Initialize a transaction list with an lsn array entry.
- *
- * PUBLIC: int __db_txnlist_lsninit __P((DB_ENV *, DB_TXNHEAD *, DB_LSN *));
- */
- int
- __db_txnlist_lsninit(dbenv, hp, lsnp)
- DB_ENV *dbenv;
- DB_TXNHEAD *hp;
- DB_LSN *lsnp;
- {
- DB_TXNLIST *elp;
- int ret;
- elp = NULL;
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
- goto err;
- LIST_INSERT_HEAD(&hp->head[0], elp, links);
- if ((ret = __os_malloc(dbenv,
- 12 * sizeof(DB_LSN), &elp->u.l.lsn_array)) != 0)
- goto err;
- elp->type = TXNLIST_LSN;
- elp->u.l.maxn = 12;
- elp->u.l.ntxns = 1;
- elp->u.l.lsn_array[0] = *lsnp;
- return (0);
- err: __db_txnlist_end(dbenv, hp);
- return (ret);
- }
- /*
- * __db_add_limbo -- add pages to the limbo list.
- * Get the file information and call pgnoadd for each page.
- *
- * PUBLIC: int __db_add_limbo __P((DB_ENV *,
- * PUBLIC: void *, int32_t, db_pgno_t, int32_t));
- */
- int
- __db_add_limbo(dbenv, info, fileid, pgno, count)
- DB_ENV *dbenv;
- void *info;
- int32_t fileid;
- db_pgno_t pgno;
- int32_t count;
- {
- DB_LOG *dblp;
- FNAME *fnp;
- int ret;
- dblp = dbenv->lg_handle;
- if ((ret = __dbreg_id_to_fname(dblp, fileid, 0, &fnp)) != 0)
- return (ret);
- do {
- if ((ret =
- __db_txnlist_pgnoadd(dbenv, info, fileid, fnp->ufid,
- R_ADDR(&dblp->reginfo, fnp->name_off), pgno)) != 0)
- return (ret);
- pgno++;
- } while (--count != 0);
- return (0);
- }
- /*
- * __db_do_the_limbo -- move pages from limbo to free.
- *
- * Limbo processing is what ensures that we correctly handle and
- * recover from page allocations. During recovery, for each database,
- * we process each in-question allocation, link them into the free list
- * and then write out the new meta-data page that contains the pointer
- * to the new beginning of the free list. On an abort, we use our
- * standard __db_free mechanism in a compensating transaction which logs
- * the specific modifications to the free list.
- *
- * If we run out of log space during an abort, then we can't write the
- * compensating transaction, so we abandon the idea of a compenating
- * transaction, and go back to processing how we do during recovery.
- * The reason that this is not the norm is that it's expensive: it requires
- * that we flush any database with an in-question allocation. Thus if
- * a compensating transaction fails, we never try to restart it.
- *
- * Since files may be open and closed within transactions (in particular,
- * the master database for subdatabases), we must be prepared to open
- * files during this process. If there is a compensating transaction, we
- * can open the files in that transaction. If this was an abort and there
- * is no compensating transaction, then we've got to perform these opens
- * in the context of the aborting transaction so that we do not deadlock.
- * During recovery, there's no locking, so this isn't an issue.
- *
- * What you want to keep in mind when reading this is that there are two
- * algorithms going on here: ctxn == NULL, then we're either in recovery
- * or our compensating transaction has failed and we're doing the
- * "create list and write meta-data page" algorithm. Otherwise, we're in
- * an abort and doing the "use compensating transaction" algorithm.
- *
- * PUBLIC: int __db_do_the_limbo __P((DB_ENV *,
- * PUBLIC: DB_TXN *, DB_TXN *, DB_TXNHEAD *));
- */
- int
- __db_do_the_limbo(dbenv, ptxn, txn, hp)
- DB_ENV *dbenv;
- DB_TXN *ptxn, *txn;
- DB_TXNHEAD *hp;
- {
- DB_TXNLIST *elp;
- int h, ret;
- ret = 0;
- /*
- * The slots correspond to hash buckets. We've hashed the
- * fileids into hash buckets and need to pick up all affected
- * files. (There will only be a single slot for an abort.)
- */
- for (h = 0; h < hp->nslots; h++) {
- if ((elp = LIST_FIRST(&hp->head[h])) == NULL)
- continue;
- if (ptxn != NULL) {
- if ((ret =
- __db_limbo_move(dbenv, ptxn, txn, elp)) != 0)
- goto err;
- } else if ((ret = __db_limbo_bucket(dbenv, txn, elp)) != 0)
- goto err;
- }
- err: if (ret != 0) {
- __db_err(dbenv, "Fatal error in abort of an allocation");
- ret = __db_panic(dbenv, ret);
- }
- return (ret);
- }
- /* Limbo support routines. */
- /*
- * __db_lock_move --
- * Move a lock from child to parent.
- */
- static int
- __db_lock_move(dbenv, fileid, pgno, mode, ptxn, txn)
- DB_ENV *dbenv;
- u_int8_t *fileid;
- db_pgno_t pgno;
- db_lockmode_t mode;
- DB_TXN *ptxn, *txn;
- {
- DBT lock_dbt;
- DB_LOCK lock;
- DB_LOCK_ILOCK lock_obj;
- DB_LOCKREQ req;
- int ret;
- lock_obj.pgno = pgno;
- memcpy(lock_obj.fileid, fileid, DB_FILE_ID_LEN);
- lock_obj.type = DB_PAGE_LOCK;
- memset(&lock_dbt, 0, sizeof(lock_dbt));
- lock_dbt.data = &lock_obj;
- lock_dbt.size = sizeof(lock_obj);
- if ((ret = dbenv->lock_get(dbenv,
- txn->txnid, 0, &lock_dbt, mode, &lock)) == 0) {
- memset(&req, 0, sizeof(req));
- req.lock = lock;
- req.op = DB_LOCK_TRADE;
- ret = dbenv->lock_vec(dbenv, ptxn->txnid, 0, &req, 1, NULL);
- }
- return (ret);
- }
- /*
- * __db_limbo_move
- * Move just the metapage lock to the parent.
- */
- static int
- __db_limbo_move(dbenv, ptxn, txn, elp)
- DB_ENV *dbenv;
- DB_TXN *ptxn, *txn;
- DB_TXNLIST *elp;
- {
- int ret;
- for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
- if (elp->type != TXNLIST_PGNO || elp->u.p.locked == 1)
- continue;
- if ((ret = __db_lock_move(dbenv, elp->u.p.uid,
- PGNO_BASE_MD, DB_LOCK_WRITE, ptxn, txn)) != 0)
- return (ret);
- elp->u.p.locked = 1;
- }
- return (0);
- }
- /*
- * __db_limbo_bucket
- * Perform limbo processing for a single hash bucket in the txnlist.
- * txn is the transaction aborting in the case of an abort and ctxn is the
- * compensating transaction.
- */
- #define T_RESTORED(txn) ((txn) != NULL && F_ISSET(txn, TXN_RESTORED))
- static int
- __db_limbo_bucket(dbenv, txn, elp)
- DB_ENV *dbenv;
- DB_TXN *txn;
- DB_TXNLIST *elp;
- {
- DB *dbp;
- DB_MPOOLFILE *mpf;
- DBMETA *meta;
- DB_TXN *ctxn, *t;
- db_pgno_t last_pgno, pgno;
- int dbp_created, in_retry, ret, t_ret;
- ctxn = NULL;
- in_retry = 0;
- meta = NULL;
- mpf = NULL;
- ret = 0;
- for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
- if (elp->type != TXNLIST_PGNO)
- continue;
- retry: dbp_created = 0;
- /*
- * Pick the transaction in which to potentially
- * log compensations.
- */
- if (!in_retry && !IS_RECOVERING(dbenv) && !T_RESTORED(txn)
- && (ret = __txn_compensate_begin(dbenv, &ctxn)) != 0)
- return (ret);
- /*
- * Either use the compensating transaction or
- * the one passed in, which will be null if recovering.
- */
- t = ctxn == NULL ? txn : ctxn;
- /* First try to get a dbp by fileid. */
- ret = __dbreg_id_to_db(dbenv, t, &dbp, elp->u.p.fileid, 0);
- /*
- * File is being destroyed. No need to worry about
- * dealing with recovery of allocations.
- */
- if (ret == DB_DELETED ||
- (ret == 0 && F_ISSET(dbp, DB_AM_DISCARD)))
- goto next;
- if (ret != 0) {
- if ((ret = db_create(&dbp, dbenv, 0)) != 0)
- goto err;
- /*
- * This tells the system not to lock, which is always
- * OK, whether this is an abort or recovery.
- */
- F_SET(dbp, DB_AM_COMPENSATE);
- dbp_created = 1;
- /* It is ok if the file is nolonger there. */
- dbp->type = DB_UNKNOWN;
- ret = __db_dbopen(dbp, t, elp->u.p.fname, NULL,
- DB_ODDFILESIZE, __db_omode("rw----"), PGNO_BASE_MD);
- if (ret == ENOENT)
- goto next;
- }
- /*
- * Verify that we are opening the same file that we were
- * referring to when we wrote this log record.
- */
- if (memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
- goto next;
- mpf = dbp->mpf;
- last_pgno = PGNO_INVALID;
- if (ctxn == NULL) {
- pgno = PGNO_BASE_MD;
- if ((ret =
- mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0)
- goto err;
- last_pgno = meta->free;
- }
- ret = __db_limbo_fix(dbp, ctxn, elp, &last_pgno, meta);
- /*
- * If we were doing compensating transactions, then we are
- * going to hope this error was due to running out of space.
- * We'll change modes (into the sync the file mode) and keep
- * trying. If we weren't doing compensating transactions,
- * then this is a real error and we're sunk.
- */
- if (ret != 0) {
- if (ret == DB_RUNRECOVERY || ctxn == NULL)
- goto err;
- in_retry = 1;
- goto retry;
- }
- if (ctxn != NULL) {
- ret = ctxn->commit(ctxn, DB_TXN_NOSYNC);
- ctxn = NULL;
- if (ret != 0)
- goto retry;
- goto next;
- }
- /*
- * This is where we handle the case where we're explicitly
- * putting together a free list. We need to decide whether
- * we have to write the meta-data page, and if we do, then
- * we need to sync it as well.
- */
- if (last_pgno == meta->free) {
- /* No change to page; just put the page back. */
- if ((ret = mpf->put(mpf, meta, 0)) != 0)
- goto err;
- meta = NULL;
- } else {
- /*
- * These changes are unlogged so we cannot have the
- * metapage pointing at pages that are not on disk.
- * Therefore, we flush the new free list, then update
- * the metapage. We have to put the meta-data page
- * first so that it isn't pinned when we try to sync.
- */
- if (!IS_RECOVERING(dbenv) && !T_RESTORED(txn))
- __db_err(dbenv, "Flushing free list to disk");
- if ((ret = mpf->put(mpf, meta, 0)) != 0)
- goto err;
- meta = NULL;
- dbp->sync(dbp, 0);
- pgno = PGNO_BASE_MD;
- if ((ret =
- mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0)
- goto err;
- meta->free = last_pgno;
- if ((ret = mpf->put(mpf, meta, DB_MPOOL_DIRTY)) != 0)
- goto err;
- meta = NULL;
- }
- next:
- /*
- * If we get here, either we have processed the list
- * or the db file has been deleted or could no be opened.
- */
- if (ctxn != NULL &&
- (t_ret = ctxn->abort(ctxn)) != 0 && ret == 0)
- ret = t_ret;
- if (dbp_created &&
- (t_ret = __db_close_i(dbp, txn, 0)) != 0 && ret == 0)
- ret = t_ret;
- dbp = NULL;
- __os_free(dbenv, elp->u.p.fname);
- __os_free(dbenv, elp->u.p.pgno_array);
- if (ret == ENOENT)
- ret = 0;
- else if (ret != 0)
- goto err;
- }
- err: if (meta != NULL)
- (void)mpf->put(mpf, meta, 0);
- return (ret);
- }
- /*
- * __db_limbo_fix --
- * Process a single limbo entry which describes all the page allocations
- * for a single file.
- */
- static int
- __db_limbo_fix(dbp, ctxn, elp, lastp, meta)
- DB *dbp;
- DB_TXN *ctxn;
- DB_TXNLIST *elp;
- db_pgno_t *lastp;
- DBMETA *meta;
- {
- DBC *dbc;
- DB_MPOOLFILE *mpf;
- PAGE *freep, *pagep;
- db_pgno_t next, pgno;
- int i, put_page, ret, t_ret;
- /*
- * Loop through the entries for this txnlist element and
- * either link them into the free list or write a compensating
- * record for each.
- */
- put_page = 0;
- ret = 0;
- mpf = dbp->mpf;
- dbc = NULL;
- for (i = 0; i < elp->u.p.nentries; i++) {
- pgno = elp->u.p.pgno_array[i];
- if ((ret = mpf->get(mpf, &pgno, DB_MPOOL_CREATE, &pagep)) != 0)
- goto err;
- put_page = 1;
- if (IS_ZERO_LSN(LSN(pagep))) {
- if (ctxn == NULL) {
- /*
- * If this is a fatal recovery which
- * spans a previous crash this page may
- * be on the free list already.
- */
- for (next = *lastp; next != 0; ) {
- if (next == pgno)
- break;
- if ((ret = mpf->get(mpf,
- &next, 0, &freep)) != 0)
- goto err;
- next = NEXT_PGNO(freep);
- if ((ret =
- mpf->put(mpf, freep, 0)) != 0)
- goto err;
- }
- if (next != pgno) {
- P_INIT(pagep, dbp->pgsize, pgno,
- PGNO_INVALID, *lastp, 0, P_INVALID);
- LSN(pagep) = LSN(meta);
- *lastp = pgno;
- }
- } else {
- P_INIT(pagep, dbp->pgsize, pgno,
- PGNO_INVALID, *lastp, 0, P_INVALID);
- if (dbc == NULL && (ret =
- dbp->cursor(dbp, ctxn, &dbc, 0)) != 0)
- goto err;
- /*
- * If the dbp is compensating (because we
- * opened it), the dbc will automatically be
- * marked compensating, but in case we didn't
- * do the open, we have to mark it explicitly.
- */
- F_SET(dbc, DBC_COMPENSATE);
- ret = __db_free(dbc, pagep);
- put_page = 0;
- /*
- * On any error, we hope that the error was
- * caused due to running out of space, and we
- * switch modes, doing the processing where we
- * sync out files instead of doing compensating
- * transactions. If this was a real error and
- * not out of space, we assume that some other
- * call will fail real soon.
- */
- if (ret != 0) {
- /* Assume that this is out of space. */
- (void)dbc->c_close(dbc);
- dbc = NULL;
- goto err;
- }
- }
- }
- if (put_page == 1) {
- ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY);
- put_page = 0;
- }
- if (ret != 0)
- goto err;
- }
- err: if (put_page &&
- (t_ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY)) != 0 && ret == 0)
- ret = t_ret;
- if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
- ret = t_ret;
- return (ret);
- }
- #define DB_TXNLIST_MAX_PGNO 8 /* A nice even number. */
- /*
- * __db_txnlist_pgnoadd --
- * Find the txnlist entry for a file and add this pgno, or add the list
- * entry for the file and then add the pgno.
- */
- static int
- __db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno)
- DB_ENV *dbenv;
- DB_TXNHEAD *hp;
- int32_t fileid;
- u_int8_t uid[DB_FILE_ID_LEN];
- char *fname;
- db_pgno_t pgno;
- {
- DB_TXNLIST *elp;
- u_int32_t hash;
- int len, ret;
- elp = NULL;
- if (__db_txnlist_find_internal(dbenv, hp,
- TXNLIST_PGNO, 0, uid, &elp, 0) != 0) {
- if ((ret =
- __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
- goto err;
- memcpy(&hash, uid, sizeof(hash));
- LIST_INSERT_HEAD(
- &hp->head[DB_TXNLIST_MASK(hp, hash)], elp, links);
- elp->u.p.fileid = fileid;
- memcpy(elp->u.p.uid, uid, DB_FILE_ID_LEN);
- len = (int)strlen(fname) + 1;
- if ((ret = __os_malloc(dbenv, len, &elp->u.p.fname)) != 0)
- goto err;
- memcpy(elp->u.p.fname, fname, len);
- elp->u.p.maxentry = 0;
- elp->u.p.locked = 0;
- elp->type = TXNLIST_PGNO;
- if ((ret = __os_malloc(dbenv,
- 8 * sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
- goto err;
- elp->u.p.maxentry = DB_TXNLIST_MAX_PGNO;
- elp->u.p.nentries = 0;
- } else if (elp->u.p.nentries == elp->u.p.maxentry) {
- elp->u.p.maxentry <<= 1;
- if ((ret = __os_realloc(dbenv, elp->u.p.maxentry *
- sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
- goto err;
- }
- elp->u.p.pgno_array[elp->u.p.nentries++] = pgno;
- return (0);
- err: __db_txnlist_end(dbenv, hp);
- return (ret);
- }
- /*
- * __db_default_getpgnos --
- * Fill in default getpgnos information for an application-specific
- * log record.
- */
- static int
- __db_default_getpgnos(dbenv, lsnp, summary)
- DB_ENV *dbenv;
- DB_LSN *lsnp;
- void *summary;
- {
- TXN_RECS *t;
- int ret;
- t = (TXN_RECS *)summary;
- if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0)
- return (ret);
- t->array[t->npages].flags = LSN_PAGE_NOLOCK;
- t->array[t->npages].lsn = *lsnp;
- t->array[t->npages].fid = DB_LOGFILEID_INVALID;
- memset(&t->array[t->npages].pgdesc, 0,
- sizeof(t->array[t->npages].pgdesc));
- t->npages++;
- return (0);
- }
- #ifdef DEBUG
- /*
- * __db_txnlist_print --
- * Print out the transaction list.
- *
- * PUBLIC: void __db_txnlist_print __P((void *));
- */
- void
- __db_txnlist_print(listp)
- void *listp;
- {
- DB_TXNHEAD *hp;
- DB_TXNLIST *p;
- int i;
- char *stats[] = { "ok", "commit", "prepare", "abort", "notfound",
- "ignore", "expected", "unexpected" };
- hp = (DB_TXNHEAD *)listp;
- printf("Maxid: %lu Generation: %lun",
- (u_long)hp->maxid, (u_long)hp->generation);
- for (i = 0; i < hp->nslots; i++)
- for (p = LIST_FIRST(&hp->head[i]); p != NULL; p = LIST_NEXT(p, links)) {
- switch (p->type) {
- case TXNLIST_TXNID:
- printf("TXNID: %lx(%lu): %sn",
- (u_long)p->u.t.txnid, (u_long)p->u.t.generation,
- stats[p->u.t.status]);
- break;
- default:
- printf("Unrecognized type: %dn", p->type);
- break;
- }
- }
- }
- #endif