Radix cross Linux

/*
 *   Copyright (c) International Business Machines Corp., 2000-2002
 *
 *   This program is free software;  you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 *   the GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program;  if not, write to the Free Software
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
#include <config.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <string.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <assert.h>

#include "jfs_types.h"
#include "jfs_endian.h"
#include "jfs_filsys.h"
#include "jfs_superblock.h"
#include "jfs_dinode.h"
#include "jfs_dtree.h"
#include "jfs_xtree.h"
#include "jfs_logmgr.h"
#include "jfs_dmap.h"
#include "jfs_imap.h"
#include "logredo.h"
#include "logform.h"
#include "devices.h"
#include "debug.h"
#include "utilsubs.h"
#include "fsck_message.h"		/* for chkdsk message logging facility */

/* + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 *
 *   L O C A L   M A C R O    D E F I N I T I O N S
 *
 */
#define MAKEDEV(__x,__y)        (dev_t)(((__x)<<16) | (__y))

#define LOGPNTOB(x)  ((x)<<L2LOGPSIZE)

#define LOG2NUM(NUM, L2NUM)\
{\
        if ((NUM) <= 0)\
                L2NUM = -1;\
        else\
        if ((NUM) == 1)\
                L2NUM = 0;\
        else\
        {\
                L2NUM = 0;\
                while ( (NUM) > 1 )\
                {\
                        L2NUM++;\
                        (NUM) >>= 1;\
                }\
        }\
}

/* + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 *
 *    R E M E M B E R    M E M O R Y    A L L O C    F A I L U R E
 *
 */
int32_t Insuff_memory_for_maps = 0;
char *available_stg_addr = NULL;
int32_t available_stg_bytes = 0;
char *bmap_stg_addr = NULL;
int32_t bmap_stg_bytes = 0;

/* + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 *
 *    S T U F F    F O R    T H E    L O G
 *
 */
struct logsuper logsup;		/* log super block */
int32_t numdoblk;		/* number of do blocks used     */
int32_t numnodofile;		/* number of nodo file blocks used  */
int32_t numExtDtPg = 0;		/* number of extended dtpage blocks used  */

/*
 *      open file system aggregate/lv array
 *
 * logredo() processes a single log.
 *
 * In the first release, logredo will process a single log which relates
 * to the single fileset in a single aggregate.  In some future release,
 * a single log may be used for multiple filesets which may or may not all
 * reside in the same aggregate.
 *
 */
struct vopen vopen[MAX_ACTIVE];
struct log_info Log;
struct {
	uuid_t uuid;
	FILE *fp;
} primary_vol;
extern int LogOpenMode;		/* logdump sets this to O_RDONLY */

/*
 * if this flag is set then the primary superblock is
 * corrupt.  The secondary superblock is good, but chkdsk
 * wasn't able to fix the primary version.  logredo can
 * run, but must use the secondary version of the
 * aggregate superblock
 */
int32_t use_2ndary_agg_superblock;
/*
 *      file system page buffer cache
 *
 * for k > 0, bufhdr[k] describes contents of buffer[k-1].
 * bufhdr[0] is reserved as anchor for free/lru list:
 * bufhdr[0].next points to the MRU buffer (head),
 * bufhdr[0].prev points to the LRU buffer (tail);
 */

/* buffer header table */
struct bufhdr {
	int16_t next;		/* 2: next on free/lru list */
	int16_t prev;		/* 2: previous on free/lru list */
	int16_t hnext;		/* 2: next on hash chain */
	int16_t hprev;		/* 2: previous on hash chain */
	char modify;		/* 1: buffer was modified */
	char inuse;		/* 1: buffer on hash chain */
	int16_t reserve;	/* 2 */
	int32_t vol;		/* 4: minor of agrregate/lv number */
	pxd_t pxd;		/* 8: on-disk page pxd */
} bufhdr[NBUFPOOL];		/* (24) */

/* buffer table */
struct bufpool {
	char bytes[PSIZE];
} buffer[NBUFPOOL - 1];

/*
 *      log page buffer cache
 *
 * log has its own 4 page buffer pool.
 */
uint8_t afterdata[LOGPSIZE * 2];	/* buffer to read in redopage data */

/*
 * Miscellaneous
 */
extern caddr_t prog;			/* Program name */
extern int32_t mntcnt;
extern int32_t bufsize;
extern char *mntinfo;
extern int32_t retcode;		/* return code from logredo    */
int end_of_transaction = 0;

/*
 * external references
 */
extern char *optarg;
extern int optind;
extern int initMaps(int32_t);
extern int updateMaps(int);
extern int findEndOfLog(void);
extern int logRead(int32_t, struct lrd *, char *);
extern int logredoInit(void);
extern int doCommit(struct lrd *);
extern int doExtDtPg(void);
extern int doNoRedoFile(struct lrd *, uint32_t);
extern int doNoRedoPage(struct lrd *);
extern int doNoRedoInoExt(struct lrd *);
extern int doAfter(struct lrd *, int32_t);
extern int doUpdateMap(struct lrd *);
extern int alloc_wrksp(uint32_t, int, int, void **);

extern FILE * open_by_label(uuid_t, int, int, char *, int *);
extern char log_device[];
/*
 * forward references
 */
int doMount(struct lrd *);
int openVol(int32_t);
int updateSuper(int vol);
int rdwrSuper(FILE *, struct superblock *, int32_t);
int bflush(int32_t, struct bufpool *);
int logOpen(void);
int fsError(int, int, int64_t);
int logError(int, int);
static int recoverExtendFS(FILE *);
int alloc_storage(int32_t, void **, int32_t *);
int alloc_dmap_bitrec(struct dmap_bitmaps **);

/*
 * debug control
 */
#ifdef _JFS_DEBUG
int32_t dflag = 1;
time_t *Tp;
uint32_t tp_start, tp_end;
int xdump(char *, int);
int x_scmp(char *, char *);
void x_scpy(char *, char *);
int prtdesc(struct lrd *);
#else
int32_t dflag = 0;
#endif

/*
 * NAME:        jfs_logredo()
 *
 * FUNCTION:	Replay all transactions committed since the most
 *		recent synch point.
 *
 * NOTES:
 *	>>>>>> 	The log replay is accomplished in one pass over the
 *		log, reading backwards from logend to the first synch
 *		point record encountered.  This means that the log
 *		entries are read and processed in LIFO (Last-In-First-Out)
 *		order.  In other words, the records logged latest in
 *		time are the first records processed during log replay.
 *
 *	>>>>>> 	Inodes, index trees, and directory trees
 *
 *		Inodes, index tree structures, and directory tree
 *		structures are handled by processing committed redopage
 *		records which have not been superceded by noredo records.
 *		This processing copies data from the log record into the
 *		appropriate disk extent page(s).
 *
 *		To ensure that only the last (in time) updates to any
 *		given disk page are applied during log replay, logredo
 *		maintains a record (union structure summary1/summary2),
 *		for each disk page which it has processed, of which
 *		portions have been updated by log records encountered.
 *
 *	>>>>>> 	Inode Allocation Map processing

 *		The xtree for the Inode Allocation Map is journaled, and
 *		a careful write is used to update it during commit
 *		processing.
 * The imap index tree is also duplicated at the known location. (TBD)
 * So at logredo time, the xtree for imap is always readable and correct.
 * This is the basic requirement from logredo.
 *
 * the inode map control page (struct dinomap) is only flushed to disk at
 * the umount time. For iag, pmap will go to disk at commit time.
 * iagnum will not change in run-time.
 * agstart field will stable without extendfs utility. It is TBD for
 * how to handle agstart when extendfs utility is available.
 * Other fields ( wmap. inosmap, extsmap ino free list pointers,
 * ino ext free list pointers ) are at working status ( i.e they are
 * updated in run-time. So the following
 * meta-data of the imap need to be reconstructed at the logredo time:
 *  1) IAGs, the pmap of imap and inoext array are contained in IAGs.
 *  2) AG Free inode list
 *  3) AG Free Inode Extent list
 *  4) IAG Free list
 *
 * There are two imaps need to take care of :
 *   1) aggregate imap
 *   2) fileset imap
 * For the first release, the aggregate imap is stable and we only
 * need to deal with the fileset imap.
 *
 * Block Allocation Map (bmap file) is for an aggregate/lv. There are
 * three fields related to the size of bmap file.
 *  1) superblock.s_size: This field indicates aggregate size. It
 *                        tells number of sector-size blocks for this
 *                        aggregate. The size of aggregate determines
 *                        the size of its bmap file.
 *                        Since the aggregate's superblock is updated
 *                        using sync-write, superblock.s_size is trustable
 *                        at logredo time.
 *               note1:   mkfs reserves the fsck space. So s_size really
 *                        inidcate (size_of_aggregate - fsck_reserve_space)
 *               note2:   At the mkfs time, "-s" parameter could be used
 *                        to indicate how large the aggregate/filesystem is.
 *                        One lv contains at most one aggregate/filesystem.
 *                        If "-s" gives the value is smaller than the size
 *                        of lv, it is ok. The space is just wasted.
 *
 *                        Without "-s" parameter, mkfs wil use the whole
 *                        size of lv to make an aggregate/filesystem.
 *                        That is usually the case. So we can also say
 *                        an aggregate/lv. "-s" is often used for test.
 *
 *  2) dbmap.dn_mapsize: This field also indicates aggregate/lv size.
 *                        It tells number of aggre. blocks in the
 *                        aggregate/lv. Without extendfs, this field should
 *                        be equivalent to superblock.s_size.
 *                        With extendfs, this field may not be updated
 *                        before a system crash happens. So logredo
 *                        need to update it.
 *  3) dinode.di_size:  For an inode of bmap file, this field indicates
 *                        the logical size of the file. I.e. it contains
 *                        the offset value of the last byte written
 *                        in the file plus one.
 *                        So di_size will include the bmap control page,
 *                        the dmap control pages and dmap pages.
 *                        In the JFS, if a file is a sparse file, the logical
 *                        size is different from its physical size.
 *                        The bmap file is a sparse file if the total of
 *                        dmap pages is  ( < 1024) or ( < 1024 * 1024).
 *                        In that case, physically L1.0, and/or L2 does
 *                        not exist, but di_size will include their page
 *                        size.
 *
 *              Note:     The di_size does NOT contain the logical
 *                        structure of the file, i.e. the space allocated
 *                        for the xtree stuff is not indicated in di_size.
 *                        It is indicated in di_nblocks.
 *
 *                        In addition, the mkfs always put one more dmap
 *                        page into the bmap file for preparing extendfs.
 *                        This hidden dmap page cannot be figured out from
 *                        superblock.s_size, but di_size includes it. Any
 *                        dmapctl pages caused by this hidden dmap page
 *                        are also included in di_size.
 *
 * The bmap control page, dmap control pages and dmap pages are all
 * needed to rebuild at logredo time.
 *
 * In overall, the following actions are taken at logredo time:
 *   1) apply log rec data to the specified page.
 *   2) initialize freelist for dtree page or root.
 *   3) rebuilt imap
 *   4) rebuilt bmap
 *   in addition, in order to ensure the log record only applying to a
 *   certain portion of page one time, logredo will start NoRedoFile,
 *   NoRedoExtent/NoRedoPage filter in the process for accuracy and
 *   efficiency.
 *
 *  The three log rec types: REDOPAGE, NOREDOPAGE, NOREDOINOEXT, and
 *  UPDATEMAP, are the main force to initiate these actions.  See
 *  comments on doAfter(), updatePage(), doNoRedoPage(), doNoRedoInoExt,
 *  and doUpdateMap() for detailed information.
 *
 * If the aggregate/lv has state of FM_DIRTY, then fsck will run
 * after the logredo process since logredo could not get 100%
 * recovery. Currently bmap rebuild is slow ( 1 min per 32 GB),
 * so logredo will NOT rebuild imap and bmap if fsck will do it
 * anyway. But logredo still read maps in and mark them for starting
 * NoRedoExtent/NoRedoPage filter.
 *
 * The maps are rebuilt in the following way:
 * at the init phase, storage is allocated for the whole map file for
 * both imap and bmap. Reading in the map files from the disk.
 * The wmap is inited to zero. At the logredo time, the wmap is used
 * to track the bits in pmap. In the beginning of the logredo process
 * the allocation status of every block is in doubt. As log records
 * are processed, the allocation state is determined and the bit of pmap
 * is updated. This fact is recorded in the corresponding bits in wmap.
 * So a pmap bit is only updated once at logredo time and only updated
 * by the latest in time log record.
 * At the end of logredo, the control information, the freelist, etc.
 * are built from the value of pmap; then pmap is copied to wmap and
 * the whole map is written back to disk.
 *
 * the status field s_state in the superblock of each file-system is
 * set to FM_CLEAN provided the initial status was either FM_CLEAN
 * or FM_MOUNT and logredo processing was successful. If an error
 * is detected in logredo the status is set to FM_LOGREDO. the status
 * is not changed if its initial value was FM_MDIRTY. fsck should be
 * run to clean-up the probable damage if the status after logredo
 * is either FM_LOGREDO or FM_MDIRTY.
 *
 *  The log record has the format:
 *   <LogRecordData><LogRecLRD>
 *  At logredo time, the log is read backward. So for every log rec,
 *  we read LogRecLRD, which tells how long the LogRecordData is.
 *  see comments on updatePage() for detailed info of log record format.
 *
 *.....................................................................
 * The logredo handles the log-within-file-system (aka inline log) issue:
 *.....................................................................
 * For AIX, we always deal with the outline log, i.e. the log resides
 * in a separate logical volume. A log is associated with one volume
 * group and can be shared by many file systems with this volume group.
 * In AIX, the logredo received a device name. It then determines if
 * this device is a log name  or a filesystem name. If it is a filesustem
 * name, get the log minor number for this filesystem. If it is a log name,
 * get its minor number.
 *
 * XJFS decided to put log inside the file system
 *
 * For supporting the inline log, the above AIX logic should be changed.
 *
 * Here is the outline:
 *
 * When the logredo received a device name, it first read the SIZE_OF_SUPER
 * bytes from SUPER1_OFF  offset to see if it is a file system superblock.
 * If yes, check the s_flag to see if it has a inline log or outline log.
 * for an inline log the s_logdev should match the input device name's
 * major and minor number. If not, an error is returned and logredo exit.
 * If no error, the logredo read the log superblock according the log info
 * in the fs superblock.
 * If the device name does not represent a filesystem device, then logredo
 * read the LOGPSIZE bytes from the log page 1 location. If it indicates
 * a log device, then open the filesystems according to the log superblock's
 * active list. For each filesystem in the active list, read its superblock
 * if one of the superblock indicates that it uses an inline log, return
 * an error. It is a system code bug if some filesystems use inline log
 * and some use outline log.
 * If the superblock indicates it used an outline log, check the superblock's
 * s_logdev to match the input device name's major and minor numbers.
 * If one of them does not match, return error. -- It is a system code bug,
 * if some match and some not match; -- It should either match all or non of
 * them match. The AIX logredo never check s_logdev with the input log device.
 * We should check here.
 *
 * for outline log, logredo will be called once to cover all the file
 * systems in the log superblock's active list.
 * For inline log, logredo will be called many times. Each time is for
 * one file system. The log superblock's active list has nothing. The
 * logmajor and logminor contains file system's major and minor number.
 *
 *.....................................................................
 * logredo handles support EA:
 *.....................................................................
 * There is 16-byte EA descriptor which is located in the section I of
 * dinode.
 * The EA can be inline or outline. If it is inlineEA then the data will
 * occupy the section IV of the dinode. The dxd_t.flag will indicate so.
 * If it is outlineEA, dxd_t.flag will indicate so and the single extent
 * is described by EA descriptor.
 *
 * The section IV of dinode has 128 byte. It is shared by the xtroot and
 * inlineEA. The sharing is in FCFS style. If xtree gets the section IV,
 * xtree will never give it away even if xtree is shrink or split.
 * If inlineEA gets it, there is a chance that later inlineEA is freed and
 * so xtree still can get it.
 *
 * for outlineEA, the XJFS will syncly write the data portion out so there
 * is no log rec for the data, but there is still an INODE log rec for EA
 * descriptor changes and there is a UPDATEMAP log rec for the allocated
 * pxd. If an outlineEA is freed, there are also two log records for it:
 * one is INODE with EA descriptor zeroed out, another is the UPDATEMAP
 * log rec for the freed pxd.
 * For inlineEA, it has to be recorded in the log rec. It is not in a
 * separate log rec. Just one additional segment is added into the
 * INODE log rec. So an INODE log rec can have at most three segments:
 * when the parent and child inodes are in the same page, then there are
 * one segment for parent base inode; one segment for child base inode;
 * and maybe the third one for the child inlineEA data.
 *....................................................................
 * 32-bit vs 64-bit
 * At the first release. assume that a file system will not be larger
 * than 32-bit.
 *....................................................................
 * TBD:
 * the method for handling crashes in the middle of extending a file
 * system is as follows. the size of a filesystem is established from
 * the superblock.s_size field (i.e the sizes in the diskmap
 * and inodemaps are ignored). in extendfs (jfs_cntl.c) the superblock
 * is not updated before the maps have been extended and the new inodes
 * formatted to zeros. no allocations in the new part of the filesystem
 * occur prior to the change in map sizes. if a crash occurs just
 * before updating the superblock, the map sizes will be their old
 * values. in this case the maps as files may be bigger than necessary.
 * if the crash occurs just after writing the super block, the map sizes
 * are fixed up here.
 */
int jfs_logredo(caddr_t pathname, FILE *fp, int32_t use_2nd_aggSuper)
{
	int rc;
	int k, logaddr, nextaddr, lastaddr, nlogrecords;
	int syncrecord = 0;
	struct lrd ld;
	int lowest_lr_byte = 2 * LOGPSIZE + LOGPHDRSIZE;
	int highest_lr_byte = 0;
	int log_has_wrapped = 0;
	int logend;
	int in_use;

	/*
	 * store away the indicator of which aggregate superblock
	 * to use
	 */
	use_2ndary_agg_superblock = use_2nd_aggSuper;

	/*
	 * loop until we get enough memory to read vmount struct
	 */
	mntinfo = (char *) &bufsize;
	bufsize = sizeof (int);

	/*
	 * validate that the log is not currently in use;
	 */
	rc = findLog(fp, &in_use);
	if (rc < 0) {
		fsck_send_msg(lrdo_DEVOPNREADERROR);
		return (rc);
	}

	/* recover from extendfs() ? */
	if (Log.location & INLINELOG && (vopen[0].status & FM_EXTENDFS)) {
		fsck_send_msg(lrdo_REXTNDBEGIN);
		rc = recoverExtendFS(fp);
		fsck_send_msg(lrdo_REXTNDDONE);
		return rc;
	}

	/*
	 * validate log superblock
	 *
	 * aggregate block size is for log file as well.
	 */
	rc = ujfs_rw_diskblocks(Log.fp,
				(uint64_t) (Log.xaddr +
					    LOGPNTOB(LOGSUPER_B)),
				(unsigned) sizeof (struct logsuper), (char *) &logsup, GET);
	if (rc != 0) {
		fsck_send_msg(lrdo_CANTREADLOGSUP);
		rc = LOGSUPER_READ_ERROR;
		goto error_out;
	}
	ujfs_swap_logsuper(&logsup);

	if (logsup.magic != LOGMAGIC) {
		fsck_send_msg(lrdo_LOGSUPBADMGC);
		rc = NOT_LOG_FILE_ERROR;
		goto error_out;
	}

	if (logsup.version > LOGVERSION) {
		fsck_send_msg(lrdo_LOGSUPBADVER);
		rc = JFS_VERSION_ERROR;
		goto error_out;
	}

	if (Log.location & OUTLINELOG) {
		struct stat st;

		if ((rc = fstat(fileno(Log.fp), &st)))
			goto error_out;

		Log.devnum = st.st_rdev;

		if (in_use) {
			fsck_send_msg(lrdo_LOGINUSE);
			return LOG_IN_USE;
		}
	}

	if (logsup.state == LOGREDONE) {
		fsck_send_msg(lrdo_ALREADYREDONE);
		if (Log.location & INLINELOG)
			if ((rc = updateSuper(0)) != 0) {
				fsck_send_msg(lrdo_CANTUPDLOGSUP);
				return (rc);
			}
		return (0);
	}

	Log.size = logsup.size;
	Log.serial = logsup.serial;

	/*
	 * find the end of log
	 */
	logend = findEndOfLog();

	if (logend < 0) {
		fsck_send_msg(lrdo_LOGEND, logend);

		fsck_send_msg(lrdo_LOGENDBAD1);
		logError(LOGEND, 0);
		ujfs_swap_logsuper(&logsup);
		rc = ujfs_rw_diskblocks(Log.fp,
					(Log.xaddr + LOGPNTOB(LOGSUPER_B)),
					(unsigned long) LOGPSIZE, (char *) &logsup, PUT);
		rc = logend;
		goto error_out;
	}

	/*
	 * allocate/initialize logredo runtime data structures and
	 * initialize each file system associated with the log based on
	 * the contents of its superblock
	 */
	if ((rc = logredoInit()) != 0) {
		fsck_send_msg(lrdo_INITFAILED, rc, errno);
		goto error_out;
	}

	highest_lr_byte = logsup.size * LOGPSIZE - LOGRDSIZE;

	if ((logend < lowest_lr_byte) || (logend > highest_lr_byte)) {
		fsck_send_msg(lrdo_LOGEND, logend);

		fsck_send_msg(lrdo_LOGENDBAD2);
		rc = INVALID_LOGEND;
		goto error_out;
	}

	/*
	 *      replay log
	 *
	 * read log backwards and process records as we go.
	 * reading stops at place specified by first SYNCPT we
	 * encounter.
	 */
	nlogrecords = lastaddr = 0;
	nextaddr = logend;

	do {
		logaddr = nextaddr;
		nextaddr = logRead(logaddr, &ld, afterdata);
		DBG_TRACE(("Logaddr=%x\nNextaddr=%x\n", logaddr, nextaddr))
		    nlogrecords += 1;
		/*
		 *
		 * Validate the nextaddr as much as possible
		 *
		 */
		if (nextaddr < 0) {
			fsck_send_msg(lrdo_NEXTADDRINVALID);
			rc = nextaddr;
			goto error_out;
		}

		if ((nextaddr < lowest_lr_byte)
		    || (nextaddr > highest_lr_byte)) {
			fsck_send_msg(lrdo_NEXTADDROUTRANGE, nextaddr);
			rc = INVALID_NEXTADDR;
			goto error_out;
		}

		if (nextaddr == logaddr) {
			fsck_send_msg(lrdo_NEXTADDRSAME, nextaddr);
			rc = NEXTADDR_SAME;
			goto error_out;
		}

		if (nextaddr > logaddr) {
			if (log_has_wrapped) {
				fsck_send_msg(lrdo_LOGWRAPPED);
				rc = LOG_WRAPPED_TWICE;
				goto error_out;
			} else {
				log_has_wrapped = -1;
			}
		}
		/*
		 *
		 * The addresses seem ok.  Process the current record.
		 *
		 */
		switch (ld.type) {
		case LOG_COMMIT:
			rc = doCommit(&ld);
			if (rc) {
				fsck_send_msg(lrdo_BADCOMMIT, logaddr);

				goto error_out;
			}
			break;
		case LOG_MOUNT:
			fsck_send_msg(lrdo_MOUNTRECORD, logaddr);

			rc = doMount(&ld);
			if (rc) {
				fsck_send_msg(lrdo_BADMOUNT, logaddr);

				goto error_out;
			}
			break;

		case LOG_SYNCPT:
			fsck_send_msg(lrdo_SYNCRECORD, logaddr);

			rc = 0;
			if (lastaddr == 0) {
				syncrecord = logaddr;
				lastaddr = (ld.log.syncpt.sync == 0)
				    ? logaddr : ld.log.syncpt.sync;
			}
			break;

		case LOG_REDOPAGE:
			DBG_TRACE(("jfs_logredo:Case Log_redoPage"))
			    rc = doAfter(&ld, logaddr);
			if (rc) {
				fsck_send_msg(lrdo_BADREDOPAGE, logaddr);
				goto error_out;
			}
			break;

		case LOG_NOREDOPAGE:
			DBG_TRACE(("jfs_logredo:Case Log_noredopage"))
			    rc = doNoRedoPage(&ld);
			if (rc) {
				fsck_send_msg(lrdo_BADNOREDOPAGE, logaddr);
				goto error_out;
			}
			break;

		case LOG_NOREDOINOEXT:
			DBG_TRACE(("jfs_logredo:Case Log_noredoinoext"))
			    rc = doNoRedoInoExt(&ld);
			if (rc) {
				fsck_send_msg(lrdo_BADNOREDOINOEXT, logaddr);
				goto error_out;
			}
			break;

		case LOG_UPDATEMAP:
			rc = doUpdateMap(&ld);
			if (rc) {
				fsck_send_msg(lrdo_BADUPDATEMAP, logaddr);
				goto error_out;
			}
			break;

		default:
			fsck_send_msg(lrdo_UNKNOWNTYPE, logaddr);
			rc = UNRECOG_LOGRECTYP;
			goto error_out;
			break;
		}

		if (rc < 0) {
			fsck_send_msg(lrdo_ERRORNEEDREFORMAT);
			goto error_out;
		}

		if (rc != 0) {
			fsck_send_msg(lrdo_ERRORCANTCONTIN);
			goto error_out;
		}

		/*
		 * If the transaction just completed was the last
		 * for the current transaction, then flush the
		 * buffers.
		 */
		if (end_of_transaction != 0) {
			for (k = 1; k < NBUFPOOL; k++) {
				if ((rc = bflush(k, &buffer[k - 1])) != 0)
					goto error_out;
			}
			end_of_transaction = 0;
		}

	} while (logaddr != lastaddr);
	/*
	 * If any 'dtpage extend' records were processed, then we need
	 * to go back and rebuild their freelists.  This cannot be done
	 * when the 'dtpage extend' record is processed, since there may
	 * be records processed later which affect the previous (shorter)
	 * version of the dtpage.  Only after all these records are processed
	 * can we safely and accurately rebuild the freelist.
	 */
	if (numExtDtPg != 0) {
		rc = doExtDtPg();
	}

	/*
	 * flush data page buffer cache
	 */
	for (k = 1; k < NBUFPOOL; k++) {
		if ((rc = bflush(k, &buffer[k - 1])) != 0)
			break;
	}

	/*
	 *      finalize file systems
	 *
	 * update allocation map and superblock of file systems
	 * of volumes which are open if they were modified here.
	 * i.e. if they were not previously unmounted cleanly.
	 */
	for (k = 0; k < MAX_ACTIVE; k++) {
		if (vopen[k].state != VOPEN_OPEN)
			continue;

		if ((rc = updateMaps(k)) != 0) {
			fsck_send_msg(lrdo_ERRORCANTUPDMAPS);
			goto error_out;
		}

		/* Make sure all changes are committed to disk before we
		 * mark the superblock clean
		 */
		ujfs_flush_dev(vopen[k].fp);

		if ((rc = updateSuper(k)) != 0) {
			fsck_send_msg(lrdo_ERRORCANTUPDFSSUPER);
			goto error_out;
		}

		/* sync superblock before journal is finalized */
		ujfs_flush_dev(vopen[k].fp);
	}

	/*
	 *      finalize log.
	 *
	 * clear active list.
	 * If this is a fully replayed log then it can be moved to earlier
	 * versions of the operating system.  Therefore switch the magic
	 * number to the earliest level.
	 */
	if (logsup.state != LOGREADERR) {
		for (k = 0; k < MAX_ACTIVE; k++)
			uuid_clear(logsup.active[k]);

		logsup.end = logend;
		logsup.state = LOGREDONE;
		logsup.magic = LOGMAGIC;
	}
	ujfs_swap_logsuper(&logsup);
	rc = ujfs_rw_diskblocks(Log.fp, (Log.xaddr + LOGPNTOB(LOGSUPER_B)),
				LOGPSIZE, (char *) &logsup, PUT);

	/*
	 * now log some info for the curious
	 */
	fsck_send_msg(lrdo_LOGEND, logend);

	fsck_send_msg(lrdo_RPTSYNCNUM, syncrecord);

	fsck_send_msg(lrdo_RPTSYNCADDR, lastaddr);

	fsck_send_msg(lrdo_RPTNUMLOGREC, nlogrecords);

	fsck_send_msg(lrdo_RPTNUMDOBLK, numdoblk);

	fsck_send_msg(lrdo_RPTNUMNODOBLK, numnodofile);

      error_out:

	if (rc > 0) {
		rc = rc * (-1);
	}

	/*
	 * If everything went ok except that we didn't have
	 * enough memory to deal with the block map, tell chkdsk
	 * to be sure to do a full check and repair, but that a log
	 * format is not necessary
	 */
	if ((rc == 0) && Insuff_memory_for_maps) {
		rc = ENOMEM25;
	}

	return (rc);
}

/*
 * NAME:        doMount(ld)
 *
 * FUNCTION:    a log mount record is the first-in-time record which is
 *              put in the log so it is the last we want to process in
 *              logredo. so we mark volume as cleanly unmounted in vopen
 *              array. the mount record is imperative when the volume
 *              is a newly made filesystem.
 */
int doMount(struct lrd *ld)
{				/* pointer to record descriptor */
	int vol, status;

	vol = ld->aggregate;

	status = vopen[vol].status;
	DBG_TRACE(("Logredo:domount: status=%d\n", status))

	    if (!(status & (FM_LOGREDO | FM_DIRTY)))
		vopen[vol].status = FM_CLEAN;

	return (0);
}

/*
 * NAME:        openVol(vol)
 *
 * FUNCTION:    open the aggregate/volume specified.
 *              check if it was cleanly unmounted. also check log
 *              serial number. initialize disk and inode mpas.
 */
int openVol(int vol)
{				/* device minor number of aggregate/lv */
	int rc, l2agsize, agsize;
	int64_t fssize;		/* number of aggr blks in the aggregate/lv */
	struct superblock sb;
	int aggsb_numpages;

	if (Log.location & OUTLINELOG) {
		/* First check if this is the already opened volume */
		if (!uuid_compare(vopen[vol].uuid, primary_vol.uuid))
			vopen[vol].fp = primary_vol.fp;
		else {
			vopen[vol].fp = open_by_label(vopen[vol].uuid, 0, 0,
						      NULL, NULL);
			if (vopen[vol].fp == NULL)
				return ENOENT;
		}
	}

	/* read superblock of the aggregate/volume */
	if ((rc = rdwrSuper(vopen[vol].fp, &sb, PB_READ)) != 0) {
		fsck_send_msg(lrdo_CANTREADFSSUPER);

		fsError(READERR, vol, SUPER1_B);
		vopen[vol].state = VOPEN_CLOSED;
		return (FSSUPER_READERROR1);
	}

	/* check magic number and initialize version specific
	 * values in the vopen struct for this vol.
	 */
	if (strncmp(sb.s_magic, JFS_MAGIC, (unsigned) strlen(JFS_MAGIC))) {
		fsck_send_msg(lrdo_FSSUPERBADMAGIC);
		vopen[vol].state = VOPEN_CLOSED;
		return (LOGSUPER_BADMAGIC);
	}
	if (sb.s_version > JFS_VERSION) {
		fsck_send_msg(lrdo_FSSUPERBADMAGIC);
		vopen[vol].state = VOPEN_CLOSED;
		return (LOGSUPER_BADVERSION);
	}

	if (Log.location & OUTLINELOG && (sb.s_flag & (JFS_INLINELOG == JFS_INLINELOG))) {
		fsck_send_msg(lrdo_FSSUPERBADLOGLOC);
		vopen[vol].state = VOPEN_CLOSED;
		return (LOGSUPER_BADLOGLOC);
	}
	vopen[vol].lblksize = sb.s_bsize;
	vopen[vol].l2bsize = sb.s_l2bsize;
	vopen[vol].l2bfactor = sb.s_l2bfactor;
	fssize = sb.s_size >> sb.s_l2bfactor;
	vopen[vol].fssize = fssize;
	vopen[vol].agsize = sb.s_agsize;
	/* LOG2NUM will alter agsize, so use local var (Then why don't we
	   fix LOG2NUM?) */
	agsize = vopen[vol].agsize;
	LOG2NUM(agsize, l2agsize);
	vopen[vol].numag = fssize >> l2agsize;
	if (fssize & (vopen[vol].agsize - 1))
		vopen[vol].numag += 1;
	vopen[vol].l2agsize = l2agsize;

	if (Log.location & INLINELOG) {
		/*
		 * Now that the aggregate superblock has been read, do some
		 * more validation of the log superblock
		 */
		if (logsup.bsize != vopen[vol].lblksize) {
			fsck_send_msg(lrdo_LOGSUPBADBLKSZ);
			return JFS_BLKSIZE_ERROR;
		}

		if (logsup.l2bsize != vopen[vol].l2bsize) {
			fsck_send_msg(lrdo_LOGSUPBADL2BLKSZ);
			return JFS_L2BLKSIZE_ERROR;
		}

		aggsb_numpages = lengthPXD(&sb.s_logpxd) * logsup.bsize / LOGPSIZE;
		if (logsup.size != aggsb_numpages) {
			fsck_send_msg(lrdo_LOGSUPBADLOGSZ);
			return JFS_LOGSIZE_ERROR;
		}
	}
	/*
	 *set lbperpage in vopen.
	 */
	vopen[vol].lbperpage = PSIZE >> vopen[vol].l2bsize;

	/*
	 * was it cleanly umounted ?
	 */
	if (sb.s_state == FM_CLEAN) {
		vopen[vol].status = FM_CLEAN;
		vopen[vol].state = VOPEN_CLOSED;
		return (0);
	}

	/*
	 * get status of volume
	 */
	vopen[vol].status = sb.s_state;
	vopen[vol].is_fsdirty = (sb.s_state & FM_DIRTY);

	/*
	 *check log serial number
	 */
	if (sb.s_logserial != Log.serial) {
		fsck_send_msg(lrdo_FSSUPERBADLOGSER);
		vopen[vol].state = VOPEN_CLOSED;
		fsError(SERIALNO, vol, SUPER1_B);
		return (LOGSUPER_BADSERIAL);
	}

	/* initialize the disk and inode maps
	 */
	if ((rc = initMaps(vol)) != 0) {
		fsck_send_msg(lrdo_INITMAPSFAIL);
		fsError(MAPERR, vol, 0);
		return (rc);
	}
	vopen[vol].state = VOPEN_OPEN;
	return 0;
}

/*
 * NAME:         updateSuper(vol)
 *
 * FUNCTION:     updates primary aggregate/lv's superblock status and
 *               writes it out.
 */
int updateSuper(int vol)
{				/* device minor number of aggregate/lv */
	int rc, status;
	struct superblock sb;

	/* read in superblock of the volume */
	if ((rc = rdwrSuper(vopen[vol].fp, &sb, PB_READ)) != 0) {
		fsck_send_msg(lrdo_READFSSUPERFAIL);
		return (FSSUPER_READERROR2);
	}

	/* mark superblock state. write it out */
	status = vopen[vol].status;
	if (status & (FM_DIRTY | FM_LOGREDO))
		sb.s_state = status & ~FM_EXTENDFS;
	else
		sb.s_state = FM_CLEAN;

	if ((rc = rdwrSuper(vopen[vol].fp, &sb, PB_UPDATE)) != 0) {
		fsck_send_msg(lrdo_WRITEFSSUPERFAIL);
	}

	return (rc);
}

/*
 * NAME:        rdwrSuper(fp, sb, rwflag)
 *
 * FUNCTION:    read or write the superblock for the file system described
 *              by the file descriptor of the opened aggregate/lv.
 *              for read, if a read of primary superblock is failed,
 *              try to read the secondary superblock. report error only
 *              when both reads failed.
 *              for write, any write failure should be reported.
 */
int rdwrSuper(FILE *fp, struct superblock * sb, int32_t rwflag)
{
	int rc;
	uint64_t super_offset;
	union {
		struct superblock super;
		char block[PSIZE];
	} super;

	if (use_2ndary_agg_superblock) {
		super_offset = SUPER2_OFF;
	} else {
		super_offset = SUPER1_OFF;
	}
	/*
	 * seek to the postion of the primary superblock.
	 * since at this time we don't know the aggregate/lv
	 * logical block size yet, we have to use the fixed
	 * byte offset address super_offset to seek for.
	 */

	/*
	 * read super block
	 */
	if (rwflag == PB_READ) {
		rc = ujfs_rw_diskblocks(fp, super_offset,
					(unsigned) SIZE_OF_SUPER, super.block, GET);
		if (rc != 0) {
			if (!use_2ndary_agg_superblock) {
				fsck_send_msg(lrdo_READFSPRIMSBFAIL);
				return (CANTREAD_PRIMFSSUPER);
			} else {
				fsck_send_msg(lrdo_READFS2NDSBFAIL);
				return (CANTREAD_2NDFSSUPER);
			}
		}

		*sb = super.super;

		ujfs_swap_superblock(sb);

		/*
		 * write superblock
		 */
	} else {		/* PB_UPDATE */
		/* ? memset(super.block, 0, SIZE_OF_SUPER); */
		super.super = *sb;

		ujfs_swap_superblock(&super.super);

		/*
		 * write whichever superblock we're working with.
		 * chkdsk will take care of replicating it.
		 */
		rc = ujfs_rw_diskblocks(fp, super_offset,
					(unsigned) SIZE_OF_SUPER, super.block, PUT);
		if (rc != 0) {
			if (!use_2ndary_agg_superblock) {
				fsck_send_msg(lrdo_WRITEFSPRIMSBFAIL);
				return (CANTWRITE_PRIMFSSUPER);
			} else {
				fsck_send_msg(lrdo_WRITEFS2NDSBFAIL);
				return (CANTWRITE_2NDFSSUPER);
			}
		}
	}

	return (0);
}

/*
 * NAME:        bflush()
 *
 * FUNCTION:    write out appropriate portion of buffer page if its modified.
 *              Note that a dtree page may not be 4k, depending on the length
 *              field specified in pxd. Write out only length that is needed.
 */
int bflush(int32_t k,		/*  The index in bufhdr that describes buf */
	   struct bufpool *buf)
{				/* pointer to buffer pool page */
	FILE *fp = NULL;
	int rc;
	int32_t vol;
	int32_t nbytes;
	int64_t blkno;

	/* nothing to do ? */
	if (bufhdr[k].modify == 0)
		return (0);

	/* write it out */
	vol = bufhdr[k].vol;
	fp = vopen[vol].fp;
	blkno = addressPXD(&bufhdr[k].pxd);
	nbytes = lengthPXD(&bufhdr[k].pxd) << vopen[vol].l2bsize;
	rc = ujfs_rw_diskblocks(fp,
				(uint64_t) (blkno << vopen[vol].l2bsize),
				(unsigned) nbytes, (char *) buf, PUT);
	if (rc != 0) {
		fsck_send_msg(lrdo_BUFFLUSHFAIL);
		return (BFLUSH_WRITEERROR);
	}

	bufhdr[k].modify = 0;

	return (0);
}

/*
 * NAME:        findLog()
 *
 * FUNCTION:    open the device to see if it's a valid filesystem
 * 		or journal.  If it is a filesystem, determine whether
 * 		the log is inline or external.  If external, find
 * 		the log device.
 *
 */
int findLog(FILE *fp, int *in_use)
{
	struct logsuper logsup;
	struct superblock sb;

	*in_use = 0;
	/*
	 * try the LV as file system with in-line log
	 */
	if (rdwrSuper(fp, &sb, PB_READ)) {
		fsck_send_msg(lrdo_NOTAFSDEV);
		return NOT_FSDEV_ERROR;
	}

	/*
	 * is the LV a file system ?
	 */
	if (memcmp(sb.s_magic, JFS_MAGIC, sizeof (sb.s_magic)) == 0) {
		/*
		 * does file system contains its in-line log ?
		 */
		if ((sb.s_flag & JFS_INLINELOG) == JFS_INLINELOG) {
			Log.location = INLINELOG;
			Log.fp = fp;
			//Log.status = sb.s_state;
			Log.l2bsize = sb.s_l2bsize;
			Log.xaddr = addressPXD(&sb.s_logpxd) << sb.s_l2bsize;

			/* vopen[0] represents fs if inline log */
			vopen[0].status = sb.s_state;
			vopen[0].fp = fp;

			return 0;
		}
		/* Save fp and uuid */
		primary_vol.fp = fp;
		uuid_copy(primary_vol.uuid, sb.s_uuid);

		/*
		 * External log
		 *
		 * First check device specified on
		 * command line
		 */
		Log.xaddr = 0;
		if (log_device[0]) {
			Log.fp = NULL;
			if (LogOpenMode != O_RDONLY) {
				Log.fp = fopen_excl(log_device, "r+");
				if (Log.fp == NULL)
					*in_use = 1;
			}
			if (Log.fp == NULL) {
				Log.fp = fopen(log_device, "r");
				if (Log.fp == NULL) {
					printf("Invalid journal specified (%s)\n",
					       log_device);
					goto by_uuid;
				}
			}
			ujfs_rw_diskblocks(Log.fp, LOGPNTOB(LOGSUPER_B),
					   sizeof (struct logsuper), &logsup, GET);
			ujfs_swap_logsuper(&logsup);
			if ((logsup.magic != LOGMAGIC) || (uuid_compare(logsup.uuid, sb.s_loguuid))) {
				fclose(Log.fp);
				*in_use = 0;
				goto by_uuid;
			}
			Log.location = OUTLINELOG;
			return 0;
		}
	      by_uuid:
		Log.fp = open_by_label(sb.s_loguuid, 0, 1, NULL, in_use);

		if (Log.fp != NULL) {
			Log.location |= OUTLINELOG;
			return 0;
		}

		return NOT_INLINELOG_ERROR;
	}
	/*
	 * is this an external log?
	 */
	ujfs_rw_diskblocks(fp, LOGPNTOB(LOGSUPER_B), sizeof (struct logsuper), &logsup, GET);
	ujfs_swap_logsuper(&logsup);
	if (logsup.magic != LOGMAGIC) {
		fsck_send_msg(lrdo_NOTAFSDEV);
		return NOT_FSDEV_ERROR;
	}
	Log.fp = fp;
	Log.location = OUTLINELOG;

	return 0;
}

extern void exit(int);

/*
 * NAME:        fsError(type,vol,bn)
 *
 * FUNCTION:    error handling code for the specified
 *              aggregate/lv (filesystem).
 */
int fsError(int type,		/* error types */
	    int vol,		/* the minor number of the aggregate/lv */
	    int64_t bn)
{				/* aggregate block No.  */

	fsck_send_msg(lrdo_ERRORONVOL, vol);

	retcode = -1;
	vopen[vol].status = FM_LOGREDO;

	switch (type) {
	case OPENERR:
		fsck_send_msg(lrdo_OPENFAILED);
		break;
	case MAPERR:
		fsck_send_msg(lrdo_CANTINITMAPS);
		break;
	case DBTYPE:
		fsck_send_msg(lrdo_BADDISKBLKNUM, (long long) bn);
		break;
	case INOTYPE:
		fsck_send_msg(lrdo_BADINODENUM, (long long) bn);
		break;
	case READERR:
		fsck_send_msg(lrdo_CANTREADBLK, (long long) bn);
		break;
	case SERIALNO:
		fsck_send_msg(lrdo_BADLOGSER);
		break;
	case IOERROR:
		fsck_send_msg(lrdo_IOERRREADINGBLK, (long long) bn);
		break;
	case LOGRCERR:
		fsck_send_msg(lrdo_BADUPDMAPREC, (long long) bn);
		break;
	}
	return (0);
}

/*
 *      logError(type)
 *
 * error handling for log read errors.
 */
int logError(int type, int logaddr)
{
	int k;
	retcode = -1;
	logsup.state = LOGREADERR;
	switch (type) {
	case LOGEND:
		fsck_send_msg(lrdo_FINDLOGENDFAIL);
		break;
	case READERR:
		fsck_send_msg(lrdo_LOGREADFAIL, logaddr);
		break;
	case UNKNOWNR:
		fsck_send_msg(lrdo_UNRECOGTYPE, logaddr);
		break;
	case IOERROR:
		fsck_send_msg(lrdo_IOERRONLOG, logaddr);
		break;
	case LOGWRAP:
		fsck_send_msg(lrdo_LOGWRAP);
	}

	/* mark all open volumes in error
	 */
	for (k = 0; k < MAX_ACTIVE; k++) {
		if ((vopen[k].state == VOPEN_OPEN) && vopen[k].status != FM_CLEAN)
			vopen[k].status = FM_LOGREDO;
	}
	return (0);
}

/*
 *	recoverExtendFS()
 *
 * function: recover crash while in extendfs() for inline log;
 *
 * note: fs superblock fields remains pre-extendfs state,
 * while that bmap file, fsck and inline log area may be in
 * unknown state;
 *
 * at entry, only log type/lv has been validated;
 * for inline log: vopen[0], fs fp = log fp;
 */
static int recoverExtendFS(FILE *fp)
{
	struct superblock *sbp;
	struct dinode *dip1, *dip2;
	struct dbmap *bgcp;
	xtpage_t *p;
	int64_t lmchild = 0, xaddr, xoff, barrier, t64, agsize;
	uint8_t lmxflag;
	int32_t i;
	char *dip, *bp;
	pxd_t temp_pxd;

	/*
	 * read bmap global control page
	 */
	/* read superblock yet again */
	sbp = (struct superblock *) &buffer[0];
	if (rdwrSuper(fp, sbp, PB_READ))
		goto errout;

	/* read primary block allocation map inode */
	dip = (char *) &buffer[1];
	if (ujfs_rw_diskblocks(fp, AITBL_OFF, PSIZE, dip, GET)) {
		fsck_send_msg(lrdo_EXTFSREADFSSUPERFAIL);
		goto errout;
	}

	/* locate the inode in the buffer page */
	dip1 = (struct dinode *) dip;
	dip1 += BMAP_I;

	bp = (char *) &buffer[2];	/* utility buffer */

	/* start from root in dinode */
	p = (xtpage_t *) & dip1->di_btroot;
	/* is this page leaf ? */
	if (p->header.flag & BT_LEAF)
		goto rdbgcp;

	/* traverse down leftmost child node to leftmost leaf of xtree */
	do {
		/* read in the leftmost child page */
		t64 = addressXAD(&p->xad[XTENTRYSTART]) << sbp->s_l2bsize;
		if (ujfs_rw_diskblocks(fp, t64, PSIZE, bp, GET)) {
			fsck_send_msg(lrdo_EXTFSREADBLKMAPINOFAIL);
			goto errout;
		}

		p = (xtpage_t *) bp;
		/* is this page leaf ? */
		if (p->header.flag & BT_LEAF)
			break;
	} while (1);

      rdbgcp:
	t64 = addressXAD(&p->xad[XTENTRYSTART]) << sbp->s_l2bsize;
	if (ujfs_rw_diskblocks(fp, t64, PSIZE, bp, GET)) {
		fsck_send_msg(lrdo_EXTFSREADBLKFAIL1, (long long) t64);
		goto errout;
	}
	bgcp = (struct dbmap *) bp;

	/*
	 * recover to pre- or post-extendfs state ?:
	 */
	if (__le64_to_cpu(bgcp->dn_mapsize) > (sbp->s_size >> sbp->s_l2bfactor)) {
		agsize = __le64_to_cpu(bgcp->dn_agsize);
		goto postx;
	}

	/*
	 *    recover pre-extendfs state
	 */
	/*
	 * reset block allocation map inode (xtree root)
	 */
	/* read 2ndary block allocation map inode */
	t64 = addressPXD(&sbp->s_ait2) << sbp->s_l2bsize;
	if (ujfs_rw_diskblocks(fp, t64, PSIZE, bp, GET)) {
		fsck_send_msg(lrdo_EXTFSREADBLKFAIL2, (long long) t64);
		goto errout;
	}
	dip2 = (struct dinode *) bp;
	dip2 += BMAP_I;

	/*
	 * Reset primary bam inode with 2ndary bam inode
	 *
	 * Not forgetting to reset di_ixpxd since they are in different
	 * inode extents.
	 */
	memcpy((void *) &temp_pxd, (void *) &(dip1->di_ixpxd), sizeof (pxd_t));
	memcpy(dip1, dip2, DISIZE);
	memcpy((void *) &(dip1->di_ixpxd), (void *) &temp_pxd, sizeof (pxd_t));

	if (ujfs_rw_diskblocks(fp, AITBL_OFF, PSIZE, dip, PUT)) {
		fsck_send_msg(lrdo_EXTFSWRITEBLKFAIL1, AITBL_OFF);
		goto errout;
	}

	/*
	 * backout bmap file to fs size:
	 *
	 * trim xtree to range specified by i_size:
	 * xtree has been grown in append mode and
	 * written from right to left, bottom-up;
	 */
	barrier = __le64_to_cpu(dip1->di_size) >> sbp->s_l2bsize;

	/* start with root */
	xaddr = 0;
	p = (xtpage_t *) & dip1->di_btroot;
	lmxflag = p->header.flag;
	p->header.next = 0;
	if (lmxflag & BT_INTERNAL) {
		/* save leftmost child xtpage xaddr */
		lmchild = addressXAD(&p->xad[XTENTRYSTART]);
	}

	/*
	 * scan each level of xtree via leftmost descend
	 */
	while (1) {
		/*
		 * scan each xtpage of current level of xtree
		 */
		while (1) {
			/*
			 * scan each xad in current xtpage
			 */
			for (i = XTENTRYSTART; i < p->header.nextindex; i++) {
				/* test if extent is of interest */
				xoff = offsetXAD(&p->xad[i]);
				if (xoff < barrier)
					continue;

				/*
				 * barrier met in current page
				 */
				assert(i > XTENTRYSTART);
				/* update current page */
				p->header.nextindex = i;
				if (xaddr) {
					/* discard further right sibling
					 * pages
					 */
					p->header.next = 0;
					if (ujfs_rw_diskblocks(fp, t64, PSIZE, p, PUT)) {
						fsck_send_msg(lrdo_EXTFSWRITEBLKFAIL2, (long long) t64);
						goto errout;
					}
				}

				goto nextLevel;
			}	/* end for current xtpage scan */

			/* barrier was not met in current page */

			/* read in next/right sibling xtpage */
			xaddr = p->header.next;
			if (xaddr) {
				if (xaddr >= barrier) {
					p->header.next = 0;
					if (ujfs_rw_diskblocks(fp, t64, PSIZE, p, PUT)) {
						fsck_send_msg(lrdo_EXTFSWRITEBLKFAIL3, (long long) t64);
						break;
					}
				}

				t64 = xaddr << sbp->s_l2bsize;
				if (ujfs_rw_diskblocks(fp, t64, PSIZE, bp, GET)) {
					fsck_send_msg(lrdo_EXTFSREADBLKFAIL3, (long long) t64);
					goto errout;
				}

				p = (xtpage_t *) bp;
			} else
				break;
		}		/* end while current level scan */

		/*
		 * descend: read leftmost xtpage of next lower level of xtree
		 */
	      nextLevel:
		if (lmxflag & BT_INTERNAL) {
			/* get the leftmost child page  */
			xaddr = lmchild;
			t64 = xaddr << sbp->s_l2bsize;
			if (ujfs_rw_diskblocks(fp, t64, PSIZE, bp, GET)) {
				fsck_send_msg(lrdo_EXTFSREADBLKFAIL4, (long long) t64);
				goto errout;
			}

			p = (xtpage_t *) bp;

			lmxflag = p->header.flag;
			if (lmxflag & BT_INTERNAL) {
				/* save leftmost child xtpage xaddr */
				lmchild = addressXAD(&p->xad[XTENTRYSTART]);
			}
		} else
			break;
	}			/* end while level scan */

	/*
	 * reconstruct map;
	 *
	 * readBmap() init blocks beyond fs size in the last
	 * partial dmap page as allocated which might have been
	 * marked as free by extendfs();
	 */
	/* fake log opend/validated */
	Log.serial = sbp->s_logserial;

	/*
	 *  reconstruct maps
	 */
	/* open LV and initialize maps  */
	if (logredoInit()) {
		fsck_send_msg(lrdo_EXTFSINITLOGREDOFAIL);
		goto errout;
	}

	/* bypass log replay */

	/* update/write maps */
	updateMaps(0);

	/*
	 * reformat log
	 *
	 * request reformat original log  (which might have been
	 * overwritten by extendfs() and set superblock clean
	 */
	jfs_logform(fp, sbp->s_bsize, sbp->s_l2bsize, sbp->s_flag,
		    addressPXD(&sbp->s_logpxd), lengthPXD(&sbp->s_logpxd), NULL, NULL);

	/* update superblock */
	updateSuper(0);

	fsck_send_msg(lrdo_REXTNDTOPRE);

	return 0;

	/*
	 *    recover post-extendfs state
	 */
      postx:
	/*
	 * update 2ndary bam inode
	 */
	/* read 2ndary block allocation map inode */
	t64 = addressPXD(&sbp->s_ait2) << sbp->s_l2bsize;
	if (ujfs_rw_diskblocks(fp, t64, PSIZE, bp, GET)) {
		fsck_send_msg(lrdo_EXTFSREADBLKFAIL5, (long long) t64);
		goto errout;
	}
	dip2 = (struct dinode *) bp;
	dip2 += BMAP_I;

	/*
	 * Reset 2ndary bam inode with primary bam inode
	 * Not forgetting to reset di_ixpxd since they are in different
	 * inode extents.
	 */
	memcpy((void *) &temp_pxd, (void *) &(dip2->di_ixpxd), sizeof (pxd_t));
	memcpy(dip2, dip1, DISIZE);
	memcpy((void *) &(dip2->di_ixpxd), (void *) &temp_pxd, sizeof (pxd_t));

	if (ujfs_rw_diskblocks(fp, t64, PSIZE, bp, PUT)) {
		fsck_send_msg(lrdo_EXTFSWRITEBLKFAIL4, (long long) t64);
		goto errout;
	}

	/*
	 * update superblock
	 */
	if (!(sbp->s_state & (FM_DIRTY | FM_LOGREDO)))
		sbp->s_state = FM_CLEAN;
	else
		sbp->s_state &= ~FM_EXTENDFS;
	sbp->s_size = sbp->s_xsize;
	sbp->s_agsize = agsize;
	sbp->s_fsckpxd = sbp->s_xfsckpxd;
	sbp->s_fscklog = 0;
	sbp->s_logpxd = sbp->s_xlogpxd;
	sbp->s_logserial = 1;

	if (rdwrSuper(fp, sbp, PB_UPDATE)) {
		fsck_send_msg(lrdo_EXTFSWRITEFSSUPERFAIL);
		goto errout;
	}

	/*
	 * finalize log
	 *
	 * note: new log is valid;
	 */
	/* read log superblock */
	t64 = (addressPXD(&sbp->s_logpxd) << sbp->s_l2bsize) + LOGPSIZE;
	if (ujfs_rw_diskblocks(fp, t64, LOGPSIZE, &logsup, GET)) {
		fsck_send_msg(lrdo_EXTFSREADLOGSUPFAIL);
		goto errout;
	}

	logsup.end = findEndOfLog();
	logsup.state = LOGREDONE;

	if (ujfs_rw_diskblocks(fp, t64, LOGPSIZE, &logsup, PUT)) {
		fsck_send_msg(lrdo_EXTFSWRITELOGSUPFAIL);
		goto errout;
	}

	fsck_send_msg(lrdo_REXTNDTOPOST);

	return 0;

      errout:
	fsck_send_msg(lrdo_REXTNDFAIL, errno);
	return (EXTENDFS_FAILRECOV);
}

/*
 *
 * NAME:        alloc_dmap_bitrec
 *
 * FUNCTION:    This routine allocates memory by calling the chkdsk
 *		alloc_wrksp() routine (because that will allocate high
 *		memory during autocheck).  If that fails then logredo
 *                   cannot continue bmap processing, so it will set a flag
 *                   and make the storage aleady allocated to the bmap
 *                   available for other uses.
 *		was successfully allocated and there's enough of it left,
 *		this routine will return a piece of it.
 */
int alloc_dmap_bitrec(struct dmap_bitmaps ** dmap_bitrec)
{
	int adb_rc = 0;
	int intermed_rc = 0;

	*dmap_bitrec = NULL;

	intermed_rc = alloc_wrksp((uint32_t) (sizeof (struct dmap_bitmaps)), 0,	/* not meaningful from logredo */
				  -1,	/* I am logredo */
				  (void **) dmap_bitrec);

	if ((intermed_rc != 0) || ((*dmap_bitrec) == NULL)) {
		Insuff_memory_for_maps = -1;
		available_stg_addr = bmap_stg_addr;
		available_stg_bytes = bmap_stg_bytes;
		/*
		 * initialize the storage for its new use
		 */
		memset((void *) available_stg_addr, 0, available_stg_bytes);
	}

	return (adb_rc);
}				/* end alloc_dmap_bitrec() */

/*
 *
 * NAME:        alloc_storage
 *
 * FUNCTION:    This routine allocates memory by calling the chkdsk
 *		alloc_wrksp() routine (because that will allocate high
 *		memory during autocheck).  If that fails and the bmap
 *		was successfully allocated and there's enough of it left,
 *		this routine will return a piece of it.
 */
int alloc_storage(int32_t size_in_bytes, void **addr_stg_ptr, int32_t * bmap_stg_returned)
{
	int as_rc = 0;
	int intermed_rc = 0;

	*bmap_stg_returned = 0;	/* assume we'll get it the usual way */
	*addr_stg_ptr = NULL;

	intermed_rc = alloc_wrksp((uint32_t) size_in_bytes, 0, -1, addr_stg_ptr);

	if ((intermed_rc != 0) || ((*addr_stg_ptr) == NULL)) {
		if ((!Insuff_memory_for_maps) && (bmap_stg_addr != NULL)) {
			/*
			 * we did allocate storage for the bmap
			 * and haven't started cannibalizing it yet
			 */
			Insuff_memory_for_maps = -1;
			available_stg_addr = bmap_stg_addr;
			available_stg_bytes = bmap_stg_bytes;
			/*
			 * initialize the storage for its new use
			 */
			memset((void *) available_stg_addr, 0, available_stg_bytes);
		}
		/* end we did allocate storage for the bmap... */
		if (Insuff_memory_for_maps & (available_stg_bytes != 0)) {
			/*
			 * we may be able to go on anyway
			 */
			if (available_stg_bytes < size_in_bytes) {
				/*
				 * not enough here
				 */
				return (ENOMEM0);
			} else {
				/* we can scavenge the memory we need */
				*addr_stg_ptr = available_stg_addr;
				available_stg_bytes -= size_in_bytes;
				available_stg_addr = (char *) (available_stg_addr + size_in_bytes);
				*bmap_stg_returned = -1;
			}
		} else {
			return (ENOMEM1);
		}
	}

	return (as_rc);
}

#ifdef  _JFS_WIP
/*
 *      nfsisloaded()
 *
 * check whether nfs is loaded
 */
static int nfsisloaded()
{
	int sav_errno;
	int (*entry) ();
	if (entry = load("/usr/sbin/probe", 0, 0))
		return (1);
	if (errno == ENOEXEC) {
		DBG_TRACE(("%s: nfs is not loaded\n", prog))
		    return (0);
	}
	sav_errno = errno;
	DBG_TRACE(("%s: ", prog))
	    errno = sav_errno;
	perror("load");
	return (0);
}
#endif				/* _JFS_WIP */

#ifdef _JFS_DEBUG
/*
 *      xdump()
 *
 * hex dump
 */
xdump(char *saddr, int count)
{
#define LINESZ     60
#define ASCIISTRT  40
#define HEXEND     36
	int i, j, k, hexdigit;
	int c;
	char *hexchar;
	char linebuf[LINESZ + 1];
	char prevbuf[LINESZ + 1];
	char *linestart;
	int asciistart;
	char asterisk = ' ';
	void x_scpy();
	int x_scmp();
	hexchar = "0123456789ABCDEF";
	prevbuf[0] = '\0';
	i = (int) saddr % 4;
	if (i != 0)
		saddr = saddr - i;
	for (i = 0; i < count;) {
		for (j = 0; j < LINESZ; j++)
			linebuf[j] = ' ';
		linestart = saddr;
		asciistart = ASCIISTRT;
		for (j = 0; j < HEXEND;) {
			for (k = 0; k < 4; k++) {
				c = *(saddr++) & 0xFF;
				if ((c >= 0x20) && (c <= 0x7e))
					linebuf[asciistart++] = (char) c;
				else
					linebuf[asciistart++] = '.';
				hexdigit = c >> 4;
				linebuf[j++] = hexchar[hexdigit];
				hexdigit = c & 0x0f;
				linebuf[j++] = hexchar[hexdigit];
				i++;
			}
			if (i >= count)
				break;
			linebuf[j++] = ' ';
		}
		linebuf[LINESZ] = '\0';
		if (((j = x_scmp(linebuf, prevbuf)) == 0) && (i < count)) {
			if (asterisk == ' ') {
				asterisk = '*';
				DBG_TRACE(("    *\n"))
			}
		} else {
			DBG_TRACE(("    %x  %s\n", linestart, linebuf))
			    asterisk = ' ';
			x_scpy(prevbuf, linebuf);
		}
	}
	return (0);
}

int x_scmp(char *s1, char *s2)
{
	while ((*s1) && (*s1 == *s2)) {
		s1++;
		s2++;
	}
	if (*s1 || *s2)
		return (-1);
	else
		return (0);
}

void x_scpy(char *s1, char *s2)
{
	while ((*s1 = *s2) != '\0') {
		s1++;
		s2++;
	}
}

prtdesc(struct lrd *ld)
{
	switch (ld->log.redopage.type) {
	case LOG_XTREE:
		DBG_TRACE((" REDOPAGE:XTREE\n  "))
		    break;
	case (LOG_XTREE | LOG_NEW):
		DBG_TRACE((" REDOPAGE:XTREE_NEW\n  "))
		    break;
	case (LOG_BTROOT | LOG_XTREE):
		DBG_TRACE((" REDOPAGE:BTROOT_XTREE\n  "))
		    break;
	case LOG_DTREE:
		DBG_TRACE((" REDOPAGE:DTREE\n  "))
		    break;
	case (LOG_DTREE | LOG_NEW):
		DBG_TRACE((" REDOPAGE:DTREE_NEW \n "))
		    break;
	case (LOG_DTREE | LOG_EXTEND):
		DBG_TRACE((" REDOPAGE:DTREE_EXTEND\n  "))
		    break;
	case (LOG_BTROOT | LOG_DTREE):
		DBG_TRACE((" REDOPAGE:BTROOT_DTREE\n  "))
		    break;
	case (LOG_BTROOT | LOG_DTREE | LOG_NEW):
		DBG_TRACE((" REDOPAGE:BTROOT_DTREE.NEW\n  "))
		    break;
	case LOG_INODE:
		/*
		 * logredo() updates imap for alloc of inode.
		 */
		DBG_TRACE((" REDOPAGE:INODE\n  "))
		    break;
	case LOG_EA:
		DBG_TRACE((" REDOPAGE:EA\n  "))
		    break;
	case LOG_DATA:
		DBG_TRACE((" REDOPAGE:DATA\n  "))
		    break;
	}
	return (0);
}
#endif				/* _JFS_DEBUG */