//------------------------------------------------------------------------- // Desc: Recover database after a failure. // Tabs: 3 // // Copyright (c) 1991-2006 Novell, Inc. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of version 2 of the GNU General Public // License as published by the Free Software Foundation. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, contact Novell, Inc. // // To contact Novell about this file by physical or electronic mail, // you may find current contact information at www.novell.com // // $Id: recover.cpp 12315 2006-01-19 15:16:37 -0700 (Thu, 19 Jan 2006) dsanders $ //------------------------------------------------------------------------- #include "flaimsys.h" FSTATIC RCODE flmReadLog( FDB * pDb, FLMUINT uiLogEOF, FLMUINT * puiCurrAddrRV, FLMBYTE * pBuf, FLMBOOL * pbIsBeforeImageBlkRV); FSTATIC RCODE flmProcessBeforeImage( FDB * pDb, FLMUINT uiLogEOF, FLMUINT * puiCurrAddrRV, FLMBYTE * pBuf, FLMBOOL bDoingRecovery, FLMUINT uiMaxTransID); /**************************************************************************** Desc: This routine reads the next before-image block from the file. Ret: FERR_OK Indicates that the desired data was successfully read. FERR_INCOMPLETE_LOG Indicates that we would have read beyond the log end-of-file. other other FLAIM error codes ****************************************************************************/ FSTATIC RCODE flmReadLog( FDB * pDb, FLMUINT uiLogEOF, // Address of end of rollback log FLMUINT * puiCurrAddrRV, // This is the current address we are // reading in the log file. It // will be updated after reading the // data FLMBYTE * pBlk, // This is the buffer that is to hold // the data that is read from the // log file FLMBOOL * pbIsBeforeImageBlkRV // Is block a before-image block? ) { RCODE rc = FERR_OK; FFILE * pFile = pDb->pFile; FLMUINT uiFilePos; FLMUINT uiBlkSize = pFile->FileHdr.uiBlockSize; FLMUINT uiBytesRead; F_TMSTAMP StartTime; DB_STATS * pDbStats = pDb->pDbStats; uiFilePos = *puiCurrAddrRV; // Verify that we are not going to read beyond the log EOF if (!FSAddrIsAtOrBelow( uiFilePos + uiBlkSize, uiLogEOF)) { rc = RC_SET( FERR_INCOMPLETE_LOG); goto Exit; } // Position to the appropriate place and read the data if (pDbStats) { pDbStats->bHaveStats = TRUE; pDbStats->LogBlockReads.ui64Count++; pDbStats->LogBlockReads.ui64TotalBytes += uiBlkSize; f_timeGetTimeStamp( &StartTime); } if( RC_BAD( rc = pDb->pSFileHdl->ReadBlock( uiFilePos, uiBlkSize, pBlk, &uiBytesRead))) { if (rc == FERR_IO_END_OF_FILE) { rc = RC_SET( FERR_INCOMPLETE_LOG); } if (pDbStats) { pDbStats->uiReadErrors++; } goto Exit; } if (pDbStats) { flmAddElapTime( &StartTime, &pDbStats->LogBlockReads.ui64ElapMilli); } if (uiBytesRead != uiBlkSize) { if (pDbStats) { pDbStats->uiLogBlockChkErrs++; } rc = RC_SET( FERR_DATA_ERROR); goto Exit; } // Verify the checksum on the block if( RC_BAD( rc = BlkCheckSum( pBlk, CHECKSUM_CHECK, BT_END, uiBlkSize))) { if (pDbStats) { pDbStats->uiLogBlockChkErrs++; } goto Exit; } *pbIsBeforeImageBlkRV = (FLMBOOL)((BH_IS_BI( pBlk)) ? (FLMBOOL)TRUE : (FLMBOOL)FALSE); BH_UNSET_BI( pBlk); // Adjust the current address for the next read uiFilePos += uiBlkSize; if (FSGetFileOffset( uiFilePos) >= pFile->uiMaxFileSize) { FLMUINT uiFileNumber = FSGetFileNumber( uiFilePos); if (!uiFileNumber) { uiFileNumber = FIRST_LOG_BLOCK_FILE_NUMBER( pFile->FileHdr.uiVersionNum); } else { uiFileNumber++; } if (uiFileNumber > MAX_LOG_BLOCK_FILE_NUMBER( pFile->FileHdr.uiVersionNum)) { rc = RC_SET( FERR_DB_FULL); goto Exit; } uiFilePos = FSBlkAddress( uiFileNumber, 0 ); } *puiCurrAddrRV = uiFilePos; Exit: return( rc); } /**************************************************************************** Desc: This routine reads and processes a before-image block record in the log file. The reapply flag indicates whether the block should be written back to the database file. Ret: FERR_OK Indicates that the before-image record was successfully read and processed. FERR_INVALID_BLOCK_LENGTH This error is returned if the block length read from the log file is invalid. other other FLAIM error codes ****************************************************************************/ FSTATIC RCODE flmProcessBeforeImage( FDB * pDb, FLMUINT uiLogEOF, // Address of the end of the rollback // log FLMUINT * puiCurrAddrRV, // This is the current offset we are // reading in the log file. // It will be updated after reading the // data FLMBYTE * pBlk, // This is a pointer to a buffer that // will be used to hold the block that // is read FLMBOOL bDoingRecovery, // Are we doing a recovery as opposed to // rolling back a transaction? FLMUINT uiMaxTransID) // Maximum transaction ID to recover to when // bDoingRecovery is TRUE. This parameter // is ignored when bDoingRecover is FALSE. { RCODE rc = FERR_OK; FFILE * pFile = pDb->pFile; FLMUINT uiBlkAddress; FLMUINT uiBlkLength; FLMUINT uiBytesWritten; FLMBOOL bIsBeforeImageBlk = FALSE; F_TMSTAMP StartTime; DB_STATS * pDbStats = pDb->pDbStats; // Read the block from the log if (RC_BAD( rc = flmReadLog( pDb, uiLogEOF, puiCurrAddrRV, pBlk, &bIsBeforeImageBlk))) goto Exit; // Determine if we want to restore the block. // If we are doing a recovery, restore the block only if // its checkpoint is <= uiMaxTransID. If we are // rolling back a transaction, restore the block only if // it is marked as a before-image block. // For the recovery process, multiple versions // of the same block may be restored if there are // multiple versions in the log. However, because // the versions will be in ascending order in the // file, ultimately, the one with the highest // checkpoint that does not exceed uiMaxTransID // will be restored - which is precisely the one // we want to be restored for a recovery. // For a transaction rollback, it is impossible for us // to see more than one version of a block that is // marked as the before-image version, because we // started from a point in the log where the last // update transaction logged its first block. All // blocks after that point that have the BI bits // set should be restored. Any that do not have // the BI bit set should NOT be restored. if (bDoingRecovery) { if ((FLMUINT)FB2UD( &pBlk [BH_TRANS_ID]) > uiMaxTransID) { goto Exit; } } else if (!bIsBeforeImageBlk) { goto Exit; } // Determine the block address before setting the checksum uiBlkAddress = (FLMUINT)GET_BH_ADDR( pBlk); uiBlkLength = getEncryptSize( pBlk); // Set the block checksum AFTER encrypting BlkCheckSum( pBlk, CHECKSUM_SET, uiBlkAddress, pFile->FileHdr.uiBlockSize); if (pDbStats) { pDbStats->bHaveStats = TRUE; pDbStats->LogBlockRestores.ui64Count++; pDbStats->LogBlockRestores.ui64TotalBytes += uiBlkLength; f_timeGetTimeStamp( &StartTime); } pDb->pSFileHdl->setMaxAutoExtendSize( pFile->uiMaxFileSize); pDb->pSFileHdl->setExtendSize( pFile->uiFileExtendSize); rc = pDb->pSFileHdl->WriteBlock( uiBlkAddress, uiBlkLength, pBlk, pFile->FileHdr.uiBlockSize, NULL, &uiBytesWritten); #ifdef FLM_DBG_LOG flmDbgLogWrite( pFile->uiFFileId, uiBlkAddress, 0, FB2UD( &pBlk [BH_TRANS_ID]), "ROLLBACK"); #endif if (pDbStats) { flmAddElapTime( &StartTime, &pDbStats->LogBlockRestores.ui64ElapMilli); if (RC_BAD( rc)) { pDbStats->uiWriteErrors++; } } Exit: return( rc); } /*************************************************************************** Desc: Writes the log header to disk. The checksum is calculated before writing the log header to disk. *****************************************************************************/ RCODE flmWriteLogHdr( DB_STATS * pDbStats, F_SuperFileHdl * pSFileHdl, FFILE * pFile, FLMBYTE * pucLogHdr, // Log header buffer. FLMBYTE * pucCPLogHdr, // Log header as it was at the time // of the checkpoint. FLMBOOL bIsCheckpoint) // Are we writing a checkpoint? If we // we are, we may write the log header // as is. Otherwise, we need to make // sure we don't write out certain // parts of the log header - they must // not be updated on disk until a // checkpoint actually occurs. { RCODE rc = FERR_OK; FLMUINT uiBytesWritten; FLMUINT uiNewCheckSum; F_FileHdlImp * pCFileHdl = NULL; FLMBYTE * pucTmpLogHdr; F_TMSTAMP StartTime; // Force any recent writes to disk before modifying the log file // header. This routine is generally called after having added // things to the log file. It is critical that any previous writes // be flushed before the header is updated because the header will // generally have been modified to point to the new things that were // added. if (RC_BAD( rc = pSFileHdl->Flush())) { goto Exit; } pucTmpLogHdr = &pFile->pucLogHdrWriteBuf[ 16]; uiBytesWritten = LOG_HEADER_SIZE + 16; // Very Important Note: FlmDbConfig relies on the fact that we will // write out the prefix area of the database header. Do not remove // this code. flmSetFilePrefix( pFile->pucLogHdrWriteBuf, pFile->FileHdr.uiAppMajorVer, pFile->FileHdr.uiAppMinorVer); // Only copy the part of the header that is relevant for this // database version. if( pFile->FileHdr.uiVersionNum < FLM_FILE_FORMAT_VER_4_3) { f_memcpy( pucTmpLogHdr, pucLogHdr, LOG_HEADER_SIZE_VER40); } else { f_memcpy( pucTmpLogHdr, pucLogHdr, LOG_HEADER_SIZE); } // If we are not doing a checkpoint, we don't really want // to write out certain items, so we restore them from // the save info. buffer, which is the buffer that contains // the log header data as it was at the time of the // checkpoint. if (!bIsCheckpoint && pucCPLogHdr) { f_memcpy( &pucTmpLogHdr [LOG_RFL_LAST_CP_FILE_NUM], &pucCPLogHdr [LOG_RFL_LAST_CP_FILE_NUM], 4); f_memcpy( &pucTmpLogHdr [LOG_RFL_LAST_CP_OFFSET], &pucCPLogHdr [LOG_RFL_LAST_CP_OFFSET], 4); f_memcpy( &pucTmpLogHdr [LOG_CURR_TRANS_ID], &pucCPLogHdr [LOG_CURR_TRANS_ID], 4); f_memcpy( &pucTmpLogHdr [LOG_COMMIT_COUNT], &pucCPLogHdr [LOG_COMMIT_COUNT], 4); f_memcpy( &pucTmpLogHdr [LOG_PF_FIRST_BACKCHAIN], &pucCPLogHdr [LOG_PF_FIRST_BACKCHAIN], 4); f_memcpy( &pucTmpLogHdr [LOG_PF_AVAIL_BLKS], &pucCPLogHdr [LOG_PF_AVAIL_BLKS], 4); f_memcpy( &pucTmpLogHdr [LOG_LOGICAL_EOF], &pucCPLogHdr [LOG_LOGICAL_EOF], 4); pucTmpLogHdr [LOG_PF_FIRST_BC_CNT] = pucCPLogHdr [LOG_PF_FIRST_BC_CNT]; f_memcpy( &pucTmpLogHdr [LOG_PF_NUM_AVAIL_BLKS], &pucCPLogHdr [LOG_PF_NUM_AVAIL_BLKS], 4); if( pFile->FileHdr.uiVersionNum >= FLM_FILE_FORMAT_VER_4_3) { f_memcpy( &pucTmpLogHdr [LOG_BLK_CHG_SINCE_BACKUP], &pucCPLogHdr [LOG_BLK_CHG_SINCE_BACKUP], 4); } if( pFile->FileHdr.uiVersionNum >= FLM_FILE_FORMAT_VER_4_31) { f_memcpy( &pucTmpLogHdr [LOG_LAST_RFL_COMMIT_ID], &pucCPLogHdr [LOG_LAST_RFL_COMMIT_ID], 4); } } // If this is not a 4.3 database, make sure the old values // in the log header slots are preserved. if( pFile->FileHdr.uiVersionNum < FLM_FILE_FORMAT_VER_4_3) { // Compatibility for parts that were unused. UD2FBA( 0, &pucTmpLogHdr [20]); UD2FBA( 0, &pucTmpLogHdr [48]); UD2FBA( 0, &pucTmpLogHdr [52]); UD2FBA( 0, &pucTmpLogHdr [84]); // Compatibility for trans active and maint in progress. pucTmpLogHdr [76] = 0; pucTmpLogHdr [79] = 0; } uiNewCheckSum = lgHdrCheckSum( pucTmpLogHdr, FALSE); UW2FBA( (FLMUINT16)uiNewCheckSum, &pucTmpLogHdr [LOG_HDR_CHECKSUM]); // Now update the log header record on disk if (pDbStats) { pDbStats->bHaveStats = TRUE; pDbStats->LogHdrWrites.ui64Count++; pDbStats->LogHdrWrites.ui64TotalBytes += uiBytesWritten; f_timeGetTimeStamp( &StartTime); } if( RC_BAD( rc = pSFileHdl->GetFileHdl( 0, TRUE, &pCFileHdl))) { goto Exit; } #ifdef FLM_WIN if (((F_FileHdlImp *)pCFileHdl)->GetSectorSize() > 512) { // We don't want to use the SectorWrite call when sector // size is > 512 because it will overwrite the file // header, which has not been set up in this buffer. rc = pCFileHdl->Write( 0, uiBytesWritten, pFile->pucLogHdrWriteBuf, &uiBytesWritten); } else { rc = pCFileHdl->SectorWrite( 0, uiBytesWritten, pFile->pucLogHdrWriteBuf, ((F_FileHdlImp *)pCFileHdl)->GetSectorSize(), NULL, &uiBytesWritten, FALSE); } #else rc = pCFileHdl->SectorWrite( 0, uiBytesWritten, pFile->pucLogHdrWriteBuf, 512, NULL, &uiBytesWritten, FALSE); #endif if (RC_BAD( rc)) { if (pDbStats) { pDbStats->uiWriteErrors++; } goto Exit; } if (pDbStats) { flmAddElapTime( &StartTime, &pDbStats->LogHdrWrites.ui64ElapMilli); } if (RC_BAD( rc = pCFileHdl->Flush())) { goto Exit; } Exit: return( rc); } /**************************************************************************** Desc: This routine recovers the database to a physically consistent state. Ret: FERR_OK - Indicates the database has been recovered. other - other FLAIM error codes ****************************************************************************/ RCODE flmPhysRollback( FDB * pDb, FLMUINT uiLogEOF, FLMUINT uiFirstLogBlkAddr, // Address of first log block FLMBOOL bDoingRecovery, // Doing recovery? If so, we will // ignore blocks whose transaction // ID is higher than uiMaxTransID. // Also, we will not check the BI // bits in the logged blocks, because // we are not rolling back a // transaction. FLMUINT uiMaxTransID // Ignored when bDoingRecovery is // FALSE ) { RCODE rc = FERR_OK; FFILE * pFile = pDb->pFile; FLMUINT uiCurrAddr; FLMBYTE * pucBlk = NULL; // If the log is empty, no need to do anything. // A uiFirstLogBlkAddr of zero indicates that there // is nothing in the log to rollback. This will be true // if we are rolling back a transaction, and the transaction // has not logged anything or if we are doing a recovery and // nothing was logged since the last checkpoint. if (uiLogEOF == pFile->FileHdr.uiBlockSize || !uiFirstLogBlkAddr) { goto Exit; // Will return FERR_OK } // Allocate a buffer to be used for reading. #ifdef FLM_WIN if ((pucBlk = (FLMBYTE *)VirtualAlloc( NULL, (DWORD)pFile->FileHdr.uiBlockSize, MEM_COMMIT, PAGE_READWRITE)) == NULL) { rc = MapWinErrorToFlaim( GetLastError(), FERR_MEM); goto Exit; } #elif defined( FLM_LINUX) || defined( FLM_SOLARIS) if( (pucBlk = (FLMBYTE *)memalign( sysconf(_SC_PAGESIZE), pFile->FileHdr.uiBlockSize)) == NULL) { rc = MapErrnoToFlaimErr(errno, FERR_MEM); goto Exit; } #else if (RC_BAD( rc = f_alloc( pFile->FileHdr.uiBlockSize, &pucBlk))) { goto Exit; } #endif // Start from beginning of log and read to EOF restoring before-image // blocks along the way. uiCurrAddr = uiFirstLogBlkAddr; pDb->pSFileHdl->enableFlushMinimize(); while (FSAddrIsBelow( uiCurrAddr, uiLogEOF)) { if (RC_BAD( rc = flmProcessBeforeImage( pDb, uiLogEOF, &uiCurrAddr, pucBlk, bDoingRecovery, uiMaxTransID))) { goto Exit; } } // Force the writes to the file. if (RC_BAD( rc = pDb->pSFileHdl->Flush())) { goto Exit; } Exit: pDb->pSFileHdl->disableFlushMinimize(); // Free the memory handle, if one was allocated. if (pucBlk) { #ifdef FLM_WIN (void)VirtualFree( pucBlk, 0, MEM_RELEASE); #elif defined( FLM_LINUX) || defined( FLM_SOLARIS) free( pucBlk); #else f_free( &pucBlk); #endif } return( rc); }