diff --git a/include/nwnss/support/lnxmbINC/nwreg.h b/include/nwnss/support/lnxmbINC/nwreg.h new file mode 100644 index 0000000..8fbead9 --- /dev/null +++ b/include/nwnss/support/lnxmbINC/nwreg.h @@ -0,0 +1,377 @@ +/**************************************************************************** + | + | (C) Copyright 2004 Novell, Inc. + | All Rights Reserved. + | + | This program is free software; you can redistribute it and/or + | modify it under the terms of version 2 of the GNU General Public + | License as published by the Free Software Foundation. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, contact Novell, Inc. + | + | To contact Novell about this file by physical or electronic mail, + | you may find current contact information at www.novell.com + | + |***************************************************************************/ +/* ---------------------------------------------------------------------------- + NetWare Registry Information + +This file documents the NetWare registry extentions that allow NLMs +to access to the NetWare registry database. All Registry API's have +been defined and documented below. + +IMPORTANT NOTE: It is possible to make your system unbootable by +corrupting the registry database! + +----------------------------------------------------------------------------- */ + +// +// Well Known +// +#define HKEY LONG +#define REGSAM LONG +//#ifndef BOOL +//#define BOOL unsigned int +//#endif +#define REGEVT struct LoadDefinitionStructure * +#define SECURITY_ATTRIBUTES LONG /* Connection Number */ +// +// Reserved Key Handles. +// +#define HKEY_ROOT ((HKEY)0x80000000) +#define HKEY_LOCAL_MACHINE ((HKEY)0x80000002) +#define HKEY_PERFORMANCE_DATA ((HKEY)0x80000004) +#define HKEY_DYN_DATA ((HKEY)0x80000006) + +#define HKLM HKEY_LOCAL_MACHINE +#define HKPD HKEY_PERFORMANCE_DATA +#define HKDD HKEY_DYN_DATA + +// +// Data Types +// +#define REG_NONE 0 // No value type +#define REG_SZ 1 // ASCIIZ null terminated string +#define REG_EXPAND_SZ 2 // ASCIIZ null terminated string (with environment variable references) +#define REG_BINARY 3 // Free form binary +#define REG_DWORD 4 // 32-bit number +#define REG_DWORD_BIG_ENDIAN 5 // 32-bit number +#define REG_LINK 6 // Symbolic Link +#define REG_MULTI_SZ 7 // Multiple ASCIIZ null terminated strings +#define REG_RESOURCE_LIST 8 // Resource list in the resource map +#define REG_FULL_RESOURCE_DESCRIPTOR 9 // Resource list in the hardware description +#define REG_RESOURCE_REQUIREMENTS_LIST 10 +#define REG_QWORD 11 // 64-bit number + + +// +// Open/Create Options +// + +#define REG_OPTION_RESERVED 0x00000000 // Parameter is reserved +#define REG_OPTION_NON_VOLATILE 0x00000000 // Key is preserved when system is rebooted +#define REG_OPTION_VOLATILE 0x00000001 // Key is not preserved when system is rebooted +#define REG_OPTION_CREATE_LINK 0x00000002 // Created key is a symbolic link +#define REG_OPTION_BACKUP_RESTORE 0x00000004 // open for backup or restore special access rules privilege required +#define REG_OPTION_OPEN_LINK 0x00000008 // Open symbolic link + +// +// Key creation/open disposition +// +#define REG_CREATED_NEW_KEY 0x00000001 // New Registry Key created +#define REG_OPENED_EXISTING_KEY 0x00000002 // Existing Key opened + +// +// Security Access Mask values +// +#define KEY_QUERY_VALUE 0x00000001 +#define KEY_SET_VALUE 0x00000002 +#define KEY_CREATE_SUB_KEY 0x00000004 +#define KEY_ENUMERATE_SUB_KEYS 0x00000008 +#define KEY_NOTIFY 0x00000010 +#define KEY_CREATE_LINK 0x00000020 + +#define KEY_READ (KEY_QUERY_VALUE | KEY_ENUMERATE_SUB_KEYS | KEY_NOTIFY) +#define KEY_WRITE (KEY_SET_VALUE | KEY_CREATE_SUB_KEY) +#define KEY_EXECUTE (KEY_READ) +#define KEY_ALL_ACCESS (KEY_QUERY_VALUE | KEY_SET_VALUE | KEY_CREATE_SUB_KEY | KEY_ENUMERATE_SUB_KEYS | KEY_NOTIFY | KEY_CREATE_LINK) + +// +// Notify filter values +// +#define REG_NOTIFY_CHANGE_NAME 0x00000001 // Create or delete (child) +#define REG_NOTIFY_CHANGE_ATTRIBUTES 0x00000002 +#define REG_NOTIFY_CHANGE_LAST_SET 0x00000004 // time stamp +#define REG_NOTIFY_CHANGE_SECURITY 0x00000008 + +#define REG_LEGAL_CHANGE_FILTER \ + (REG_NOTIFY_CHANGE_NAME |\ + REG_NOTIFY_CHANGE_ATTRIBUTES |\ + REG_NOTIFY_CHANGE_LAST_SET |\ + REG_NOTIFY_CHANGE_SECURITY) + + +// +// InValid Character Set for Names (Case Preserved, Case Insensitive) +// +#define NULL_CHAR 0x00 +#define SLASH_CHAR 0x5c + + +// +// VALENT Definition +// +typedef struct val_ent +{ + BYTE *vValueName; /* SET BY CALLER -- contains name of value to get */ + LONG vValueLen; /* SET BY OS -- contains size of value */ + void *vValueData; /* SET BY OS -- pointer to where data place in specified buffer */ + LONG vDataType; /* SET BY OS -- contains the data type */ +} VALENT; + +/* ---------------------------------------------------------------------------- + + NetWare Registry APIs + --------------------- + RegCloseKey + RegCreateKeyEx + RegDeleteKey + RegDeleteValue + RegEnumKeyEx + RegEnumValue + RegNotifyChangeKeyValue + RegOpenKeyEx + RegQueryInfoKey + RegQueryValueEx + RegSetValueEx + +---------------------------------------------------------------------------- */ + +extern LONG RegCloseKey(HKEY hKey); + +extern LONG RegCreateKeyEx( + HKEY hKey, + BYTE *SubKey, + LONG Reserved, /* reserved for future use */ + BYTE *reserved0, /* reserved for future use */ + LONG Options, /* volatile or not */ + REGSAM samDesired, /* security access mask */ + LONG Reserved1, /* reserved for future use */ + HKEY *newHKey, /* new handle */ + LONG *rtnDisposition); /* disposition taken */ + +extern LONG RegDeleteKey( + HKEY hKey, + BYTE *ptrSubKey); /* string identifying the subkey to delete */ + +extern LONG RegDeleteValue( + HKEY hKey, + BYTE *ptrValueName); /* name of value to delete */ + +extern LONG RegEnumKeyEx( + HKEY hKey, + LONG dwIndex, /* numeric index of key to enumerate */ + BYTE *Name, /* returned key name */ + LONG *lenName, /* sizeof key name*/ + LONG *Reserved, /* reserved for future use */ + BYTE *Reserved0, /* reserved for future use */ + LONG *Reserved1, /* reserved for future use */ + LONG *ptrLastWriteTime); /* DOS time of last modification of $subkey */ + +extern LONG RegEnumValue( + HKEY hKey, + LONG dwIndex, /* numeric index of key to enumerate */ + BYTE *Name, /* returned value name */ + LONG *lenName, /* sizeof value name */ + LONG *Reserved, /* must be zero */ + LONG *lpType, /* returned type of $valname */ + void *Data, /* data associated with $valname (of data type $type) */ + LONG *sizeData); /* sizeof data returned */ + +extern LONG RegFlushKey( + HKEY hKey); + +extern LONG RegLoadKey( + HKEY hKey, + BYTE *keyName, /* string identifying key to created under the specified HKEY */ + BYTE *pathAndFileName); /* path and file name point the hive created by a RegSaveKey */ + + +extern LONG RegNotifyChangeKeyValue( + HKEY hKey, + BOOL bWatchSubtree, + LONG dwNotifyFilter, + REGEVT hEvent, + BOOL fAsynchronus); + +extern LONG RegOpenKeyEx( + HKEY hKey, + BYTE *SubKey, /* string identifying subkey of $key */ + LONG ulOptions, + REGSAM samDesired, /* requested security access mask */ + HKEY *newHKey); /* returned key handle */ + +extern LONG RegQueryInfoKey( + HKEY hKey, + BYTE *reserved2, + LONG *reserved3, + LONG *Reserved, /* reserved for future use */ + LONG *rtnSubKeys, /* number of subkeys for this key */ + LONG *rtnMaxSubKeyLen, /* size of largest subkey */ + LONG *Reserved0, /* reserved for future use */ + LONG *rtnValues, /* number of values associated with this key */ + LONG *MaxValueNameLen, /* size of largest value name */ + LONG *MaxValueLen, /* size of largest value data */ + LONG Reserved1, /* reserved for future use */ + LONG *ptrLastWriteTime); /* time of last write (DOS Time Format )*/ + +extern LONG RegQueryMultipleValues( + HKEY hKey, + VALENT *pValent, + LONG numOfValents, + void *data, + LONG *totalSize); + + +extern LONG RegQueryValue( + HKEY hKey, + BYTE *SubKey, /* string identifying subkey of $key */ + void *lpData, /* returned type of $data */ + LONG *DataLen); /* returned data associated with $valname */ + + +extern LONG RegQueryValueEx( + HKEY hKey, + BYTE *ptrValueName, /* string identifying value to retreive */ + LONG *Reserved, /* reserved for future use */ + LONG *lpType, /* returned type of data */ + void *lpData, /* returned data associated with $valname */ + LONG *DataLen); /* sizeof returned data */ + +extern LONG RegSaveKey( + HKEY hkey, + BYTE *pathAndFileName, /* path and file name */ + SECURITY_ATTRIBUTES SecurityAttrs); /* security attributes */ + +extern LONG RegSetValue( + HKEY hKey, + BYTE *SubKey, + LONG dataType, /* data type of $data */ + void *Data, /* data to associate with valname */ + LONG sizeOfData); /* sizeof $data */ + + +extern LONG RegSetValueEx( + HKEY hKey, + BYTE *ValueName, /* name of value to set */ + LONG Reserved, /* reserved for future use */ + LONG dataType, /* data type of $data */ + BYTE *Data, /* data to associate with valname */ + LONG sizeOfData); /* sizeof $data */ + +/*------------------------------------------------------------------------------------------------------------------** +** extern LONG CloseConfigFile(); * THIS API IS ONLY TO BE USED BY ORION NLMs * ** +** extern void InitializeClusterCfgFileJumpTable(void *, ...); * THIS API IS ONLY TO BE USED BY ORION NLMs * ** +** extern void InitializeLocalCfgFileJumpTable(void); * THIS API IS ONLY TO BE USED BY ORION NLMs * ** +**------------------------------------------------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- + + NetWare Remote Registry Access NCPs + ---------------------------------- + ncp xx 00 f_RegCloseKey + ncp xx 01 f_RegCreateKeyEx + ncp xx 02 f_RegDeleteKey + ncp xx 03 f_RegDeleteValue + ncp xx 04 f_RegEnumKeyEx + ncp xx 05 f_RegEnumValue + ncp xx 06 f_RegNotifyChangeKeyValue + ncp xx 07 f_RegOpenKeyEx + ncp xx 08 f_RegQueryInfoKey + ncp xx 09 f_RegQueryValueEx + ncp xx 10 f_RegSetValueEx + +---------------------------------------------------------------------------- */ + + +/* + Error Codes +*/ +#define REG_ERR_INVALID_HANDLE 0x40000001 +#define REG_ERR_INPUT_PARAMETER_NO_DATA 0x40000002 +#define REG_ERR_VALUE_NOT_SET 0x40000003 +#define REG_ERR_ENUM_INDEX_OUT_OF_RANGE 0x40000004 +#define REG_ERR_NAME_BUFFER_TOO_SMALL 0x40000005 +#define REG_ERR_VALUE_BUFFER_TOO_SMALL 0x40000006 +#define REG_ERR_SAM_CHECK_FAILED 0x40000007 +#define REG_ERR_DATABASE_IS_CORRUPT 0x40000008 +#define REG_ERR_INCOMPLETE_SUBKEY_PATH 0x40000009 +#define REG_ERR_VALUE_SEARCH_FAILED 0x4000000a +#define REG_ERR_INTERNAL_RESOURCE_FAILURE 0x4000000b +#define REG_ERR_DB_WRITE_ERROR 0x4000000c +#define REG_ERR_RESERVED_KEYS_NO_DELETE 0x4000000d +#define REG_ERR_SUBKEY_SEARCH_FAILED 0x4000000e +#define REG_ERR_NOT_ACCESSIBLE 0x4000000f +#define REG_ERR_REG_SZ_ONLY_ALLOWED 0x40000010 +#define REG_ERR_NO_FREE_HANDLE_SLOTS 0x40000011 +#define REG_ERR_DATABASE_NOT_ACCESSIBLE 0x40000012 +#define REG_ERR_INVALID_CHARACTER 0x40000013 +#define REG_ERR_FILE_CREATION_FOR_SAVEKEY 0x40000014 +#define REG_ERR_SERVICE_NOT_AVAILABLE 0x40000015 +#define REG_ERR_VALUE_SIZE_TOO_LARGE 0x40000016 + +/* ---------------------------------------------------------------------------- + + Reserved NetWare PATHS & KEYS under HKEY_LOCAL_MACHINE + +---------------------------------------------------------------------------- */ + +#define REGSTR_SET_PARAMETERS "Setable Parameters" +#define REGSTR_SET_PARAMETER_CURRENT_VALUE "Current Value" +#define REGSTR_SET_PARAMETER_PENDING_VALUE "Pending Value" +#define REGSTR_SET_PARAMETER_CONTROL_INFO "Control Info" +#define REGSTR_SET_PARAMETER_DEFAULT_VALUE "Default Value" + +#define REGSTR_SERVER_NAME "Server Name" +#define REGSTR_SERVER_OPERATION_MODE "Operational Mode" + +#define REGSTR_NO_SPLASH "No Splash" +#define REGSTR_PROTOCOL_IPX_INTERNAL_NET_NUMBER "Protocols\\IPX\\Internal Net Number" + + +#define REGSTR_LOAD_ORDER "Load Control" +#define REGSTR_LOAD_ORDER_CREATOR "Load Control\\Creator Info" +#define REGSTR_LOAD_ORDER_STAGE0 "Load Control\\Stage0" +#define REGSTR_LOAD_ORDER_STAGE1 "Load Control\\Stage1" +#define REGSTR_LOAD_ORDER_STAGE2 "Load Control\\Stage2" +#define REGSTR_LOAD_ORDER_STAGE3 "Load Control\\Stage3" +#define REGSTR_LOAD_ORDER_STAGE4 "Load Control\\Stage4" +#define REGSTR_LOAD_ORDER_STAGE5 "Load Control\\Stage5" +#define REGSTR_CREATOR_INFO "Creator Info" +#define REGSTR_LOAD_ORDER_BOOT_INFO "Load Control\\Boot Info" +#define REGSTR_BOOT_NCF_DATE_TIME_INFO "Startup & AutoExec Info" + +#define REGSTR_NCP_CONTROL_INFO "NCP Control" + +#define REGSTR_SERVER_GUID_INFO "Server GUID" +#define REGSTR_LAST_ISSUED_GUID "Last Issued GUID" + +#define REGSTR_ALERT_CONTROL "Alert Control" + +/* ---------------------------------------------------------------------------- + + Reserved NetWare PATHS & KEYS under HKEY_PERFORMANCE_DATA + +---------------------------------------------------------------------------- */ +#define REGSTR_SERVER_WORKING_SET "Server Working Set" +#define REGSTR_LEGACY_FILE_SYSTEM "Legacy File System" +#define REGSTR_LEGACY_CACHE "Legacy Cache" +#define REGSTR_LEGACY_PROCESSES "Legacy Processes" +#define REGSTR_LEGACY_DIRECTORY_CACHE "Legacy Directory Cache" +#define REGSTR_LEGACY_VOLUMES "Legacy Volumes" diff --git a/src/nwnss/CMakeLists.txt b/src/nwnss/CMakeLists.txt index 2f7b1e8..e8b78a5 100644 --- a/src/nwnss/CMakeLists.txt +++ b/src/nwnss/CMakeLists.txt @@ -315,6 +315,14 @@ add_library(nwnss SHARED nssUserspaceProvider.c lsa/lsaXattrUserspace.c lsa/lsaXattr.c + zlss/xaction.c + zlss/zlog.c + zlss/zio.c + zlss/zfsVol.c + zlss/zfsPool.c + zlss/zfsFileMap.c + zlss/eflTree.c + zlss/myBTree.c zlss/purgeLog.c zlss/salvageLog.c zlss/purgeTree.c diff --git a/src/nwnss/zlss/eflTree.c b/src/nwnss/zlss/eflTree.c new file mode 100644 index 0000000..1e88eb5 --- /dev/null +++ b/src/nwnss/zlss/eflTree.c @@ -0,0 +1,5441 @@ +/**************************************************************************** + | + | (C) Copyright 2002, 2003 Novell, Inc. + | All Rights Reserved. + | + | This program is free software; you can redistribute it and/or + | modify it under the terms of version 2 of the GNU General Public + | License as published by the Free Software Foundation. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, contact Novell, Inc. + | + | To contact Novell about this file by physical or electronic mail, + | you may find current contact information at www.novell.com + | + |*************************************************************************** + | + | Novell Storage Services (NSS) + | + |--------------------------------------------------------------------------- + | + | $Author: blarsen $ + | $Date: 2006-01-21 04:09:53 +0530 (Sat, 21 Jan 2006) $ + | + | $RCSfile$ + | $Revision: 1315 $ + | + |--------------------------------------------------------------------------- + | This module is used to: + | Define the btree operations for the Event File List(EFL). + +-------------------------------------------------------------------------*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zParams.h" +#include "comnPublics.h" +#include "comnBeasts.h" +#include "fsm.h" +#include "pssDebug.h" +#include "comnIO.h" +#include "msgIO.h" +#include "eflTree.h" +#include "parse.h" +#include "pssDebug.h" +#include "zlssStartup.h" +#include "zfsAsyncio.h" +#include "zlog.h" +#include "zlssLogicalVolume.h" +#include "comnBeastClass.h" +#include "beastTree.h" +#include "msgName.h" +#include "zfsXTree.h" +#include "zlssUpgrade.h" + +#define EFL_defaultEpochInterval 14 * 24 * 3600 /* 14 days */ + +BOOL EFLDisplay = FALSE; + +#if NSS_DEBUG IS_ENABLED +#define EFL_Printf DBG_DebugPrintf +#else +#define EFL_Printf aprintf +#endif + +NINT EFL_TotalBranches; +NINT EFL_TotalLeafs; +NINT EFL_TotalEntries; + +STATUS EFL_writeTheBeast( + GeneralMsg_s *genMsg, + Volume_s *volume, + ZfsEFLTreeBeast_s *eTree); + +/**************************************************************************** + * + ****************************************************************************/ +void EFL_DisplayNode ( + ZfsEFLTreeBeast_s *treeBeast, + char *location, + char *nameOfBuffer, + EFLTreeNode_s *node, + Blknum_t block) + +{ + NINT i; + ELogLink_s nextLink; + ELogEntry_s *entry; + IoMsg_s iomsg; + Buffer_s *buffer; + GeneralMsg_s genMsg; + ELogNode_s *logNode; + + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + EFL_Printf(LBLUE, MSGNot("%s=%s<%lx(%ld.)>"), location, nameOfBuffer, + (unsigned long)block, (unsigned long)block ); + + if (EFL_IS_ROOT(node)) + { + EFL_Printf(LBLUE, MSGNot("ROOT:")); + } + if (EFL_IS_LEAF(node)) + { + EFL_TotalLeafs++; + EFL_Printf(LBLUE, MSGNot("LEAF")); + if (node->magic != EFL_LEAF_MAGIC) + { + EFL_Printf(LRED, MSGNot(":ERROR(Node does not have correct MAGIC (value is %lx))\n"),node->magic); + zASSERT(node->magic == EFL_LEAF_MAGIC); + return; + } + EFL_Printf(LRED, MSGNot("(numRecs=%d)\n"), node->numRecs); + for (i = 0; i < node->numRecs; ++i) + { + EFL_TotalEntries++; + EFL_Printf(YELLOW, MSGNot("<%8Lx, %8x, %8x, %8x, %8x, %d/%d, %d/%d, %d>\n"), + node->ELEAF(i).zid, + node->ELEAF(i).createEpochs, + node->ELEAF(i).modifyEpochs, + node->ELEAF(i).metaDataEpochs, + node->ELEAF(i).deleteEpochs, + node->ELEAF(i).firstLogEntry.blockNum, + node->ELEAF(i).firstLogEntry.offset, + node->ELEAF(i).lastLogEntry.blockNum, + node->ELEAF(i).lastLogEntry.offset, + node->ELEAF(i).numGoodLogEntries); + /* Display event log entries for this ZID */ + nextLink = node->ELEAF(i).firstLogEntry; + for(;;) + { + if (nextLink.blockNum == INVALID_BLK_ZERO) + { + break; + } + READBLK_IO_MSG(iomsg, treeBeast, nextLink.blockNum, CACHE_READ); + buffer = ELOG_ReadPoolBlk(&genMsg, &iomsg); + if (buffer == NULL) + { + zASSERT("Error following log entries for a ZID" == NULL); + break; + } + logNode = (ELogNode_s *)buffer->pBuf.data; + entry = (ELogEntry_s *)&logNode->u.log.data[nextLink.offset]; + zASSERT(node->ELEAF(i).zid == entry->zid); + zASSERT(entry->magic == ELOG_ENTRY_MAGIC); + EFL_DisplayLogEntry(entry, nextLink.offset); + nextLink = entry->nextLogEntry; + CACHE_RELEASE(buffer); + } + } + } + else + { + EFL_TotalBranches++; + EFL_Printf(LGREEN, MSGNot("BRANCH")); + if (node->magic != EFL_BRANCH_MAGIC) + { + EFL_Printf(LRED, ":ERROR(Node does not have correct MAGIC (value is %lx))\n",node->magic); + zASSERT(node->magic == EFL_BRANCH_MAGIC); + return; + } + EFL_Printf(LBLUE, MSGNot("(numRecs=%d)\n"), node->numRecs); + for (i = 0; i < node->numRecs; ++i) + { + EFL_Printf(LBLUE, MSGNot("<%8Lx, %-10d>"), + node->EBRANCH(i).zid, (LONG)node->EBRANCH(i).child); + if ((i % 3) == 2) + { + EFL_Printf(LBLUE, MSGNot("\n")); + } + } + if ((i % 3) != 0) + { + EFL_Printf(LBLUE, MSGNot("\n")); + } + } +} + +/**************************************************************************** + * Display a single node + ****************************************************************************/ +void EFL_DisplayANode( + ZfsEFLTreeBeast_s *treeBeast, + char *location, + Buffer_s *buffer) +{ + EFLTreeNode_s *node = (EFLTreeNode_s *)buffer->pBuf.data; + char *type; + + type = EFL_IS_LEAF(node) ? "LEAF" : "BRANCH"; + EFL_DisplayNode(treeBeast, location, type, node, buffer->volBlk); +} + +/**************************************************************************** + * + ****************************************************************************/ +void EFL_DisplaySubtree( + ZfsEFLTreeBeast_s *treeBeast, + Buffer_s *buffer) +{ + GeneralMsg_s genMsg; + EFLTreeNode_s *node = (EFLTreeNode_s *)buffer->pBuf.data; + Buffer_s *newBuf; + NINT i; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + if (EFL_IS_LEAF(node)) + { + EFL_DisplayNode(treeBeast, MSGNot("Full Tree"), MSGNot("LEAF"), node, + buffer->volBlk); + } + else + { + EFL_DisplayNode(treeBeast, MSGNot("Full Tree"), MSGNot("BRANCH"), node, + buffer->volBlk); + if ( node->magic == EFL_BRANCH_MAGIC ) + { + for (i = 0; i < node->numRecs; i++) + { + READBLK_IO_MSG(iomsg, treeBeast, + node->EBRANCH(i).child, CACHE_READ); + SET_DEBUG_ID(iomsg, 0); + newBuf = ZFS_ReadPoolBlk( &genMsg, &iomsg); + EFL_DisplaySubtree(treeBeast, newBuf); + CACHE_RELEASE(newBuf); + } + } + } +} + +/**************************************************************************** + * Given a volume, display the tree + ****************************************************************************/ +void EFL_DoDisplayEFLTree( + ZfsEFLTreeBeast_s *eTree) +{ + GeneralMsg_s genMsg; + Buffer_s *buffer; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + + EFL_Printf( LBLUE, MSGNot("***** EFL B-TREE *****\n")); + if (eTree == NULL) + { + EFL_Printf(LRED, MSGNot("No EFL tree beast found\n")); + return; + } + + S_LATCH(&eTree->ZFSEFLTREEbeastLatch); /* Latch the tree */ +// wPause(stdout, -1); + +// RootDisplayMyCache(ztree); + EFL_TotalEntries = 0; + EFL_TotalBranches = 0; + EFL_TotalLeafs = 0; + EFL_Printf( LBLUE, MSGNot("eflRoot=%d\n"), + eTree->ZFSEFLTREEzbtree.p.btRoot); + if (eTree->zfsBtree.p.btRoot != INVALID_BLK_ZERO) + { + READBLK_IO_MSG(iomsg, eTree, eTree->zfsBtree.p.btRoot, CACHE_READ); + SET_DEBUG_ID(iomsg, 0); + buffer = ZFS_ReadPoolBlk( &genMsg, &iomsg); + zASSERT(buffer != NULL); + EFL_DisplaySubtree(eTree, buffer); + EFL_Printf(LMAGENTA, MSGNot("\nTotal branches = %d\n"), + EFL_TotalBranches); + EFL_Printf(LMAGENTA, MSGNot("Total leafs = %d\n"), + EFL_TotalLeafs); + EFL_Printf(LMAGENTA, MSGNot("Total entries = %d\n"), + EFL_TotalEntries); + CACHE_RELEASE(buffer); + } + else + { + EFL_Printf( LBLUE, MSGNot("Empty B-Tree\n")); + } +// wPause(stdout, 0); + UNS_LATCH(&eTree->ZFSEFLTREEbeastLatch); +} + +/**************************************************************************** + * Called from command line parser to display the given tree + ****************************************************************************/ +void EFL_DisplayEFLTree( + struct PCLSwitchDef_s *switchDef, + NINT index, + void *userParm) +{ + GeneralMsg_s genMsg; + ZfsVolume_s *volume; + + MPKNSS_LOCK(); + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + volume = (ZfsVolume_s *)COMN_VolumeNameLookup(&genMsg, + (unicode_t *)switchDef->ret_value,FALSE,NULL); + + if (volume != NULL) + { + EFL_DoDisplayEFLTree(volume->eflTree); + COMN_Release(&volume); + } + else + { + printf(MSG("Volume not found.\n", 580)); + } + MPKNSS_UNLOCK(); +} + +/**************************************************************************** + * * + ****************************************************************************/ +BOOL EFL_ValidateNode (EFLTreeNode_s *node) +{ + NINT count; + + if (EFL_IS_LEAF(node)) + { + EFLLeafEntry_s *curr; + EFLLeafEntry_s *prev; + + zASSERT(node->magic == EFL_LEAF_MAGIC || node->magic == + (EFL_LEAF_MAGIC | 0x20)); + + if ((node->magic != EFL_LEAF_MAGIC) && + (node->magic != (EFL_LEAF_MAGIC))) + { + return FALSE; + } + + prev = &node->ELEAF(0); + + zASSERT(EFL_MAX_LEAF_ENTRIES >= node->numRecs); + if(node->numRecs >= EFL_MAX_LEAF_ENTRIES) + { + return FALSE; + } + + for(count = 1; count < node->numRecs; count++) + { + curr = &node->ELEAF(count); + +// zASSERT(curr->usedAmount <= curr->restrictionAmount); + + /* check if ascending */ + zASSERT(prev->zid < curr->zid); + if(prev->zid >= curr->zid) + { + return FALSE; + } + + /* set previous record and try again */ + prev = curr; + } + } + else + { + EFLBranchEntry_s *curr; + EFLBranchEntry_s *prev; + + zASSERT(node->magic == EFL_BRANCH_MAGIC); + if (node->magic != EFL_BRANCH_MAGIC) + { + return FALSE; + } + + prev = &node->EBRANCH(0); + + zASSERT(EFL_MAX_BRANCH_ENTRIES >= node->numRecs); + if(node->numRecs >= EFL_MAX_BRANCH_ENTRIES) + { + return FALSE; + } + + for(count = 1; count < node->numRecs; count++) + { + curr = &node->EBRANCH(count); + + /* check if ascending */ + zASSERT(prev->zid < curr->zid); + if(prev->zid >= curr->zid) + { + return FALSE; + } + + /* set previous record and try again */ + prev = curr; + } + } + return TRUE; +} + +#if NSS_DEBUG IS_ENABLED +/**************************************************************************** + * + ****************************************************************************/ +void EFL_compare ( + EFLTreeNode_s *node1, + EFLTreeNode_s *node2, + NINT len) +{ + EFL_VALIDATE_NODE(node1); + EFL_VALIDATE_NODE(node2); + zASSERT(memcmp(node1, node2, len) == 0); + return; +} + +/**************************************************************************** + * + ****************************************************************************/ +void EFL_noCompare ( + EFLTreeNode_s *node1, + EFLTreeNode_s *node2, + NINT len) +{ +} + +#endif + +/**************************************************************************** + * EFL beast constructor + *****************************************************************************/ +STATUS EFL_construct( + GeneralMsg_s *genMsg, + void *beast_LX) +{ + ZfsEFLTreeBeast_s *beast = (ZfsEFLTreeBeast_s *)beast_LX; + + ASSERT_MPKNSS_LOCK(); + beast->zfsBtree.p.btRoot = INVALID_BLK_ZERO; + beast->p.logHeaderBlock = INVALID_BLK_ZERO; + return zOK; +} + +/**************************************************************************** + * + ****************************************************************************/ +STATUS createEFLEpoch( + GeneralMsg_s *genMsg, + Volume_s *volume, + EFLEpoch_t *epoch) +{ + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + NINT bit, index; + STATUS status; + + for (bit = 1, index = 0; bit != 0; bit <<= 1, index++) + { + if (!(bit & eTree->p.usedEpochs)) + { + LB_GUIDGenerate((GUID_t *)epoch); + + eTree->p.usedEpochs |= bit; + eTree->p.activeEpochs |= bit; + eTree->p.epochs[index].epoch = *epoch; + eTree->p.epochs[index].lastAlive = GetUTCTime(); + + status = EFL_writeTheBeast(genMsg, volume, eTree); + zASSERT(status == zOK); + + return status; + } + } + + SetErrno(genMsg, zERR_EFL_NO_EPOCH_AVAILABLE); + return zFAILURE; + +} + +/**************************************************************************** + * + ****************************************************************************/ +BOOL findEpochIndex( + ZfsEFLTreeBeast_s *eTree, + GUID_t *epoch, + NINT *index) +{ + NINT i; + + if (!LB_GUIDValidate(epoch)) + { + return FALSE; + } + + for (i = 0; i < MAX_ZFS_EFL_EPOCHS; i++) + { + if (!LB_GUIDCompare(epoch, &eTree->p.epochs[i].epoch)) + { + /* find match */ + *index = i; + return TRUE; + } + } + + return FALSE; +} + +/**************************************************************************** + * + ****************************************************************************/ +EFLEpochMask_t getEFLEpochMask( + ZfsEFLTreeBeast_s *eTree, + GUID_t *epoch) +{ + NINT index; + + if (findEpochIndex(eTree, epoch, &index)) + { + return 1 << index; + } + return 0; +} + +/**************************************************************************** + * + ****************************************************************************/ +STATUS pingEpoch( + GeneralMsg_s *genMsg, + Volume_s *volume, + EFLEpoch_t *epoch) +{ + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + NINT index; + + if (findEpochIndex(eTree, epoch, &index)) + { + eTree->p.epochs[index].lastAlive = GetUTCTime(); + return EFL_writeTheBeast(genMsg, volume, eTree); + } + else + { + SetErrno(genMsg, zERR_EFL_EPOCH_NOT_FOUND); + return zFAILURE; + } +} + +/**************************************************************************** + * + * Create the EFL beast if it is not there + * + ****************************************************************************/ +ZfsEFLTreeBeast_s *EFL_createEFL( + GeneralMsg_s *genMsg, + ZfsVolume_s *zfsVol) +{ + ZfsXaction_s *xaction; + ZfsEFLTreeBeast_s *eTree; + + zASSERT(zfsVol->ZLSSVOLEFL == NULL); + + ASSERT_XLATCH(&zfsVol->ZFSVOLbeastLatch); + + /* Make sure the EFL isn't already there */ + eTree = ZFSVOL_VOL_GetBeastFromVolume( + genMsg, ZFSVOL_EFL_ZID, &zfsVol->ZFSVOLvol); + + if (!eTree) + { + eTree = BST_new(genMsg, zFTYPE_ZLSS_EFL, zfsVol); + if (eTree == NULL) + { + errPrintf(WHERE, Module, 1460, + MSG("Error creating EFL (ZID %d), status=%d", 581), + ZFSVOL_EFL_ZID, GetErrno(genMsg)); + return NULL; + } + X_LATCH(&eTree->ZFSEFLTREEbeastLatch); + + eTree->ZFSEFLTREEzid = ZFSVOL_EFL_ZID; + eTree->ZFSEFLTREEfirstParentZID = zINVALID_ZID; + + eTree->ZFSEFLTREEbstState |= BST_STATE_NEW; + + eTree->p.nameSpace = EFL_DEFAULT_NAMESPACE; + /* set default epoch inactive interval to 2 weeks, in terms of seconds */ + eTree->p.inactiveInterval = EFL_defaultEpochInterval; + + + xaction = BeginXLocal(&zfsVol->ZFSVOLvol, BXL_DEFAULT); + COMN_MARK_BEAST_XLOCAL(&eTree->ZFSEFLTREEroot, &xaction->xaction); + COMN_ForceBeastWrite(genMsg, eTree, &xaction->xaction); + EndXlocal(xaction); + BEASTHASH_Insert(&eTree->ZFSEFLTREEroot); + } + else + { + X_LATCH(&eTree->ZFSEFLTREEbeastLatch); + zASSERT(eTree->ZFSEFLTREEroot.useCount != 0); + } + DQ_RMV(&eTree->ZFSEFLTREEroot, volLink); + zfsVol->ZLSSVOLEFL = eTree; + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + +//#ifndef __linux__ // LINUX_Upgrade + /* upgrade the pool */ + if (ZFSPOOL_AIPU4085_4086_4087_4088To4300(genMsg, ZLSS_VOLUME_TO_ZLSS_POOL(zfsVol)) != zOK) + { + zfsVol->ZLSSVOLEFL = NULL; + + eTree->ZFSEFLTREEbstState |= BST_STATE_PURGING; + BST_releaseAndFree(eTree); + + return NULL; + } +//#endif + + return eTree; +} + + +/**************************************************************************** + * + * Get the EFL beast + * + ****************************************************************************/ +ZfsEFLTreeBeast_s *EFL_getEFL( + GeneralMsg_s *genMsg, + ZfsVolume_s *zfsVol) +{ + ZfsEFLTreeBeast_s *eTree; + + X_LATCH(&zfsVol->ZFSVOLbeastLatch); + + eTree = zfsVol->eflTree; + if (!eTree) + { + eTree = EFL_createEFL(genMsg, zfsVol); + } + + UNX_LATCH(&zfsVol->ZFSVOLbeastLatch); + + return eTree; +} + + +/**************************************************************************** + * + * This routine front ends the pool read routines and checks to make sure + * the block that is read in has a valid magic number. + * + ****************************************************************************/ +Buffer_s *EFL_ReadPoolBlk ( + GeneralMsg_s *genMsg, + IoMsg_s *iomsg) +{ + Buffer_s *buffer; + NINT magic; + + buffer = ZFS_ReadPoolBlk(genMsg, iomsg); + if ((buffer != NULL) && (iomsg->mode != CACHE_WRITE)) + { + magic = ((EFLTreeNode_s *)buffer->pBuf.data)->magic; + if ((magic != EFL_LEAF_MAGIC) && (magic != EFL_BRANCH_MAGIC)) + { + zASSERT("EFLTree -- Bad magic during read" == NULL); + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, iomsg); + CACHE_RELEASE(buffer); + return NULL; + } + } + zASSERT(buffer != NULL); + return buffer; +} + +/**************************************************************************** + * EFL_findChildBlock + * In: buffer - the buffer to be searched + * zid - ZID to find + * Out: index into the ptr array (0 to numrecs) + * Return: Blknum of the child block holding the zid + ****************************************************************************/ +Blknum_t EFL_findChildBlock( + EFLTreeNode_s *node, + Zid_t zid, + NINT *index) +{ + SNINT mid = 0; + SNINT low = 0; + SNINT high = node->numRecs-1; + Zid_t zidFromMid = 0; + + while (low <= high) + { + mid = (low + high) / 2; + zidFromMid = node->EBRANCH(mid).zid; + if (zid == zidFromMid) + { + break; + } + else if (zid > zidFromMid) + { + low = mid + 1; + } + else + { + high = mid - 1; + } + } + + if (zid < zidFromMid) + { + --mid; + } + *index = mid; + zASSERT(mid >= 0); + return(node->EBRANCH(mid).child); +} + +/**************************************************************************** + * This routine finds an entry in a leaf. If the entry is not found then + * the location where the insertion should occur is returned. + * I: buffer - the buffer to be searched + * I: zid - zid to find + * O: index into the ptr array (0 to numrecs) + * index of the item before which the given key can be inserted. + * R: True if the zid is found + ****************************************************************************/ +BOOL EFL_findLeafEntry( + EFLTreeNode_s *node, + Zid_t zid, + NINT *index) +{ + SNINT mid = 0; + SNINT low = 0; + SNINT high = node->numRecs-1; + Zid_t zidFromMid = (Zid_t)-1; + BOOL foundFlag = FALSE; + + while (low <= high) + { + mid = (low + high) / 2; + zidFromMid = node->ELEAF(mid).zid; + if (zid == zidFromMid) + { + foundFlag = TRUE; + break; + } + else if (zid > zidFromMid) + { + low = mid + 1; + } + else + { + high = mid - 1; + } + } + + if (zid > zidFromMid) + { + ++mid; + } + + *index = mid; + return foundFlag; +} + +/**************************************************************************** + * + ****************************************************************************/ +void EFL_initNode( + Buffer_s *buffer, + NINT state, + GUID_t *internalID) +{ + EFLTreeNode_s *node = (EFLTreeNode_s *)buffer->pBuf.data; + + ASSERT_MPKNSS_LOCK(); + + zASSERT( LB_GUIDValidate( internalID ) ); + if (state & EFL_LEAF) + { + node->magic = EFL_LEAF_MAGIC; + node->n.leaf.nextLeaf = INVALID_BLK_ZERO; /* init the leaf link field */ + node->n.leaf.resBlk = INVALID_BLK_ZERO; /* init the leaf link field */ + } + else + { + node->magic = EFL_BRANCH_MAGIC; + } + node->state = state; + node->efl_internalID = *internalID; + node->numRecs = 0; + node->lsn = 0; + + buffer->state |= CACHE_DIRTY; + + EFL_VALIDATE_NODE(node); + return; +} + +/**************************************************************************** + * + ****************************************************************************/ +void EFL_insertZeroNode( + EFLTreeNode_s *node) +{ + ASSERT_MPKNSS_LOCK(); + + EFL_VALIDATE_NODE(node); + /* + * We always want a zero in the first location to establish a lower bound + * so we do not need a lot of special case code. + */ + node->numRecs = 1; + memset(&node->ELEAF(0), 0, sizeof(EFLLeafEntry_s)); + node->ELEAF(0).firstLogEntry.blockNum = INVALID_BLK_ZERO; + node->ELEAF(0).firstLogEntry.resBlk = INVALID_BLK_ZERO; + node->ELEAF(0).firstLogEntry.offset = 0; + node->ELEAF(0).lastLogEntry.blockNum = INVALID_BLK_ZERO; + node->ELEAF(0).lastLogEntry.resBlk = INVALID_BLK_ZERO; + node->ELEAF(0).lastLogEntry.offset = 0; + EFL_VALIDATE_NODE(node); + return; +} + +/**************************************************************************** + * Free the node pointed to by sibling + * + * Input: The sibling node points to the node that will be delete + * + * Output: Sibling is NULL + ****************************************************************************/ +STATUS EFL_freeNode( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms) +{ + Buffer_s *sibling = parms->sibling; + + ASSERT_MPKNSS_LOCK(); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->sibling->pBuf.data); + + CACHE_RELEASE(sibling); + sibling = NULL; + + return(zOK); +} + +/**************************************************************************** + * Join the child and sibling and release the sibling + * + * Input: Parent, child and sibling latched. + * + * Output: Parent and child latched. Sibling released. + ****************************************************************************/ +STATUS EFL_join( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms) +{ + EFLTreeNode_s *parent = (EFLTreeNode_s *)parms->parent->pBuf.data; + EFLTreeNode_s *child = (EFLTreeNode_s *)parms->child->pBuf.data; + EFLTreeNode_s *sibling = (EFLTreeNode_s *)parms->sibling->pBuf.data; + BYTE *startLocation; + NINT lenMoved; + NINT numMoved; + NINT parentIndex; + Zid_t entryInParent; + ZfsXaction_s *localXaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + EFLLog_s *logRecord; + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->sibling->pBuf.data); + +// EFL_DisplayANode("Start Join (parent)", parms->parent); +// EFL_DisplayANode("Start Join (child)", parms->child); +// EFL_DisplayANode("Start Join (sibling)", parms->sibling); + if (EFL_IS_LEAF(child)) + { /* leaf */ + /* setup logging information */ + startLocation = (BYTE *)&child->ELEAF(child->numRecs); + lenMoved = sibling->numRecs * sizeof(EFLLeafEntry_s); + numMoved = sibling->numRecs; + + memcpy(startLocation, /* Destination */ + &sibling->ELEAF(0), /* Source */ + lenMoved); + child->numRecs += sibling->numRecs; + + /* update the leaf link */ + child->n.leaf.nextLeaf = sibling->n.leaf.nextLeaf; + /* Change magic in Deleted Leaf Node to indicate it is deleted. This + * is being done for scan mode of rebuild. + */ + sibling->magic |= 0x20; /* Lower case low byte letter */ + } + else + { /* branch */ + /* setup logging information */ + startLocation = (BYTE *)&child->EBRANCH(child->numRecs); + lenMoved = sibling->numRecs * sizeof(EFLBranchEntry_s); + numMoved = sibling->numRecs; + + memcpy(startLocation, /* Destination */ + &sibling->EBRANCH(0), /* Source */ + lenMoved); + child->numRecs += sibling->numRecs; + } + + --(parent->numRecs); + parentIndex = parms->index + 1; + entryInParent = parent->EBRANCH(parentIndex).zid; + memmove( &parent->EBRANCH(parentIndex), /* Destination */ + &parent->EBRANCH(parentIndex + 1), /* Source */ + (parent->numRecs - parentIndex) * sizeof(EFLBranchEntry_s)); + + localXaction = BeginXLocal(parms->volume, BXL_DEFAULT); + + zASSERT((ZLOG_BLOCK_INFO_SIZE(3) + sizeof(EFLSplit_s) - 1 + lenMoved) <= + ZLOG_MAXIMUM_RECORD_SIZE); + + ZLOG_ObtainRecord(localXaction, ZLOG_BLOCK_INFO_SIZE(3) + sizeof(EFLSplit_s) + - 1 + lenMoved); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_JOIN, localXaction, logBuffer, 3, + poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->parent->volBlk, + parent->lsn, parms->parent, localXaction, 0, EFL_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->child->volBlk, + child->lsn, parms->child, localXaction, 1, EFL_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[2], parms->sibling->volBlk, + sibling->lsn, parms->sibling, localXaction, 2, EFL_noCompare); + ZLOG_DELETE_BLOCK(localXaction, poolBlks[2]); + + logRecord->u.split.zidForParent = entryInParent; + logRecord->u.split.leafLink = child->n.leaf.nextLeaf; + logRecord->u.split.blockForParent = poolBlks[2].blkNum; + zASSERT( parms->eTree != NULL ); + logRecord->u.split.internalID = + parms->eTree->ZFSEFLTREEroot.ROOTinternalID; + logRecord->u.split.indexForParent = parentIndex; + logRecord->u.split.totalLength = lenMoved; + logRecord->u.split.numToMove = numMoved; + logRecord->u.split.nodeType = sibling->state & EFL_LEAF; + memcpy(&logRecord->u.split.data[0], startLocation, lenMoved); + + child->lsn = logBuffer->ZXR_Lsn; + parent->lsn = logBuffer->ZXR_Lsn; +#if LOG_TEST IS_ENABLED + if (LogTest) + sibling->lsn = logBuffer->ZXR_Lsn; +#endif + + ZLOG_BIND(localXaction, parms->child); + ZLOG_BIND(localXaction, parms->parent); + if (EFL_IS_LEAF(child) ) + { /* This is the deleted LEAF block we must write so that rebuild + * can scan whole disk and not find OLD beast tree leafs. + */ + parms->sibling->state |= CACHE_DIRTY; + ZLOG_BIND(localXaction, parms->sibling); +#if 0 + /* This code allows for testing of the recovery code of setting + * the magic word on deletes. + */ +#if NSS_ASSERT IS_ENABLED + { + extern LONG gZlogAssertBlock; + ZlogBeast_s *zlogBeast; + + zlogBeast = localXaction->ZX_zlogBeast; + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + gZlogAssertBlock = (LONG)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + zASSERT(" Close to time to re-boot machine"==NULL); + } +#endif +#endif + } + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + parms->parent->state |= CACHE_DIRTY; + parms->child->state |= CACHE_DIRTY; + + if (EFL_freeNode(genMsg, parms) != zOK) + { + return zFAILURE; + } + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); +// EFL_DisplayANode("End Join (parent)", parms->parent); +// EFL_DisplayANode("End Join (child)", parms->child); + return zOK; +} + +/**************************************************************************** + * + ****************************************************************************/ +STATUS EFL_balance ( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms) +{ + EFLTreeNode_s *parent = (EFLTreeNode_s *)parms->parent->pBuf.data; + EFLTreeNode_s *sibling = (EFLTreeNode_s *)parms->sibling->pBuf.data; + EFLTreeNode_s *child = (EFLTreeNode_s *)parms->child->pBuf.data; + NINT avg; + NINT numToMove; + BYTE *startLocation; + NINT lenMoved; + NINT direction; + Zid_t zidForParent; + Zid_t previousZid; + ZfsXaction_s *localXaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + EFLLog_s *logRecord; + + avg = (child->numRecs + sibling->numRecs) / 2; + + /* Don't do balancing if no records can be moved */ + if (child->numRecs == avg || (sibling->numRecs == avg)) + { + /* + * The entry we were looking for when we started the balance my be + * in either the child or the sibling entry. Make sure that it + * ends up in the child. + */ + if (parms->zid >= parent->EBRANCH(parms->index + 1).zid) + { + CACHE_RELEASE(parms->child); + parms->child = parms->sibling; + } + else + { + CACHE_RELEASE(parms->sibling); + } + return zOK; + } + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->sibling->pBuf.data); + +// EFL_DisplayANode("Start Balance (child)", parms->child); +// EFL_DisplayANode("Start Balance (sibling)", parms->sibling); + + if (EFL_IS_LEAF(child)) + { /* leaf */ + if (child->numRecs < avg) + { /* if moving from the sibling to the child */ + direction = EFL_RIGHT_TO_LEFT; + numToMove = avg - child->numRecs; + lenMoved = numToMove * sizeof(EFLLeafEntry_s); + startLocation = (BYTE *)&child->ELEAF(child->numRecs); + + /* move the entries from sibling to child */ + memcpy(startLocation, /* Destination */ + &sibling->ELEAF(0), /* Source */ + lenMoved); + + /* move sibling entries back to the front of the record */ + memmove(&sibling->ELEAF(0), /* Destination */ + &sibling->ELEAF(numToMove), /* Source */ + (sibling->numRecs - numToMove) * sizeof(EFLLeafEntry_s)); + + child->numRecs += numToMove; + sibling->numRecs -= numToMove; + } + else + { + zASSERT(sibling->numRecs < avg); + + direction = EFL_LEFT_TO_RIGHT; + numToMove = avg - sibling->numRecs; + lenMoved = numToMove * sizeof(EFLLeafEntry_s); + startLocation = (BYTE *)&sibling->ELEAF(0); + + /* make room for the entries from the child node */ + memmove(&sibling->ELEAF(numToMove), /* Destination */ + &sibling->ELEAF(0), /* Source */ + sibling->numRecs * sizeof(EFLLeafEntry_s)); + + /* move the entries from the child to the sibling */ + memcpy(startLocation, + &child->ELEAF(child->numRecs - numToMove), + lenMoved); + + child->numRecs -= numToMove; + sibling->numRecs += numToMove; + } + } + else + { /* branch */ + if (child->numRecs < avg) + { /* if moving from the sibling to the child */ + direction = EFL_RIGHT_TO_LEFT; + numToMove = avg - child->numRecs; + lenMoved = numToMove * sizeof(EFLBranchEntry_s); + startLocation = (BYTE *)&child->EBRANCH(child->numRecs); + + /* move the entries from sibling to child */ + memcpy(startLocation, /* Destination */ + &sibling->EBRANCH(0), /* Source */ + lenMoved); + + /* move sibling entries back to the front of the record */ + memmove(&sibling->EBRANCH(0), /* Destination */ + &sibling->EBRANCH(numToMove), /* Source */ + (sibling->numRecs - numToMove) * sizeof(EFLBranchEntry_s)); + + child->numRecs += numToMove; + sibling->numRecs -= numToMove; + } + else + { + zASSERT(sibling->numRecs < avg); + + direction = EFL_LEFT_TO_RIGHT; + numToMove = avg - sibling->numRecs; + lenMoved = numToMove * sizeof(EFLBranchEntry_s); + startLocation = (BYTE *)&sibling->EBRANCH(0); + + /* make room for the entries from the child node */ + memmove(&sibling->EBRANCH(numToMove), /* Destination */ + &sibling->EBRANCH(0), /* Source */ + sibling->numRecs * sizeof(EFLBranchEntry_s)); + + /* move the entries from the child to the sibling */ + memcpy(startLocation, + &child->EBRANCH(child->numRecs - numToMove), + lenMoved); + + child->numRecs -= numToMove; + sibling->numRecs += numToMove; + } + } + + /* fix up the parent record */ + previousZid = parent->EBRANCH(parms->index + 1).zid; + if (EFL_IS_LEAF(sibling)) + { + parent->EBRANCH(parms->index + 1).zid = + sibling->ELEAF(0).zid; + } + else + { + parent->EBRANCH(parms->index + 1).zid = + sibling->EBRANCH(0).zid; + } + zidForParent = parent->EBRANCH(parms->index + 1).zid; + + localXaction = BeginXLocal(parms->volume,BXL_DEFAULT); + + ZLOG_ObtainRecord( localXaction, ZLOG_BLOCK_INFO_SIZE(3) + sizeof(EFLBalance_s) + - 1 + lenMoved); + + /* create the log record */ + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_BALANCE, localXaction, logBuffer, 3, + poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->parent->volBlk, + parent->lsn, parms->parent, localXaction, 0, EFL_compare); + + /* put the source in position 1 and the destination in position 2 */ + if (direction == EFL_LEFT_TO_RIGHT) + { + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->child->volBlk, + child->lsn, parms->child, localXaction, 1, EFL_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[2], parms->sibling->volBlk, + sibling->lsn, parms->sibling, localXaction, 2, EFL_compare); + } + else + { + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[2], parms->child->volBlk, + child->lsn, parms->child, localXaction, 2, EFL_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->sibling->volBlk, + sibling->lsn, parms->sibling, localXaction, 1, EFL_compare); + } + + logRecord->u.balance.zidForParent = zidForParent; + logRecord->u.balance.oldZid = previousZid; + logRecord->u.balance.indexForParent = parms->index + 1; + logRecord->u.balance.totalLength = lenMoved; + logRecord->u.balance.numToMove = numToMove; + + memcpy(&logRecord->u.balance.data[0], startLocation, lenMoved); + + child->lsn = logBuffer->ZXR_Lsn; + sibling->lsn = logBuffer->ZXR_Lsn; + parent->lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(localXaction, parms->parent); + ZLOG_BIND(localXaction, parms->child); + ZLOG_BIND(localXaction, parms->sibling); + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + +// EFL_DisplayANode("End Balance (child)", parms->child); +// EFL_DisplayANode("End Balance (sibling)", parms->sibling); + + if (parms->zid >= parent->EBRANCH(parms->index + 1).zid) + { + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->sibling->pBuf.data); + CACHE_DIRTY_RELEASE(parms->child); + parms->child = parms->sibling; + } + else + { + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->sibling->pBuf.data); + CACHE_DIRTY_RELEASE(parms->sibling); + } + + parms->parent->state |= CACHE_DIRTY; + parms->child->state |= CACHE_DIRTY; + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return zOK; +} + +/**************************************************************************** + * Joins or balances nodes because a node is partially empty + * + * Input: Parent and child are locked. + * + * Ouput: Both still locked + ****************************************************************************/ +STATUS EFL_underflow ( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms) +{ + ZfsEFLTreeBeast_s *eflTreeBeast = parms->eTree; + EFLTreeNode_s *parent = (EFLTreeNode_s *)parms->parent->pBuf.data; + IoMsg_s iomsg; + STATUS status; + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + /* Get sibling and child in right order, child lower than sibling */ + if (parms->index < parent->numRecs - 1) + { + parms->readBlkNum = parent->EBRANCH(parms->index + 1).child; + READBLK_IO_MSG(iomsg, eflTreeBeast, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 1); + parms->sibling = EFL_ReadPoolBlk(genMsg, &iomsg); + if (parms->sibling == NULL) + { + return zFAILURE; + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->sibling->pBuf.data); + } + else + { + parms->sibling = parms->child; + parms->readBlkNum = parent->EBRANCH(--parms->index).child; + READBLK_IO_MSG(iomsg, eflTreeBeast, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 2); + /* + * Release the latch so we do not latch out of order -- This is + * OK because the parent latch is held. + */ + CACHE_PIN(parms->sibling); + CACHE_UNXLATCH(parms->sibling); + parms->child = EFL_ReadPoolBlk(genMsg, &iomsg); + CACHE_XLATCH(parms->sibling); + CACHE_UNPIN(parms->sibling); + if (parms->child == NULL) + { + CACHE_RELEASE(parms->sibling); + return zFAILURE; + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + } + + if (EFL_IS_LESSTHAN_MAX((EFLTreeNode_s *)parms->child->pBuf.data, + (EFLTreeNode_s *)parms->sibling->pBuf.data)) + { + status = EFL_join(genMsg, parms); + } + else + { + status = EFL_balance(genMsg, parms); + } + + return status; +} + +/**************************************************************************** + * Shrink the tree by removing the root node + * + * Input: Child is latched. It is the root. The tree beast is latched. + * + * Output: New child (latched). Dir tree beast is latched. + ****************************************************************************/ +STATUS EFL_shrink ( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms) +{ + Lsn_t rootLsn; + Blknum_t rootBlkNum; + ZfsEFLTreeBeast_s *eflTreeBeast = parms->eTree; + IoMsg_s iomsg; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + EFLLog_s *logRecord; + ZfsXaction_s *localXaction; + + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + parms->sibling = parms->child; + rootBlkNum = parms->child->volBlk; + rootLsn = ((EFLTreeNode_s *)parms->child->pBuf.data)->lsn; + eflTreeBeast->zfsBtree.p.btRoot = + ((EFLTreeNode_s *)parms->child->pBuf.data)->EBRANCH(0).child; + + + /* force the beast out */ + localXaction = BeginXLocal(parms->volume,BXL_DEFAULT); + COMN_MARK_BEAST_XLOCAL(&eflTreeBeast->ZFSEFLTREEroot, + &localXaction->xaction); + if (COMN_ForceBeastWrite(genMsg, &eflTreeBeast->ZFSEFLTREEroot, + &localXaction->xaction)) + { + goto errorEndXaction; + } + + parms->readBlkNum = eflTreeBeast->zfsBtree.p.btRoot; + + READBLK_IO_MSG(iomsg, eflTreeBeast, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 4); + parms->child = EFL_ReadPoolBlk(genMsg, &iomsg); + if (parms->child == NULL) + { + //zASSERT(0); + goto errorEndXaction; + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + ((EFLTreeNode_s *)parms->child->pBuf.data)->state |= EFL_ROOT; + + /* log the shrink action */ + + ZLOG_ObtainRecord(localXaction, ZLOG_BLOCK_INFO_SIZE(2) + sizeof(EFLGrow_s)); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_SHRINK, localXaction, logBuffer, 2, + poolBlks, logRecord); + + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], rootBlkNum, + ((EFLTreeNode_s *)parms->sibling)->lsn, parms->sibling, localXaction, 0, + EFL_compare); + ZLOG_DELETE_BLOCK(localXaction, poolBlks[0]); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->child->volBlk, + ((EFLTreeNode_s *)parms->child->pBuf.data)->lsn, parms->child, + localXaction, 1, EFL_compare); + + ((EFLTreeNode_s *)parms->child->pBuf.data)->lsn = logBuffer->ZXR_Lsn; + +#if LOG_TEST IS_ENABLED + if (LogTest) + ((EFLTreeNode_s *)parms->sibling->pBuf.data)->lsn = logBuffer->ZXR_Lsn; +#endif + logRecord->u.grow.blockForParent = parms->child->volBlk; + logRecord->u.grow.internalID = eflTreeBeast->ZFSEFLTREEroot.ROOTinternalID; + + ZLOG_BIND(localXaction, parms->child); + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + if (EFL_freeNode(genMsg, parms) != zOK) + { + goto errorExit; + } + + parms->child->state |= CACHE_DIRTY; + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return zOK; + +errorEndXaction: + EndXlocal(localXaction); +errorExit: +#if NSS_DEBUG IS_ENABLED + if (parms->child != NULL) + { + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + } +#endif + return zFAILURE; +} + +/**************************************************************************** + * Check sibling nodes to see if they should be joined + * + * Input: Child has the node to be checked. Child is latched. The tree + * beast is latched if the child is the root. + * + * Output: Child and parent are latched. Beast b-tree is unlatched. + * If an error occurs then child and parent are unlatched. + ****************************************************************************/ +STATUS EFL_checkForJoin( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms) +{ + ZfsEFLTreeBeast_s *eTree = parms->eTree; + EFLTreeNode_s *node = (EFLTreeNode_s *)parms->child->pBuf.data; + + ASSERT_MPKNSS_LOCK(); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + if (EFL_IS_ROOT(node)) + { + if ((node->numRecs == 1) && !EFL_IS_LEAF(node)) + { + if (EFL_shrink(genMsg, parms) != zOK) + { + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + return(zFAILURE); + } + else + { /* if it got turned into a leaf during shrink, don't unlatch it */ + if (!EFL_IS_LEAF((EFLTreeNode_s *)parms->child->pBuf.data)) + { + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + } + } + } + else + { + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + } + } + else + { + if (EFL_IS_MIN_ENTRIES(node)) + { + if (EFL_underflow(genMsg, parms) != zOK) + { + goto errorRelease; + } + } + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return(zOK); + +errorRelease: + if (parms->parent != NULL) + { + CACHE_RELEASE(parms->parent); + } + if (parms->child != NULL) + { + CACHE_RELEASE(parms->child); + } + return(zFAILURE); +} + +/**************************************************************************** + * Split a node. + * + * Input: Child has the node that will be split. Parent and child are + * latched. + * + * Output: Child holds the node that can be inserted into. The parent + * and child are latched. + ****************************************************************************/ +STATUS EFL_split ( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms) +{ + ZfsEFLTreeBeast_s *eflTreeBeast = parms->eTree; + EFLTreeNode_s *parent = (EFLTreeNode_s *)parms->parent->pBuf.data; + EFLTreeNode_s *sibling; + EFLTreeNode_s *child; + NINT i; + IoMsg_s iomsg; + BYTE *startLocation; + NINT lenMoved; + NINT numMoved; + NINT holdState; + Zid_t zidForParent; + ZfsXaction_s *localXaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + EFLLog_s *logRecord; + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + parms->sibling = parms->child; + sibling = (EFLTreeNode_s *)parms->sibling->pBuf.data; + + localXaction = BeginXLocal(parms->volume, BXL_DEFAULT); + + /* get the new node */ + XALLOCBLK_IO_MSG(iomsg, eflTreeBeast, localXaction, CACHE_UPDATE); + parms->child = ZFS_AllocPoolBlk(genMsg, &iomsg); + if (parms->child == NULL) + { + parms->child = parms->sibling; /* restore the child for error cleanup */ + goto errorEndXaction; + } + parms->readBlkNum = parms->child->volBlk; + + /* + * Sibling is the node to be split. It is the left node in the split. + * Child is the new node. + */ + + child = (EFLTreeNode_s *)parms->child->pBuf.data; + holdState = sibling->state & EFL_LEAF; + EFL_initNode(parms->child, holdState, + &eflTreeBeast->ZFSEFLTREEroot.ROOTinternalID); + + if (EFL_IS_LEAF(sibling)) + { + child->numRecs = sibling->numRecs - EFL_MIN_LEAF_ENTRIES; + + /* setup logging information */ + startLocation = (BYTE *)&child->ELEAF(0); + lenMoved = child->numRecs * sizeof(EFLLeafEntry_s); + numMoved = child->numRecs; + + memcpy(startLocation, /* Destination */ + &sibling->ELEAF(EFL_MIN_LEAF_ENTRIES), /* Source */ + lenMoved); /* length */ + sibling->numRecs = EFL_MIN_LEAF_ENTRIES; + + /* update leaf links */ + child->n.leaf.nextLeaf = sibling->n.leaf.nextLeaf; + sibling->n.leaf.nextLeaf = parms->readBlkNum; + } + else + { + child->numRecs = sibling->numRecs - EFL_MIN_BRANCH_ENTRIES; + + /* setup logging information */ + startLocation = (BYTE *)&child->EBRANCH(0); + lenMoved = child->numRecs * sizeof(EFLBranchEntry_s); + numMoved = child->numRecs; + + memcpy(startLocation, /* Destination */ + &sibling->EBRANCH(EFL_MIN_BRANCH_ENTRIES), /* Source */ + lenMoved); /* length */ + sibling->numRecs = EFL_MIN_BRANCH_ENTRIES; + } + + /* + * Make room in the parent to place the node + */ + i = parms->index + 1; + memmove(&parent->EBRANCH(i + 1), /* Destination */ + &parent->EBRANCH(i), /* Source */ + (parent->numRecs - i) * sizeof(EFLBranchEntry_s)); + + if (EFL_IS_LEAF(sibling)) + { + zidForParent = parent->EBRANCH(i).zid = child->ELEAF(0).zid; + } + else + { + zidForParent = parent->EBRANCH(i).zid = child->EBRANCH(0).zid; + } + parent->EBRANCH(i).child = parms->readBlkNum; + ++(parent->numRecs); + + ZLOG_ObtainRecord( localXaction, ZLOG_BLOCK_INFO_SIZE(3) + sizeof(EFLSplit_s) + - 1 + lenMoved ); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_SPLIT, localXaction, logBuffer, 3, + poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->parent->volBlk, parent->lsn, + parms->parent, localXaction, 0, EFL_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->sibling->volBlk, sibling->lsn, + parms->sibling, localXaction, 1, EFL_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[2], parms->child->volBlk, child->lsn, + parms->child, localXaction, 2, EFL_compare); + ZLOG_ALLOC_BLOCK(poolBlks[2]); + + logRecord->u.split.zidForParent = zidForParent; + logRecord->u.split.leafLink = child->n.leaf.nextLeaf; + logRecord->u.split.blockForParent = poolBlks[2].blkNum; + logRecord->u.split.internalID = eflTreeBeast->ZFSEFLTREEroot.ROOTinternalID; + logRecord->u.split.indexForParent = i; + logRecord->u.split.totalLength = lenMoved; + logRecord->u.split.numToMove = numMoved; + logRecord->u.split.nodeType = holdState; + memcpy(&logRecord->u.split.data[0], startLocation, lenMoved); + + child->lsn = logBuffer->ZXR_Lsn; + sibling->lsn = logBuffer->ZXR_Lsn; + parent->lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(localXaction, parms->child); + ZLOG_BIND(localXaction, parms->sibling); + ZLOG_BIND(localXaction, parms->parent); + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + if (parms->zid >= parent->EBRANCH(i).zid) + { /* if inserting in the child */ + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->sibling->pBuf.data); + CACHE_DIRTY_RELEASE(parms->sibling); + } + else + { + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + CACHE_DIRTY_RELEASE(parms->child); + parms->child = parms->sibling; + child = sibling; + } + parms->parent->state |= CACHE_DIRTY; + parms->child->state |= CACHE_DIRTY; + ++Inst.ztree.split; + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return zOK; + +errorEndXaction: + EndXlocal(localXaction); + return zFAILURE; +} + +/**************************************************************************** + * Grow the tree by adding a new root node and splitting the former root. + * + * Input: Child is latched. It is the root. The tree beast is latched. + * + * Output: Returns with the parent and child latched and the beast tree + * unlatched. + ****************************************************************************/ +STATUS EFL_grow ( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms) +{ + ZfsEFLTreeBeast_s *eflTreeBeast = parms->eTree; + EFLTreeNode_s *newRoot; + IoMsg_s iomsg; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + EFLLog_s *logRecord; + ZfsXaction_s *localXaction; + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + localXaction = BeginXLocal(parms->volume,BXL_DEFAULT); + + /* get the node for the new root */ + XALLOCBLK_IO_MSG(iomsg, eflTreeBeast, localXaction, CACHE_UPDATE); + parms->parent = ZFS_AllocPoolBlk(genMsg, &iomsg); + if (parms->parent == NULL) + { + //zASSERT(0); + goto errorUnlatch; + } + parms->readBlkNum = parms->parent->volBlk; + + newRoot = (EFLTreeNode_s *)parms->parent->pBuf.data; + EFL_initNode(parms->parent, EFL_ROOT, + &eflTreeBeast->ZFSEFLTREEroot.ROOTinternalID); + newRoot->numRecs = 1; + newRoot->EBRANCH(0).zid = 0; + newRoot->EBRANCH(0).child = eflTreeBeast->zfsBtree.p.btRoot; + eflTreeBeast->zfsBtree.p.btRoot = parms->readBlkNum; + + /* force the beast out */ + COMN_MARK_BEAST_XLOCAL(&eflTreeBeast->ZFSEFLTREEroot, &localXaction->xaction); + if (COMN_ForceBeastWrite(genMsg, &eflTreeBeast->ZFSEFLTREEroot, + &localXaction->xaction)) + { + goto errorUnlatch; + } + + UNX_LATCH(&eflTreeBeast->ZFSEFLTREEbeastLatch); + + parms->sibling = parms->child; + ((EFLTreeNode_s *)parms->sibling->pBuf.data)->state &= ~(EFL_ROOT); /* reset the root state */ + parms->index = 0; + + /* log the grow action */ + + ZLOG_ObtainRecord(localXaction, ZLOG_BLOCK_INFO_SIZE(2) + sizeof(EFLGrow_s)); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_GROW, localXaction, logBuffer, 2, + poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->parent->volBlk, 0, parms->parent, + localXaction, 0, EFL_compare); + ZLOG_ALLOC_BLOCK(poolBlks[0]); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->child->volBlk, + ((EFLTreeNode_s *)parms->child->pBuf.data)->lsn, parms->child, + localXaction, 1, EFL_compare); + newRoot->lsn = logBuffer->ZXR_Lsn; + ((EFLTreeNode_s *)parms->child->pBuf.data)->lsn = logBuffer->ZXR_Lsn; + logRecord->u.grow.blockForParent = parms->child->volBlk; + logRecord->u.grow.internalID = eflTreeBeast->ZFSEFLTREEroot.ROOTinternalID; + + ZLOG_BIND(localXaction, parms->child); + ZLOG_BIND(localXaction, parms->parent); + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + ++Inst.ztree.grow; + + if (EFL_split(genMsg, parms) != zOK) + { + return zFAILURE; + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return zOK; + +errorUnlatch: + UNX_LATCH(&eflTreeBeast->ZFSEFLTREEbeastLatch); + EndXlocal(localXaction); + return zFAILURE; +} + +/**************************************************************************** + * Check a node to see if it should be split + * + * Input: Child has the node to be checked. Child is latched. The tree + * beast is latched if the child is the root. + * + * Output: Child and parent are latched. Beast b-tree is unlatched. + * If an error occurs then child and parent are unlatched. + ****************************************************************************/ +STATUS EFL_checkForSplit( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms) +{ + EFLTreeNode_s *node = (EFLTreeNode_s *)parms->child->pBuf.data; + ASSERT_MPKNSS_LOCK(); + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + if (EFL_IS_MAX_ENTRIES(node, parms)) + { + if (EFL_IS_ROOT(node)) + { /* if it is the root then grow the tree another level */ + if (EFL_grow(genMsg, parms) != zOK) + { + goto errorRelease; + } + } + else + { + if (EFL_split(genMsg, parms) != zOK) + { + goto errorRelease; + } + } + } + else + { + if (EFL_IS_ROOT(node)) + { + UNX_LATCH(&parms->eTree->ZFSEFLTREEbeastLatch); + } + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return(zOK); + +errorRelease: + if (parms->parent != NULL) + { + CACHE_RELEASE(parms->parent); + } + if (parms->child != NULL) + { + CACHE_RELEASE(parms->child); + } + return(zFAILURE); +} + +/**************************************************************************** + * Finds the leaf node. Splits or joins nodes as it goes down the tree if + * requested with the options + * + * Input: Will also start at the root of the b-tree. Gets root from the + * b-tree beast. Splits of joins based on the options. + * Tree beast is latched. + * + * Output: Leaves with parent and child latched, unless there is an error, + * then they are unlatched. + ****************************************************************************/ +STATUS EFL_findLeaf( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms, + NINT options) +{ + ZfsEFLTreeBeast_s *eTree = parms->eTree; + EFLTreeNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + + parms->readBlkNum = eTree->zfsBtree.p.btRoot; + READBLK_IO_MSG(iomsg, eTree, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 5); + parms->child = EFL_ReadPoolBlk(genMsg, &iomsg); + if (parms->child == NULL) + { + //zASSERT(0); + EFL_UNX_LATCH_TREE(parms); + return(zFAILURE); + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + node = (EFLTreeNode_s *)parms->child->pBuf.data; + + zASSERT(EFL_IS_ROOT(node)); + while (!EFL_IS_LEAF(node)) + { + if (options & EFL_CHECK_FOR_SPLIT) + { + if (EFL_checkForSplit(genMsg, parms) != zOK) + { + return(zFAILURE); + } + /* the child can change to a different node during a split */ + node = (EFLTreeNode_s *)parms->child->pBuf.data; + } + else if (options & EFL_CHECK_FOR_JOIN) + { + if (EFL_checkForJoin(genMsg, parms) != zOK) + { + return(zFAILURE); + } + /* the child can change to a different node during a join */ + node = (EFLTreeNode_s *)parms->child->pBuf.data; + /* if the tree shrinks we may be at the leaf node */ + if (EFL_IS_LEAF(node)) + { + break; + } + } + else + { + if (EFL_IS_ROOT(node)) + { + EFL_UNX_LATCH_TREE(parms); + } + } + if (!EFL_IS_ROOT(node)) + { + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + CACHE_RELEASE(parms->parent); + } + /* go down another level */ + parms->parent = parms->child; + parms->readBlkNum = + EFL_findChildBlock((EFLTreeNode_s *)parms->parent->pBuf.data, + parms->zid, &parms->index); + READBLK_IO_MSG(iomsg, eTree, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 6); + parms->child = EFL_ReadPoolBlk(genMsg, &iomsg); + if (parms->child == NULL) + { + //zASSERT(0); + CACHE_RELEASE(parms->parent); + return(zFAILURE); + } + node = (EFLTreeNode_s *)parms->child->pBuf.data; + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return(zOK); +} + +/**************************************************************************** + * + * Update a leaf entry + * + ****************************************************************************/ +/**************************************************************************** + * Put a modify leaf entry log record into the log. This is the contents + * before the entry is deleted. + * + * Input: Child record is Latched + * + * Output: Child record is Latched + ****************************************************************************/ +void EFL_logModifyRecord( + EFLTreeParms_s *parms, + EFLLeafEntry_s *leafEntry, + EFLLeafEntry_s *oldEntry) +{ + EFLTreeNode_s *node = (EFLTreeNode_s *)parms->child->pBuf.data; + ZfsXasRecovery_s *logBuffer; + EFLLog_s *logRecord; + BlockInfo_s *poolBlks; + + ASSERT_MPKNSS_LOCK(); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + ZLOG_ObtainRecord(parms->xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(EFLModify_s)); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_L_MODIFY, parms->xaction, logBuffer, 1, + poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[0], parms->child->volBlk, node->lsn, + parms->child, parms->xaction, 0); + + logRecord->u.modify.internalID = parms->eTree->ZFSEFLTREEroot.ROOTinternalID; + logRecord->u.modify.newEntry = *leafEntry; + logRecord->u.modify.oldEntry = *oldEntry; + + node->lsn = logBuffer->ZXR_Lsn; + ZLOG_BIND(parms->xaction, parms->child); + + ZLOG_ReleaseRecord(parms->xaction); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return; +} + + +/**************************************************************************** + * The leaf has already been modified outside of this function. This + * function really just takes care of getting things latched correctly and + * logging the modify. + * + * Input: Child and parent are latched. Child is buffer to modified. + * + * Output: Parent and child are unlatched. + ****************************************************************************/ +STATUS EFL_doModifyEntry( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms, + NINT index, + EFLLeafEntry_s *oldEntry) +{ + EFLTreeNode_s *child = (EFLTreeNode_s *)parms->child->pBuf.data; + EFLLeafEntry_s *leafEntry = &child->ELEAF(index); + + ASSERT_MPKNSS_LOCK(); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + /* + * Change the leaf entry contents in the node. + * + * NOTE: the parent is still latched when we come into this routine. + */ + if (EFL_IS_ROOT(child)) + { + EFL_UNX_LATCH_TREE(parms); + } + else + { + EFL_VALIDATE_NODE(child); + CACHE_RELEASE(parms->parent); + } + + if (memcmp(leafEntry, oldEntry, sizeof(EFLLeafEntry_s)) != 0) + { + EFL_logModifyRecord(parms, leafEntry, oldEntry); + CACHE_DIRTY_RELEASE(parms->child); + } + else + { + CACHE_RELEASE(parms->child); + } + + return(zOK); +} + + +/************************************************************************** + * + * Insert an eNode + * + **************************************************************************/ + +/**************************************************************************** + * Make room in the current buffer for a new beast eNode + * + * Input: Child has the node to be inserted into. The child is latched. + * + ****************************************************************************/ +STATUS EFL_insertEntry( + GeneralMsg_s *genMsg, + EFLTreeNode_s *child, + EFLLeafEntry_s *newEntry) +{ + NINT i; + EFLLeafEntry_s *entry; + + ASSERT_MPKNSS_LOCK(); + /* + * This routine assumes a check has ready been done to make sure there + * there is enough free space in the buffer to hold the new eNode. + */ + + /* Add the eNode to the leaf node */ + if (EFL_findLeafEntry(child, newEntry->zid, &i)) + { + zASSERT("Same ZID added twice to EFL b-tree" == NULL); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + return zFAILURE; + } + + entry = &child->ELEAF(i); + + /* make room for the entry */ + memmove(&child->ELEAF(i + 1), /* Destination */ + entry, /* Source */ + (child->numRecs - i) * sizeof(EFLLeafEntry_s)); + ++(child->numRecs); + + *entry = *newEntry; + return zOK; +} + +/**************************************************************************** + * Put an insert leaf entry log record into the log + * + * Input: Child record is Latched + * + * Output: Child record is Latched + ****************************************************************************/ +void EFL_logInsertRecord( + EFLTreeParms_s *parms, + EFLLeafEntry_s *newEntry) +{ + EFLTreeNode_s *child = (EFLTreeNode_s *)parms->child->pBuf.data; + ZfsXasRecovery_s *logBuffer; + EFLLog_s *logRecord; + BlockInfo_s *poolBlks; + + ASSERT_MPKNSS_LOCK(); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + ZLOG_ObtainRecord(parms->xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(EFLInsert_s)); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_L_INSERT, parms->xaction, logBuffer, 1, + poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->child->volBlk, child->lsn, + parms->child, parms->xaction, 0, EFL_compare); + + logRecord->u.insert.internalID = parms->eTree->ZFSEFLTREEroot.ROOTinternalID; + logRecord->u.insert.newEntry = *newEntry; + + child->lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(parms->xaction, parms->child); + + ZLOG_ReleaseRecord(parms->xaction); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return; +} + +/**************************************************************************** + * Insert the beast into the b-tree node + * + * Input: Child and parent are latched. Child is buffer to inserted into. + * + * Output: Parent and child are unlatched. + ****************************************************************************/ +STATUS EFL_doInsertEntry( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms, + EFLLeafEntry_s *newEntry) +{ + EFLTreeNode_s *child; + + ASSERT_MPKNSS_LOCK(); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + /* + * Insert the new entry into the tree + */ + if (EFL_checkForSplit(genMsg, parms) != zOK) + { + return(zFAILURE); + } + + child = (EFLTreeNode_s *)parms->child->pBuf.data; + if (!EFL_IS_ROOT(child)) + { + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + CACHE_RELEASE(parms->parent); + } + + if (EFL_insertEntry(genMsg, child, newEntry) != zOK) + { + CACHE_RELEASE(parms->child); + return(zFAILURE); + } + + EFL_logInsertRecord(parms, newEntry); + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + CACHE_DIRTY_RELEASE(parms->child); + return zOK; +} + +//rks/**************************************************************************** +//rks * +//rks * Check to see if we need to add a log record for this action. If we are +//rks * modifying data that has already been modified or created in this epoch +//rks * we do not need to send the log record. +//rks * +//rks ****************************************************************************/ +//rksSTATUS EFL_ProcessLogEntry( +//rks GeneralMsg_s *genMsg, +//rks NINT logOp, +//rks EFLTreeParms_s *parms, +//rks EFLLeafEntry_s *leafEntry, +//rks unicode_t *name) +//rks{ +//rks Volume_s *volume = parms->volume; +//rks Zid_t zid = parms->zid; +//rks STATUS status = zOK; +//rks EFLEpochMask_t activeEpochs = parms->eTree->p.activeEpochs; +//rks EFLEpochMask_t notFoundEpochs; +//rks +//rks switch (logOp) +//rks { +//rks case ELOG_OP_CLEAR: +//rks { +//rks status = ELOG_ClearLogEntries(genMsg, volume, leafEntry, +//rks activeEpochs, parms->xaction, FALSE); +//rks break; +//rks } +//rks case ELOG_OP_CREATE: +//rks { +//rks status = ELOG_AddLogEntry(genMsg, parms->xaction, leafEntry, +//rks ELOG_CREATE, activeEpochs, volume, zid, name); +//rks break; +//rks } +//rks case ELOG_OP_DELETE: +//rks { +//rks status = ELOG_AddLogEntry(genMsg, parms->xaction, leafEntry, +//rks ELOG_DELETE, parms->epochMask, volume, zid, name); +//rks break; +//rks } +//rks case ELOG_OP_MODIFY: +//rks { +//rks if ((status = ELOG_FindModifyForEpochs(genMsg, volume, leafEntry, +//rks leafEntry->modifyEpochs, ¬FoundEpochs, FALSE)) != zOK) +//rks { +//rks goto exit; +//rks } +//rks +//rks if (notFoundEpochs) +//rks { +//rks status = ELOG_AddLogEntry(genMsg, parms->xaction, leafEntry, +//rks ELOG_MODIFY, notFoundEpochs, volume, zid, name); +//rks } +//rks break; +//rks } +//rks case ELOG_OP_METADATA: +//rks if ((status = ELOG_FindModifyForEpochs(genMsg, volume, leafEntry, +//rks leafEntry->modifyEpochs, ¬FoundEpochs, TRUE)) != zOK) +//rks { +//rks goto exit; +//rks } +//rks +//rks if (notFoundEpochs) +//rks { +//rks status = ELOG_AddLogEntry(genMsg, parms->xaction, leafEntry, +//rks ELOG_MODIFY_META_DATA, notFoundEpochs, volume, zid, name); +//rks } +//rks break; +//rks +//rks case ELOG_OP_RENAME: +//rks { +//rks status = ELOG_AddLogEntry(genMsg, parms->xaction, leafEntry, +//rks ELOG_RENAME, activeEpochs, volume, zid, name); +//rks } +//rks case ELOG_OP_UNDELETE: +//rks case ELOG_OP_CLEAR_MODIFIES: +//rks { +//rks status = ELOG_ClearLogEntries(genMsg, volume, leafEntry, +//rks activeEpochs, parms->xaction, TRUE); +//rks break; +//rks } +//rks default: +//rks { +//rks zASSERT("Unknown EFL log operation" == NULL); +//rks } +//rks } +//rks +//rksexit: +//rks return status; +//rks} + +/**************************************************************************** + * This function is called during error recovery to get things release and + * unlatched as is needed. + * + * Input: Child and parent are latched. Tree may be latched + * + * Output: Parent, child, and tree are all unlatched. + ****************************************************************************/ +void EFL_releaseStuff( + EFLTreeParms_s *parms) +{ + EFLTreeNode_s *child = (EFLTreeNode_s *)parms->child->pBuf.data; + + /* + * NOTE: the parent is still latched when we come into this routine. + */ + if (EFL_IS_ROOT(child)) + { + EFL_UNX_LATCH_TREE(parms); + } + else + { + EFL_VALIDATE_NODE(child); + CACHE_RELEASE(parms->parent); + } + + CACHE_RELEASE(parms->child); + + return; +} + +/**************************************************************************** + * + ****************************************************************************/ +/**************************************************************************** + ** + ** Insert / modify an EFL entry. + ** + ** Note: if name is not NULL, it's null terminated. + ****************************************************************************/ +STATUS ZFSVOL_VOL_insertEFLEntry( + GeneralMsg_s *genMsg, + File_s *beast, + NINT action, + unicode_t *name, + Xaction_s *xaction) +{ + Volume_s *volume = beast->FILEvolume; + Zid_t zid = beast->FILEzid; + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + EFLLog_s *logRecord; + ZfsXaction_s *localXaction; + EFLTreeNode_s *node; + ELogNode_s *headerNode; + NINT index; + EFLTreeNode_s *child; + EFLLeafEntry_s *currentEntry; + Buffer_s *logHeaderBuffer; + EFLEpochMask_t activeEpochs; + BOOL directory = FALSE; + BOOL dataStream = FALSE; + BOOL extendedAttribute = FALSE; + typedef struct Stack_s { + EFLLeafEntry_s leafEntry; + EFLTreeParms_s parms; + IoMsg_s iomsg; + } Stack_s; + STACK_ALLOC(); + + ASSERT_MPKNSS_LOCK(); + zASSERT((action != EFL_FILE_STATE_RENAME) || (name && (name[0] != '\0'))); + zASSERT((action == EFL_FILE_STATE_RENAME) || !name); + + if (!eTree || !(eTree->p.activeEpochs)) + { + STACK_FREE(); + return zOK; + } + + memset(&aStack->leafEntry, 0, sizeof(EFLLeafEntry_s)); + + activeEpochs = eTree->p.activeEpochs; + aStack->parms.xaction = (ZfsXaction_s *)xaction; + aStack->parms.zid = zid; + aStack->parms.action = action; + aStack->parms.volume = volume; + aStack->parms.eTree = eTree; + + /* Determine type of object */ + if (beast->FILEattributes & zFA_SUBDIRECTORY) + { + directory = TRUE; + } + else + { + if (COMN_IsDerivedFrom(beast, zFTYPE_NAMED_DATA_STREAM)) + { + if (((NamedBeast_s *)beast)->NAMEDfirstParentNameType == zNTYPE_DATA_STREAM) + { + dataStream = TRUE; + } + else if (((NamedBeast_s *)beast)->NAMEDfirstParentNameType == zNTYPE_EXTENDED_ATTRIBUTE) + { + extendedAttribute = TRUE; + } + } + } + + if (EFLDisplay) + { + aprintf(YELLOW, "(EFL) Insert ZID:0x%Lx action:%d activeEpochs:0x%x\n", + zid, action, activeEpochs); + } + + /* + * Get an exclusive latch on eTree + */ + EFL_X_LATCH_TREE(&aStack->parms); + + if (eTree->zfsBtree.p.btRoot == INVALID_BLK_ZERO) + { /* no root to the b-tree exists */ + + localXaction = BeginXLocal(volume, BXL_DEFAULT); + + XALLOCBLK_IO_MSG(aStack->iomsg, eTree, localXaction, CACHE_UPDATE); + aStack->parms.child = ZFS_AllocPoolBlk(genMsg, &aStack->iomsg); + if (aStack->parms.child == NULL) + { + //zASSERT(0); + goto errorEndXaction; + } + node = (EFLTreeNode_s *)aStack->parms.child->pBuf.data; + aStack->parms.readBlkNum = aStack->parms.child->volBlk; + EFL_initNode(aStack->parms.child, EFL_ROOT|EFL_LEAF, + &eTree->ZFSEFLTREEroot.ROOTinternalID); + EFL_insertZeroNode((EFLTreeNode_s *)aStack->parms.child->pBuf.data); + logHeaderBuffer = ELOG_AllocateHeaderNode(genMsg, volume, + localXaction); + if (logHeaderBuffer == NULL) + { + // FixFixFix should release parms.child to avoid losing a block + goto errorEndXaction; + } + + headerNode = (ELogNode_s *)logHeaderBuffer->pBuf.data; + eTree->p.logHeaderBlock = logHeaderBuffer->volBlk; + eTree->zfsBtree.p.btRoot = aStack->parms.child->volBlk; + + COMN_MARK_BEAST_XLOCAL(&eTree->ZFSEFLTREEroot, + &localXaction->xaction); + + if (COMN_ForceBeastWrite(genMsg, &eTree->ZFSEFLTREEroot, + &localXaction->xaction) != zOK) + { + CACHE_DIRTY_RELEASE(logHeaderBuffer); + goto errorEndXaction; + } + + /* log the init record */ + ZLOG_ObtainRecord(localXaction, + ZLOG_BLOCK_INFO_SIZE(2) + sizeof(EFLTreeInit_s) ); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_INIT, localXaction, logBuffer, 2, + poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[0], aStack->parms.child->volBlk, + node->lsn, aStack->parms.child, localXaction, 0); + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[1], logHeaderBuffer->volBlk, + headerNode->lsn, logHeaderBuffer, localXaction, 1); + ZLOG_ALLOC_BLOCK(poolBlks[0]); + ZLOG_ALLOC_BLOCK(poolBlks[1]); + node->lsn = logBuffer->ZXR_Lsn; + headerNode->lsn = logBuffer->ZXR_Lsn; + logRecord->u.init.internalID = eTree->ZFSEFLTREEroot.ROOTinternalID; + + ZLOG_BIND(localXaction, aStack->parms.child); + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + CACHE_DIRTY_RELEASE(logHeaderBuffer); + } + else + { + if (EFL_findLeaf(genMsg, &aStack->parms, EFL_CHECK_FOR_SPLIT) != zOK) + { + STACK_FREE(); + return(zFAILURE); + } + } + + child = (EFLTreeNode_s *)aStack->parms.child->pBuf.data; + if (EFL_findLeafEntry(child, aStack->parms.zid, &index)) + { + STATUS status = zOK; + + /* Entry found -- modify it */ + currentEntry = &child->ELEAF(index); + aStack->leafEntry = *currentEntry; + + switch (action) + { + case EFL_FILE_STATE_CREATE_FILE: + { + zASSERT("How did we create a ZID a second time?" == NULL); + SetErrno(genMsg, zERR_EFL_ILLEGAL_STATE); + break; + } + case EFL_FILE_STATE_DELETE_FILE: + { + /* + * If there are epochs where the delete and the create are + * both in the same epoch then remove the create. Always + * remove all of the modify records for the active epochs. + */ + EFLLeafEntry_s localEntry = *currentEntry; + EFLEpochMask_t createActive; + EFLEpochMask_t deleteActive; + EFLEpochMask_t deleteEpochs; + EFLEpochMask_t deleteCreateEpochs; + NINT typesToClear; + + createActive = localEntry.createEpochs & activeEpochs; + deleteActive = localEntry.deleteEpochs & activeEpochs; + + /* Clear bits on active delete epochs so we can set them correctly */ + localEntry.deleteEpochs &= ~activeEpochs; + + /* Fix up the bits for active epochs */ + localEntry.deleteEpochs |= ~createActive & activeEpochs; + + /* Figure out which epochs need delete log records created */ + deleteEpochs = deleteActive ^ + (localEntry.deleteEpochs & activeEpochs); + + /* clear all other masks */ + localEntry.createEpochs &= ~activeEpochs; + localEntry.modifyEpochs &= ~activeEpochs; + localEntry.metaDataEpochs &= ~activeEpochs; + + /* figure out which epochs had the create removed */ + deleteCreateEpochs = ~deleteEpochs & activeEpochs; + + typesToClear = ELOG_CLEAR_MODIFY | ELOG_CLEAR_CREATE; + + /* Add in log records for deletes that need to be done */ + if (deleteEpochs) + { + if (ELOG_AddLogEntry(genMsg, aStack->parms.xaction, &localEntry, + ELOG_DELETE, deleteEpochs, volume, zid, name) != zOK) + { + status = zFAILURE; + break; + } + } + *currentEntry = localEntry; + + /* + * Remove log entries that are no longer needed. If the + * deleted and creation are in the same epoch then we should + * get rid of renames. + */ + if (deleteCreateEpochs == activeEpochs) + { + typesToClear |= ELOG_CLEAR_RENAME; + } + else + { + if (deleteCreateEpochs) + { + ELOG_ClearLogEntries(genMsg, volume, currentEntry, + deleteCreateEpochs, aStack->parms.xaction, ELOG_CLEAR_RENAME); + } + } + + ELOG_ClearLogEntries(genMsg, volume, currentEntry, + activeEpochs, aStack->parms.xaction, typesToClear); + + break; + } + case EFL_FILE_STATE_MODIFY_DATA: + { + EFLLeafEntry_s localEntry; + EFLEpochMask_t modifyActive; + EFLEpochMask_t notFoundEpochs; +#if NSS_DEBUG IS_ENABLED + EFLEpochMask_t notFoundEpochsComputed; +#endif + + modifyActive = ((currentEntry->modifyEpochs | + currentEntry->createEpochs)) & activeEpochs; + notFoundEpochs = modifyActive ^ activeEpochs; +#if NSS_DEBUG IS_ENABLED + ELOG_FindModifyForEpochs(genMsg, volume, currentEntry, + activeEpochs, ¬FoundEpochsComputed, FALSE); + zASSERT(notFoundEpochsComputed == notFoundEpochs); +#endif + if (notFoundEpochs) + { + localEntry = *currentEntry; + localEntry.modifyEpochs |= notFoundEpochs; + if (ELOG_AddLogEntry(genMsg, aStack->parms.xaction, &localEntry, + ELOG_MODIFY, notFoundEpochs, volume, zid, name) != zOK) + { + status = zFAILURE; + break; + } + *currentEntry = localEntry; + } + break; + } + case EFL_FILE_STATE_MODIFY_METADATA: + { + EFLLeafEntry_s localEntry; + EFLEpochMask_t metadataActive; + EFLEpochMask_t notFoundEpochs; +#if NSS_DEBUG IS_ENABLED + EFLEpochMask_t notFoundEpochsComputed; +#endif + metadataActive = ((currentEntry->modifyEpochs | + currentEntry->metaDataEpochs | currentEntry->createEpochs)) & + activeEpochs; + notFoundEpochs = metadataActive ^ activeEpochs; +#if NSS_DEBUG IS_ENABLED + ELOG_FindModifyForEpochs(genMsg, volume, currentEntry, + activeEpochs, ¬FoundEpochsComputed, TRUE); + zASSERT(notFoundEpochsComputed == notFoundEpochs); +#endif + if (notFoundEpochs) + { + localEntry = *currentEntry; + localEntry.metaDataEpochs |= notFoundEpochs; + if (ELOG_AddLogEntry(genMsg, aStack->parms.xaction, &localEntry, + ELOG_MODIFY_META_DATA, notFoundEpochs, volume, zid, + name) != zOK) + { + status = zFAILURE; + break; + } + *currentEntry = localEntry; + } + break; + } + case EFL_FILE_STATE_RENAME: + { + EFLEpochMask_t createActive; + EFLEpochMask_t changedEpochs; + EFLEpochMask_t renameEpochs = activeEpochs; + + createActive = currentEntry->createEpochs & activeEpochs; + if (createActive && !directory) + { + if (ELOG_ClearSameNameCreate(genMsg, volume, currentEntry, + createActive, aStack->parms.xaction, name, &changedEpochs) == zOK) + { + if (changedEpochs) + { + ELOG_AddLogEntry(genMsg, aStack->parms.xaction, currentEntry, + ELOG_CREATE, changedEpochs, volume, zid, NULL); + renameEpochs = activeEpochs ^ changedEpochs; + } + } + } + if (renameEpochs) + { + ELOG_AddLogEntry(genMsg, aStack->parms.xaction, currentEntry, + ELOG_RENAME, renameEpochs, volume, zid, name); + } + break; + } + case EFL_FILE_STATE_UNDELETE: + { + EFLLeafEntry_s localEntry = *currentEntry; + EFLEpochMask_t changedEpochs; + + zASSERT(!(localEntry.createEpochs & activeEpochs)); + changedEpochs = (localEntry.deleteEpochs ^ activeEpochs) & + activeEpochs; + localEntry.createEpochs |= changedEpochs; + localEntry.deleteEpochs &= ~changedEpochs; + if (ELOG_AddLogEntry(genMsg, aStack->parms.xaction, &localEntry, + ELOG_CREATE, changedEpochs, volume, zid, name) != zOK) + { + status = zFAILURE; + break; + } + *currentEntry = localEntry; + ELOG_ClearLogEntries(genMsg, volume, currentEntry, + changedEpochs, aStack->parms.xaction, ELOG_CLEAR_DELETE); + break; + } + } + + if (status == zOK) + { + if (EFL_doModifyEntry(genMsg, &aStack->parms, index, &aStack->leafEntry) != zOK) + { + goto error; + } + } + else + { + if (EFLDisplay) + { + aprintf(LRED, "(EFL) Insert. Error %d during leaf modify\n", + GetErrno(genMsg)); + } + + EFL_releaseStuff(&aStack->parms); + goto error; + } + } + else + { + NINT logOp = EFL_FILE_STATE_NO_ACTION; + + if (EFLDisplay) + { + aprintf(YELLOW, "(EFL) Insert. Create new leaf entry\n"); + } + + aStack->leafEntry.zid = zid; + + if (directory) + { + aStack->leafEntry.objectType = ELF_TYPE_DIRECTORY; + } + else if (dataStream) + { + aStack->leafEntry.objectType = ELF_TYPE_DATASTREAM; + } + else if (extendedAttribute) + { + aStack->leafEntry.objectType = ELF_TYPE_EXTENDED_ATTRIBUTE; + } + else + { + aStack->leafEntry.objectType = ELF_TYPE_FILE; + } + + switch (aStack->parms.action) + { + case EFL_FILE_STATE_CREATE_FILE: + { + aStack->leafEntry.createEpochs |= activeEpochs; + logOp = ELOG_CREATE; + break; + } + case EFL_FILE_STATE_DELETE_FILE: + { + aStack->leafEntry.deleteEpochs |= activeEpochs; + logOp = ELOG_DELETE; + break; + } + case EFL_FILE_STATE_MODIFY_DATA: + { + aStack->leafEntry.modifyEpochs |= activeEpochs; + logOp = ELOG_MODIFY; + break; + } + case EFL_FILE_STATE_MODIFY_METADATA: + { + aStack->leafEntry.metaDataEpochs |= activeEpochs; + logOp = ELOG_MODIFY_META_DATA; + break; + } + case EFL_FILE_STATE_RENAME: + { + aStack->leafEntry.metaDataEpochs |= activeEpochs; + logOp = ELOG_RENAME; + break; + } + case EFL_FILE_STATE_UNDELETE: + { + aStack->leafEntry.createEpochs |= activeEpochs; + logOp = ELOG_CREATE; + break; + } + default: + { + zASSERT("Should always have one of the actions above"==NULL); + break; + } + } + + aStack->leafEntry.firstLogEntry.blockNum = INVALID_BLK_ZERO; + aStack->leafEntry.firstLogEntry.resBlk = INVALID_BLK_ZERO; + aStack->leafEntry.firstLogEntry.offset = 0; + aStack->leafEntry.lastLogEntry.blockNum = INVALID_BLK_ZERO; + aStack->leafEntry.lastLogEntry.resBlk = INVALID_BLK_ZERO; + aStack->leafEntry.lastLogEntry.offset = 0; + + if (ELOG_AddLogEntry(genMsg, aStack->parms.xaction, &aStack->leafEntry, logOp, + activeEpochs, volume, zid, name) != zOK) + { + if (EFLDisplay) + { + aprintf(LRED, "(EFL) Insert. Error %d adding log entry during leaf create\n", + GetErrno(genMsg)); + } + + EFL_releaseStuff(&aStack->parms); + goto error; + } + else + { + if (EFL_doInsertEntry(genMsg, &aStack->parms, &aStack->leafEntry) != zOK) + { + goto error; + } + } + } + + STACK_FREE(); + return(zOK); + +errorEndXaction: + EndXlocal(localXaction); + EFL_UNX_LATCH_TREE(&aStack->parms); + STACK_FREE(); + return(zFAILURE); + +error: + STACK_FREE(); + return(zFAILURE); +} + +/*************************************************************************** + * + * Delete an event entry + * + ***************************************************************************/ +/**************************************************************************** + * Remove specified entry from the child record. + ****************************************************************************/ +void EFL_deleteEntry( + EFLTreeNode_s *child, + NINT index) +{ + ASSERT_MPKNSS_LOCK(); + EFL_VALIDATE_NODE(child); + + child->numRecs--; + if (index < child->numRecs) + { + memmove(&child->ELEAF(index), &child->ELEAF(index + 1), + (child->numRecs - index) * sizeof(EFLLeafEntry_s)); + } + EFL_VALIDATE_NODE(child); + return; +} + +/**************************************************************************** + * Put an delete eNode log record into the log. This is the contents before + * the eNode is deleted. + * + * Input: Child record is Latched + * + * Output: Child record is Latched + ****************************************************************************/ +void EFL_logDeleteRecord( + EFLTreeParms_s *parms, + EFLLeafEntry_s *entry) +{ + EFLTreeNode_s *node = (EFLTreeNode_s *)parms->child->pBuf.data; + ZfsXasRecovery_s *logBuffer; + EFLLog_s *logRecord; + BlockInfo_s *poolBlks; + + ASSERT_MPKNSS_LOCK(); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + + ZLOG_ObtainRecord(parms->xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(EFLDelete_s)); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_L_DELETE, parms->xaction, logBuffer, 1, + poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->child->volBlk, node->lsn, + parms->child, parms->xaction, 0, EFL_compare); + + logRecord->u.delete.internalID = parms->eTree->ZFSEFLTREEroot.ROOTinternalID; + logRecord->u.delete.entry = *entry; + node->lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(parms->xaction, parms->child); + + ZLOG_ReleaseRecord(parms->xaction); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + return; +} + +/**************************************************************************** + * Delete the eNode and log the delete + * + * Before doing this, caller has to remove all the names. + * + * Input: The parent and child are latched. + * + * Output: The parent and child are unlatched. + ****************************************************************************/ +STATUS EFL_doDeleteEntry( + GeneralMsg_s *genMsg, + EFLTreeParms_s *parms, + EFLLeafEntry_s *entry) +{ + NINT index; + EFLTreeNode_s *child; + + ASSERT_MPKNSS_LOCK(); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + +zASSERT(parms->index != -1); + if (EFL_checkForJoin(genMsg, parms) != zOK) + { + //zASSERT(0); + return(zFAILURE); + } + + child = (EFLTreeNode_s *)parms->child->pBuf.data; + if (!EFL_IS_ROOT(child)) + { + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + CACHE_RELEASE(parms->parent); + } + + if(!(EFL_findLeafEntry(child, parms->zid, &index))) + { /* not found */ + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + goto errorExit; + } + + EFL_logDeleteRecord(parms, entry); + EFL_deleteEntry(child, index); + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + CACHE_DIRTY_RELEASE(parms->child); + + return(zOK); + +errorExit: + //zASSERT(0); + CACHE_DIRTY_RELEASE(parms->child); + return(zFAILURE); +} + +/**************************************************************************** + * Delete the given epoch entry + ****************************************************************************/ +STATUS ZFSVOL_VOL_deleteEFLEntry( + GeneralMsg_s *genMsg, + Volume_s *volume, + Zid_t zid, + QUAD id1, + QUAD id2, + EFLEpoch_t *epoch, + Xaction_s *xaction) +{ + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + EFLTreeParms_s parms; + EFLTreeNode_s *child; + EFLLeafEntry_s oldEntry; + EFLLeafEntry_s *currentEntry; + NINT index; + EFLEpochMask_t epochMask; + Blknum_t block = id1; + NINT offset = id2; + + ASSERT_MPKNSS_LOCK(); + + if (zid < zROOTDIR_ZID) + { + return zOK; + } + + if (!eTree) + { + return zOK; + } + + /* + * Return with an error if we have a bad ZID + */ + if (zid == 0) + { + zASSERT("Attempting to remove the zero ZID from the eflTree" == NULL); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + return(zFAILURE); + } + + if (EFLDisplay) + { + aprintf(YELLOW, "(EFL) Delete ZID:0x%Lx id1:%d id2:%d\n", + zid, id1, id2); + } + + /* + * Get an exclusive latch on eTree + */ + + parms.xaction = (ZfsXaction_s *)xaction; + parms.volume = volume; + parms.zid = zid; + parms.eTree = eTree; + + X_LATCH(&eTree->ZFSEFLTREEbeastLatch); + + if (eTree->zfsBtree.p.btRoot == INVALID_BLK_ZERO) + { + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + return zFAILURE; + } + + if (EFL_findLeaf(genMsg, &parms, EFL_CHECK_FOR_JOIN) != zOK) + { + return zFAILURE; + } + + child = (EFLTreeNode_s *)parms.child->pBuf.data; + if (EFL_findLeafEntry(child, parms.zid, &index)) + { + NINT operation; + + currentEntry = &child->ELEAF(index); + oldEntry = *currentEntry; + + epochMask = EFL_GET_EPOCH_MASK(eTree, epoch); + + /* Get the log entry */ + + if (block == INVALID_BLK_ZERO) + { + if (ELOG_ClearLogEntries(genMsg, volume, currentEntry, + epochMask, parms.xaction, ELOG_CLEAR_ALL) != zOK) + { + EFL_releaseStuff(&parms); + return zFAILURE; + } + + currentEntry->createEpochs &= ~epochMask; + currentEntry->deleteEpochs &= ~epochMask; + currentEntry->modifyEpochs &= ~epochMask; + currentEntry->metaDataEpochs &= ~epochMask; + + if (currentEntry->numGoodLogEntries > 0) + { + if (EFL_doModifyEntry(genMsg, &parms, index, &oldEntry) != zOK) + { + return zFAILURE; + } + } + else + { + if (EFL_doDeleteEntry(genMsg, &parms, &oldEntry) != zOK) + { + return zFAILURE; + } + } + } + else + { + if (ELOG_ClearALogEntry(genMsg, volume, currentEntry, block, offset, + epochMask, parms.xaction, &operation) != zOK) + { + EFL_releaseStuff(&parms); + return zFAILURE; + } + + switch (operation) + { + case ELOG_CREATE: + { + currentEntry->createEpochs &= ~epochMask; + break; + } + case ELOG_DELETE: + { + currentEntry->deleteEpochs &= ~epochMask; + break; + } + case ELOG_MODIFY: + { + currentEntry->modifyEpochs &= ~epochMask; + break; + } + case ELOG_MODIFY_META_DATA: + { + currentEntry->metaDataEpochs &= ~epochMask; + break; + } + } + if (EFL_doModifyEntry(genMsg, &parms, index, &oldEntry) != zOK) + { + return zFAILURE; + } + } + } + else + { + EFL_releaseStuff(&parms); + return zFAILURE; + } + + return zOK; +} + +/**************************************************************************** + * + * Find an Entry + * + ****************************************************************************/ + + +/**************************************************************************** + * Decend the tree until at a leaf + ****************************************************************************/ +STATUS EFL_getLeaf( + GeneralMsg_s *genMsg, + Zid_t zid, + ZfsVolume_s *volume, + EFLTreeParms_s *parms) +{ + NINT dummy; + IoMsg_s iomsg; + Blknum_t poolBlk; + + ASSERT_MPKNSS_LOCK(); + + while (!EFL_IS_LEAF((EFLTreeNode_s *)parms->child->pBuf.data)) + { + parms->parent = parms->child; + poolBlk = EFL_findChildBlock((EFLTreeNode_s *)parms->parent->pBuf.data, zid, &dummy); + + READBLK_IO_MSG(iomsg, volume->eflTree, poolBlk, CACHE_READ); + SET_DEBUG_ID(iomsg, 7); + parms->child = EFL_ReadPoolBlk(genMsg, &iomsg); + + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->parent->pBuf.data); + CACHE_RELEASE(parms->parent); + if (parms->child == NULL) + { + //zASSERT(0); + return(zFAILURE); + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms->child->pBuf.data); + } + return(zOK); +} + +/**************************************************************************** + * Find an EFL entry + * + * return: Returns the info for the entry + * + ****************************************************************************/ +STATUS ZFSVOL_VOL_lookupEFLEntry( + GeneralMsg_s *genMsg, + Volume_s *volume, + Zid_t zid, + EFLEpoch_t *epoch, + EFLEntry_s *entry) +{ + /* Currently not needed for anything */ + + return zFAILURE; +} + +/**************************************************************************** + * This function fills an array of EFLEntry_s entries from the EFL log. + * It starts at the "index" found in the nextEntry structure. + ****************************************************************************/ +STATUS ZFSVOL_VOL_enumerateEFL( + GeneralMsg_s *genMsg, + Volume_s *volume, + EFLEpoch_t *targetEpoch, + NINT maxEntries, + EFLEntryIndex_s *nextEntry, /* inout */ + EFLEntry_s *retEntries, /* out */ + NINT *numEntriesReturned) /* out */ +{ + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + Blknum_t block; + NINT offset; + Buffer_s *buffer; + Buffer_s *headerBuffer; + IoMsg_s iomsg; + ELogNode_s *node; + ELogEntry_s *entry; + EFLEpochMask_t targetEpochMask; + NINT objectType; + + *numEntriesReturned = 0; + if (!eTree) + { + nextEntry->done = TRUE; + return zOK; + } + + S_LATCH(&eTree->ZFSEFLTREEbeastLatch); /* Latch the tree */ + if (eTree->p.logHeaderBlock == INVALID_BLK_ZERO) + { + UNS_LATCH(&eTree->ZFSEFLTREEbeastLatch); + nextEntry->done = TRUE; + return zOK; + } + + targetEpochMask = EFL_GET_EPOCH_MASK(eTree, targetEpoch); + if (nextEntry->firstTime) + { + nextEntry->firstTime = FALSE; + + /* Start at the front */ + READBLK_IO_MSG(iomsg, eTree, eTree->p.logHeaderBlock, CACHE_READ); + headerBuffer = ELOG_ReadPoolBlk(genMsg, &iomsg); + if (headerBuffer == NULL) + { + zASSERT("Error getting header block in EFL enumerate" == NULL); + UNS_LATCH(&eTree->ZFSEFLTREEbeastLatch); + return zFAILURE; + } + node = (ELogNode_s *)headerBuffer->pBuf.data; + buffer = ELOG_GetFirstGoodEntry(genMsg, eTree, node->nextBlock, 0, + &block, &offset); + CACHE_RELEASE(headerBuffer); + } + else + { + buffer = ELOG_GetFirstGoodEntry(genMsg, eTree, nextEntry->u.zlss.block, + nextEntry->u.zlss.offset, &block, &offset); + } + UNS_LATCH(&eTree->ZFSEFLTREEbeastLatch); + + while ((buffer != NULL) && (*numEntriesReturned < maxEntries)) + { + node = (ELogNode_s *)buffer->pBuf.data; + entry = (ELogEntry_s *)&node->u.log.data[offset]; + if (targetEpochMask & entry->epochs) + { + retEntries[*numEntriesReturned].zid = entry->zid; + retEntries[*numEntriesReturned].id1 = block; + retEntries[*numEntriesReturned].id2 = offset; + if (entry->state & ELOG_ENTRY_STATE_DIRECTORY) + { + retEntries[*numEntriesReturned].objectType = EFL_DIRECTORY; + } + else if (entry->state & ELOG_ENTRY_STATE_DATASTREAM) + { + retEntries[*numEntriesReturned].objectType = EFL_DATASTREAM; + } + else if (entry->state & ELOG_ENTRY_STATE_EXTENDED_ATTRIBUTE) + { + retEntries[*numEntriesReturned].objectType = + EFL_EXTENDED_ATTRIBUTE; + } + else + { + retEntries[*numEntriesReturned].objectType = EFL_FILE; + } + retEntries[*numEntriesReturned].action = entry->action; + + /* + * Get the current name as well as the operation time name + * because it may have changed the requester needs to know + * where to go to get current content. + */ + if (entry->action == ELOG_DELETE) + { + zASSERT(entry->oldNameIndex == (WORD)(-1)); + unicpy(retEntries[*numEntriesReturned].name, entry->names); + retEntries[*numEntriesReturned].newName[0] = L'\0'; + retEntries[*numEntriesReturned].realName[0] = L'\0'; + } + else + { + if (entry->action == ELOG_RENAME) + { + zASSERT(entry->oldNameIndex != (WORD)(-1) ); + unicpy(retEntries[*numEntriesReturned].name, + &entry->names[entry->oldNameIndex]); + unicpy(retEntries[*numEntriesReturned].newName, + entry->names); + } + else + { + zASSERT(entry->oldNameIndex == (WORD)(-1) ); + unicpy(retEntries[*numEntriesReturned].name, entry->names); + retEntries[*numEntriesReturned].newName[0] = L'\0'; + } + + if (EFL_GetNameFromZid(genMsg, volume, entry->zid, + EFL_GET_NAME_SPACE_ID(volume), + retEntries[*numEntriesReturned].realName, &objectType) != zOK) + { + retEntries[*numEntriesReturned].realName[0] = L'\0'; + } + } + (*numEntriesReturned)++; + } + buffer = ELOG_GetNextGoodEntry(genMsg, eTree, buffer, offset, + &block, &offset); + } + if (buffer != NULL) + { + CACHE_RELEASE(buffer); + nextEntry->done = FALSE; + nextEntry->u.zlss.block = block; + nextEntry->u.zlss.offset = offset; + } + else + { + nextEntry->done = TRUE; + } + + return zOK; +} + +/**************************************************************************** + * Find an EFL entry + * + * return: Returns the exact entry + * + ****************************************************************************/ +STATUS EFL_GetEFLEntry( + GeneralMsg_s *genMsg, + Volume_s *volume, + Zid_t zid, + EFLLeafEntry_s *retEntry) +{ + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + EFLTreeParms_s parms; + IoMsg_s iomsg; + EFLTreeNode_s *child; + NINT i; + + ASSERT_MPKNSS_LOCK(); + + S_LATCH(&eTree->ZFSEFLTREEbeastLatch); /* Latch the tree */ + if (eTree->zfsBtree.p.btRoot == INVALID_BLK_ZERO) + { + UNS_LATCH(&eTree->ZFSEFLTREEbeastLatch); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + return zFAILURE; + } + + /* read the root node */ + READBLK_IO_MSG(iomsg, eTree, eTree->zfsBtree.p.btRoot, CACHE_READ); + SET_DEBUG_ID(iomsg, 8); + parms.child = EFL_ReadPoolBlk(genMsg, &iomsg); + + UNS_LATCH(&eTree->ZFSEFLTREEbeastLatch); + if (parms.child == NULL) + { + //zASSERT(0); + return zFAILURE; + } + EFL_VALIDATE_NODE((EFLTreeNode_s *)parms.child->pBuf.data); + + /* descend the tree to the leaf */ + if (EFL_getLeaf(genMsg, zid, (ZfsVolume_s *)volume, &parms) != zOK) + { + return zFAILURE; + } + zASSERT(EFL_IS_LEAF((EFLTreeNode_s *)parms.child->pBuf.data)); + + child = (EFLTreeNode_s *)parms.child->pBuf.data; + if (!EFL_findLeafEntry(child, zid, &i)) + { + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + CACHE_RELEASE(parms.child); + return zFAILURE; + } + + memcpy(retEntry, &child->ELEAF(i), sizeof(EFLLeafEntry_s)); + + CACHE_RELEASE(parms.child); + return zOK; +} + +/**************************************************************************** + * Return a buffer of Zids from the EFL tree. LastZidReturned is the + * ZID of the the last entry returned (it is the input to the next call). + * Initially it is set to zero. The returned entries either match the + * requested epoch or have an epoch mask of zero. + ****************************************************************************/ +STATUS EFL_enumerateTree( + GeneralMsg_s *genMsg, + Volume_s *volume, + EFLEpoch_t *targetEpoch, + NINT numEntriesRequested, + Zid_t *lastZidReturned, /* inout */ + Zid_t *zids, /* out */ + NINT *numEntriesReturned) /* out */ +{ + STATUS status; + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + EFLTreeParms_s parms; + EFLTreeNode_s *node; + EFLLeafEntry_s *currentEntry; + IoMsg_s iomsg; + NINT nextIndex; + NINT entryIndex; + EFLEpochMask_t epochMask; + EFLEpochMask_t targetEpochMask; + + if (!eTree) + { + *numEntriesReturned = 0; + return zOK; + } + + entryIndex = 0; + status = zOK; + if (eTree == NULL) + goto done; + + parms.xaction = 0; + parms.eTree = eTree; + + S_LATCH(&eTree->ZFSEFLTREEbeastLatch); /* Latch the tree */ + if (eTree->zfsBtree.p.btRoot == INVALID_BLK_ZERO) + { + UNS_LATCH(&eTree->ZFSEFLTREEbeastLatch); + goto done; + } + + /* + * Find the next beast after *lastZidReturned to return. + * For this, we start at the root. + */ + /* read the root node */ + READBLK_IO_MSG(iomsg, eTree, eTree->zfsBtree.p.btRoot, CACHE_READ); + parms.child = EFL_ReadPoolBlk(genMsg, &iomsg); + + UNS_LATCH(&eTree->ZFSEFLTREEbeastLatch); + if (parms.child == NULL) + { + status = zFAILURE; + goto done; + } + node = (EFLTreeNode_s *)parms.child->pBuf.data; + EFL_VALIDATE_NODE(node); + + /* descend the tree to the leaf */ + if ((status = EFL_getLeaf(genMsg, *lastZidReturned, (ZfsVolume_s *)volume, + &parms)) != zOK) + { + goto done; + } + + node = (EFLTreeNode_s *)parms.child->pBuf.data; + zASSERT(EFL_IS_LEAF(node)); + if (!EFL_findLeafEntry(node, *lastZidReturned, &nextIndex)) + { + nextIndex--; + } + nextIndex++; /* Skip the last matched entry */ + + /* + * Return the ZIDs of next numEntriesRequested after (*lastZidReturned) + * by sequentially scanning the EFL tree's leaf nodes + * + * Sequentially scan the linked list of leaf nodes for entries to return + * (starting at the entry at (parms.child, nextind)). + */ + targetEpochMask = EFL_GET_EPOCH_MASK(eTree, targetEpoch); + for (; entryIndex < numEntriesRequested; ) + { + /* Skip to a leaf node with entries to process */ + if (nextIndex >= node->numRecs) + { + /* Jump to the next leaf node */ + Buffer_s *nextbuf; + if (node->n.leaf.nextLeaf == INVALID_BLK_ZERO || + node->n.leaf.nextLeaf == 0) + { + break; + } + + READBLK_IO_MSG(iomsg, eTree, node->n.leaf.nextLeaf, CACHE_READ); + nextbuf = EFL_ReadPoolBlk(genMsg, &iomsg); + if (nextbuf == NULL) + { + status = zFAILURE; + break; + } + CACHE_RELEASE(parms.child); + parms.child = nextbuf; + node = (EFLTreeNode_s *)parms.child->pBuf.data; + nextIndex = 0; + } + + currentEntry = &node->ELEAF(nextIndex); + /* Add the beast's ZID to the beastZids list */ + zids[entryIndex] = currentEntry->zid; + + /* only return if the epoch matches */ + epochMask = currentEntry->createEpochs | currentEntry->modifyEpochs | + currentEntry->metaDataEpochs | currentEntry->deleteEpochs; + + if ((epochMask & eTree->p.usedEpochs) == 0 || + epochMask & targetEpochMask) + { + ++entryIndex; + } + + ++nextIndex; + } + CACHE_RELEASE(parms.child); + +done: + if (entryIndex > 0) + { + *lastZidReturned = zids[entryIndex - 1]; + } + *numEntriesReturned = entryIndex; + return status; +} + +/************************************************************************ + * + * Force write the beast + * + * NOTE: This function will xlatch the EFL tree beast + * + ************************************************************************/ +STATUS EFL_writeTheBeast( + GeneralMsg_s *genMsg, + Volume_s *volume, + ZfsEFLTreeBeast_s *eTree) +{ + ZfsXaction_s *localXaction; + STATUS status; + + /* force the beast out */ + X_LATCH(&eTree->ZFSEFLTREEbeastLatch); + localXaction = BeginXLocal(volume, BXL_DEFAULT); + COMN_MARK_BEAST_XLOCAL(&eTree->ZFSEFLTREEroot, + &localXaction->xaction); + status = COMN_ForceBeastWrite(genMsg, + &eTree->ZFSEFLTREEroot, &localXaction->xaction); + EndXlocal(localXaction); + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + return status; +} + +/**************************************************************************** + * Remove all entries for an epoch + ****************************************************************************/ +STATUS eflRemoveEpoch( + GeneralMsg_s *genMsg, + Volume_s *volume, + EFLEpoch_t *epoch) +{ +#define MAX_EFL_ENTRIES_PER_SCOOP 50 + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + EFLEpochMask_t epochMask; + NINT numEntriesObtained; + Zid_t lastZidReturned; + Zid_t *zids = NULL; + ZfsXaction_s *xaction; + NINT i; + Volume_s *tmpVolume; + STATUS status = zFAILURE; + + struct purgeLogInfo_s + { + PurgeLogMsg_s purgeLogMsg; + LONG purgeLogLoc[MAX_PLOG_LOCATION_SIZE]; + }; + struct purgeLogInfo_s *purgeLogInfo = NULL; + + + COMN_USE_BEAST(&volume->VOLroot); + if (COMN_LockVolumeActive(genMsg, volume, FALSE) != zOK) + { + goto exitRelease; + } + + if (!findEpochIndex(eTree, epoch, &i)) + { + errPrintf(WHERE, Module, 0, + MSG("Epoch to be removed was not found", 608)); + SetErrno(genMsg, zERR_EFL_EPOCH_NOT_FOUND); + goto exitUnlock; + } + epochMask = 1 << i; + eTree->p.activeEpochs &= ~epochMask; + (void)EFL_writeTheBeast(genMsg, volume, eTree); + + /* allocate memory for return data */ + zids = malloc(MAX_EFL_ENTRIES_PER_SCOOP * sizeof(Zid_t)); + if (!zids) + { + errPrintf(WHERE, Module, 0, + MSG("Error allocating memory for remove epoch operation\n", 609)); + SetErrno(genMsg, zERR_NO_MEMORY); + goto exitUnlock; + } + + /* set up purge log */ + purgeLogInfo = zalloc(sizeof(struct purgeLogInfo_s)); + if (purgeLogInfo == NULL) + { + errPrintf(WHERE, Module, 0, + MSG("Error allocating memory for remove epoch operation\n", 610)); + SetErrno(genMsg, zERR_NO_MEMORY); + goto exitFree; + } + + SETUP_REMOVE_EFL_EPOCH_LOG(&purgeLogInfo->purgeLogMsg, + (void *)&purgeLogInfo->purgeLogLoc, volume, epoch); + xaction = BeginXLocal(volume, BXL_DEFAULT); + status = volume->VOLcomnVolOps.VOL_addPurgeLogEntry( + genMsg, volume, PLOG_EFL_REMOVE_EPOCH, + &purgeLogInfo->purgeLogMsg, (Xaction_s *)xaction); + EndXlocal(xaction); + if (status != zOK) + { + goto exitFree; + } + + lastZidReturned = zINVALID_ZID; + for(;;) + { + if (volume->v_statusFlag & VOL_SF_LEAVING_ACTIVE_STATE_CLEANUP) + { + goto exitFree; /* do not remove the purge log entry */ + } + if (EFL_enumerateTree(genMsg, volume, + epoch, MAX_EFL_ENTRIES_PER_SCOOP, &lastZidReturned, + zids, &numEntriesObtained) != zOK) + { + break; + } + + if (numEntriesObtained == 0) + { + break; + } + + for (i = 0; i < numEntriesObtained; ++i) + { + xaction = BeginXLocal(volume, BXL_DEFAULT | X_CF_OK_TO_THROTTLE); + status = ZFSVOL_VOL_deleteEFLEntry(genMsg, volume, zids[i], + INVALID_BLK_ZERO, 0, epoch, &xaction->xaction); + + EndXlocal(xaction); + if (status != zOK) + { + errPrintf(WHERE, Module, 0, + MSG("Error removing epoch, status = %d\n", 611), GetErrno(genMsg)); + + goto exitRemovePurgeLog; + } + lastZidReturned = zids[i]; + } + LB_delay(1); /* delay so we don't fill the journal */ + } + + eTree->p.usedEpochs &= ~epochMask; + (void)EFL_writeTheBeast(genMsg, volume, eTree); + (void)ELOG_CleanupBlocks(genMsg, volume, TRUE); + + status = zOK; + +exitRemovePurgeLog: + xaction = BeginXLocal(volume, BXL_DEFAULT); + volume->VOLcomnVolOps.VOL_removePurgeLogEntry( + genMsg, volume, PLOG_EFL_REMOVE_EPOCH, + &purgeLogInfo->purgeLogMsg, (Xaction_s *)xaction); + EndXlocal(xaction); +exitFree: + if (zids) + { + free(zids); + } + if (purgeLogInfo) + { + free(purgeLogInfo); + } +exitUnlock: + COMN_UnlockVolumeActive(volume, FALSE); +exitRelease: + tmpVolume = volume; + COMN_Release(&tmpVolume); + + return status; +} + + +/**************************************************************************** + * Thread to get rid of all entries for an epoch. + ****************************************************************************/ +void EFL_RemoveEpochThread( + EFLRemoveEpochData_s *fsm, + void *unused) +{ + GeneralMsg_s genMsg; + + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + + eflRemoveEpoch(&genMsg, fsm->volume, &fsm->epoch); + free(fsm); + return; +} + +/**************************************************************************** + **************************************************************************** + * + * Administer the EFL + * + **************************************************************************** + ****************************************************************************/ +STATUS ZFSVOL_VOL_administerEFL( + GeneralMsg_s *genMsg, + Volume_s *volume, + NINT opcode, + EFLAdminIn_s *data, + EFLAdminOut_s *retData) +{ + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + LONG bit; + STATUS status = zOK; + NINT numEntries; + EFLEpoch_t *epochList; + EFLEpochMask_t epochMask; +#if NSS_DEBUG IS_ENABLED + STATIC NINT fsmInstance = 0; +#endif + + zASSERT(sizeof(LONG) == sizeof(EFLEpochMask_t)); + + if (!eTree) + { + /* somebody wants to start a EFL epoch, or set EFL default + * name space, it's a good time to create EFL tree if it doesn't exist + */ + if (((opcode == VOL_EFL_ADMIN_CHANGE_EPOCH) && (retData != NULL)) || + (opcode == VOL_EFL_ADMIN_SET_NAME_SPACE_ID) || + (opcode == VOL_EFL_ADMIN_SET_INACTIVE_EPOCH_INTERVAL)) + { + eTree = EFL_getEFL(genMsg, (ZfsVolume_s *)volume); + if (!eTree) + { + return zFAILURE; + } + } + else + { + if (opcode == VOL_EFL_GET_STATUS) + { + retData->u.active = FALSE; + return zOK; + } + else if (opcode == VOL_EFL_ADMIN_LIST_EPOCHS) + { + retData->u.listEpochs.numActiveEpochs = 0; + retData->u.listEpochs.activeEpochs = NULL; + retData->u.listEpochs.numUsedEpochs = 0; + retData->u.listEpochs.usedEpochs = NULL; + + return zOK; + } + else if (opcode == VOL_EFL_ADMIN_GET_NAME_SPACE_ID) + { + retData->u.nameSpaceID = EFL_DEFAULT_NAMESPACE; + + return zOK; + } + else if (((opcode == VOL_EFL_ADMIN_CHANGE_EPOCH) && (data != NULL)) || + (opcode == VOL_EFL_ADMIN_REMOVE_EPOCH) || + (opcode == VOL_EFL_ADMIN_RESET_EFL) || + (opcode == VOL_EFL_ADMIN_GET_INACTIVE_EPOCH_INTERVAL) || + (opcode == VOL_EFL_ADMIN_PING_EPOCH) || + (opcode == VOL_EFL_ADMIN_CHECK_EPOCH)) + { + return zOK; + } + else + { + return zFAILURE; + } + } + } + + switch(opcode) + { + case VOL_EFL_ADMIN_CHANGE_EPOCH: + { + /* stop the current active epoch and start a epoch */ + if (data != NULL) + { /* stop the specified epoch */ + epochMask = EFL_GET_EPOCH_MASK(eTree, &data->u.changeEpoch.epoch); + if (!epochMask) + { + SetErrno(genMsg, zERR_EFL_EPOCH_NOT_FOUND); + return zFAILURE; + } + eTree->p.activeEpochs &= ~epochMask; + status = EFL_writeTheBeast(genMsg, volume, eTree); + } + if (retData != NULL) + { + /* start a new epoch */ + if (status == zOK) + { + status = createEFLEpoch(genMsg, volume, + &retData->u.changeEpoch.epoch); + } + } + zASSERT((eTree->p.usedEpochs | eTree->p.activeEpochs) == eTree->p.usedEpochs); + return status; + } + case VOL_EFL_ADMIN_REMOVE_EPOCH: + { + EFLRemoveEpochData_s *fsm; + + fsm = malloc(sizeof(EFLRemoveEpochData_s)); + if (fsm == NULL) + { + SetErrno(genMsg, zERR_NO_MEMORY); + return zFAILURE; + } + + fsm->volume = volume; + fsm->epoch = data->u.removeEpoch.epoch; + + FSMLITE_INIT(&fsm->fsm, "FSM for removing EFL epochs", + ++fsmInstance); + WORK_Schedule((FsmLite_s *)fsm, EFL_RemoveEpochThread, NULL); + zASSERT((eTree->p.usedEpochs | eTree->p.activeEpochs) == eTree->p.usedEpochs); + return zOK; + } + case VOL_EFL_ADMIN_RESET_EFL: + { + /* This is done on COMMON layer instead */ + zASSERT("We shouldn't be here" == NULL); + break; + } + case VOL_EFL_ADMIN_LIST_EPOCHS: + { + NINT index; + + /* Do the active epochs */ + epochList = malloc(sizeof(EFLEpoch_t) * 32); + if (!epochList) + { + SetErrno(genMsg, zERR_NO_MEMORY); + return zFAILURE; + } + numEntries = 0; + for (bit = 1, index = 0; bit != 0; bit <<= 1, index++) + { + if (bit & eTree->p.activeEpochs) + { + epochList[numEntries++] = eTree->p.epochs[index].epoch; + } + } + retData->u.listEpochs.activeEpochs = epochList; + retData->u.listEpochs.numActiveEpochs = numEntries; + + /* Do the used epochs */ + epochList = malloc(sizeof(EFLEpoch_t) * 32); + if (!epochList) + { + SetErrno(genMsg, zERR_NO_MEMORY); + return zFAILURE; + } + numEntries = 0; + for (bit = 1, index = 0; bit != 0; bit <<= 1, index++) + { + if (bit & eTree->p.usedEpochs) + { + epochList[numEntries++] = eTree->p.epochs[index].epoch; + } + } + retData->u.listEpochs.usedEpochs = epochList; + retData->u.listEpochs.numUsedEpochs = numEntries; + return zOK; + } + case VOL_EFL_ADMIN_GET_NAME_SPACE_ID: + { + retData->u.nameSpaceID = EFL_GET_NAME_SPACE_ID(volume); + return zOK; + } + case VOL_EFL_ADMIN_SET_NAME_SPACE_ID: + { + eTree->p.nameSpace = data->u.nameSpaceID; + return EFL_writeTheBeast(genMsg, volume, eTree); + } + case VOL_EFL_ADMIN_GET_INACTIVE_EPOCH_INTERVAL: + { + retData->u.inactiveEpochInterval = + EFL_GET_INACTIVE_EPOCH_INTERVAL(volume); + return zOK; + } + case VOL_EFL_ADMIN_SET_INACTIVE_EPOCH_INTERVAL: + { + eTree->p.inactiveInterval = data->u.inactiveEpochInterval; + return EFL_writeTheBeast(genMsg, volume, eTree); + } + case VOL_EFL_ADMIN_PING_EPOCH: + { + return pingEpoch(genMsg, volume, &data->u.changeEpoch.epoch); + } + case VOL_EFL_ADMIN_CHECK_EPOCH: + { + EFLRemoveEpochData_s *fsm; + NINT i; + + for (i = 0; i < MAX_ZFS_EFL_EPOCHS; i++) + { + if (((GetUTCTime() - eTree->p.epochs[i].lastAlive) > eTree->p.inactiveInterval) + && ((1 << i) & eTree->p.usedEpochs)) + { + /* this epoch hasn't been pinged for a long time, delete it */ + fsm = malloc(sizeof(EFLRemoveEpochData_s)); + if (fsm == NULL) + { + errPrintf(WHERE, Module, 0, + MSG("Error allocating memory for remove epoch operation\n", 612)); + SetErrno(genMsg, zERR_NO_MEMORY); + return zFAILURE; + } + + fsm->volume = volume; + fsm->epoch = eTree->p.epochs[i].epoch; + + FSMLITE_INIT(&fsm->fsm, "FSM for removing EFL epochs", + ++fsmInstance); + WORK_Schedule((FsmLite_s *)fsm, EFL_RemoveEpochThread, NULL); + } + } + return zOK; + } + case VOL_EFL_GET_STATUS: + { + retData->u.active = eTree->p.activeEpochs ? TRUE : FALSE; + return zOK; + } + } + return zFAILURE; +} + +/**************************************************************************** + * + * This function will remove all blocks from the EFL tree + * + ****************************************************************************/ +STATUS ZFSVOL_VOL_resetEFL( + GeneralMsg_s *genMsg, + Volume_s *volume) +{ +#define MAX_EFL_TREE_DEPTH 20 + + typedef struct ParentStack_s + { + Buffer_s *buffer; + NINT entry; + } ParentStack_s; + EFLTreeNode_s *node; + ZfsEFLTreeBeast_s *eTree = ((ZfsVolume_s *)volume)->eflTree; + Buffer_s *buffer; + STATUS status; + ZfsXaction_s *xaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + EFLLog_s *logRecord; + SNINT stkPtr; + BOOL readALeaf = FALSE; + Blknum_t nodeBlockNum; + ELogNode_s *elogNode; + + typedef struct Stack_s { + IoMsg_s iomsg; + ParentStack_s parentStack[MAX_EFL_TREE_DEPTH]; + } Stack_s; + STACK_ALLOC(); + + /* + * If there is no tree then don't bother. + */ + if (eTree == NULL) + { + STACK_FREE(); + return zOK; + } + + /* + * Get an exclusive latch on the EFL Tree Beast + */ + X_LATCH(&eTree->ZFSEFLTREEbeastLatch); + + if (eTree->zfsBtree.p.btRoot == INVALID_BLK_ZERO) + { /* no root to the b-tree exists */ + status = zOK; + goto exit; + } + + /* Clean up the EFL log */ + if (ELOG_CleanupBlocks(genMsg, volume, FALSE) != zOK) + { + status = zFAILURE; + goto exit; + } + + READBLK_IO_MSG(aStack->iomsg, eTree, eTree->p.logHeaderBlock, CACHE_UPDATE); + buffer = ELOG_ReadPoolBlk(genMsg, &aStack->iomsg); + if (buffer == NULL) + { + zASSERT("No EFL log header block" == NULL); + SetErrno(genMsg, zERR_EFL_NO_LOG_HEADER); + STACK_FREE(); + return zFAILURE; + } + elogNode = (ELogNode_s *)buffer->pBuf.data; + + xaction = BeginXLocal(&((ZfsVolume_s *)volume)->vol, BXL_DEFAULT); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + sizeof(EFLRemoveLog_s)); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_REMOVE_LOG, xaction, + logBuffer, 1, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[0], buffer->volBlk, + elogNode->lsn, buffer, xaction, 0); + + logRecord->u.removeLog.internalID = eTree->ZFSEFLTREEroot.ROOTinternalID; + + elogNode->lsn = logBuffer->ZXR_Lsn; + ZLOG_DELETE_BLOCK(xaction, poolBlks[0]); + + ZLOG_BIND(xaction, buffer); + + ZLOG_ReleaseRecord(xaction); + + eTree->p.logHeaderBlock = INVALID_BLK_ZERO; + COMN_MARK_BEAST_XLOCAL(&eTree->ZFSEFLTREEroot, &xaction->xaction); + COMN_ForceBeastWrite(genMsg, eTree, &xaction->xaction); + EndXlocal(xaction); + CACHE_RELEASE(buffer); + + /* Clean up the EFL tree */ + stkPtr = -1; + nodeBlockNum = eTree->zfsBtree.p.btRoot; + for (;;) + { + if (((volume->VOLstate != zVOLSTATE_ACTIVE) || + (volume->v_statusFlag & VOL_SF_LEAVING_ACTIVE_STATE_CLEANUP)) && + !(genMsg->flags & ALLOW_INACTIVE_VOLUME)) + { + SetErrno(genMsg, zERR_VOLUME_STATE_CHANGE_REQUESTED); + status = zFAILURE; + goto exit; + } + + if (nodeBlockNum != INVALID_BLK_ZERO) + { + READBLK_IO_MSG(aStack->iomsg, eTree, nodeBlockNum, CACHE_UPDATE); + buffer = EFL_ReadPoolBlk(genMsg, &aStack->iomsg); + if (buffer == NULL) + { + status = zFAILURE; + goto exit; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + if (EFL_IS_LEAF(node)) + { /* + * If it is a leaf then set up so on the next loop it will be + * deleted. + */ + zASSERT(node->magic == EFL_LEAF_MAGIC); + cacheReleaseToss(buffer); + readALeaf = TRUE; + /* if the root block is a leaf then kick out */ + if (stkPtr >= 0) + { + nodeBlockNum = INVALID_BLK_ZERO; + } + else + { + break; + } + } + else + { /* + * it's a branch -- push on the stack and move on to its first + * child + */ + readALeaf = FALSE; + zASSERT(node->magic == EFL_BRANCH_MAGIC); + + stkPtr++; + zASSERT(stkPtr < MAX_EFL_TREE_DEPTH); + if (stkPtr >= MAX_EFL_TREE_DEPTH) + { + errPrintf(WHERE, Module, -1, + MSG("Directory tree too deep. Unable to remove it.",575)); + cacheReleaseToss(buffer); + status = zFAILURE; + goto exit; + } + aStack->parentStack[stkPtr].buffer = buffer; + aStack->parentStack[stkPtr].entry = 0; + if (node->numRecs > 0) + { + nodeBlockNum = node->EBRANCH(0).child; + } + else + { + nodeBlockNum = INVALID_BLK_ZERO; + } + } + } + else + { /* working from the stack */ + buffer = aStack->parentStack[stkPtr].buffer; + node = (EFLTreeNode_s *)buffer->pBuf.data; + + /* + * If we have read all of the child records we can now delete + * them. + * + * If this is a leaf we need to check each name record for the + * leaf and see if it should be deleted. + */ + aStack->parentStack[stkPtr].entry++; + if ((aStack->parentStack[stkPtr].entry >= node->numRecs) || readALeaf) + { + xaction = BeginXLocal(&((ZfsVolume_s *)volume)->vol, BXL_DEFAULT); + + /* Delete the children */ + while (node->numRecs > 0) + { + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(2)); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_REMOVE, xaction, + logBuffer, 2, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[0], buffer->volBlk, + node->lsn, buffer, xaction, 0); + node->lsn = logBuffer->ZXR_Lsn; + node->numRecs--; + + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[1], + node->EBRANCH(node->numRecs).child, 0, NULL, + xaction, 1); + ZLOG_DELETE_BLOCK(xaction, poolBlks[1]); + + ZLOG_BIND(xaction, buffer); + + ZLOG_ReleaseRecord(xaction); + } + EndXlocal(xaction); + readALeaf = FALSE; + CACHE_DIRTY_RELEASE(buffer); + + /* pop the stack */ + if (stkPtr > 0) + { + stkPtr--; + } + else + { + break; + } + } + else + { + nodeBlockNum = node->EBRANCH(aStack->parentStack[stkPtr].entry).child; + } + } + } + + /* Remove the root node */ + xaction = BeginXLocal(volume, BXL_DEFAULT); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(2)); + + ZLOG_INIT_LOG_RECORD(XFUNC_EFLTREE_REMOVE, xaction, logBuffer, 2, + poolBlks, logRecord); + + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[0], 0, 0, NULL, xaction, 0); + + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[1], eTree->zfsBtree.p.btRoot, 0, + NULL, xaction, 0); + + ZLOG_DELETE_BLOCK(xaction, poolBlks[1]); + + ZLOG_ReleaseRecord(xaction); + + /* + * Fix up the efl tree beast to show no tree + */ + eTree->zfsBtree.p.btRoot = INVALID_BLK_ZERO; + + /* resets efl tree beast's persistent areas */ + bzero(&eTree->p, sizeof(PersistentZfsEFLTree_s)); + eTree->p.logHeaderBlock = INVALID_BLK_ZERO; + eTree->p.nameSpace = EFL_DEFAULT_NAMESPACE; + eTree->p.inactiveInterval = EFL_defaultEpochInterval; + + COMN_MARK_BEAST_XLOCAL(&eTree->ZFSEFLTREEroot, &xaction->xaction); + COMN_ForceBeastWrite(genMsg, &eTree->ZFSEFLTREEroot, + &xaction->xaction); + EndXlocal(xaction); + + status = zOK; + +exit: + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + STACK_FREE(); + return status; +} + + +/*---------------------------------------------------------------------------*/ +/**************************************************************************** + * + * Recovery routines + * + ****************************************************************************/ + +/**************************************************************************** + * Recovery routine for initing a tree + * + ****************************************************************************/ +STATUS EFL_recoveryInit( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + EFLLog_s *logRecord; + BlockInfo_s *poolBlks; + + ASSERT_MPKNSS_LOCK(); + zASSERT( (logBuffer->ZXR_FunctionIndex == XFUNC_EFLTREE_INIT) ); + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + /* init the node */ + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_initNode(buffer, EFL_ROOT|EFL_LEAF, &logRecord->u.init.internalID); + EFL_insertZeroNode(node); + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + + /* EFL log header record */ + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + /* init the log header */ + node = (EFLTreeNode_s *)buffer->pBuf.data; + ELOG_InitNode(buffer, &logRecord->u.init.internalID, + ELOG_HEADER_RECORD); + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[1], action); + CACHE_DIRTY_RELEASE(buffer); + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for uniniting a tree (undo for init) + * + ****************************************************************************/ +STATUS EFL_recoveryUninit( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + Buffer_s *buffer; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + ASSERT_MPKNSS_LOCK(); + zASSERT( (logBuffer->ZXR_FunctionIndex == XFUNC_EFLTREE_INIT) ); + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + /* make sure the cache block for the deleted node is gone */ + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + cacheReleaseToss(buffer); + } + } + + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + /* make sure the cache block for the deleted node is gone */ + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + cacheReleaseToss(buffer); + } + } + + return zOK; +} + +/**************************************************************************** + * Recovery routine for shrinking a tree + ****************************************************************************/ +STATUS EFL_recoveryShrink( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + EFLLog_s *logRecord; + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + ASSERT_MPKNSS_LOCK(); + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + /* make sure the cache block for the deleted node is gone */ + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + cacheReleaseToss(buffer); + } + } + + /* update the new root */ + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the root */ + node->state |= EFL_ROOT; /* set the root state */ + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[1], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for growing a tree + ****************************************************************************/ +STATUS EFL_recoveryGrow( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + EFLLog_s *logRecord; + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + ASSERT_MPKNSS_LOCK(); + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* update the new root */ + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + + /* fix up the root */ + EFL_initNode(buffer, EFL_ROOT, &logRecord->u.grow.internalID); + node->numRecs = 1; + node->EBRANCH(0).zid = 0; + node->EBRANCH(0).child = logRecord->u.grow.blockForParent; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + + /* update the child */ + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the child */ + node->state &= ~(EFL_ROOT); /* reset the root state */ + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[1], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for balancing leafs + ****************************************************************************/ +STATUS EFL_recoveryBalance( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + EFLLog_s *logRecord; + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + NINT lenMoved; + NINT numToMove; + NINT block; + + ASSERT_MPKNSS_LOCK(); + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* update the parent */ + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + zASSERT(node->magic == EFL_BRANCH_MAGIC); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the parent */ + if (action == X_REDO) + { + node->EBRANCH(logRecord->u.balance.indexForParent).zid = + logRecord->u.balance.zidForParent; + } + else + { + node->EBRANCH(logRecord->u.balance.indexForParent).zid = + logRecord->u.balance.oldZid; + } + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* update the source */ + block = (action == X_REDO) ? 1 : 2; + if (ZLOG_VALID_BLOCK(poolBlks[block])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[block].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the source node */ + if (EFL_IS_LEAF(node)) + { /* leaf */ + numToMove = logRecord->u.balance.numToMove; + if ((numToMove > 0) && + (((EFLLeafEntry_s *)&logRecord->u.balance.data[0])->zid + == node->ELEAF(0).zid)) + { /* if we are removing from the start of the leaf record ... */ + memmove(&node->ELEAF(0), /* Destination */ + &node->ELEAF(numToMove), + (node->numRecs - numToMove) * sizeof(EFLLeafEntry_s)); + } + } + else + { /* branch */ + numToMove = logRecord->u.balance.numToMove; + if ((numToMove > 0) && + (((EFLBranchEntry_s *)&logRecord->u.balance.data[0])->zid + == node->EBRANCH(0).zid)) + { /* if we are removing from the start of the branch record ... */ + memmove(&node->EBRANCH(0), /* Destination */ + &node->EBRANCH(numToMove), + (node->numRecs - numToMove) * sizeof(EFLBranchEntry_s)); + } + } + node->numRecs -= numToMove; + + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[block], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* update the destination */ + block = (action == X_REDO) ? 2 : 1; + if (ZLOG_VALID_BLOCK(poolBlks[block])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[block].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the destination node */ + if (EFL_IS_LEAF(node)) + { /* leaf */ + numToMove = logRecord->u.balance.numToMove; + lenMoved = logRecord->u.balance.totalLength; + if (numToMove > 0) + { + if (((EFLLeafEntry_s *)&logRecord->u.balance.data[0])->zid + < node->ELEAF(0).zid) + { /* if we are inserting at the start of the branch record ... */ + + /* make room for the insert */ + memmove(&node->ELEAF(numToMove), /* Destination */ + &node->ELEAF(0), /* Source */ + node->numRecs * sizeof(EFLLeafEntry_s) ); + + /* insert the info from the log record */ + memcpy(&node->ELEAF(0), + &logRecord->u.balance.data[0], + lenMoved); + } + else + { + /* insert the info from the log record */ + memcpy(&node->ELEAF(node->numRecs), + &logRecord->u.balance.data[0], + lenMoved); + } + } + } + else + { /* branch */ + numToMove = logRecord->u.balance.numToMove; + lenMoved = logRecord->u.balance.totalLength; + if (numToMove > 0) + { + if (((EFLBranchEntry_s *)&logRecord->u.balance.data[0])->zid + < node->EBRANCH(0).zid) + { /* if we are inserting at the start of the branch record ... */ + + /* make room for the insert */ + memmove(&node->EBRANCH(numToMove), /* Destination */ + &node->EBRANCH(0), /* Source */ + node->numRecs * sizeof(EFLBranchEntry_s) ); + + /* insert the info from the log record */ + memcpy(&node->EBRANCH(0), + &logRecord->u.balance.data[0], + lenMoved); + } + else + { + /* insert the info from the log record */ + memcpy(&node->EBRANCH(node->numRecs), + &logRecord->u.balance.data[0], + lenMoved); + } + } + } + node->numRecs += numToMove; + + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[block], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for joining leafs + ****************************************************************************/ +STATUS EFL_recoveryJoin( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + EFLLog_s *logRecord; + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + NINT i; + + ASSERT_MPKNSS_LOCK(); + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* update the parent */ + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + zASSERT(node->magic == EFL_BRANCH_MAGIC); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the parent */ + --(node->numRecs); + i = logRecord->u.split.indexForParent; + memmove( &node->EBRANCH(i), /* Destination */ + &node->EBRANCH(i + 1), /* Source */ + (node->numRecs - i) * sizeof(EFLBranchEntry_s)); + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* update the child */ + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the child */ + if (EFL_IS_LEAF(node)) + { /* leaf */ + memcpy(&node->ELEAF(node->numRecs), + &logRecord->u.split.data[0], + logRecord->u.split.totalLength); + node->n.leaf.nextLeaf = logRecord->u.split.leafLink; + } + else + { /* branch */ + memcpy(&node->EBRANCH(node->numRecs), + &logRecord->u.split.data[0], + logRecord->u.split.totalLength); + } + node->numRecs += logRecord->u.split.numToMove; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[1], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + if (ZLOG_VALID_BLOCK(poolBlks[2])) + { + /* make sure the cache block for the deleted node is gone */ + READBLK_IO_MSG(iomsg, pool, poolBlks[2].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + node = (EFLTreeNode_s *)buffer->pBuf.data; + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { /* Because we have deleted this block just toss it */ + cacheReleaseToss(buffer); + } + else + { /* fix up the sibling */ + if (EFL_IS_LEAF(node)) + { /* leaf */ + node->magic |= 0x20; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[2], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + else + { + cacheReleaseToss(buffer); + } + } + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for splitting leafs + ****************************************************************************/ +STATUS EFL_recoverySplit( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + EFLLog_s *logRecord; + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + NINT i; + + ASSERT_MPKNSS_LOCK(); + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* update the parent */ + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + zASSERT(node->magic == EFL_BRANCH_MAGIC); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the parent */ + i = logRecord->u.split.indexForParent; + memmove(&node->EBRANCH(i + 1), /* Destination */ + &node->EBRANCH(i), /* Source */ + (node->numRecs - i) * sizeof(EFLBranchEntry_s)); + ++(node->numRecs); + node->EBRANCH(i).zid = logRecord->u.split.zidForParent; + node->EBRANCH(i).child = logRecord->u.split.blockForParent; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* remove from the old node (on the left)*/ + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { + if (EFL_IS_LEAF(node)) + { + node->n.leaf.nextLeaf = logRecord->u.split.blockForParent; + } + node->numRecs -= logRecord->u.split.numToMove; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[1], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* add to the new node */ + if (ZLOG_VALID_BLOCK(poolBlks[2])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[2].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + + /* fix up the child */ + EFL_initNode(buffer, logRecord->u.split.nodeType, + &logRecord->u.split.internalID); + if (EFL_IS_LEAF(node)) + { /* leaf */ + memcpy(&node->ELEAF(0), + &logRecord->u.split.data[0], + logRecord->u.split.totalLength); + node->n.leaf.nextLeaf = logRecord->u.split.leafLink; + } + else + { /* branch */ + memcpy(&node->EBRANCH(0), + &logRecord->u.split.data[0], + logRecord->u.split.totalLength); + } + node->numRecs = logRecord->u.split.numToMove; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[2], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for inserting a eNode into a beast + ****************************************************************************/ +STATUS EFL_recoveryInsertEntry( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + EFLLog_s *logRecord; + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + ASSERT_MPKNSS_LOCK(); + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + if (!ZLOG_VALID_BLOCK(poolBlks[0])) + { + return zOK; + } + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + zASSERT(node->magic == EFL_LEAF_MAGIC); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + return zOK; + } + + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (EFL_insertEntry(genMsg, node, &logRecord->u.insert.newEntry) != zOK) + { + CACHE_RELEASE(buffer); + return zFAILURE; + } + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + return zOK; +} + + +/**************************************************************************** + * Recovery routine for inserting an eNode (logical undo of insert) + ****************************************************************************/ +STATUS EFL_recoveryDeleteEntryLogical( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + BlockInfo_s *poolBlks; + EFLLog_s *logRecord; + EFLTreeParms_s parms; + EFLTreeNode_s *node; + ZfsVolume_s *zfsVol; + ZfsXaction_s *xaction; + GUID_t internalID; + ZfsEFLTreeBeast_s *eTree; + NINT index; + STATUS status = zFAILURE; + + ASSERT_MPKNSS_LOCK(); + + if (action == X_UNDO) + { + return zX_LOGICAL; + } + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + internalID = logRecord->u.insert.internalID; + + zfsVol = ZLSS_VolumeIDLookupRecovery(genMsg, &internalID, pool); + if (zfsVol == NULL) + { + return zFAILURE; + } + + xaction = BeginXLocal(&zfsVol->vol, BXL_LOGICAL_UNDO); + + eTree = zfsVol->eflTree; + + parms.xaction = xaction; + parms.volume = (Volume_s *)zfsVol; + parms.zid = logRecord->u.insert.newEntry.zid; + parms.eTree = eTree; + + X_LATCH(&eTree->ZFSEFLTREEbeastLatch); + + if (eTree->zfsBtree.p.btRoot == INVALID_BLK_ZERO) + { + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + goto exit; + } + + if (EFL_findLeaf(genMsg, &parms, EFL_CHECK_FOR_JOIN) != zOK) + { + goto exit; + } + + node = (EFLTreeNode_s *)parms.child->pBuf.data; + if (EFL_findLeafEntry(node, parms.zid, &index)) + { + status = EFL_doDeleteEntry(genMsg, &parms, + &logRecord->u.insert.newEntry); + } + +exit: + SET_LUNDO_LSN(xaction, logBuffer, status); + EndXlocal(xaction); + COMN_Release(&zfsVol); + + return status; +} + +/**************************************************************************** + * Recovery routine for deleting an entry (redo) + ****************************************************************************/ +STATUS EFL_recoveryDeleteEntry( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + EFLLog_s *logRecord; + NINT i; + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + ASSERT_MPKNSS_LOCK(); + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + if (!ZLOG_VALID_BLOCK(poolBlks[0])) + { + return zOK; + } + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + zASSERT(node->magic == EFL_LEAF_MAGIC); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + return zOK; + } + + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (EFL_findLeafEntry(node, logRecord->u.delete.entry.zid, &i)) + { + EFL_deleteEntry(node, i); + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + return zOK; + } + else + { /* zid not found in the buffer */ + CACHE_RELEASE(buffer); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + return zFAILURE; + } +} + + +/**************************************************************************** + * Recovery routine for deleting an eNode (logical undo for delete) + ****************************************************************************/ +STATUS EFL_recoveryInsertEntryLogical( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + STATUS status = zFAILURE; + EFLLog_s *logRecord; + ZfsVolume_s *zfsVol; + EFLTreeParms_s parms; + ZfsEFLTreeBeast_s *eTree; + ZfsXaction_s *xaction; + GUID_t internalID; + + ASSERT_MPKNSS_LOCK(); + + if (action == X_UNDO) + { + return zX_LOGICAL; + } + + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + internalID = logRecord->u.delete.internalID; + + zfsVol = ZLSS_VolumeIDLookupRecovery(genMsg, &internalID, pool); + if (zfsVol == NULL) + { + return zFAILURE; + } + + eTree = zfsVol->eflTree; + + X_LATCH(&eTree->ZFSEFLTREEbeastLatch); + + if (eTree->zfsBtree.p.btRoot == INVALID_BLK_ZERO) + { + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + goto exitRelease; + } + + xaction = BeginXLocal(&zfsVol->vol, BXL_LOGICAL_UNDO); + + parms.xaction = (ZfsXaction_s *)xaction; + parms.zid = logRecord->u.delete.entry.zid; + parms.volume = (Volume_s *)zfsVol; + parms.eTree = eTree; + + if (EFL_findLeaf(genMsg, &parms, EFL_CHECK_FOR_SPLIT) != zOK) + { + goto exitXaction; + } + +#if NSS_DEBUG IS_ENABLED + { + EFLTreeNode_s *node; + NINT index; + + node = (EFLTreeNode_s *)parms.child->pBuf.data; + if (EFL_findLeafEntry(node, parms.zid, &index)) + { + zASSERT("Node is not deleted" == NULL); + goto exitXaction; + } + } +#endif + + status = EFL_doInsertEntry(genMsg, &parms, &logRecord->u.delete.entry); + + SET_LUNDO_LSN(xaction, logBuffer, status); +exitXaction: + EndXlocal(xaction); +exitRelease: + COMN_Release(&zfsVol); + + return status; +} + + +/**************************************************************************** + * Recovery routine for modifying a leaf entry + ****************************************************************************/ +STATUS EFL_recoveryModifyEntry( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + EFLLog_s *logRecord; + NINT i; + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + EFLLeafEntry_s *oldEntry; + EFLLeafEntry_s *newEntry; + + ASSERT_MPKNSS_LOCK(); + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (!ZLOG_VALID_BLOCK(poolBlks[0])) + { + return zOK; + } + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + newEntry = &logRecord->u.modify.newEntry; + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + zASSERT(node->magic == EFL_LEAF_MAGIC); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + return zOK; + } + + if (!EFL_findLeafEntry(node, newEntry->zid, &i)) + { /* zid not found in the buffer */ + zASSERT("Leaf entry is not found" == NULL); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + CACHE_RELEASE(buffer); + return zFAILURE; + } + + oldEntry = &node->ELEAF(i); + *oldEntry = *newEntry; + + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + + return zOK; +} + +/**************************************************************************** + * Recovery routine for updating an eNode (undo) + ****************************************************************************/ +STATUS EFL_recoveryModifyEntryLogical( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + EFLLog_s *logRecord; + NINT i; + EFLTreeNode_s *node; + EFLTreeParms_s parms; + ZfsEFLTreeBeast_s *eTree; + EFLLeafEntry_s *oldEntry; + EFLLeafEntry_s *leafEntry; + EFLLeafEntry_s prevEntry; + ZfsXaction_s *xaction; + GUID_t internalID; + ZfsVolume_s *zfsVol; + STATUS status = zFAILURE; + ASSERT_MPKNSS_LOCK(); + + if (action == X_UNDO) + { + return zX_LOGICAL; + } + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + internalID = logRecord->u.modify.internalID; + oldEntry = &logRecord->u.modify.oldEntry; + + zfsVol = ZLSS_VolumeIDLookupRecovery(genMsg, &internalID, pool); + if (zfsVol == NULL) + { + return zFAILURE; + } + + xaction = BeginXLocal(&zfsVol->vol, BXL_LOGICAL_UNDO); + + eTree = zfsVol->eflTree; + X_LATCH(&eTree->ZFSEFLTREEbeastLatch); + + if (eTree->zfsBtree.p.btRoot == INVALID_BLK_ZERO) + { + UNX_LATCH(&eTree->ZFSEFLTREEbeastLatch); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + goto exit; + } + + parms.xaction = (ZfsXaction_s *)xaction; + parms.volume = (Volume_s *)zfsVol; + parms.eTree = eTree; + parms.zid = oldEntry->zid; + + if (EFL_findLeaf(genMsg, &parms, EFL_CHECK_FOR_JOIN) != zOK) + { + if (EFL_TREE_X_LATCHED(&parms)) + { + EFL_UNX_LATCH_TREE(&parms); + } + goto exit; + } + + node = (EFLTreeNode_s *)parms.child->pBuf.data; + if (!EFL_findLeafEntry(node, parms.zid, &i)) + { /* zid not found in the buffer */ + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + goto exit; + } + + + leafEntry = &node->ELEAF(i); + prevEntry = *leafEntry; + *leafEntry = *oldEntry; + + status = EFL_doModifyEntry(genMsg, &parms, i, &prevEntry); + + zASSERT(!EFL_TREE_X_LATCHED(&parms)); + +exit: + SET_LUNDO_LSN(xaction, logBuffer, status); + EndXlocal(xaction); + COMN_Release(&zfsVol); + return status; +} + + +/**************************************************************************** + * Recovery routine for removing nodes during reset of the tree + ****************************************************************************/ +STATUS EFL_recoveryRemove( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + Buffer_s *buffer; + EFLTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (EFLTreeNode_s *)buffer->pBuf.data; + EFL_VALIDATE_NODE(node); + zASSERT(node->magic == EFL_BRANCH_MAGIC); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { + if (action == X_REDO) + { + node->numRecs--; + } + else + { + node->numRecs++; + } + + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + EFL_VALIDATE_NODE(node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + /* make sure the cache block for the deleted node is gone */ + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_WRITE) + if ((buffer = EFL_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + cacheReleaseToss(buffer); + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for removing the link to the EFL log + ****************************************************************************/ +STATUS EFL_recoveryRemoveLog( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + Buffer_s *buffer; + ELogNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + EFLLog_s *logRecord; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + if ((buffer = ELOG_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (ELogNode_s *)buffer->pBuf.data; + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { + if (action == X_REDO) + { + cacheReleaseToss(buffer); + } + else + { + ELOG_InitNode(buffer, &logRecord->u.removeLog.internalID, + ELOG_HEADER_RECORD); + } + + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + + + +/*=========================================================================== + *=========================================================================== + *=========================================================================== + * + * ZFSEFLTREE BEAST + * + *=========================================================================== + *=========================================================================== + *===========================================================================*/ + +/**************************************************************************** + * ZFSEFLTREE COMMON BEAST OPERATIONS definition + *****************************************************************************/ + +/*--------------------------------------------------------------------------- + * Defines all of the ZFS beast tree beast operations + *---------------------------------------------------------------------------*/ +CommonBeastOps_s ZFSEFLTREE_ComnBeastOps = +{ + EFL_construct, /* construct */ + NULL, /* destruct */ + +// cnt NULL, /* BST_getNameUniquifier */ + NULL, /* BST_setupNameTypeSpecificInfo */ + NULL, /* BST_lookupByNameInDirectory*/ + NULL, /* BST_isDirectoryEmpty*/ + NULL, /* BST_addNameToDirectory*/ + NULL, /* BST_removeNameFromDirectory*/ + NULL, /* BST_modifyNameSpaceMaskInDirectory*/ + NULL, /* BST_setMatchAttributesInDirectory*/ + NULL, /* BST_wildcardLookup*/ + + NULL, /* BST_truncateFile*/ + NULL, /* BST_getStorageInfo*/ + NULL, /* BST_getExtentList*/ + NULL, /* BST_getPhysicalExtent*/ + NULL, /* BST_isBlockInBeast*/ + + NULL, /* BST_asyncReadFileBlk*/ + NULL, /* BST_getFileBlk*/ + NULL, /* BST_dfsReadUnits*/ + NULL, /* BST_dfsWriteUnits*/ + + NULL, /* BST_getZID*/ + NULL, /* BST_beastNotify*/ + NULL, /* BST_getInfo*/ + NULL, /* BST_modifyInfo*/ + NULL, /* BST_getInfoXML*/ + NULL /* BST_modifyInfoXML*/ +}; + + +/**************************************************************************** + * EXTENT BASED storage pack routine + *****************************************************************************/ +STATIC NINT ZFSEFLTREE_PackedSize( + void *beast_LX) +{ + ASSERT_MPKNSS_LOCK(); + return sizeof(PersistentZfsEFLTree_s); +} + +/**************************************************************************** + * EXTENT BASED storage pack routine + *****************************************************************************/ +STATIC BYTE *ZFSEFLTREE_Pack( + void *beast_LX, + BYTE *storeBuffer) +{ + ZfsEFLTreeBeast_s *beast = (ZfsEFLTreeBeast_s *)beast_LX; + + ASSERT_MPKNSS_LOCK(); + memcpy(storeBuffer, &beast->p, sizeof(PersistentZfsEFLTree_s)); + return (storeBuffer + sizeof(PersistentZfsEFLTree_s)); +} + +/**************************************************************************** + * EXTENT BASED storage pack routines + *****************************************************************************/ +STATIC BYTE *ZFSEFLTREE_Unpack( + GeneralMsg_s *genMsg, + void *beast_LX, + BYTE *storeBuffer) +{ + ZfsEFLTreeBeast_s *beast = (ZfsEFLTreeBeast_s *)beast_LX; + + ASSERT_MPKNSS_LOCK(); + memcpy(&beast->p, storeBuffer, sizeof(PersistentZfsEFLTree_s)); + return (storeBuffer + sizeof(PersistentZfsEFLTree_s)); +} + +/*--------------------------------------------------------------------------- + * file beast STORAGE ops definition + *---------------------------------------------------------------------------*/ +LSSSpecificPackUnpackOps_s ZFSEFLTREE_lssOps[] = +{ + {zLSS_ID_ZLSS,ZFSEFLTREE_PackedSize,ZFSEFLTREE_Pack,NULL,ZFSEFLTREE_Unpack}, + {zLSS_ID_INVALID} +}; diff --git a/src/nwnss/zlss/myBTree.c b/src/nwnss/zlss/myBTree.c new file mode 100644 index 0000000..8e57617 --- /dev/null +++ b/src/nwnss/zlss/myBTree.c @@ -0,0 +1,3416 @@ +/**************************************************************************** + | + | (C) Copyright 2000 Novell, Inc. + | All Rights Reserved. + | + | This program is free software; you can redistribute it and/or + | modify it under the terms of version 2 of the GNU General Public + | License as published by the Free Software Foundation. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, contact Novell, Inc. + | + | To contact Novell about this file by physical or electronic mail, + | you may find current contact information at www.novell.com + | + |*************************************************************************** + | + | NetWare Advance File Services (NSS) module + | + |--------------------------------------------------------------------------- + | + | $Author: gpachner $ + | $Date: 2007-07-28 03:10:14 +0530 (Sat, 28 Jul 2007) $ + | + | $RCSfile$ + | $Revision: 2107 $ + | + |--------------------------------------------------------------------------- + | This module is used to: + | Generic fixed-key, fixed-value B-tree for use in various ZLSS + | data structures. + +-------------------------------------------------------------------------*/ +#include +#include +#include +#include +#include +#include +#include + +#include "myBTree.h" +#include "pssDebug.h" +#include "msgGen.h" +#include "zlog.h" +#include "zlssStartup.h" +#include "msgIO.h" +#include "zError.h" +#include "comnPublics.h" +#include "parse.h" +#include "zfsXTree.h" +#include "comnParams.h" + +STATIC void MYBT_logDeleteRecord ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms, + NINT index); + +STATIC void MYBT_deleteLeafEntry( + MYBTreeInstanceInfo_s *treeInfo, + MYBTreeNode_s *node, + NINT index); + +/* + * The information is stored in a B+ tree with all data stored in the + * leaf nodes. Splits and joins are done during descent if there is a chance + * a split/join will be needed. This may cause the occasional operation that + * is not needed. But it is worth it to be able to lock smaller portions of + * the tree. + */ + +#define MYBT_keyCopy(_treeInfo, _k1, _k2) \ + memmove(_k1, _k2, MYBT_KEY_SIZE(_treeInfo)) + +#define MYBT_valueCopy(_treeInfo, _v1, _v2) \ + memmove(_v1, _v2, MYBT_VALUE_SIZE(_treeInfo)) + +Buffer_s *MYBT_ReadPoolBlk( + MYBTreeInstanceInfo_s *treeInfo, + GeneralMsg_s *genMsg, + IoMsg_s *iomsg) +{ + Buffer_s *buffer; + MYBTreeNode_s *node; + BOOL modified; + + buffer = ZFS_ReadPoolBlk(genMsg, iomsg); + if ((buffer != NULL) && (iomsg->mode != CACHE_WRITE)) + { + if (treeInfo->upgradeNode) + { + if (treeInfo->upgradeNode(genMsg, buffer, iomsg->mode, &modified) + != zOK) + { + CACHE_RELEASE(buffer); + return NULL; + } + if (modified && (iomsg->mode != CACHE_READ)) + { + buffer->state |= CACHE_DIRTY; + } + } + node = (MYBTreeNode_s *)buffer->pBuf.data; + if ((MYBT_IS_LEAF(node) && + (node->magic != MYBT_LEAF_MAGIC(treeInfo))) || + (node->magic != MYBT_BRANCH_MAGIC(treeInfo))) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, iomsg); + CACHE_RELEASE(buffer); + return NULL; + } + } + return buffer; +} + +#define VERIFY_CONDITION(_cond, _dontAssert) \ +while (! _cond) \ +{ \ + if (! (_dontAssert)) \ + { \ + zASSERT(_cond); \ + } \ + goto invalidNode; \ +} + +/**************************************************************************** + * + ****************************************************************************/ +BOOL MYBT_validateNode( + MYBTreeInstanceInfo_s *treeInfo, + MYBTreeNode_s *node, + BOOL dontAssert) +{ + NINT count; + + if (MYBT_IS_LEAF(node)) + { + BYTE *curr, *prev; + + VERIFY_CONDITION( + ((node->magic == MYBT_LEAF_MAGIC(treeInfo)) || + (node->magic == (MYBT_LEAF_MAGIC(treeInfo) | 0x20))), + dontAssert); + + prev = MYBT_LEAFENTRY(treeInfo, node, 0); + + VERIFY_CONDITION((MYBT_MAX_LEAF_ENTRIES(treeInfo) >= node->numRecs), dontAssert); + for(count = 1; count < node->numRecs; count++) + { + curr = MYBT_LEAFENTRY(treeInfo, node, count); + + /* check if ascending */ + VERIFY_CONDITION((treeInfo->keyComp(MYBT_LEAFENTRY_KEY(treeInfo, prev), + MYBT_LEAFENTRY_KEY(treeInfo, curr)) < 0), dontAssert); + + /* set previous record and try again */ + prev = curr; + } + } + else + { + MYBTBranchEntry_s *curr; + MYBTBranchEntry_s *prev; + + VERIFY_CONDITION((node->magic == MYBT_BRANCH_MAGIC(treeInfo)), dontAssert); + + prev = MYBT_BRANCHENTRY(treeInfo, node, 0); + + VERIFY_CONDITION((MYBT_MAX_BRANCH_ENTRIES(treeInfo) >= node->numRecs), + dontAssert); + for(count = 1; count < node->numRecs; count++) + { + curr = MYBT_BRANCHENTRY(treeInfo, node, count); + + /* check if ascending */ + VERIFY_CONDITION((treeInfo->keyComp(&prev->key, &curr->key) < 0), + dontAssert); + + /* set previous record and try again */ + prev = curr; + } + } + return TRUE; +invalidNode: ; + return FALSE; +} + +#if NSS_DEBUG IS_ENABLED + +/**************************************************************************** + * + ****************************************************************************/ +void MYBT_displayNode( + MYBTreeInstanceInfo_s *treeInfo, + char *location, + char *nameOfBuffer, + Buffer_s *buffer) +{ + NINT i; + MYBTreeNode_s *node = (MYBTreeNode_s *)buffer->pBuf.data; + Blknum_t block = buffer->volBlk; + + DBG_DebugPrintf(LBLUE, MSGNot("%s=%s<%lx(%ld.)>"), location, nameOfBuffer, + (unsigned long)block, (unsigned long)block ); + + if (MYBT_IS_ROOT(node)) + { + DBG_DebugPrintf(LBLUE, MSGNot("ROOT:")); + } + if (MYBT_IS_LEAF(node)) + { + DBG_DebugPrintf(LBLUE, MSGNot("LEAF")); + if (node->magic != MYBT_LEAF_MAGIC(treeInfo)) + { + DBG_DebugPrintf(LRED, MSGNot(":ERROR(Node does not have correct MAGIC (value is %lx))\n"),node->magic); + zASSERT(node->magic == MYBT_LEAF_MAGIC(treeInfo)); + return; + } + DBG_DebugPrintf(LRED, MSGNot("(numRecs=%d)\n"), node->numRecs); + for (i = 0; i < node->numRecs; ++i) + { + DBG_DebugPrintf(YELLOW, MSGNot("<")); + treeInfo->displayKey(YELLOW, MYBT_LEAF_IND_KEY(treeInfo, node, i)); + DBG_DebugPrintf(YELLOW, MSGNot(", ")); + treeInfo->displayValue(YELLOW, + MYBT_LEAF_IND_VALUE(treeInfo, node, i)); + DBG_DebugPrintf(YELLOW, MSGNot(">\n")); + } + } + else + { + DBG_DebugPrintf(LGREEN, MSGNot("BRANCH")); + if (node->magic != MYBT_BRANCH_MAGIC(treeInfo)) + { + DBG_DebugPrintf(LRED, ":ERROR(Node does not have correct MAGIC (value is %lx))\n",node->magic); + zASSERT(node->magic == MYBT_BRANCH_MAGIC(treeInfo)); + return; + } + DBG_DebugPrintf(LBLUE, MSGNot("(numRecs=%d)\n"), node->numRecs); + for (i = 0; i < node->numRecs; ++i) + { + DBG_DebugPrintf(LBLUE, MSGNot("<")); + treeInfo->displayKey(LBLUE, MYBT_BRANCHENTRY(treeInfo, node, i)->key); + DBG_DebugPrintf(LBLUE, MSGNot(", %-10d>"), + (LONG)MYBT_BRANCHENTRY(treeInfo, node, i)->child); + if ((i % 3) == 2) + { + DBG_DebugPrintf(LBLUE, MSGNot("\n")); + } + } + if ((i % 3) != 0) + { + DBG_DebugPrintf(LBLUE, MSGNot("\n")); + } + } +} + +/**************************************************************************** + * + ****************************************************************************/ +STATIC void MYBT_displaySubtree( + MYBTreeInstanceInfo_s *treeInfo, + ZfsMYBTreeBeast_s *treeBeast, + Buffer_s *buffer) +{ + GeneralMsg_s genMsg; + MYBTreeNode_s *node = (MYBTreeNode_s *)buffer->pBuf.data; + Buffer_s *newBuf; + NINT i; + IoMsg_s iomsg; + + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + if (MYBT_IS_LEAF(node)) + { + MYBT_displayNode(treeInfo, MSGNot("Full Tree"), MSGNot("LEAF"), buffer); + } + else + { + MYBT_displayNode(treeInfo, MSGNot("Full Tree"), MSGNot("BRANCH"), buffer); + if ( node->magic == MYBT_BRANCH_MAGIC(treeInfo) ) + { + for (i = 0; i < node->numRecs; i++) + { + READBLK_IO_MSG(iomsg, treeBeast, + MYBT_BRANCHENTRY(treeInfo, node, i)->child, CACHE_READ); + SET_DEBUG_ID(iomsg, 0); + newBuf = MYBT_ReadPoolBlk(treeInfo, &genMsg, &iomsg); + zASSERT(newBuf != NULL); + MYBT_displaySubtree(treeInfo, treeBeast, newBuf); + CACHE_RELEASE(newBuf); + } + } + } +} + +/**************************************************************************** + * Given a volume, display the tree + ****************************************************************************/ +void MYBT_displayTree( + MYBTreeInstanceInfo_s *treeInfo, + ZfsMYBTreeBeast_s *ztree) +{ + GeneralMsg_s genMsg; + Buffer_s *buffer; + IoMsg_s iomsg; + + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + + S_LATCH(&ztree->ZFSMYBTREEbeastLatch); /* Latch the tree */ +// wPause(stdout, -1); + +// RootDisplayMyCache(ztree); + + DBG_DebugPrintf( LBLUE, MSGNot("***** MY B-TREE *****\n")); + DBG_DebugPrintf( LBLUE, MSGNot("mybtRoot=%d\n"), + ztree->ZFSMYBTREEzbtree.p.btRoot); + if (ztree->zfsBtree.p.btRoot != INVALID_BLK) + { + READBLK_IO_MSG(iomsg, ztree, ztree->zfsBtree.p.btRoot, CACHE_READ); + SET_DEBUG_ID(iomsg, 0); + buffer = MYBT_ReadPoolBlk(treeInfo, &genMsg, &iomsg); + zASSERT(buffer != NULL); + MYBT_displaySubtree(treeInfo, ztree, buffer); + CACHE_RELEASE(buffer); + } + else + { + DBG_DebugPrintf( LBLUE, MSGNot("Empty B-Tree\n")); + } +// wPause(stdout, 0); + UNS_LATCH(&ztree->ZFSMYBTREEbeastLatch); +} + +/**************************************************************************** + * + ****************************************************************************/ +void MYBT_compare ( + MYBTreeNode_s *node1, + MYBTreeNode_s *node2, + NINT len) +{ +#if 0 + MYBT_VALIDATE_NODE(node1); + MYBT_VALIDATE_NODE(node2); +#endif + zASSERT(memcmp(node1, node2, len) == 0); + return; +} + +/**************************************************************************** + * + ****************************************************************************/ +void MYBT_noCompare ( + MYBTreeNode_s *node1, + MYBTreeNode_s *node2, + NINT len) +{ +} + +#endif //nss_debug + +/**************************************************************************** + * This routine returns the block number of the child that should have an + * entry for the key. + * + * In: buffer - the buffer to be searched + * key - key to find. + * Out: index into the ptr array (0 to numrecs) + * Return: Blknum of the child block holding the key + ****************************************************************************/ +STATIC Blknum_t MYBT_findChildBlock( + MYBTreeInstanceInfo_s *treeInfo, + MYBTreeNode_s *node, + MYBTKey_t key, + NINT *index) +{ + SNINT mid = 0; + SNINT low = 0; + SNINT high = node->numRecs-1; + SNINT cmpResult = 0; + + ENTER(TZTREE, MYBT_findChildBlock); + + while (low <= high) + { + mid = (low + high) / 2; + cmpResult = treeInfo->keyComp(key, MYBT_BRANCHENTRY(treeInfo, node, mid)->key); + if (cmpResult == 0) /* key == mid entry's key */ + { + break; + } + else if (cmpResult > 0) /* key > mid entry's key */ + { + low = mid + 1; + } + else + { + high = mid - 1; + } + } + + if (cmpResult < 0) /* key < mid entry's key */ + { + --mid; + } + *index = mid; + zASSERT(mid >= 0); + RTN_BLOCK(MYBT_BRANCHENTRY(treeInfo, node, mid)->child); +} + +/**************************************************************************** + * This routine find an entry in a leaf. If the entry is not found then + * the location where the insertion should occur is returned. + * I: buffer - the buffer to be searched + * I: key - key to find + * O: index into the ptr array (0 to numrecs) + * index of the item before which the given key can be inserted. + * R: True if the key is found + ****************************************************************************/ +STATIC BOOL MYBT_findLeafEntry( + MYBTreeInstanceInfo_s *treeInfo, + MYBTreeNode_s *node, + const MYBTKey_t key, + NINT *index) +{ + SNINT mid = 0; + SNINT low = 0; + SNINT high = node->numRecs-1; + BOOL foundFlag = FALSE; + SNINT cmpResult = 0; + + ENTER(TZTREE, MYBT_findLeafEntry); + MYBT_VALIDATE_NODE(treeInfo, node); + + while (low <= high) + { + mid = (low + high) / 2; + cmpResult = treeInfo->keyComp(key, MYBT_LEAF_IND_KEY(treeInfo, node, mid)); + if (cmpResult == 0) /* key == mid entry's key */ + { + foundFlag = TRUE; + break; + } + else if (cmpResult > 0) /* key > mid entry's key */ + { + low = mid + 1; + } + else + { + high = mid - 1; + } + } + + if (cmpResult > 0) /* key > mid entry's key */ + { + ++mid; + } + + *index = mid; + RTN_BOOL(foundFlag); +} + +/**************************************************************************** + * + ****************************************************************************/ +STATIC void MYBT_initNode( + MYBTreeInstanceInfo_s *treeInfo, + Buffer_s *buffer, + NINT state, + GUID_t *internalID) +{ + MYBTreeNode_s *node = (MYBTreeNode_s *)buffer->pBuf.data; + + ENTER(TZTREE, MYBT_initNode); + + zASSERT(LB_GUIDValidate(internalID)); + if (state & MYBT_LEAF) + { + node->magic = MYBT_LEAF_MAGIC(treeInfo); + node->n.leaf.nextLeaf = INVALID_BLK; + } + else + { + node->magic = MYBT_BRANCH_MAGIC(treeInfo); + } + node->state = state; + node->utn_internalID = *internalID; + node->numRecs = 0; + node->lsn = 0; + + buffer->state |= CACHE_DIRTY; + + MYBT_VALIDATE_NODE(treeInfo, node); + RTN_VOID(); +} + +/**************************************************************************** + * + ****************************************************************************/ +STATIC void MYBT_insertZeroNode ( + MYBTreeInstanceInfo_s *treeInfo, + MYBTreeNode_s *node) +{ + ENTER(TZTREE, MYBT_insertZeroNode); + + MYBT_VALIDATE_NODE(treeInfo, node); + /* + * We always want a zero in the first location to establish a lower bound + * so we do not need a lot of special case code. + */ + node->numRecs = 1; + bzero(MYBT_LEAFENTRY(treeInfo, node, 0), MYBT_LEAFENTRY_SIZE(treeInfo)); + MYBT_VALIDATE_NODE(treeInfo, node); + RTN_VOID(); +} + +/**************************************************************************** + * Free the node pointed to by sibling + * + * Input: The sibling node points to the node that will be delete + * + * Output: Sibling is NULL + ****************************************************************************/ +STATIC STATUS MYBT_freeNode ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + Buffer_s *sibling = parms->sibling; + + ENTER(TZTREE, MYBT_freeNode); + MYBT_VALIDATE_NODE(parms->treeInfo, (MYBTreeNode_s *)parms->sibling->pBuf.data); + + CACHE_RELEASE(sibling); + sibling = NULL; + + RTN_STATUS(zOK); +} + +/**************************************************************************** + * Join the child and sibling and release the sibling + * + * Input: Parent, child and sibling latched. + * + * Output: Parent and child latched. Sibling released. + ****************************************************************************/ +STATIC STATUS MYBT_join ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + MYBTreeNode_s *parent = (MYBTreeNode_s *)parms->parent->pBuf.data; + MYBTreeNode_s *child = (MYBTreeNode_s *)parms->child->pBuf.data; + MYBTreeNode_s *sibling = (MYBTreeNode_s *)parms->sibling->pBuf.data; + BYTE *startLocation; + NINT lenMoved; + NINT numMoved; + NINT parentIndex; + ZfsXaction_s *localXaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + MybtInternalLog_s *logRecord; + BYTE *keyForParent; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, MYBT_join); + + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->sibling->pBuf.data); + + if (MYBT_IS_LEAF(child)) + { /* leaf */ + /* setup logging information */ + startLocation = MYBT_LEAFENTRY(treeInfo, child, child->numRecs); + lenMoved = sibling->numRecs * MYBT_LEAFENTRY_SIZE(treeInfo); + numMoved = sibling->numRecs; + + memcpy(startLocation, /* Destination */ + MYBT_LEAFENTRY(treeInfo, sibling, 0), /* Source */ + lenMoved); + child->numRecs += sibling->numRecs; + + /* update the leaf link */ + child->n.leaf.nextLeaf = sibling->n.leaf.nextLeaf; + /* Change magic in Deleted Leaf Node to indicate it is deleted. This + * is being done for scan mode of rebuild. + */ + sibling->magic |= 0x20; /* Lower case low byte letter */ + } + else + { /* branch */ + /* setup logging information */ + startLocation = (BYTE *)MYBT_BRANCHENTRY(treeInfo, child, child->numRecs); + lenMoved = sibling->numRecs * MYBT_BRANCHENTRY_SIZE(treeInfo); + numMoved = sibling->numRecs; + + memcpy(startLocation, /* Destination */ + MYBT_BRANCHENTRY(treeInfo, sibling, 0), /* Source */ + lenMoved); + child->numRecs += sibling->numRecs; + } + + localXaction = BeginXLocal(parms->volume, BXL_DEFAULT); + + zASSERT((ZLOG_BLOCK_INFO_SIZE(3) + sizeof(MYBTSplit_s) - 1 + lenMoved + MYBT_KEY_SIZE(treeInfo)) <= + ZLOG_MAXIMUM_RECORD_SIZE); + + ZLOG_ObtainRecord(localXaction, ZLOG_BLOCK_INFO_SIZE(3) + sizeof(MYBTSplit_s) + - 1 + lenMoved + MYBT_KEY_SIZE(treeInfo)); + + ZLOG_INIT_LOG_RECORD( + (treeInfo->internalRecoveryStartOpCode + MYBT_RECOVERY_OP_JOIN), + localXaction, logBuffer, 3, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->parent->volBlk, + parent->lsn, parms->parent, localXaction, 0, MYBT_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->child->volBlk, + child->lsn, parms->child, localXaction, 1, MYBT_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[2], parms->sibling->volBlk, + sibling->lsn, parms->sibling, localXaction, 2, MYBT_noCompare); + ZLOG_DELETE_BLOCK(localXaction, poolBlks[2]); + + --(parent->numRecs); + parentIndex = parms->index + 1; + keyForParent = &logRecord->u.split.data[lenMoved]; + MYBT_keyCopy(treeInfo, keyForParent, MYBT_BRANCHENTRY(treeInfo, parent, parentIndex)->key); + memmove( MYBT_BRANCHENTRY(treeInfo, parent, parentIndex), /* Destination */ + MYBT_BRANCHENTRY(treeInfo, parent, parentIndex + 1), /* Source */ + (parent->numRecs - parentIndex) * MYBT_BRANCHENTRY_SIZE(treeInfo)); + logRecord->u.split.leafLink = child->n.leaf.nextLeaf; + logRecord->u.split.blockForParent = poolBlks[2].blkNum; + zASSERT( parms->btreeBeast != NULL ); + logRecord->u.split.internalID = + parms->btreeBeast->ZFSMYBTREEroot.ROOTinternalID; + logRecord->u.split.indexForParent = parentIndex; + logRecord->u.split.dataLength = lenMoved; + logRecord->u.split.numToMove = numMoved; + logRecord->u.split.nodeType = sibling->state & MYBT_LEAF; + memcpy(&logRecord->u.split.data[0], startLocation, lenMoved); + + child->lsn = logBuffer->ZXR_Lsn; + parent->lsn = logBuffer->ZXR_Lsn; +#if LOG_TEST IS_ENABLED + if (LogTest) + sibling->lsn = logBuffer->ZXR_Lsn; +#endif + + ZLOG_BIND(localXaction, parms->child); + ZLOG_BIND(localXaction, parms->parent); + if (MYBT_IS_LEAF(child) ) + { /* This is the deleted LEAF block we must write so that rebuild + * can scan whole disk and not find OLD beast tree leafs. + */ + parms->sibling->state |= CACHE_DIRTY; + ZLOG_BIND(localXaction, parms->sibling); +#if 0 + /* This code allows for testing of the recovery code of setting + * the magic word on deletes. + */ +#if NSS_ASSERT IS_ENABLED + { + extern LONG gZlogAssertBlock; + ZlogBeast_s *zlogBeast; + + zlogBeast = localXaction->ZX_zlogBeast; + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + gZlogAssertBlock = (LONG)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + zASSERT(" Close to time to re-boot machine"==NULL); + } +#endif +#endif + } + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + parms->parent->state |= CACHE_DIRTY; + parms->child->state |= CACHE_DIRTY; + + if (MYBT_freeNode(genMsg, parms) != zOK) + { + RTN_STATUS(zFAILURE); + } + + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + RTN_STATUS(zOK); +} + +/**************************************************************************** + * In: parent, child, sibling latched + * Out: sibling unlatched + ****************************************************************************/ +STATIC STATUS MYBT_balance ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + MYBTreeNode_s *parent = (MYBTreeNode_s *)parms->parent->pBuf.data; + MYBTreeNode_s *sibling = (MYBTreeNode_s *)parms->sibling->pBuf.data; + MYBTreeNode_s *child = (MYBTreeNode_s *)parms->child->pBuf.data; + NINT avg; + NINT numToMove; + BYTE *startLocation; + NINT lenMoved; + NINT direction; + ZfsXaction_s *localXaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + MybtInternalLog_s *logRecord; + BYTE *keyForParent, *oldKey; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, balance); + + avg = (child->numRecs + sibling->numRecs) / 2; + + /* Dont do balancing if no records can be moved */ + if ((child->numRecs == avg) || (sibling->numRecs == avg)) + { + /* + * The entry we were looking for when we started the balance may be + * in either the child or the sibling entry. Make sure that it + * ends up in the child. + */ + if (treeInfo->keyComp(parms->key, + MYBT_BRANCHENTRY(treeInfo, parent, parms->index + 1)->key) >= 0) + { + CACHE_RELEASE(parms->child); + parms->child = parms->sibling; + } + else + { + CACHE_RELEASE(parms->sibling); + } + RTN_STATUS(zOK); + } + + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->sibling->pBuf.data); + + if (MYBT_IS_LEAF(child)) + { /* leaf */ + if (child->numRecs < avg) + { /* if moving from the sibling to the child */ + direction = MYBT_RIGHT_TO_LEFT; + numToMove = avg - child->numRecs; + lenMoved = numToMove * MYBT_LEAFENTRY_SIZE(treeInfo); + startLocation = (BYTE *)MYBT_LEAFENTRY(treeInfo, child, child->numRecs); + + /* move the entries from sibling to child */ + memcpy(startLocation, /* Destination */ + MYBT_LEAFENTRY(treeInfo, sibling, 0), /* Source */ + lenMoved); + + /* move sibling entries back to the front of the record */ + memmove(MYBT_LEAFENTRY(treeInfo, sibling, 0), /* Destination */ + MYBT_LEAFENTRY(treeInfo, sibling, numToMove), /* Source */ + (sibling->numRecs - numToMove) * MYBT_LEAFENTRY_SIZE(treeInfo)); + + child->numRecs += numToMove; + sibling->numRecs -= numToMove; + } + else + { + zASSERT(sibling->numRecs < avg); + + direction = MYBT_LEFT_TO_RIGHT; + numToMove = avg - sibling->numRecs; + lenMoved = numToMove * MYBT_LEAFENTRY_SIZE(treeInfo); + startLocation = (BYTE *)MYBT_LEAFENTRY(treeInfo, sibling, 0); + + /* make room for the entries from the child node */ + memmove(MYBT_LEAFENTRY(treeInfo, sibling, numToMove), /* Destination */ + MYBT_LEAFENTRY(treeInfo, sibling, 0), /* Source */ + sibling->numRecs * MYBT_LEAFENTRY_SIZE(treeInfo)); + + /* move the entries from the child to the sibling */ + memcpy(startLocation, + MYBT_LEAFENTRY(treeInfo, child, child->numRecs - numToMove), + lenMoved); + + child->numRecs -= numToMove; + sibling->numRecs += numToMove; + } + } + else + { /* branch */ + if (child->numRecs < avg) + { /* if moving from the sibling to the child */ + direction = MYBT_RIGHT_TO_LEFT; + numToMove = avg - child->numRecs; + lenMoved = numToMove * MYBT_BRANCHENTRY_SIZE(treeInfo); + startLocation = (BYTE *)MYBT_BRANCHENTRY(treeInfo, child, child->numRecs); + + /* move the entries from sibling to child */ + memcpy(startLocation, /* Destination */ + MYBT_BRANCHENTRY(treeInfo, sibling, 0), /* Source */ + lenMoved); + + /* move sibling entries back to the front of the record */ + memmove(MYBT_BRANCHENTRY(treeInfo, sibling, 0), /* Destination */ + MYBT_BRANCHENTRY(treeInfo, sibling, numToMove), /* Source */ + (sibling->numRecs - numToMove) * MYBT_BRANCHENTRY_SIZE(treeInfo)); + + child->numRecs += numToMove; + sibling->numRecs -= numToMove; + } + else + { + zASSERT(sibling->numRecs < avg); + + direction = MYBT_LEFT_TO_RIGHT; + numToMove = avg - sibling->numRecs; + lenMoved = numToMove * MYBT_BRANCHENTRY_SIZE(treeInfo); + startLocation = (BYTE *)MYBT_BRANCHENTRY(treeInfo, sibling, 0); + + /* make room for the entries from the child node */ + memmove(MYBT_BRANCHENTRY(treeInfo, sibling, numToMove), /* Destination */ + MYBT_BRANCHENTRY(treeInfo, sibling, 0), /* Source */ + sibling->numRecs * MYBT_BRANCHENTRY_SIZE(treeInfo)); + + /* move the entries from the child to the sibling */ + memcpy(startLocation, + MYBT_BRANCHENTRY(treeInfo, child, child->numRecs - numToMove), + lenMoved); + + child->numRecs -= numToMove; + sibling->numRecs += numToMove; + } + } + + localXaction = BeginXLocal(parms->volume,BXL_DEFAULT); + + ZLOG_ObtainRecord( localXaction, ZLOG_BLOCK_INFO_SIZE(3) + sizeof(MYBTBalance_s) + - 1 + lenMoved + 2 * MYBT_KEY_SIZE(treeInfo)); + + /* create the log record */ + ZLOG_INIT_LOG_RECORD( + (treeInfo->internalRecoveryStartOpCode + MYBT_RECOVERY_OP_BALANCE), + localXaction, logBuffer, 3, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->parent->volBlk, + parent->lsn, parms->parent, localXaction, 0, MYBT_compare); + + /* put the source in position 1 and the destination in position 2 */ + if (direction == MYBT_LEFT_TO_RIGHT) + { + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->child->volBlk, + child->lsn, parms->child, localXaction, 1, MYBT_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[2], parms->sibling->volBlk, + sibling->lsn, parms->sibling, localXaction, 2, MYBT_compare); + } + else + { + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[2], parms->child->volBlk, + child->lsn, parms->child, localXaction, 2, MYBT_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->sibling->volBlk, + sibling->lsn, parms->sibling, localXaction, 1, MYBT_compare); + } + + /* fix up the parent record */ + oldKey = &logRecord->u.balance.data[lenMoved]; + MYBT_keyCopy(treeInfo, oldKey, MYBT_BRANCHENTRY(treeInfo, parent, parms->index + 1)->key); + if (MYBT_IS_LEAF(sibling)) + { + MYBT_keyCopy(treeInfo, MYBT_BRANCH_IND_KEY(treeInfo, parent, parms->index + 1), + MYBT_LEAF_IND_KEY(treeInfo, sibling, 0)); + } + else + { + MYBT_keyCopy(treeInfo, MYBT_BRANCHENTRY(treeInfo, parent, parms->index + 1)->key, + MYBT_BRANCHENTRY(treeInfo, sibling, 0)->key); + } + + keyForParent = oldKey + MYBT_KEY_SIZE(treeInfo); + MYBT_keyCopy(treeInfo, keyForParent, MYBT_BRANCHENTRY(treeInfo, parent, parms->index + 1)->key); + logRecord->u.balance.indexForParent = parms->index + 1; + logRecord->u.balance.dataLength = lenMoved; + logRecord->u.balance.numToMove = numToMove; + + memcpy(&logRecord->u.balance.data[0], startLocation, lenMoved); + + child->lsn = logBuffer->ZXR_Lsn; + sibling->lsn = logBuffer->ZXR_Lsn; + parent->lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(localXaction, parms->parent); + ZLOG_BIND(localXaction, parms->child); + ZLOG_BIND(localXaction, parms->sibling); + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + if (treeInfo->keyComp(parms->key, + MYBT_BRANCHENTRY(treeInfo, parent, parms->index + 1)->key) >= 0) + { + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->sibling->pBuf.data); + CACHE_DIRTY_RELEASE(parms->child); + parms->child = parms->sibling; + } + else + { + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->sibling->pBuf.data); + CACHE_DIRTY_RELEASE(parms->sibling); + } + + parms->parent->state |= CACHE_DIRTY; + parms->child->state |= CACHE_DIRTY; + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + RTN_STATUS(zOK); +} + +/**************************************************************************** + * Joins or balances nodes because a node is partially empty + * + * Input: Parent and child are locked. + * + * Ouput: Both still locked + ****************************************************************************/ +STATIC STATUS MYBT_underflow ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + MYBTreeNode_s *parent = (MYBTreeNode_s *)parms->parent->pBuf.data; + IoMsg_s iomsg; + STATUS status; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, MYBT_underflow); + + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + /* Get sibling and child in right order, child lower than sibling */ + if (parms->index < parent->numRecs - 1) + { + parms->readBlkNum = MYBT_BRANCHENTRY(treeInfo, parent, parms->index + 1)->child; + READBLK_IO_MSG(iomsg, btreeBeast, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 1); + parms->sibling = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg); + if (parms->sibling == NULL) + { + RTN_STATUS(zFAILURE); + } + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->sibling->pBuf.data); + } + else + { + parms->sibling = parms->child; + -- parms->index; + parms->readBlkNum = MYBT_BRANCHENTRY(treeInfo, parent, parms->index)->child; + READBLK_IO_MSG(iomsg, btreeBeast, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 2); + /* + * Release the latch so we do not latch out of order -- This is + * OK because the parent latch is held. + */ + CACHE_PIN(parms->sibling); + CACHE_UNXLATCH(parms->sibling); + parms->child = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg); + CACHE_XLATCH(parms->sibling); + CACHE_UNPIN(parms->sibling); + if (parms->child == NULL) + { + CACHE_RELEASE(parms->sibling); + RTN_STATUS(zFAILURE); + } + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + } + + if (MYBT_IS_LESSTHAN_MAX(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data, + (MYBTreeNode_s *)parms->sibling->pBuf.data)) + { + status = MYBT_join(genMsg, parms); + } + else + { + status = MYBT_balance(genMsg, parms); + } + + RTN_STATUS(status); +} + +/**************************************************************************** + * Shrink the tree by removing the root node + * + * Input: Child is latched. It is the root. The tree beast is latched. + * + * Output: New child (latched). btree beast is latched. + ****************************************************************************/ +STATIC STATUS MYBT_shrink ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + Lsn_t rootLsn; + Blknum_t rootBlkNum; + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + IoMsg_s iomsg; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + MybtInternalLog_s *logRecord; + ZfsXaction_s *localXaction; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + + ENTER(TZTREE, MYBT_shrink); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + parms->sibling = parms->child; + rootBlkNum = parms->child->volBlk; + rootLsn = ((MYBTreeNode_s *)parms->child->pBuf.data)->lsn; + btreeBeast->zfsBtree.p.btRoot = + MYBT_BRANCHENTRY(treeInfo, ((MYBTreeNode_s *)parms->child->pBuf.data), 0)->child; + + + /* force the beast out */ + localXaction = BeginXLocal(parms->volume,BXL_DEFAULT); + COMN_MARK_BEAST_XLOCAL(&btreeBeast->ZFSMYBTREEroot, + &localXaction->xaction); + if (COMN_ForceBeastWrite(genMsg, &btreeBeast->ZFSMYBTREEroot, + &localXaction->xaction)) + { + goto errorEndXaction; + } + + parms->readBlkNum = btreeBeast->zfsBtree.p.btRoot; + + READBLK_IO_MSG(iomsg, btreeBeast, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 4); + parms->child = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg); + if (parms->child == NULL) + { + //zASSERT(0); + goto errorEndXaction; + } + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + ((MYBTreeNode_s *)parms->child->pBuf.data)->state |= MYBT_ROOT; + + /* log the shrink action */ + + ZLOG_ObtainRecord(localXaction, ZLOG_BLOCK_INFO_SIZE(2) + sizeof(MYBTGrow_s)); + + ZLOG_INIT_LOG_RECORD( + (treeInfo->internalRecoveryStartOpCode + MYBT_RECOVERY_OP_SHRINK), + localXaction, logBuffer, 2, poolBlks, logRecord); + + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], rootBlkNum, + ((MYBTreeNode_s *)parms->sibling)->lsn, parms->sibling, localXaction, 0, + MYBT_compare); + ZLOG_DELETE_BLOCK(localXaction, poolBlks[0]); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->child->volBlk, + ((MYBTreeNode_s *)parms->child->pBuf.data)->lsn, parms->child, + localXaction, 1, MYBT_compare); + + ((MYBTreeNode_s *)parms->child->pBuf.data)->lsn = logBuffer->ZXR_Lsn; + +#if LOG_TEST IS_ENABLED + if (LogTest) + ((MYBTreeNode_s *)parms->sibling->pBuf.data)->lsn = logBuffer->ZXR_Lsn; +#endif + logRecord->u.grow.blockForParent = parms->child->volBlk; + logRecord->u.grow.internalID = btreeBeast->ZFSMYBTREEroot.ROOTinternalID; + + ZLOG_BIND(localXaction, parms->child); + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + if (MYBT_freeNode(genMsg, parms) != zOK) + { + goto errorExit; + } + + parms->child->state |= CACHE_DIRTY; + + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + RTN_STATUS(zOK); + +errorEndXaction: + EndXlocal(localXaction); +errorExit: +#if NSS_DEBUG IS_ENABLED + if (parms->child != NULL) + { + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + } +#endif + RTN_STATUS(zFAILURE); +} + +/**************************************************************************** + * Check sibling nodes to see if they should be joined + * + * Input: Child has the node to be checked. Child is latched. The tree + * beast is latched if the child is the root. + * + * Output: Child and parent are latched. b-tree is unlatched. + * If an error occurs then child and parent are unlatched. + ****************************************************************************/ +STATIC STATUS MYBT_checkForJoin ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + MYBTreeNode_s *node = (MYBTreeNode_s *)parms->child->pBuf.data; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, MYBT_checkForJoin); + MYBT_VALIDATE_NODE(treeInfo, node); + + if (MYBT_IS_ROOT(node)) + { + if ((node->numRecs == 1) && !MYBT_IS_LEAF(node)) + { + if (MYBT_shrink(genMsg, parms) != zOK) + { + UNX_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + RTN_STATUS(zFAILURE); + } + else + { /* if it got turned into a leaf during shrink, don't unlatch it */ + if (!MYBT_IS_LEAF((MYBTreeNode_s *)parms->child->pBuf.data)) + { + UNX_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + } + } + } + else + { + UNX_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + } + } + else + { + if (MYBT_IS_MIN_ENTRIES(treeInfo, node)) + { + if (MYBT_underflow(genMsg, parms) != zOK) + { + goto errorRelease; + } + } + } + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + RTN_STATUS(zOK); + +errorRelease: + if (parms->parent != NULL) + { + CACHE_RELEASE(parms->parent); + } + if (parms->child != NULL) + { + CACHE_RELEASE(parms->child); + } + RTN_STATUS(zFAILURE); +} + +/**************************************************************************** + * Split a node. + * + * Input: Child has the node that will be split. Parent and child are + * latched. + * + * Output: Child holds the node that can be inserted into. The parent + * and child are latched. + ****************************************************************************/ +STATIC STATUS MYBT_split ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + MYBTreeNode_s *parent = (MYBTreeNode_s *)parms->parent->pBuf.data; + MYBTreeNode_s *sibling; + MYBTreeNode_s *child; + NINT i; + IoMsg_s iomsg; + BYTE *startLocation; + NINT lenMoved; + NINT numMoved; + NINT holdState; + ZfsXaction_s *localXaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + MybtInternalLog_s *logRecord; + BYTE *keyForParent; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, MYBT_split); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + parms->sibling = parms->child; + sibling = (MYBTreeNode_s *)parms->sibling->pBuf.data; + + localXaction = BeginXLocal(parms->volume,BXL_DEFAULT); + + /* get the new node */ + XALLOC_SEED_IO_MSG( iomsg, btreeBeast, localXaction, parms->sibling->volBlk, CACHE_WRITE ); + parms->child = ZFS_AllocPoolBlkSpecialWithFlags(genMsg, &iomsg, NULL, XTREE_AF_NEAR_TREE ); + if (parms->child == NULL) + { + parms->child = parms->sibling; /* restore the child for error cleanup */ + goto errorEndXaction; + } + parms->readBlkNum = parms->child->volBlk; + + /* + * Sibling is the node to be split. It is the left node in the split. + * Child is the new node. + */ + + child = (MYBTreeNode_s *)parms->child->pBuf.data; + holdState = sibling->state & MYBT_LEAF; + MYBT_initNode(treeInfo, parms->child, holdState, + &btreeBeast->ZFSMYBTREEroot.ROOTinternalID); + + if (MYBT_IS_LEAF(sibling)) + { + child->numRecs = sibling->numRecs - MYBT_MIN_LEAF_ENTRIES(treeInfo); + + /* setup logging information */ + startLocation = (BYTE *)MYBT_LEAFENTRY(treeInfo, child, 0); + lenMoved = child->numRecs * MYBT_LEAFENTRY_SIZE(treeInfo); + numMoved = child->numRecs; + + memcpy(startLocation, /* Destination */ + MYBT_LEAFENTRY(treeInfo, sibling, MYBT_MIN_LEAF_ENTRIES(treeInfo)), /* Source */ + lenMoved); /* length */ + sibling->numRecs = MYBT_MIN_LEAF_ENTRIES(treeInfo); + + /* update leaf links */ + child->n.leaf.nextLeaf = sibling->n.leaf.nextLeaf; + sibling->n.leaf.nextLeaf = parms->readBlkNum; + } + else + { + child->numRecs = sibling->numRecs - MYBT_MIN_BRANCH_ENTRIES(treeInfo); + + /* setup logging information */ + startLocation = (BYTE *)MYBT_BRANCHENTRY(treeInfo, child, 0); + lenMoved = child->numRecs * MYBT_BRANCHENTRY_SIZE(treeInfo); + numMoved = child->numRecs; + + memcpy(startLocation, /* Destination */ + MYBT_BRANCHENTRY(treeInfo, sibling, MYBT_MIN_BRANCH_ENTRIES(treeInfo)), /* Source */ + lenMoved); /* length */ + sibling->numRecs = MYBT_MIN_BRANCH_ENTRIES(treeInfo); + } + + /* + * Make room in the Parent to place the node + */ + i = parms->index + 1; + memmove(MYBT_BRANCHENTRY(treeInfo, parent, i + 1), /* Destination */ + MYBT_BRANCHENTRY(treeInfo, parent, i), /* Source */ + (parent->numRecs - i) * MYBT_BRANCHENTRY_SIZE(treeInfo)); + + if (MYBT_IS_LEAF(sibling)) + { + MYBT_keyCopy(treeInfo, MYBT_BRANCHENTRY(treeInfo, parent, i)->key, MYBT_LEAF_IND_KEY(treeInfo, child, 0)); + } + else + { + MYBT_keyCopy(treeInfo, MYBT_BRANCHENTRY(treeInfo, parent, i)->key, MYBT_BRANCHENTRY(treeInfo, child, 0)->key); + } + MYBT_BRANCHENTRY(treeInfo, parent, i)->child = parms->readBlkNum; + MYBT_BRANCHENTRY(treeInfo, parent, i)->reserved = 0; + ++(parent->numRecs); + + ZLOG_ObtainRecord( localXaction, ZLOG_BLOCK_INFO_SIZE(3) + sizeof(MYBTSplit_s) + - 1 + lenMoved + MYBT_KEY_SIZE(treeInfo)); + + ZLOG_INIT_LOG_RECORD( + (treeInfo->internalRecoveryStartOpCode + MYBT_RECOVERY_OP_SPLIT), + localXaction, logBuffer, 3, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->parent->volBlk, parent->lsn, + parms->parent, localXaction, 0, MYBT_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->sibling->volBlk, sibling->lsn, + parms->sibling, localXaction, 1, MYBT_compare); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[2], parms->child->volBlk, child->lsn, + parms->child, localXaction, 2, MYBT_compare); + ZLOG_ALLOC_BLOCK(poolBlks[2]); + + keyForParent = &logRecord->u.split.data[lenMoved]; + MYBT_keyCopy(treeInfo, keyForParent, MYBT_BRANCHENTRY(treeInfo, parent, i)->key); + logRecord->u.split.leafLink = child->n.leaf.nextLeaf; + logRecord->u.split.blockForParent = poolBlks[2].blkNum; + logRecord->u.split.internalID = btreeBeast->ZFSMYBTREEroot.ROOTinternalID; + logRecord->u.split.indexForParent = i; + logRecord->u.split.dataLength = lenMoved; + logRecord->u.split.numToMove = numMoved; + logRecord->u.split.nodeType = holdState; + memcpy(&logRecord->u.split.data[0], startLocation, lenMoved); + + child->lsn = logBuffer->ZXR_Lsn; + sibling->lsn = logBuffer->ZXR_Lsn; + parent->lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(localXaction, parms->child); + ZLOG_BIND(localXaction, parms->sibling); + ZLOG_BIND(localXaction, parms->parent); + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + if (treeInfo->keyComp(parms->key, MYBT_BRANCHENTRY(treeInfo, parent, i)->key) >= 0) + { /* if inserting in the child */ + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->sibling->pBuf.data); + CACHE_DIRTY_RELEASE(parms->sibling); + } + else + { + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + CACHE_DIRTY_RELEASE(parms->child); + parms->child = parms->sibling; + child = sibling; + } + parms->parent->state |= CACHE_DIRTY; + parms->child->state |= CACHE_DIRTY; + ++Inst.ztree.split; + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + RTN_STATUS(zOK); + +errorEndXaction: + EndXlocal(localXaction); + RTN_STATUS(zFAILURE); +} + +/**************************************************************************** + * Grow the tree by adding a new root node and splitting the former root. + * + * Input: Child is latched. It is the root. The tree beast is latched. + * + * Output: Returns with the parent and child latched and the beast tree + * unlatched. + ****************************************************************************/ +STATIC STATUS MYBT_grow ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + MYBTreeNode_s *newRoot; + IoMsg_s iomsg; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + MybtInternalLog_s *logRecord; + ZfsXaction_s *localXaction; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, MYBT_grow); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + localXaction = BeginXLocal(parms->volume,BXL_DEFAULT); + + /* get the node for the new root */ + XALLOCBLK_IO_MSG(iomsg, btreeBeast, localXaction, CACHE_WRITE); + parms->parent = ZFS_AllocPoolBlkSpecialWithFlags(genMsg, &iomsg, NULL, XTREE_AF_NEW_AREA ); + if (parms->parent == NULL) + { + //zASSERT(0); + goto errorUnlatch; + } + parms->readBlkNum = parms->parent->volBlk; + + newRoot = (MYBTreeNode_s *)parms->parent->pBuf.data; + MYBT_initNode(treeInfo, parms->parent, MYBT_ROOT, + &btreeBeast->ZFSMYBTREEroot.ROOTinternalID); + newRoot->numRecs = 1; + treeInfo->setZeroKey(MYBT_BRANCHENTRY(treeInfo, newRoot, 0)->key); + MYBT_BRANCHENTRY(treeInfo, newRoot, 0)->child = btreeBeast->zfsBtree.p.btRoot; + MYBT_BRANCHENTRY(treeInfo, newRoot, 0)->reserved = 0; + btreeBeast->zfsBtree.p.btRoot = parms->readBlkNum; + + /* force the beast out */ + COMN_MARK_BEAST_XLOCAL(&btreeBeast->ZFSMYBTREEroot, &localXaction->xaction); + if (COMN_ForceBeastWrite(genMsg, &btreeBeast->ZFSMYBTREEroot, + &localXaction->xaction)) + { + goto errorUnlatch; + } + + parms->sibling = parms->child; + ((MYBTreeNode_s *)parms->sibling->pBuf.data)->state &= ~(MYBT_ROOT); /* reset the root state */ + parms->index = 0; + + /* log the grow action */ + + ZLOG_ObtainRecord(localXaction, ZLOG_BLOCK_INFO_SIZE(2) + sizeof(MYBTGrow_s)); + + ZLOG_INIT_LOG_RECORD( + (treeInfo->internalRecoveryStartOpCode + MYBT_RECOVERY_OP_GROW), + localXaction, logBuffer, 2, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->parent->volBlk, 0, parms->parent, + localXaction, 0, MYBT_compare); + ZLOG_ALLOC_BLOCK(poolBlks[0]); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[1], parms->child->volBlk, + ((MYBTreeNode_s *)parms->child->pBuf.data)->lsn, parms->child, + localXaction, 1, MYBT_compare); + newRoot->lsn = logBuffer->ZXR_Lsn; + ((MYBTreeNode_s *)parms->child->pBuf.data)->lsn = logBuffer->ZXR_Lsn; + logRecord->u.grow.blockForParent = parms->child->volBlk; + logRecord->u.grow.internalID = btreeBeast->ZFSMYBTREEroot.ROOTinternalID; + + ZLOG_BIND(localXaction, parms->child); + ZLOG_BIND(localXaction, parms->parent); + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + /* When doing forceBeastWrite, unlatch beast only after ending xaction */ + UNX_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + + ++Inst.ztree.grow; + + if (MYBT_split(genMsg, parms) != zOK) + { + RTN_STATUS(zFAILURE); + } + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + RTN_STATUS(zOK); + +errorUnlatch: + UNX_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + EndXlocal(localXaction); + RTN_STATUS(zFAILURE); +} + +/**************************************************************************** + * Check a node to see if it should be split + * + * Input: Child has the node to be checked. Child is latched. The tree + * beast is latched if the child is the root. + * + * Output: Child and parent are latched. b-tree beast is unlatched. + * If an error occurs then child and parent are unlatched. + ****************************************************************************/ +STATIC STATUS MYBT_checkForSplit ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + MYBTreeNode_s *node = (MYBTreeNode_s *)parms->child->pBuf.data; + + ENTER(TZTREE, MYBT_checkForSplit); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + if (MYBT_IS_MAX_ENTRIES(treeInfo, node)) + { + if (MYBT_IS_ROOT(node)) + { /* if it is the root then grow the tree another level */ + if (MYBT_grow(genMsg, parms) != zOK) + { + goto errorRelease; + } + } + else + { + if (MYBT_split(genMsg, parms) != zOK) + { + goto errorRelease; + } + } + } + else + { + if (MYBT_IS_ROOT(node)) + { + UNX_LATCH(&parms->btreeBeast->ZFSMYBTREEbeastLatch); + } + } + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + RTN_STATUS(zOK); + +errorRelease: + if (parms->parent != NULL) + { + CACHE_RELEASE(parms->parent); + } + if (parms->child != NULL) + { + CACHE_RELEASE(parms->child); + } + RTN_STATUS(zFAILURE); +} + +/**************************************************************************** + * Put an insert leaf entry log record into the log + * + * Input: Child record is Latched + * + * Output: Child record is Latched + ****************************************************************************/ +STATIC void MYBT_logInsertRecord ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms, + MYBTValue_t value) +{ + MYBTreeNode_s *child = (MYBTreeNode_s *)parms->child->pBuf.data; + ZfsXasRecovery_s *logBuffer; + MybtLogicalLog_s *logRecord; + BlockInfo_s *poolBlks; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, MYBT_logInsertRecord); + MYBT_VALIDATE_NODE(treeInfo, child); + + ZLOG_ObtainRecord(parms->xaction, + ZLOG_BLOCK_INFO_SIZE(1) + sizeof(MybtLogicalLog_s) - 1 + + MYBT_ENTRY_SIZE(treeInfo)); + + ZLOG_INIT_LOG_RECORD( + (treeInfo->logicalRecoveryStartOpCode + + MYBT_RECOVERY_L_OP_INSERT_ENTRY), + parms->xaction, logBuffer, 1, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->child->volBlk, child->lsn, + parms->child, parms->xaction, 0, MYBT_compare); + + logRecord->internalVolumeID = parms->btreeBeast->ZFSMYBTREEroot.ROOTinternalID; + logRecord->btreeBeastZid = parms->btreeBeast->ZFSMYBTREEzid; + MYBT_keyCopy(treeInfo, MYBT_LEAFENTRY_KEY(treeInfo, logRecord->u.entry), parms->key); + MYBT_valueCopy(treeInfo, MYBT_LEAFENTRY_VALUE(treeInfo, logRecord->u.entry), value); + child->lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(parms->xaction, parms->child); + + ZLOG_ReleaseRecord(parms->xaction); + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + RTN_VOID(); +} + +/**************************************************************************** + * Make room in the current buffer for a new entry and return the entry. + * + * Input: Child has the node to be inserted into. The child is latched. + * + ****************************************************************************/ +STATIC STATUS MYBT_insertEntryIntoNode ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms, + MYBTValue_t value) +{ + MYBTreeNode_s *child; + NINT i; + BYTE *entry; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + /* + * This routine assumes a check has ready been done to make sure there + * there is enough free space in the buffer to hold the new znode. + */ + + ENTER(TZTREE, MYBT_insertEntryIntoNode); + + child = (MYBTreeNode_s *)parms->child->pBuf.data; + + /* + * Add the entry to the leaf node + */ + if (MYBT_findLeafEntry(treeInfo, child, parms->key, &i)) + { + if (parms->updateCallback == NULL) + { + /* Requested insert and not insert/update */ + SetErrno(genMsg, zERR_FILE_ALREADY_EXISTS); + RTN_STATUS(zFAILURE); + } + /* We found an entry in the tree. updateCallBack is non-NULL, + * so call the callback routine. The callback routine will + * return the modified value to be inserted, in the newValue + * field, which is value, which is a pointer to the parms->value. + * Next we do a delete of the entry and the insert code will + * insert it, resulting in an update. + */ + if (parms->updateCallback( genMsg, + MYBT_LEAF_IND_VALUE(treeInfo, child, i), + value) != zOK) + { + RTN_STATUS(zFAILURE); + } + MYBT_logDeleteRecord(genMsg, parms, i); + MYBT_deleteLeafEntry(treeInfo, child, i); + } + + entry = MYBT_LEAFENTRY(treeInfo, child, i); + + /* make room for the entry */ + memmove(MYBT_LEAFENTRY(treeInfo, child, i + 1), /* Destination */ + entry, /* Source */ + (child->numRecs - i) * MYBT_LEAFENTRY_SIZE(treeInfo)); + + MYBT_keyCopy(treeInfo, MYBT_LEAFENTRY_KEY(treeInfo, entry), parms->key); + MYBT_valueCopy(treeInfo, MYBT_LEAFENTRY_VALUE(treeInfo, entry), value); + + ++(child->numRecs); + + RTN_STATUS(zOK); +} + +/**************************************************************************** + * Insert the entry into the b-tree node + * + * Input: Child and parent are latched. Child is buffer to inserted into. + * + * Output: Parent and child are unlatched. + ****************************************************************************/ +STATIC STATUS MYBT_doInsertEntry ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + MYBTreeNode_s *child; + + ENTER(TZTREE, MYBT_doInsertEntry); + MYBT_VALIDATE_NODE(parms->treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + if (MYBT_checkForSplit(genMsg, parms) != zOK) + { + RTN_STATUS(zFAILURE); + } + + child = (MYBTreeNode_s *)parms->child->pBuf.data; + if (!MYBT_IS_ROOT(child)) + { + MYBT_VALIDATE_NODE(parms->treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + CACHE_RELEASE(parms->parent); + } + + if (MYBT_insertEntryIntoNode(genMsg, parms, parms->value)) + { + CACHE_RELEASE(parms->child); + RTN_STATUS(zFAILURE); + } + MYBT_logInsertRecord(genMsg, parms, parms->value); + + MYBT_VALIDATE_NODE(parms->treeInfo, child); + CACHE_DIRTY_RELEASE(parms->child); + RTN_STATUS(zOK); +} + +/**************************************************************************** + * Finds the leaf node. Splits or joins nodes as it goes down the tree if + * requested with the options + * + * Input: Will also start at the root of the b-tree. Gets root from the + * b-tree beast. Splits or joins based on the options. + * b-tree beast is latched. + * + * Output: Leaves with parent and child latched, unless there is an error, + * then they are unlatched. + ****************************************************************************/ +STATIC STATUS MYBT_findLeaf ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms, + NINT options) +{ + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + MYBTreeNode_s *node; + IoMsg_s iomsg; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, MYBT_findLeaf); + + parms->readBlkNum = btreeBeast->zfsBtree.p.btRoot; + READBLK_IO_MSG(iomsg, btreeBeast, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 5); + parms->child = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg); + if (parms->child == NULL) + { + //zASSERT(0); + UNX_LATCH(&parms->btreeBeast->ZFSMYBTREEbeastLatch); + RTN_STATUS(zFAILURE); + } + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + node = (MYBTreeNode_s *)parms->child->pBuf.data; + + zASSERT(MYBT_IS_ROOT(node)); + while (!MYBT_IS_LEAF(node)) + { + if (options & MYBT_CHECK_FOR_SPLIT) + { + if (MYBT_checkForSplit(genMsg, parms) != zOK) + { + RTN_STATUS(zFAILURE); + } + /* the child can change to a different node during a split */ + node = (MYBTreeNode_s *)parms->child->pBuf.data; + } + else if (options & MYBT_CHECK_FOR_JOIN) + { + if (MYBT_checkForJoin(genMsg, parms) != zOK) + { + RTN_STATUS(zFAILURE); + } + /* the child can change to a different node during a join */ + node = (MYBTreeNode_s *)parms->child->pBuf.data; + /* if the tree shrinks we may be at the leaf node */ + if (MYBT_IS_LEAF(node)) + { + break; + } + } + else + { + if (MYBT_IS_ROOT(node)) + { + UNX_LATCH(&parms->btreeBeast->ZFSMYBTREEbeastLatch); + } + } + if (!MYBT_IS_ROOT(node)) + { + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + CACHE_RELEASE(parms->parent); + } + /* go down another level */ + parms->parent = parms->child; + parms->readBlkNum = + MYBT_findChildBlock(treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data, + parms->key, &parms->index); + READBLK_IO_MSG(iomsg, btreeBeast, parms->readBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 6); + parms->child = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg); + if (parms->child == NULL) + { + //zASSERT(0); + CACHE_RELEASE(parms->parent); + RTN_STATUS(zFAILURE); + } + node = (MYBTreeNode_s *)parms->child->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + } + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + RTN_STATUS(zOK); +} + +/**************************************************************************** + * + ****************************************************************************/ +STATUS MYBT_insertEntry ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + IoMsg_s iomsg; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + MybtInternalLog_s *logRecord; + ZfsXaction_s *localXaction; + MYBTreeNode_s *node; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, MYBT_insertEntry); + + /* + * Get an exclusive latch on the Tree Beast + */ + X_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + + if (btreeBeast->zfsBtree.p.btRoot == INVALID_BLK) + { /* no root to the b-tree exists */ + + localXaction = BeginXLocal(parms->volume, BXL_DEFAULT); + + XALLOCBLK_IO_MSG(iomsg, btreeBeast, localXaction, CACHE_WRITE); + parms->child = ZFS_AllocPoolBlkSpecialWithFlags(genMsg, &iomsg, NULL, XTREE_AF_NEW_AREA ); + if (parms->child == NULL) + { + goto errorEndXaction; + } + node = (MYBTreeNode_s *)parms->child->pBuf.data; + parms->readBlkNum = parms->child->volBlk; + MYBT_initNode(treeInfo, parms->child, MYBT_ROOT|MYBT_LEAF, + &btreeBeast->ZFSMYBTREEroot.ROOTinternalID); + MYBT_insertZeroNode(treeInfo, node); + btreeBeast->zfsBtree.p.btRoot = parms->child->volBlk; + btreeBeast->zfsBtree.p.btLeftMostLeaf = parms->child->volBlk; + +// COMN_MARK_BEAST_DIRTY( &btreeBeast->ZFSMYBTREEroot); + COMN_MARK_BEAST_XLOCAL(&btreeBeast->ZFSMYBTREEroot, &localXaction->xaction); + + if (COMN_ForceBeastWrite(genMsg, &btreeBeast->ZFSMYBTREEroot, + &localXaction->xaction) != zOK) + { + //COMN_AbortXaction() + CACHE_RELEASE(parms->child); + goto errorEndXaction; + } + + /* log the init record */ + ZLOG_ObtainRecord(localXaction, + ZLOG_BLOCK_INFO_SIZE(1) + sizeof(MYBTTreeInit_s) ); + + ZLOG_INIT_LOG_RECORD( + (treeInfo->internalRecoveryStartOpCode + MYBT_RECOVERY_OP_INIT), + localXaction, logBuffer, 1, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->child->volBlk, + node->lsn, parms->child, localXaction, 0, MYBT_compare); + ZLOG_ALLOC_BLOCK(poolBlks[0]); + node->lsn = logBuffer->ZXR_Lsn; + logRecord->u.initTree.internalID = + btreeBeast->ZFSMYBTREEroot.ROOTinternalID; + + ZLOG_BIND(localXaction, parms->child); + + ZLOG_ReleaseRecordAndLogEnd(localXaction); + EndXlocal(localXaction); + + if (MYBT_doInsertEntry(genMsg, parms) != zOK) + { + RTN_STATUS(zFAILURE); + } + } + else + { + if (MYBT_findLeaf(genMsg, parms, MYBT_CHECK_FOR_SPLIT) != zOK) + { + RTN_STATUS(zFAILURE); + } + if (MYBT_doInsertEntry(genMsg, parms) != zOK) + { + RTN_STATUS(zFAILURE); + } + } + +#if 0 && (NSS_DEBUG IS_ENABLED) + MYBT_displayTree(treeInfo, btreeBeast); +#endif + /* the insert of the new beast was successful, update the volume count */ + RTN_STATUS(zOK); + +errorEndXaction: + EndXlocal(localXaction); + UNX_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + RTN_STATUS(zFAILURE); +} + +/**************************************************************************** + * Decend the tree until you are at a leaf + ****************************************************************************/ +STATIC STATUS MYBT_getLeaf( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + NINT dummy; + IoMsg_s iomsg; + Blknum_t poolBlk; + MYBTreeNode_s *node; + + ENTER(TZTREE, MYBT_getLeaf); + + node = (MYBTreeNode_s *)parms->child->pBuf.data; + while (!MYBT_IS_LEAF(node)) + { + parms->parent = parms->child; + poolBlk = MYBT_findChildBlock(parms->treeInfo, node, parms->key, &dummy); + + READBLK_IO_MSG(iomsg, parms->btreeBeast, poolBlk, CACHE_READ); + SET_DEBUG_ID(iomsg, 7); + parms->child = MYBT_ReadPoolBlk(parms->treeInfo, genMsg, &iomsg); + CACHE_RELEASE(parms->parent); + if (parms->child == NULL) + { + //zASSERT(0); + RTN_STATUS(zFAILURE); + } + node = (MYBTreeNode_s *)parms->child->pBuf.data; + MYBT_VALIDATE_NODE(parms->treeInfo, node); + } + RTN_STATUS(zOK); +} + +STATUS MYBT_lookupEntry( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + MYBTreeNode_s *node; + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + IoMsg_s iomsg; + NINT index; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + /* + * Get an exclusive latch on the B-Tree Beast + */ + S_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + if (btreeBeast->zfsBtree.p.btRoot == INVALID_BLK) + { + UNS_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + RTN_STATUS(zFAILURE); + } + + /* read the root node */ + READBLK_IO_MSG(iomsg, btreeBeast, btreeBeast->zfsBtree.p.btRoot, + CACHE_READ); + SET_DEBUG_ID(iomsg, 8); + parms->child = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg); + + UNS_LATCH(&btreeBeast->ZFSBEASTTREEbeastLatch); + if (parms->child == NULL) + { + //zASSERT(0); + RTN_STATUS(zFAILURE); + } + MYBT_VALIDATE_NODE(treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + /* decend the tree to the leaf */ + if (MYBT_getLeaf(genMsg, parms) != zOK) + { + RTN_STATUS(zFAILURE); + } + + node = (MYBTreeNode_s *)parms->child->pBuf.data; + + zASSERT(MYBT_IS_LEAF((MYBTreeNode_s *)parms->child->pBuf.data)); + + if (!MYBT_findLeafEntry(treeInfo, node, parms->key, &index)) + { + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + CACHE_RELEASE(parms->child); + RTN_STATUS(zFAILURE); + } + else + { + MYBT_valueCopy(treeInfo, parms->value, MYBT_LEAF_IND_VALUE(treeInfo, node, index)); + } + + CACHE_RELEASE(parms->child); + RTN_STATUS(zOK); +} + +/**************************************************************************** + * Put an delete leaf entry log record into the log + * + * Input: Child record is Latched + * + * Output: Child record is Latched + ****************************************************************************/ +STATIC void MYBT_logDeleteRecord ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms, + NINT index) +{ + MYBTreeNode_s *child = (MYBTreeNode_s *)parms->child->pBuf.data; + ZfsXasRecovery_s *logBuffer; + MybtLogicalLog_s *logRecord; + BlockInfo_s *poolBlks; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; +#if LOG_TEST IS_ENABLED + Lsn_t holdLsn; +#endif + + ENTER(TZTREE, MYBT_logDeleteRecord); + MYBT_VALIDATE_NODE(treeInfo, child); + + ZLOG_ObtainRecord(parms->xaction, + ZLOG_BLOCK_INFO_SIZE(1) + sizeof(MybtLogicalLog_s) - 1 + + MYBT_ENTRY_SIZE(treeInfo)); + + ZLOG_INIT_LOG_RECORD( + (treeInfo->logicalRecoveryStartOpCode + + MYBT_RECOVERY_L_OP_DELETE_ENTRY), + parms->xaction, logBuffer, 1, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO2(poolBlks[0], parms->child->volBlk, child->lsn, + parms->child, parms->xaction, 0, MYBT_compare); + + logRecord->internalVolumeID = parms->btreeBeast->ZFSMYBTREEroot.ROOTinternalID; + logRecord->btreeBeastZid = parms->btreeBeast->ZFSMYBTREEzid; + MYBT_keyCopy(treeInfo, MYBT_LEAFENTRY_KEY(treeInfo, logRecord->u.entry), + MYBT_LEAF_IND_KEY(treeInfo, child, index)); + MYBT_valueCopy(treeInfo, MYBT_LEAFENTRY_VALUE(treeInfo, logRecord->u.entry), + MYBT_LEAF_IND_VALUE(treeInfo, child, index)); +#if LOG_TEST IS_ENABLED + holdLsn = logBuffer->ZXR_Lsn; /* this is done so the lsn will be ok for testing */ +#else + child->lsn = logBuffer->ZXR_Lsn; +#endif + + ZLOG_BIND(parms->xaction, parms->child); + + ZLOG_TEST_REDO(parms->xaction); + ZLOG_ReleaseRecord(parms->xaction); +#if LOG_TEST IS_ENABLED + child->lsn = holdLsn; +#endif + MYBT_VALIDATE_NODE(treeInfo, child); + RTN_VOID(); +} + +/**************************************************************************** + * Delete an entry in a leaf node + * + ****************************************************************************/ +STATIC void MYBT_deleteLeafEntry( + MYBTreeInstanceInfo_s *treeInfo, + MYBTreeNode_s *node, + NINT index) +{ + MYBT_VALIDATE_NODE(treeInfo, node); + if (index < (node->numRecs - 1)) + { + memmove(MYBT_LEAFENTRY(treeInfo, node, index), /* destination */ + MYBT_LEAFENTRY(treeInfo, node, index+1), /* source */ + (node->numRecs - (index + 1)) * MYBT_LEAFENTRY_SIZE(treeInfo)); + } + node->numRecs--; + MYBT_VALIDATE_NODE(treeInfo, node); + return; +} + +/**************************************************************************** + * Delete the entry and log the delete + * + * Input: The parent and child are latched. + * + * Output: The parent and child are unlatched. + ****************************************************************************/ +STATUS MYBT_deleteEntry ( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms) +{ + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + NINT i; + MYBTreeNode_s *child; + + ENTER(TZTREE, MYBT_deleteEntry); + + X_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + + if (btreeBeast->zfsBtree.p.btRoot == INVALID_BLK) + { /* no root to the b-tree exists */ + UNX_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + RTN_STATUS(zFAILURE); + } + + if (MYBT_findLeaf(genMsg, parms, MYBT_CHECK_FOR_JOIN) != zOK) + { + RTN_STATUS(zFAILURE); + } + + MYBT_VALIDATE_NODE(parms->treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + + if (MYBT_checkForJoin(genMsg, parms) != zOK) + { + //zASSERT(0); + RTN_STATUS(zFAILURE); + } + + child = (MYBTreeNode_s *)parms->child->pBuf.data; + if (!MYBT_IS_ROOT(child)) + { + MYBT_VALIDATE_NODE(parms->treeInfo, (MYBTreeNode_s *)parms->parent->pBuf.data); + CACHE_RELEASE(parms->parent); + } + + if(!(MYBT_findLeafEntry(parms->treeInfo, child, parms->key, &i))) + { /* not found */ + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + goto errorExit; + } + + MYBT_logDeleteRecord(genMsg, parms, i); + MYBT_deleteLeafEntry(parms->treeInfo, child, i); + MYBT_VALIDATE_NODE(parms->treeInfo, (MYBTreeNode_s *)parms->child->pBuf.data); + CACHE_DIRTY_RELEASE(parms->child); + RTN_STATUS(zOK); + +errorExit: + //zASSERT(0); + CACHE_DIRTY_RELEASE(parms->child); + RTN_STATUS(zFAILURE); + +} + +STATUS MYBT_browseEntries( + GeneralMsg_s *genMsg, + MYBTreeParms_s *parms, + NINT numEntriesRequested, + BYTE *entries, /* out */ + NINT *numEntriesReturned) /* out */ +{ + STATUS status; + ZfsMYBTreeBeast_s *btreeBeast = parms->btreeBeast; + MYBTreeNode_s *node; + IoMsg_s iomsg; + NINT entryIndex; + NINT index; + Buffer_s *buffer; + MYBTKey_t lastKey = NULL; + MYBTreeInstanceInfo_s *treeInfo = parms->treeInfo; + + ENTER(TZTREE, MYBT_browseEntries); + + status = zOK; + entryIndex = 0; + + if (btreeBeast == NULL) + { + goto done; + } + + S_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + if (btreeBeast->zfsBtree.p.btRoot == INVALID_BLK) + { + UNS_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + goto done; + } + + /* + * Find the next key after *parms->key. + */ + + /* read the root node */ + READBLK_IO_MSG(iomsg, btreeBeast, btreeBeast->zfsBtree.p.btRoot, + CACHE_READ); + parms->child = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg); + + UNS_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + if (parms->child == NULL) + { + status = zFAILURE; + goto done; + } + node = (MYBTreeNode_s *)parms->child->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + + /* descend the tree to the leaf */ + if ((status = MYBT_getLeaf(genMsg, parms)) != zOK) + { + goto done; + } + + node = (MYBTreeNode_s *)parms->child->pBuf.data; + zASSERT(MYBT_IS_LEAF(node)); + if (! MYBT_findLeafEntry(treeInfo, node, parms->key, &index)) + { + index--; + } + ++ index; /* Skip the last matched entry */ + + /* + * Return the next numEntriesReturned entries + * by sequentially scanning the tree's leaf nodes + * + * Sequentially scan the linked list of leaf nodes for beasts to return + * (starting at the beast at (parms->child, index)). + */ + for (; entryIndex < numEntriesRequested; entryIndex++, index++) + { + /* Skip to a leaf node with entries to process */ + if (index >= node->numRecs) + { + /* Jump to the next leaf node */ + if (node->n.leaf.nextLeaf == INVALID_BLK) + { + break; + } + + READBLK_IO_MSG(iomsg, btreeBeast, node->n.leaf.nextLeaf, + CACHE_READ); + buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg); + if (buffer == NULL) + { + status = zFAILURE; + break; + } + CACHE_RELEASE(parms->child); + parms->child = buffer; + node = (MYBTreeNode_s *)parms->child->pBuf.data; + index = 0; + } + + /* Add the entry to the list of entries returned */ + memmove(entries, MYBT_LEAFENTRY(treeInfo, node, index), + MYBT_LEAFENTRY_SIZE(treeInfo)); + lastKey = MYBT_LEAFENTRY_KEY(treeInfo, entries); + entries += MYBT_LEAFENTRY_SIZE(treeInfo); + } + CACHE_RELEASE(parms->child); + +done: + if (entryIndex > 0) + { + MYBT_keyCopy(treeInfo, parms->key, lastKey); + } + *numEntriesReturned = entryIndex; + RTN_STATUS(status); +} + +/**************************************************************************** + * + * This function will remove all entries from the B-tree + * + ****************************************************************************/ +STATUS MYBT_resetTree( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsMYBTreeBeast_s *btreeBeast, + Volume_s *volume) +{ + #define MAX_MYBTREE_DEPTH 20 + + typedef struct ParentStack_s + { + Buffer_s *buffer; + NINT entry; + } ParentStack_s; + MYBTreeNode_s *node; + Buffer_s *buffer; + STATUS status; + ZfsXaction_s *xaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + MybtInternalLog_s *logRecord; + SNINT stkPtr; + BOOL readALeaf = FALSE; + Blknum_t nodeBlockNum; + typedef struct Stack_s { + ParentStack_s parentStack[MAX_MYBTREE_DEPTH]; + IoMsg_s iomsg; + } Stack_s; + STACK_ALLOC(); + + zASSERT(!(volume->VOLenabledAttributes & zATTR_READONLY)); + + /* + * Get an exclusive latch on the B-Tree Beast + */ + X_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); + + if (btreeBeast->zfsBtree.p.btRoot == INVALID_BLK) + { /* no root to the b-tree exists */ + status = zOK; + goto exit; + } + + stkPtr = -1; + nodeBlockNum = btreeBeast->zfsBtree.p.btRoot; + for (;;) + { + if ((volume->VOLstate != zVOLSTATE_ACTIVE) || + (volume->v_statusFlag & VOL_SF_LEAVING_ACTIVE_STATE_CLEANUP)) + { + SetErrno(genMsg, zERR_VOLUME_STATE_CHANGE_REQUESTED); + status = zFAILURE; + goto exit; + } + + if (nodeBlockNum != INVALID_BLK) + { + READBLK_IO_MSG(aStack->iomsg, btreeBeast, nodeBlockNum, CACHE_UPDATE); + buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &aStack->iomsg); + if (buffer == NULL) + { + status = zFAILURE; + goto exit; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + if (MYBT_IS_LEAF(node)) + { /* + * If it is a leaf then set up so on the next loop it will be + * deleted. + */ + zASSERT(node->magic == MYBT_LEAF_MAGIC(treeInfo)); + cacheReleaseToss(buffer); + readALeaf = TRUE; + /* if the root block is a leaf then kick out */ + if (stkPtr >= 0) + { + nodeBlockNum = INVALID_BLK; + } + else + { + break; + } + } + else + { /* + * it's a branch -- push on the stack and move on to its first + * child + */ + readALeaf = FALSE; + zASSERT(node->magic == MYBT_BRANCH_MAGIC(treeInfo)); + + stkPtr++; + zASSERT(stkPtr < MAX_MYBTREE_DEPTH); + if (stkPtr >= MAX_MYBTREE_DEPTH) + { + errPrintf(WHERE, Module, -1, + MSG("A MYBTree instance (zid=0x%Lx) too deep. Unable to remove it.",545), btreeBeast->ZFSMYBTREEzid); + cacheReleaseToss(buffer); + status = zFAILURE; + goto exit; + } + aStack->parentStack[stkPtr].buffer = buffer; + aStack->parentStack[stkPtr].entry = 0; + if (node->numRecs > 0) + { + nodeBlockNum = MYBT_BRANCHENTRY(treeInfo, node, 0)->child; + } + else + { + nodeBlockNum = INVALID_BLK; + } + } + } + else + { /* working from the stack */ + buffer = aStack->parentStack[stkPtr].buffer; + node = (MYBTreeNode_s *)buffer->pBuf.data; + + /* + * If we have read all of the child records we can now delete + * them. + * + * If one of the children was a leaf they all will be so there + * is no reason to read them all. Just delete them. + */ + aStack->parentStack[stkPtr].entry++; + if ((aStack->parentStack[stkPtr].entry >= node->numRecs) || readALeaf) + { + xaction = BeginXLocal(volume, BXL_DEFAULT); + /* Delete the children */ + + while (node->numRecs > 0) + { + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(2)); + + ZLOG_INIT_LOG_RECORD( + (treeInfo->internalRecoveryStartOpCode + + MYBT_RECOVERY_OP_REMOVE_NODE), + xaction, logBuffer, 2, poolBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[0], buffer->volBlk, + node->lsn, buffer, xaction, 0); + node->lsn = logBuffer->ZXR_Lsn; + node->numRecs--; + + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[1], + MYBT_BRANCHENTRY(treeInfo, node, node->numRecs)->child, + 0, NULL, xaction, 1); + ZLOG_DELETE_BLOCK(xaction, poolBlks[1]); + + ZLOG_BIND(xaction, buffer); + + ZLOG_ReleaseRecord(xaction); + } + EndXlocal(xaction); + readALeaf = FALSE; + CACHE_DIRTY_RELEASE(buffer); + + /* pop the stack */ + if (stkPtr > 0) + { + stkPtr--; + } + else + { + break; + } + } + else + { + nodeBlockNum = MYBT_BRANCHENTRY(treeInfo, node, aStack->parentStack[stkPtr].entry)->child; + } + } + } + + /* Remove the root node */ + xaction = BeginXLocal(volume, BXL_DEFAULT); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(2)); + + ZLOG_INIT_LOG_RECORD((treeInfo->internalRecoveryStartOpCode + + MYBT_RECOVERY_OP_REMOVE_NODE), + xaction, logBuffer, 2, poolBlks, logRecord); + + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[0], 0, 0, NULL, xaction, 0); + + ZLOG_ASSIGN_BLOCK_INFO(poolBlks[1], btreeBeast->zfsBtree.p.btRoot, 0, + NULL, xaction, 0); + ZLOG_DELETE_BLOCK(xaction, poolBlks[1]); + + ZLOG_ReleaseRecord(xaction); + + /* + * Fix up the btree beast to show no tree + */ + btreeBeast->zfsBtree.p.btRoot = INVALID_BLK; + + COMN_MARK_BEAST_XLOCAL(&btreeBeast->ZFSMYBTREEroot, &xaction->xaction); + COMN_ForceBeastWrite(genMsg, &btreeBeast->ZFSMYBTREEroot, + &xaction->xaction); + EndXlocal(xaction); + + status = zOK; + +exit: + UNX_LATCH(&btreeBeast->ZFSMYBTREEbeastLatch); +#if NSS_DEBUG IS_ENABLED + MYBT_displayTree(treeInfo, btreeBeast); +#endif + STACK_FREE(); + return status; +} + +/*---------------------------------------------------------------------------*/ +/**************************************************************************** + * + * Recovery routines + * + ****************************************************************************/ + +/**************************************************************************** + * Recovery routine for initing a tree + ****************************************************************************/ +STATIC STATUS MYBT_recoveryInit( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + Buffer_s *buffer; + MYBTreeNode_s *node; + IoMsg_s iomsg; + MybtInternalLog_s *logRecord; + BlockInfo_s *poolBlks; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* update the new root */ + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + /* init the node */ + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_initNode(treeInfo, buffer, MYBT_ROOT|MYBT_LEAF, + &logRecord->u.initTree.internalID); + MYBT_insertZeroNode(treeInfo, node); + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + CACHE_DIRTY_RELEASE(buffer); + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for uniniting a tree (undo for init) + ****************************************************************************/ +STATIC STATUS MYBT_recoveryUninit( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + Buffer_s *buffer; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + /* make sure the cache block for the deleted node is gone */ + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) != NULL) + { +#if LOG_TEST IS_ENABLED + if (LogTest) + ZLOG_SET_LSN(logBuffer, ((MYBTreeNode_s *)buffer->pBuf.data)->lsn, + poolBlks[0], action); +#endif + cacheReleaseToss(buffer); + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for shrinking a tree + ****************************************************************************/ +STATIC STATUS MYBT_recoveryShrink( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + MybtInternalLog_s *logRecord; + Buffer_s *buffer; + MYBTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + /* make sure the cache block for the deleted node is gone */ + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) != NULL) + { +#if LOG_TEST IS_ENABLED + if (LogTest) + ZLOG_SET_LSN(logBuffer, ((MYBTreeNode_s *)buffer->pBuf.data)->lsn, + poolBlks[0], action); +#endif +// VALIDATE_NODE((BeastTreeNode_s *)buffer->pBuf.data); + cacheReleaseToss(buffer); + } + } + + /* update the new root */ + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the root */ + node->state |= MYBT_ROOT; /* set the root state */ + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[1], action); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for growing a tree + ****************************************************************************/ +STATIC STATUS MYBT_recoveryGrow( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + MybtInternalLog_s *logRecord; + Buffer_s *buffer; + MYBTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* update the new root */ + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + + /* fix up the root */ + MYBT_initNode(treeInfo, buffer, MYBT_ROOT, &logRecord->u.grow.internalID); + node->numRecs = 1; + treeInfo->setZeroKey(MYBT_BRANCH_IND_KEY(treeInfo, node, 0)); + MYBT_BRANCHENTRY(treeInfo, node, 0)->child = logRecord->u.grow.blockForParent; + MYBT_BRANCHENTRY(treeInfo, node, 0)->reserved = 0; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + CACHE_DIRTY_RELEASE(buffer); + } + + /* update the child */ + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the child */ + node->state &= ~(MYBT_ROOT); /* reset the root state */ + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[1], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for balancing leafs + ****************************************************************************/ +STATIC STATUS MYBT_recoveryBalance( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + MybtInternalLog_s *logRecord; + Buffer_s *buffer; + MYBTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + NINT lenMoved; + NINT numToMove; + NINT block; + BYTE *keyForParent, *oldKey; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* update the parent */ + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + zASSERT(node->magic == MYBT_BRANCH_MAGIC(treeInfo)); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the parent */ + oldKey = &logRecord->u.balance.data[logRecord->u.balance.dataLength]; + keyForParent = oldKey + MYBT_KEY_SIZE(treeInfo); + if (action == X_REDO) + { + MYBT_keyCopy(treeInfo, MYBT_BRANCHENTRY(treeInfo, node, logRecord->u.balance.indexForParent)->key, + keyForParent); + } + else + { + MYBT_keyCopy(treeInfo, MYBT_BRANCHENTRY(treeInfo, node, logRecord->u.balance.indexForParent)->key, + oldKey); + } + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* update the source */ + block = (action == X_REDO) ? 1 : 2; + if (ZLOG_VALID_BLOCK(poolBlks[block])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[block].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the source node */ + if (MYBT_IS_LEAF(node)) + { /* leaf */ + numToMove = logRecord->u.balance.numToMove; + if ((numToMove > 0) && + treeInfo->keyComp( + MYBT_LEAFENTRY_KEY(treeInfo, &logRecord->u.balance.data[0]), + MYBT_LEAF_IND_KEY(treeInfo, node, 0)) == 0) + { /* if we are removing from the start of the leaf record ... */ + memmove(MYBT_LEAFENTRY(treeInfo, node, 0), /* Destination */ + MYBT_LEAFENTRY(treeInfo, node, numToMove), + (node->numRecs - numToMove) * MYBT_LEAFENTRY_SIZE(treeInfo)); + } + } + else + { /* branch */ + numToMove = logRecord->u.balance.numToMove; + if ((numToMove > 0) && + treeInfo->keyComp( + &((MYBTBranchEntry_s *)&logRecord->u.balance.data[0])->key, + MYBT_BRANCHENTRY(treeInfo, node, 0)->key) == 0) + { /* if we are removing from the start of the branch record ... */ + memmove(MYBT_BRANCHENTRY(treeInfo, node, 0), /* Destination */ + MYBT_BRANCHENTRY(treeInfo, node, numToMove), + (node->numRecs - numToMove) * MYBT_BRANCHENTRY_SIZE(treeInfo)); + } + } + node->numRecs -= numToMove; + + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[block], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* update the destination */ + block = (action == X_REDO) ? 2 : 1; + if (ZLOG_VALID_BLOCK(poolBlks[block])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[block].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the destination node */ + if (MYBT_IS_LEAF(node)) + { /* leaf */ + numToMove = logRecord->u.balance.numToMove; + lenMoved = logRecord->u.balance.dataLength; + if (numToMove > 0) + { + if (treeInfo->keyComp( + MYBT_LEAFENTRY_KEY(treeInfo, &logRecord->u.balance.data[0]), + MYBT_LEAF_IND_KEY(treeInfo, node, 0)) < 0) + { /* if we are inserting at the start of the branch record ... */ + + /* make room for the insert */ + memmove(MYBT_LEAFENTRY(treeInfo, node, numToMove), /* Destination */ + MYBT_LEAFENTRY(treeInfo, node, 0), /* Source */ + node->numRecs * MYBT_LEAFENTRY_SIZE(treeInfo) ); + + /* insert the info from the log record */ + memcpy(MYBT_LEAFENTRY(treeInfo, node, 0), + &logRecord->u.balance.data[0], + lenMoved); + } + else + { + /* insert the info from the log record */ + memcpy(MYBT_LEAFENTRY(treeInfo, node, node->numRecs), + &logRecord->u.balance.data[0], + lenMoved); + } + } + } + else + { /* branch */ + numToMove = logRecord->u.balance.numToMove; + lenMoved = logRecord->u.balance.dataLength; + if (numToMove > 0) + { + if (treeInfo->keyComp( + &((MYBTBranchEntry_s *)&logRecord->u.balance.data[0])->key, + MYBT_BRANCHENTRY(treeInfo, node, 0)->key) < 0) + { /* if we are inserting at the start of the branch record ... */ + + /* make room for the insert */ + memmove(MYBT_BRANCHENTRY(treeInfo, node, numToMove), /* Destination */ + MYBT_BRANCHENTRY(treeInfo, node, 0), /* Source */ + node->numRecs * MYBT_BRANCHENTRY_SIZE(treeInfo) ); + + /* insert the info from the log record */ + memcpy(MYBT_BRANCHENTRY(treeInfo, node, 0), + &logRecord->u.balance.data[0], + lenMoved); + } + else + { + /* insert the info from the log record */ + memcpy(MYBT_BRANCHENTRY(treeInfo, node, node->numRecs), + &logRecord->u.balance.data[0], + lenMoved); + } + } + } + node->numRecs += numToMove; + + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[block], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for joining leafs + ****************************************************************************/ +STATIC STATUS MYBT_recoveryJoin( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + MybtInternalLog_s *logRecord; + Buffer_s *buffer; + MYBTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + NINT i; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* update the parent */ + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + zASSERT(node->magic == MYBT_BRANCH_MAGIC(treeInfo)); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the parent */ + --(node->numRecs); + i = logRecord->u.split.indexForParent; + memmove( MYBT_BRANCHENTRY(treeInfo, node, i), /* Destination */ + MYBT_BRANCHENTRY(treeInfo, node, i + 1), /* Source */ + (node->numRecs - i) * MYBT_BRANCHENTRY_SIZE(treeInfo)); + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* update the child */ + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the child */ + if (MYBT_IS_LEAF(node)) + { /* leaf */ + memcpy(MYBT_LEAFENTRY(treeInfo, node, node->numRecs), + &logRecord->u.split.data[0], + logRecord->u.split.dataLength); + node->n.leaf.nextLeaf = logRecord->u.split.leafLink; + } + else + { /* branch */ + memcpy(MYBT_BRANCHENTRY(treeInfo, node, node->numRecs), + &logRecord->u.split.data[0], + logRecord->u.split.dataLength); + } + node->numRecs += logRecord->u.split.numToMove; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[1], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + if (ZLOG_VALID_BLOCK(poolBlks[2])) + { + /* make sure the cache block for the deleted node is gone */ + READBLK_IO_MSG(iomsg, pool, poolBlks[2].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) != NULL) + { + node = (MYBTreeNode_s *)buffer->pBuf.data; + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { /* Because we have deleted this block just toss it */ + cacheReleaseToss(buffer); + } + else + { /* fix up the sibling */ + if (MYBT_IS_LEAF(node)) + { /* leaf */ + node->magic |= 0x20; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[2], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + else + { +#if LOG_TEST IS_ENABLED + if (LogTest) + { + ZLOG_SET_LSN(logBuffer, ((MYBTreeNode_s *)buffer->pBuf.data)->lsn, + poolBlks[2], action); + } +#endif + cacheReleaseToss(buffer); + } + } + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for splitting leafs + ****************************************************************************/ +STATIC STATUS MYBT_recoverySplit( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + MybtInternalLog_s *logRecord; + Buffer_s *buffer; + MYBTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + NINT i; + BYTE *keyForParent; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* update the parent */ + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + zASSERT(node->magic == MYBT_BRANCH_MAGIC(treeInfo)); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { /* fix up the parent */ + i = logRecord->u.split.indexForParent; + memmove(MYBT_BRANCHENTRY(treeInfo, node, i + 1), /* Destination */ + MYBT_BRANCHENTRY(treeInfo, node, i), /* Source */ + (node->numRecs - i) * MYBT_BRANCHENTRY_SIZE(treeInfo)); + ++(node->numRecs); + keyForParent = + &logRecord->u.split.data[logRecord->u.split.dataLength]; + MYBT_keyCopy(treeInfo, MYBT_BRANCHENTRY(treeInfo, node, i)->key, keyForParent); + MYBT_BRANCHENTRY(treeInfo, node, i)->child = logRecord->u.split.blockForParent; + MYBT_BRANCHENTRY(treeInfo, node, i)->reserved = 0; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* remove from the old node (on the left)*/ + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + } + else + { + if (MYBT_IS_LEAF(node)) + { + node->n.leaf.nextLeaf = logRecord->u.split.blockForParent; + } + node->numRecs -= logRecord->u.split.numToMove; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[1], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /* add to the new node */ + if (ZLOG_VALID_BLOCK(poolBlks[2])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[2].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + + /* fix up the child */ + MYBT_initNode(treeInfo, buffer, logRecord->u.split.nodeType, + &logRecord->u.split.internalID); + if (MYBT_IS_LEAF(node)) + { /* leaf */ + memcpy(MYBT_LEAFENTRY(treeInfo, node, 0), + &logRecord->u.split.data[0], + logRecord->u.split.dataLength); + node->n.leaf.nextLeaf = logRecord->u.split.leafLink; + } + else + { /* branch */ + memcpy(MYBT_BRANCHENTRY(treeInfo, node, 0), + &logRecord->u.split.data[0], + logRecord->u.split.dataLength); + } + node->numRecs = logRecord->u.split.numToMove; + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[2], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for removing a B-tree node + ****************************************************************************/ +STATUS MYBT_recoveryRemoveNode( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + MybtInternalLog_s *logRecord; + Buffer_s *buffer; + MYBTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + if (ZLOG_VALID_BLOCK(poolBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + zASSERT(node->magic == MYBT_BRANCH_MAGIC(treeInfo)); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + return zOK; + } + + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (action == X_REDO) + { + node->numRecs--; + } + else + { + node->numRecs++; + } + + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + } + + if (ZLOG_VALID_BLOCK(poolBlks[1])) + { + /* make sure the cache block for the deleted node is gone */ + READBLK_IO_MSG(iomsg, pool, poolBlks[1].blkNum, CACHE_WRITE); + if ((buffer = cacheLookup(&iomsg.beast->ROOTmycache, iomsg.volBlk, + CACHE_WRITE)) != NULL) + { + cacheReleaseToss(buffer); + } + } + return zOK; +} + +/**************************************************************************** + * Recovery routine for inserting a key into the tree + ****************************************************************************/ +STATIC STATUS MYBT_recoveryInsertEntry( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsMYBTreeBeast_s *btreeBeastignore, /* A NULL pointer! DO NOT USE! */ + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + MYBTreeParms_s parms; + MybtLogicalLog_s *logRecord; + Buffer_s *buffer; + MYBTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + if (!ZLOG_VALID_BLOCK(poolBlks[0])) + { + return zOK; + } + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + zASSERT(node->magic == MYBT_LEAF_MAGIC(treeInfo)); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + return zOK; + } + + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + parms.child = buffer; + parms.treeInfo = treeInfo; + parms.key = MYBT_LEAFENTRY_KEY(&parms, &logRecord->u.entry[0]); + parms.updateCallback = NULL; + + if (MYBT_insertEntryIntoNode(genMsg, &parms, + MYBT_LEAFENTRY_VALUE(treeInfo, logRecord->u.entry)) != zOK) + { + CACHE_RELEASE(buffer); + return zFAILURE; + } + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); \ + return zOK; +} + +/**************************************************************************** + * Recovery routine for deleting an entry from the tree + ****************************************************************************/ +STATIC STATUS MYBT_recoveryDeleteEntry( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsMYBTreeBeast_s *btreeBeastignore, /* A NULL pointer! DO NOT USE! */ + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + MybtLogicalLog_s *logRecord; + NINT i; + Buffer_s *buffer; + MYBTreeNode_s *node; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + if (!ZLOG_VALID_BLOCK(poolBlks[0])) + { + return zOK; + } + READBLK_IO_MSG(iomsg, pool, poolBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = MYBT_ReadPoolBlk(treeInfo, genMsg, &iomsg)) == NULL) + { + //zASSERT(0); + return zFAILURE; + } + + node = (MYBTreeNode_s *)buffer->pBuf.data; + MYBT_VALIDATE_NODE(treeInfo, node); + zASSERT(node->magic == MYBT_LEAF_MAGIC(treeInfo)); + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->lsn, action)) + { + CACHE_RELEASE(buffer); + return zOK; + } + + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (MYBT_findLeafEntry(treeInfo, node, MYBT_LEAFENTRY_KEY(treeInfo, logRecord->u.entry), &i)) + { + MYBT_deleteLeafEntry(treeInfo, node, i); + ZLOG_SET_LSN(logBuffer, node->lsn, poolBlks[0], action); + MYBT_VALIDATE_NODE(treeInfo, node); + CACHE_DIRTY_RELEASE(buffer); + return zOK; + } + else + { /* Entry not found in the leaf */ + CACHE_RELEASE(buffer); + SetErrno(genMsg, zERR_ZID_NOT_FOUND); + return zFAILURE; + } +} + +/**************************************************************************** + * Recovery routine for inserting a pinfo into a beast + ****************************************************************************/ +STATUS MYBT_recoveryInsertEntryLogical( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsMYBTreeBeast_s *btreeBeast, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + STATUS status; + MybtLogicalLog_s *logRecord; + MYBTreeParms_s parms; + + ASSERT_MPKNSS_LOCK(); + + if (action == X_UNDO) + { + return zX_LOGICAL; + } + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + zASSERT(btreeBeast != NULL); + parms.treeInfo = treeInfo; + parms.volume = btreeBeast->ZFSMYBTREEvolume; + parms.btreeBeast = btreeBeast; + parms.key = MYBT_LEAFENTRY_KEY(treeInfo, logRecord->u.entry); + parms.value = MYBT_LEAFENTRY_VALUE(treeInfo, logRecord->u.entry); + parms.updateCallback = NULL; + + parms.xaction = BeginXLocal(btreeBeast->ZFSMYBTREEvolume, BXL_LOGICAL_UNDO); + status = MYBT_insertEntry(genMsg, &parms); + + SET_LUNDO_LSN(parms.xaction, logBuffer, status); + EndXlocal(parms.xaction); + return status; +} + +/**************************************************************************** + * Recovery routine for deleting a pinfo from a beast + ****************************************************************************/ +STATUS MYBT_recoveryDeleteEntryLogical( + GeneralMsg_s *genMsg, + MYBTreeInstanceInfo_s *treeInfo, + ZfsPool_s *pool, + ZfsMYBTreeBeast_s *btreeBeast, + ZfsXasRecovery_s *logBuffer, + NINT action) +{ + STATUS status; + MybtLogicalLog_s *logRecord; + MYBTreeParms_s parms; + + ASSERT_MPKNSS_LOCK(); + + if (action == X_UNDO) + { + return zX_LOGICAL; + } + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + zASSERT(btreeBeast != NULL); + parms.treeInfo = treeInfo; + parms.volume = btreeBeast->ZFSMYBTREEvolume; + parms.btreeBeast = btreeBeast; + parms.key = MYBT_LEAFENTRY_KEY(treeInfo, logRecord->u.entry); + parms.value = MYBT_LEAFENTRY_VALUE(treeInfo, logRecord->u.entry); + parms.updateCallback = NULL; + + parms.xaction = BeginXLocal(btreeBeast->ZFSMYBTREEvolume, BXL_LOGICAL_UNDO); + status = MYBT_deleteEntry(genMsg, &parms); + + SET_LUNDO_LSN(parms.xaction, logBuffer, status); + EndXlocal(parms.xaction); + return status; +} + +MYBTInternalRecovery_f MYBTreeInternalRecoveryFuncTable[2 * MYBT_RECOVERY_MAX_INTERNAL_OPS] = +{ + MYBT_recoverySplit, MYBT_recoveryJoin, /* 0 */ + MYBT_recoveryJoin, MYBT_recoverySplit, /* 1 */ + MYBT_recoveryBalance, MYBT_recoveryBalance, /* 2 */ + MYBT_recoveryGrow, MYBT_recoveryShrink, /* 3 */ + MYBT_recoveryShrink, MYBT_recoveryGrow, /* 4 */ + MYBT_recoveryInit, MYBT_recoveryUninit, /* 5 */ + MYBT_recoveryRemoveNode, MYBT_recoveryRemoveNode, /* 6 */ +}; + +MYBTLogicalRecovery_f MYBTreeLogicalRecoveryFuncTable[2 * MYBT_RECOVERY_MAX_LOGICAL_OPS] = +{ + MYBT_recoveryInsertEntry, MYBT_recoveryDeleteEntryLogical, /* 0 */ + MYBT_recoveryDeleteEntry, MYBT_recoveryInsertEntryLogical, /* 1 */ + 0, 0, /* 2 */ +}; + +/**************************************************************************** + * EXTENT BASED storage pack routine + *****************************************************************************/ +STATIC NINT MYBT_PackedSize( + void *beast_LX) +{ + return 0; +} + +/**************************************************************************** + * EXTENT BASED storage pack routine + *****************************************************************************/ +STATIC BYTE *MYBT_Pack( + void *beast_LX, + BYTE *storeBuffer) +{ + return (storeBuffer); +} + +/**************************************************************************** + * EXTENT BASED storage pack routines + *****************************************************************************/ +STATIC BYTE *MYBT_Unpack( + GeneralMsg_s *genMsg, + void *beast_LX, + BYTE *storeBuffer) +{ + return (storeBuffer); +} + +/*--------------------------------------------------------------------------- + * file beast STORAGE ops definition + *---------------------------------------------------------------------------*/ +LSSSpecificPackUnpackOps_s ZFSMYBTREE_lssOps[] = +{ + {zLSS_ID_ZLSS,MYBT_PackedSize,MYBT_Pack,NULL,MYBT_Unpack}, + {zLSS_ID_INVALID} +}; diff --git a/src/nwnss/zlss/xaction.c b/src/nwnss/zlss/xaction.c new file mode 100644 index 0000000..69fa788 --- /dev/null +++ b/src/nwnss/zlss/xaction.c @@ -0,0 +1,3293 @@ +/**************************************************************************** + | + | (C) Copyright 1995-1998 Novell, Inc. + | All Rights Reserved. + | + | This program is free software; you can redistribute it and/or + | modify it under the terms of version 2 of the GNU General Public + | License as published by the Free Software Foundation. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, contact Novell, Inc. + | + | To contact Novell about this file by physical or electronic mail, + | you may find current contact information at www.novell.com + | + |*************************************************************************** + | + | NetWare Advance File Services (NSS) Initialization module + | + |--------------------------------------------------------------------------- + | + | $Author: vandana $ + | $Date: 2007-04-18 23:26:20 +0530 (Wed, 18 Apr 2007) $ + | + | $RCSfile$ + | $Revision: 1954 $ + | + |--------------------------------------------------------------------------- + | This module is used to: + | Routines for allocating and managing structures for transaction + | requests. + +-------------------------------------------------------------------------*/ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "comnPublics.h" +#include "zParams.h" +#include "control.h" +#include "xaction.h" +#include "zlssStartup.h" +#include "zfs.h" +#include "zlog.h" +#include "zfsXTree.h" +#include "purgeLog.h" +#include "comnPublics.h" +#include "zlssLogicalVolume.h" + + /* + * Initialization for the Transaction Recovery Table. Each developer + * needs to place their functions they need for recovery in this table. + */ +STATUS XRedoDummy ( + struct GeneralMsg_s *genMsg, + struct ZfsPool_s *pool, + struct ZfsXasRecovery_s *logRecord, + NINT pass) +{ + zASSERT("Missing Redo Handler" == NULL); + return zOK; +} + +STATUS XUndoDummy ( + struct GeneralMsg_s *genMsg, + struct ZfsPool_s *pool, + struct ZfsXasRecovery_s *logRecord, + NINT pass) +{ + zASSERT("Missing Undo Handler" == NULL); + return zOK; +} + +STATUS XRedoIgnore ( + struct GeneralMsg_s *genMsg, + struct ZfsPool_s *pool, + struct ZfsXasRecovery_s *logRecord, + NINT pass) +{ + return zOK; +} + +STATUS XUndoIgnore ( + struct GeneralMsg_s *genMsg, + struct ZfsPool_s *pool, + struct ZfsXasRecovery_s *logRecord, + NINT pass) +{ + return zOK; +} + +STATUS XRedoCompensation ( + struct GeneralMsg_s *genMsg, + struct ZfsPool_s *pool, + struct ZfsXasRecovery_s *logRecord, + NINT pass) +{ + Lsn_t *lUndoLSN; + + lUndoLSN = ZLOG_START_OF_LOG_RECORD(logRecord); + + zASSERT(pool->zfsLogBeast->ZLB_RecoveryCompensationPointerLsn < *lUndoLSN); + if (logRecord->ZXR_TransactionState & XAS_XR_TS_END) + { + pool->zfsLogBeast->ZLB_RecoveryCompensationPointerLsn = *lUndoLSN; + } + + return zOK; +} + +RedoUndo_s XRecoveryTable[XFUNC_MAX] = +{ + { XRedoDummy, XUndoDummy }, + /* + * If we just want to ignore it. (End Transaction). In debug + * mode (unss.nlm) the checkpoint code also uses to log + * chcekpoints into the ZLOG file. + */ + { XRedoIgnore, XUndoIgnore }, /* 1 */ + /* + * Beast B-tree recovery functions + */ + { recoveryInsertZnode, recoveryDeleteZnode }, /* 2 */ + { recoveryDeleteZnode, recoveryInsertZnode }, /* 3 */ + { recoverySplit, recoveryJoin }, /* 4 */ + { recoveryJoin, recoverySplit }, /* 5 */ + { recoveryBalance, recoveryBalance }, /* 6 */ + { recoveryInsertOverflow, recoveryRemoveOverflow}, /* 7 */ + { recoveryGrow, recoveryShrink }, /* 8 */ + { recoveryShrink, recoveryGrow }, /* 9 */ + { recoveryInit, recoveryUninit }, /* 10 */ + /* + * Purge Log + */ + { XRedoDummy, XUndoDummy }, /* Not being used */ /* 11 */ + { recoveryPurgeLogGrow, recoveryPurgeLogShrink }, /* 12 */ + { recoveryPurgeLogInsert, recoveryPurgeLogRemove }, /* 13 */ + { recoveryPurgeLogRemove, recoveryPurgeLogInsert }, /* 14 */ + /* + * File Map recovery functions + */ + { redoInitRoot, undoInitRoot }, /* 15 */ + { redoFmapInsert, undoFmapInsert }, /* 16 */ + { redoFmapInsertSparse, undoFmapInsertSparse }, /* 17 */ + { redoFmapGrow, undoFmapGrow }, /* 18 */ + { redoFmapSplit, undoFmapSplit }, /* 19 */ + { redoFmapRemove, undoFmapRemove }, /* 20 */ + { redoFmapJoin, undoFmapJoin }, /* 21 */ + { redoFmapToss, undoFmapToss }, /* 22 */ + /* + * Name log + */ +#ifndef NSSOLD_NAMING + { XRedoDummy, XUndoDummy }, + { XRedoDummy, XUndoDummy }, + { XRedoDummy, XUndoDummy }, + { XRedoDummy, XUndoDummy }, +#else + { redoNameInit, undoNameInit }, /* 23 */ + { addNameToDir, removeNameFromDir }, /* 24 */ + { removeNameFromDir, addNameToDir }, /* 25 */ + { redoModifyNSMask, undoModifyNSMask }, /* 26 */ +#endif + /* + * Free tree + */ + { redoFXshrink, undoFXshrink }, /* 27 */ + { redoFXdelete, undoFXdelete }, /* 28 */ + { redoFXbalance, undoFXbalance }, /* 29 */ + { redoFXjoin, undoFXjoin }, /* 30 */ + { redoFXgrow, undoFXgrow }, /* 31 */ + { redoFXsplit, undoFXsplit }, /* 32 */ + { redoFXinsert, undoFXinsert }, /* 33 */ + /* + * Volume info + */ + { recoveryVolumeInfo, recoveryVolumeInfo }, /* 34 */ +#ifndef NSSOLD_NAMING + { dlog_recovery_common, dlog_recovery_common }, /* 35 */ +#else + { XRedoDummy, XUndoDummy }, +#endif + /* + * Old Purge B-tree recovery functions + */ + { pbt_recoveryInsertEntry, pbt_recoveryDeleteEntry }, /* 36 */ + { pbt_recoveryDeleteEntry, pbt_recoveryInsertEntry }, /* 37 */ + { pbt_recoverySplit, pbt_recoveryJoin }, /* 38 */ + { pbt_recoveryJoin, pbt_recoverySplit }, /* 39 */ + { pbt_recoveryBalance, pbt_recoveryBalance }, /* 40 */ + { pbt_recoveryInsertOverflow, pbt_recoveryRemoveOverflow}, /* 41 */ + { pbt_recoveryGrow, pbt_recoveryShrink }, /* 42 */ + { pbt_recoveryShrink, pbt_recoveryGrow }, /* 43 */ + { pbt_recoveryInit, pbt_recoveryUninit }, /* 44 */ + /* + * user B-tree recovery functions + */ + { UBT_recoveryModifyEntry, UBT_recoveryModifyEntry }, /* 45 */ + { UBT_recoveryInsertEntry, UBT_recoveryDeleteEntry }, /* 46 */ + { UBT_recoveryDeleteEntry, UBT_recoveryInsertEntry }, /* 47 */ + { UBT_recoverySplit, UBT_recoveryJoin }, /* 48 */ + { UBT_recoveryJoin, UBT_recoverySplit }, /* 49 */ + { UBT_recoveryBalance, UBT_recoveryBalance }, /* 50 */ + { UBT_recoveryGrow, UBT_recoveryShrink }, /* 51 */ + { UBT_recoveryShrink, UBT_recoveryGrow }, /* 52 */ + { UBT_recoveryInit, UBT_recoveryUninit }, /* 53 */ + { UBT_recoveryRemove, UBT_recoveryRemove }, /* 54 */ + /* + * LV delete info + */ + { ZLSSVOL_LVD_RecoveryBTShrinkRoot, ZLSSVOL_LVD_RecoveryBTShrinkRoot }, /* 55 */ + { ZLSSVOL_LVD_RecoveryBTShrink, ZLSSVOL_LVD_RecoveryBTShrink }, /* 56 */ + { ZLSSVOL_LVD_RecoveryNTShrink, ZLSSVOL_LVD_RecoveryNTShrink }, /* 57 */ + { ZLSSVOL_LVD_RecoveryNTShrink, ZLSSVOL_LVD_RecoveryNTShrink }, /* 58 */ + { ZLSSVOL_LVD_RecoveryNTShrink, ZLSSVOL_LVD_RecoveryNTShrink }, /* 59 */ + /* + * LV Create info + */ + { recoveryInit, recoveryUninit }, /* 60 */ + { recoveryPurgeLogInit, recoveryPurgeLogUninit }, /* 61 */ + { recoveryVDBInit, recoveryVDBUninit }, /* 62 */ + { recoveryLVDBInit, recoveryLVDBUninit }, /* 63 */ + /* + * Pool info + */ + { ZLSSPOOL_RecoveryPoolInfo, ZLSSPOOL_RecoveryPoolInfo }, /* 64 */ + /* + * Logical Undo routines + */ + { recoveryInsertZnode, recoveryDeleteZnodeLogical }, /* 65 */ + { recoveryDeleteZnode, recoveryInsertZnodeLogical }, /* 66 */ + { redoFXinsert, undoFXinsertLogical }, /* 67 */ + { redoFXdelete, undoFXdeleteLogical }, /* 68 */ + { pbt_recoveryInsertEntry, pbt_recoveryDeleteEntryLogical }, /* 69 */ + { pbt_recoveryDeleteEntry, pbt_recoveryInsertEntryLogical }, /* 70 */ + { UBT_recoveryModifyEntry, UBT_recoveryModifyEntryLogical }, /* 71 */ + { UBT_recoveryInsertEntry, UBT_recoveryDeleteEntryLogical }, /* 72 */ + { UBT_recoveryDeleteEntry, UBT_recoveryInsertEntryLogical }, /* 73 */ + /* + * Name tree physical redo/undo routines + */ + { dlog_recovery_common, dlog_recovery_common }, /* 74 */ + /* + * Superblock Header + */ + { ZLSSPOOL_RecoverySuperblockHeader, ZLSSPOOL_RecoverySuperblockHeader }, /* 75 */ + + /* + * + */ + { recoveryPDBInit, recoveryPDBUninit }, /* 76 */ + { recoveryLPDBInit, recoveryLPDBUninit }, /* 77 */ + { recoveryVDBUpdate, recoveryVDBUpdate }, /* 78 */ + /* + * directory quota B-tree recovery functions + */ + { DBT_recoveryModifyEntry, DBT_recoveryModifyEntry }, /* 79 */ + { DBT_recoveryInsertEntry, DBT_recoveryDeleteEntry }, /* 80 */ + { DBT_recoveryDeleteEntry, DBT_recoveryInsertEntry }, /* 81 */ + { DBT_recoverySplit, DBT_recoveryJoin }, /* 82 */ + { DBT_recoveryJoin, DBT_recoverySplit }, /* 83 */ + { DBT_recoveryBalance, DBT_recoveryBalance }, /* 84 */ + { DBT_recoveryGrow, DBT_recoveryShrink }, /* 85 */ + { DBT_recoveryShrink, DBT_recoveryGrow }, /* 86 */ + { DBT_recoveryInit, DBT_recoveryUninit }, /* 87 */ + { DBT_recoveryRemove, DBT_recoveryRemove }, /* 88 */ + { DBT_recoveryModifyEntry, DBT_recoveryModifyEntryLogical }, /* 89 */ + { DBT_recoveryInsertEntry, DBT_recoveryDeleteEntryLogical }, /* 90 */ + { DBT_recoveryDeleteEntry, DBT_recoveryInsertEntryLogical }, /* 91 */ + + /* + * New Purge B-tree recovery functions + * Comment: logical here means interface operations, not logical undo. + * the routine itself decides whether to apply logical/physical undo/redo. + */ + { pbt_logicalRecoveryCommon, pbt_logicalRecoveryCommon }, /* 92 */ + { pbt_logicalRecoveryCommon, pbt_logicalRecoveryCommon }, /* 93 */ + { pbt_logicalRecoveryCommon, pbt_logicalRecoveryCommon }, /* 94 */ + { pbt_internalRecoveryCommon, pbt_internalRecoveryCommon }, /* 95 */ + { pbt_internalRecoveryCommon, pbt_internalRecoveryCommon }, /* 96 */ + { pbt_internalRecoveryCommon, pbt_internalRecoveryCommon }, /* 97 */ + { pbt_internalRecoveryCommon, pbt_internalRecoveryCommon }, /* 98 */ + { pbt_internalRecoveryCommon, pbt_internalRecoveryCommon }, /* 99 */ + { pbt_internalRecoveryCommon, pbt_internalRecoveryCommon }, /* 100 */ + { pbt_internalRecoveryCommon, pbt_internalRecoveryCommon }, /* 101 */ + + /* + * MFL recovery functions + * Comment: logical here means interface operations, not logical undo. + * the routine itself decides whether to apply logical/physical undo/redo. + */ + { MFL_logicalRecoveryCommon, MFL_logicalRecoveryCommon }, /* 102 */ + { MFL_logicalRecoveryCommon, MFL_logicalRecoveryCommon }, /* 103 */ + { MFL_logicalRecoveryCommon, MFL_logicalRecoveryCommon }, /* 104 */ + { MFL_internalRecoveryCommon, MFL_internalRecoveryCommon }, /* 105 */ + { MFL_internalRecoveryCommon, MFL_internalRecoveryCommon }, /* 106 */ + { MFL_internalRecoveryCommon, MFL_internalRecoveryCommon }, /* 107 */ + { MFL_internalRecoveryCommon, MFL_internalRecoveryCommon }, /* 108 */ + { MFL_internalRecoveryCommon, MFL_internalRecoveryCommon }, /* 109 */ + { MFL_internalRecoveryCommon, MFL_internalRecoveryCommon }, /* 110 */ + { MFL_internalRecoveryCommon, MFL_internalRecoveryCommon }, /* 111 */ + + /* Name Tree Logical routines */ + { dlog_logical_recovery_common, dlog_logical_recovery_common }, /* 112 */ + + /* End Xlocal logging LSN for compensation */ + { XRedoCompensation, XUndoDummy }, /* 113 */ + + /* User tree object name operations */ + { UBT_recoveryModifyName, UBT_recoveryModifyNameLogical }, /* 114 */ + { UBT_recoveryAddNameRecord, UBT_recoveryDeleteNameRecord }, /* 115 */ + { UBT_recoveryAddName, UBT_recoveryDeleteNameLogical }, /* 116 */ + { UBT_recoveryRemoveNameRecord, UBT_recoveryInsertNameRecord }, /* 117 */ + { UBT_recoveryDeleteNameRecord, UBT_recoveryAddNameRecord }, /* 118 */ + { UBT_recoveryModifyNameEntry, UBT_recoveryModifyNameEntryLogical },/* 119 */ + { UBT_recoveryDeleteAll, UBT_recoveryAddAll }, /* 120 */ + { UBT_recoveryDeleteName, UBT_recoveryAddOnlyName }, /* 121 */ + + /* Event File List operations */ + { EFL_recoverySplit, EFL_recoveryJoin }, /* 122 */ + { EFL_recoveryJoin, EFL_recoverySplit }, /* 123 */ + { EFL_recoveryBalance, EFL_recoveryBalance }, /* 124 */ + { EFL_recoveryGrow, EFL_recoveryShrink }, /* 125 */ + { EFL_recoveryShrink, EFL_recoveryGrow }, /* 126 */ + { EFL_recoveryInit, EFL_recoveryUninit }, /* 127 */ + { EFL_recoveryInsertEntry, EFL_recoveryDeleteEntryLogical }, /* 128 */ + { EFL_recoveryDeleteEntry, EFL_recoveryInsertEntryLogical }, /* 129 */ + { EFL_recoveryModifyEntry, EFL_recoveryModifyEntryLogical }, /* 130 */ + { EFL_recoveryRemove, EFL_recoveryRemove }, /* 131 */ + { EFL_recoveryRemoveLog, EFL_recoveryRemoveLog }, /* 132 */ + { ELOG_recoveryAddNode, ELOG_recoveryAddNodeUndo }, /* 133 */ + { ELOG_recoveryAddEntry, ELOG_recoveryRemoveEntryLogical }, /* 134 */ + { ELOG_recoveryAddEntryName, ELOG_recoveryRemoveEntryNameLogical }, /* 135 */ + { ELOG_recoveryRemoveNode, ELOG_recoveryRemoveNodeUndo }, /* 136 */ + { ELOG_recoveryModifyEpoch, ELOG_recoveryModifyEpoch }, /* 137 */ + +}; + +/* + * Set up the system pool of transaction control structures + */ + +ControlStore_s XactionControl; +ControlStore_s FreeUserDataBlksControl; +ObjCache_s PLogFreeEntry; +LONG XidUnique = 0; + +#if NSS_DEBUG IS_ENABLED +DQhead_t Xinuse; +Histogram_s XcommitHistogram = { 0 }; +Histogram_s XendHistogram = { 0 }; +NINT XdeleteCnt = 0; + /* July 30, 1998(Greg) Made 20 verses 10 because we hit + * to often because of slow Name Tree when many duplicate + * deleted file names. + */ +NINT MaxXactionTicks = SEC2TICK(100); +#endif + +#if FREE_DATA_STATS IS_ENABLED +FreeUserDataBlksStats_s UserFreeStats = {0}; +#endif + +void XqueueDeferredPoolBlks ( + ZfsXaction_s *xaction); + +void XisHomedSignalHandler(Agent_s *agent); + +void XACT_Init (ZfsXaction_s *xaction) +{ + initAgent(&xaction->xaction.agent, XisHomedSignalHandler, MSGNot("Xagent")); +} + +STATUS XACT_Startup (void) +{ + ENTER(TXACTION, XACT_Startup); + + if (CONTROL_Startup(&XactionControl, ZstoreConfig.zfs.numXactions, + sizeof(ZfsXaction_s), XACT_Init) != zOK) + { + RTN_STATUS(zFAILURE); + } + + if (CONTROL_Startup(&FreeUserDataBlksControl, 256, + sizeof(FreeUserDataBlks_s), NULL) != zOK) + { + CONTROL_Shutdown( &XactionControl); + RTN_STATUS(zFAILURE); + } + + if (objCacheCreate(&PLogFreeEntry, "Plog", + sizeof(FreePurgeLogEntries_s), NULL) != zOK) + { + CONTROL_Shutdown( &XactionControl); + CONTROL_Shutdown( &FreeUserDataBlksControl); + RTN_STATUS(zFAILURE); + } + + +#if NSS_DEBUG IS_ENABLED + DQ_INIT( &Xinuse); +#endif + + RTN_STATUS(zOK); +} + +void XACT_Shutdown (void) +{ + ENTER(TXACTION, XACT_Shutdown); + + CONTROL_Shutdown( &XactionControl); + CONTROL_Shutdown( &FreeUserDataBlksControl); + objCacheDestroy( &PLogFreeEntry); + + RTN_VOID(); +} + +/************************************************************************** + * free xaction structure + **************************************************************************/ +void freeXaction (ZfsXaction_s *xaction) +{ + DEBUG_PRINTF(TXACTION,DBG_BOTH,(LMAGENTA,MSGNot("Free ZfsXaction_s %08x\n"),xaction)); + +#if NSS_DEBUG IS_ENABLED + DQ_RMV(xaction, inuse); + EVENT_HISTOGRAM(XcommitHistogram, + xaction->commitStamp - xaction->timeStamp); + EVENT_HISTOGRAM(XendHistogram, Ticks - xaction->timeStamp); +#endif + + CONTROL_FREE(xaction); +} + +/************************************************************************** + * + ***************************************************************************/ +void writeCommitRecord (ZfsXaction_s *xaction) +{ + ZfsXasRecovery_s *log; + Lsn_t *lsn; + + ENTER(TXACTION, writeCommitRecord); + + if (xaction->xstate & XAS_XR_TS_END) + { + zASSERT(!(xaction->xflags & XFLAG_LOGICAL_UNDO)); + RTN_VOID(); + } + + if (xaction->xflags & XFLAG_LOGICAL_UNDO) + { + /* These are the EndXLocal that are called during Pass 7 + * on transactions that were started by the logical undo + * routines. We want to save the LSN of the logical undo + * record that initiated this xaction. + */ + ZLOG_ObtainRecord( xaction, sizeof(Lsn_t) ); + + log = xaction->ZX_zlogBeast->ZLB_ZfsXasRecovery; + if (xaction->xstate & X_DONT_COMMIT) + { + /* This is the condition that an error occurred while + * in Pass 7 of recovery. We do not want to mark the + * this xaction with an XAS_XR_TS_END state, as we want + * to undo it if the user tries to bring the system back up. + */ + log->ZXR_TransactionState = 0; + } + else + { + log->ZXR_TransactionState = XAS_XR_TS_END; + } + log->ZXR_FunctionIndex = XFUNC_COMPENSATION; + log->ZXR_PoolBlockCount = 0; + lsn = ZLOG_START_OF_LOG_RECORD(log); + zASSERT(xaction->ZX_logicalUndoLSN != 0); + *lsn = xaction->ZX_logicalUndoLSN; + } + else + { + /* This is the normal running condition */ + ZLOG_ObtainRecord( xaction, 0 ); + + log = xaction->ZX_zlogBeast->ZLB_ZfsXasRecovery; + log->ZXR_TransactionState = XAS_XR_TS_END; + log->ZXR_FunctionIndex = XFUNC_IGNORE; + log->ZXR_PoolBlockCount = 0; + + } +// bind( &xaction->xaction.agent, &xaction->ZX_zlogBeast->ZLB_Buffer->agent); + bind( &xaction->xaction.agent, &xaction->ZX_zlogBeast->ZLB_Barrier->ZB_Agent); + ZLOG_ReleaseRecord(xaction); + + RTN_VOID(); +} + +/************************************************************************** + * BeginXLocal begins a local transaction + * Given: volume where transaction is to run + * flags - specifies if called from common layer. We currently + * use to determine that we can throttle the caller. + * Returns: A transaction with + * Xid + * log file set + ***************************************************************************/ +ZfsXaction_s *BeginXLocal (Volume_s *volume, NINT flags) +{ + ZfsXaction_s *xaction; + ZfsVolume_s *zfsvol = STRUCT(volume, ZfsVolume_s, vol); + +#if NSS_DEBUG IS_ENABLED +#ifndef __linux__ + LONG callingRoutine = *(((LONG *)&volume) - 1); + LONG callingRoutineCaller = *((*(((LONG **)&volume) - 2)) + 1); +#endif +#endif + + xaction = CONTROL_get( &XactionControl); + zASSERT(xaction != NULL); + + DQ_INIT(&xaction->freeUserDataBlks); + DQ_INIT(&xaction->freePurgeLogEntries); + xaction->xstate = XAS_XR_TS_START; + zASSERT( zfsvol->pool->ZFSPOOLvol.v_restartCount == zfsvol->ZFSVOLvol.v_restartCount ); + xaction->localXid.restartCount = zfsvol->pool->ZFSPOOLvol.v_restartCount; + xaction->localXid.unique = ++XidUnique; + + xaction->xflags = 0; + if ( flags & BXL_LOGICAL_UNDO ) + { + xaction->xflags |= XFLAG_LOGICAL_UNDO; + } + if (flags & BXL_LOGICAL_UNDO_FREE_TREE_INSERT) + { + xaction->xflags |= XFLAG_LOGICAL_UNDO_FREE_TREE_INSERT; + } + xaction->ZX_zlogBeast = zfsvol->zv_zfsLogBeast; +// xaction->ZX_zfsPool = zfsvol->pool; + xaction->ZX_zfsVol = zfsvol; + xaction->ZX_logicalUndoLSN = 0; + DQ_INIT( &xaction->ZX_deleteBlkQ ); + + if ( flags & X_CF_OK_TO_THROTTLE ) + { + Zlog_Throttle( volume, xaction ); + } + +#if NSS_DEBUG IS_ENABLED + { + Time_t time = Ticks; + ZfsXaction_s *xact; + + DQ_PEEK( &Xinuse, xact, ZfsXaction_s, inuse); + if (xact) + { + if (time - xact->timeStamp >= MaxXactionTicks) + { +#ifdef __linux__ + printk("<1>" "Xaction slow status = 0x%08x; id = 0x%08x, currentid=0x%08x\n", xact->xstate, (LONG)xact->localXid.unique, (LONG)XidUnique); + aprintf(CYAN,"stat = 0x%08x; id = 0x%08x, currentid=0x%08x\n", xact->xstate, (LONG)xact->localXid.unique, (LONG)XidUnique); +#else + aprintf(CYAN, MSGNot("Caller = 0x%08x; CallerCaller = 0x%08x\n"), xact->caller, xact->callerCaller); + zASSERT("Transaction too slow" == NULL); +#endif + } + } + } + xaction->timeStamp = Ticks; + xaction->commitStamp = 0; +#ifndef __linux__ + xaction->caller = callingRoutine; + xaction->callerCaller = callingRoutineCaller; +#endif + xaction->numFreeNonusableBlks = 0; + DQ_ENQ( &Xinuse, xaction, inuse); +#endif + + DEBUG_PRINTF(TXACTION,DBG_BOTH,(LMAGENTA,MSGNot("Getting ZfsXaction_s %08x\n"),xaction)); + return xaction; +} + +// +// Common Op Entry +// +Xaction_s *ZFSVOL_VOL_beginXLocal(Volume_s *volume, NINT flags) +{ + return( (Xaction_s *)BeginXLocal( volume, flags ) ); +} + + + +/************************************************************************** + * XisHomed is called as a finite state machine + ***************************************************************************/ +void XisHomedSignalHandler (Agent_s *agent) +{ + ZfsXaction_s *xaction = (ZfsXaction_s *)STRUCT(agent, Xaction_s, agent); + + ENTER(TXACTION, XisHomedSignalHandler); + + if (xaction->xstate & X_COMMITTED) + { /* + * The Xaction is committed and everything is in its + * home location so we can finish up. + */ + ZLOG_TransactionHomed(xaction); + } + RTN_VOID(); +} + +/************************************************************************** + * + * EndXlocal finishes the transaction passed to it. + * + ***************************************************************************/ +void EndXlocal (ZfsXaction_s *xaction) +{ + FreePurgeLogEntries_s *pLogEntry; + + ENTER(TXACTION, EndXlocal); + +#if NSS_DEBUG IS_ENABLED + xaction->commitStamp = Ticks; +#endif + + /* queue any deferred pool blocks onto the pool queue */ + if (DQ_NOT_EMPTY(&xaction->ZX_deleteBlkQ)) + { + XqueueDeferredPoolBlks(xaction); + } + +#if NSS_ASSERT IS_ENABLED + if (xaction->xflags & XFLAG_LOGICAL_UNDO) + { + zASSERT(xaction->ZX_logicalUndoLSN != 0); + } + else + { + zASSERT(xaction->ZX_logicalUndoLSN == 0); + } +#endif + writeCommitRecord(xaction); + + if (DQ_NOT_EMPTY(&xaction->freeUserDataBlks)) + { + RELEASE_FREE_DATA_BLKS(xaction); + } + while (DQ_NOT_EMPTY(&xaction->freePurgeLogEntries)) + { + DQ_DEQ(&xaction->freePurgeLogEntries, pLogEntry, + FreePurgeLogEntries_s, xactionLink); + PLOG_PutEntryOnFreeList(pLogEntry); + } + +#if NSS_DEBUG IS_ENABLED + zASSERT(xaction->numFreeNonusableBlks == 0); +#endif + + /* The X_COMMITTED state on the xaction is what causes the xaction to + * eventually be freed from memory and for ZLOG_TransactionHomed + * to be called. + * We do not want to set this bit until this point, because this is + * where we are done using the xaction and one of the 2 paths below + * will cause the signal handler for the xaction to be called. + */ + xaction->xstate |= X_COMMITTED; + + lazyFlush( &xaction->xaction.agent); + + PERIODIC_YIELD(); + RTN_VOID(); +} + +// +// Common Op Entry +// +void ZFSVOL_VOL_endXLocal(Xaction_s *xaction) +{ + EndXlocal((ZfsXaction_s *) xaction); +} + +/************************************************************************** + * + * EndXlocalAsync finishes the transaction passed to it asynchronously. + * + ***************************************************************************/ + +void EndXlocalAsync (ZfsXaction_s *xaction) +{ + + ENTER(TXACTION, EndXlocalAsync); + + zASSERT("EndXlocalAsync can no longer be used with logical undo" == NULL); + +//#if NSS_DEBUG IS_ENABLED +// { +// char buffer[40]; +// +// DBG_DebugPrintf(CYAN, MSGNot("ASYNC EndXlocal %s\n"), +// UTCTime2Str(GetUTCTime(),&buffer[0]) ); +// } +//#endif + /* + * This only works because the FSM and the ZfsXaction_s + * are at offset 0. + */ + WORK_Schedule_HIGH( &xaction->xaction.agent.fsm, EndXlocal, 0); + RTN_VOID(); + +} + + +#if 0 +#if NSS_DEBUG IS_ENABLED +#define MAX_PAGES (1<<14) +#define PAGE_REALLOC 0x1000 +#define PAGE_ALLOC 0x2000 + +struct Pages_s +{ + Blknum_t blk; + WORD logState; + WORD cnt; +} Pages[MAX_PAGES]; + +NINT PageIndex = 0; + +#define SET_PAGE(_binfo, _alloc) \ +{ \ + Pages[PageIndex].blk = (_binfo)->blkNum; \ + Pages[PageIndex].logState = ((_binfo)->logState | (_alloc)); \ + Pages[PageIndex].cnt = Rec.pass1; \ + ++PageIndex; \ + if (PageIndex == MAX_PAGES) PageIndex = 0; \ +} +#else +#define SET_PAGE(_binfo, _alloc) ((void)0) +#endif +#endif + + +/************************************************************************* + * + * Init the freed block table + * + *************************************************************************/ +STATUS initXfreeTable ( + XfreeTable_s *freeTable) +{ + STATUS status; + + /* + * The free table keeps track of which block were freed and allocated + * in the log so we can ignore operations on the freed + * blocks. + */ + ENTER(TXACTION, initXfreeTable); + + freeTable->size = INITIAL_XTABLE_SIZE; + freeTable->blks = zalloc(sizeof(XfreedBlk_s) * INITIAL_XTABLE_SIZE); + freeTable->free = 0; + + status = ((freeTable->blks == NULL) ? zFAILURE : zOK); + + RTN_STATUS(status); +} + +/************************************************************************* + * + * Free the freed block table + * + *************************************************************************/ +void freeXfreeTable ( + XfreeTable_s *freeTable) +{ + ENTER(TXACTION, freeXfreeTable); + + free(freeTable->blks); + freeTable->size = 0; + + RTN_VOID(); +} + +/************************************************************************* + * + * Find an entry in the undo transaction table + * return - an index into the table (-1 = not found) + * + *************************************************************************/ +NINT findUndoEntry ( + UndoTable_s *undoTable, + LocalXid_t xid) /* input - block number we are looking for */ +{ + NINT i; + + ENTER(TXACTION, findUndoEntry); + + for (i = 0; i < undoTable->numEntries; i++) + { + if ((undoTable->xids[i].unique == xid.unique) && + (undoTable->xids[i].restartCount == xid.restartCount)) + { + RTN_NINT(i); + } + } + RTN_NINT(-1); +} + +/************************************************************************* + * + * Find a block in the freed block table + * + *************************************************************************/ +XfreedBlk_s *findFreedBlk ( + XfreeTable_s *freeTable, + Blknum_t poolBlk) +{ + NINT i; + + ENTER(TXACTION, findFreedBlk); + + for (i = 0; i < freeTable->free; ++i) + { + if (poolBlk == freeTable->blks[i].poolBlk) + { + RTN_PTR( &freeTable->blks[i]); + } + } + RTN_PTR(NULL); +} + +/************************************************************************* + * + * Add an entry to the freed block table + * + *************************************************************************/ +STATUS insertFreedBlk ( + GeneralMsg_s *genMsg, + XfreeTable_s *freeTable, + Blknum_t poolBlk) +{ + XfreedBlk_s *blks; + + ENTER(TXACTION, insertFreedBlk); + + blks = freeTable->blks; + if (freeTable->size == freeTable->free) + { + freeTable->size *= XTABLE_GROWTH_FACTOR; + blks = realloc(blks, sizeof(XfreedBlk_s) * freeTable->size); + if (blks == NULL) + { /* Couldn't allocate memory */ + SetErrno(genMsg, zERR_NO_MEMORY); + RTN_STATUS( zFAILURE ); + } + freeTable->blks = blks; + } +#if NSS_DEBUG IS_ENABLED + { + NINT i; + for (i = 0; i < freeTable->free; ++i) + { + if (poolBlk == blks[i].poolBlk) + { + zASSERT("Freed blk should not be in the log twice" == NULL); + RTN_STATUS( zOK ); + } + } + } +#endif + blks[freeTable->free].poolBlk = poolBlk; + ++freeTable->free; + RTN_STATUS( zOK ); +} + +/************************************************************************* + * + * Get the blocks that have been freed during a logged operation + * + *************************************************************************/ +STATUS recoverFreedBlocks ( + GeneralMsg_s *genMsg, + XfreeTable_s *freeTable, + ZfsXasRecovery_s *logRec) +{ + BlockInfo_s *blkInfo; + NINT i; + STATUS status; + + ENTER(TXACTION, recoverFreedBlocks); + + blkInfo = ZLOG_START_OF_POOL_BLOCKS(logRec); + for (i = 0; i < logRec->ZXR_PoolBlockCount; ++i) + { + if (blkInfo[i].blkNum != 0) + { + if (blkInfo[i].logState & LOG_FREE_NODE) + { + zASSERT(!(blkInfo[i].logState & LOG_ALLOC_NODE)); + + status = insertFreedBlk(genMsg, freeTable, blkInfo[i].blkNum); + if ( status != zOK ) + { + RTN_STATUS( status ); + } + } + } + } + RTN_STATUS( zOK ); +} + +/************************************************************************* + * + * Set the replay state based on the entries in the freed block table + * + *************************************************************************/ +void setReplayState ( + XfreeTable_s *freeTable, + ZfsXasRecovery_s *logRec) +{ + BlockInfo_s *blkInfo; + XfreedBlk_s *blk; + NINT i; + + ENTER(TXACTION, markFreedBlocks); + + blkInfo = ZLOG_START_OF_POOL_BLOCKS(logRec); + for (i = 0; i < logRec->ZXR_PoolBlockCount; ++i) + { + if (blkInfo[i].blkNum != 0) + { + blk = findFreedBlk(freeTable, blkInfo[i].blkNum); + if (blk != NULL) + { + blkInfo[i].replayState = REPLAY_DONT_PROCESS; + } + else + { + blkInfo[i].replayState = 0; + } + } + } + RTN_VOID(); +} + + + +/************************************************************************* + * + * Init the transaction table + * + *************************************************************************/ +STATUS initXactTable ( + XactTable_s *xactTable) +{ + STATUS status; + + /* + * Xaction table keeps track of which transactions have not been + * been committed. + */ + ENTER(TXACTION, initXactTable); + + xactTable->size = INITIAL_XTABLE_SIZE; + xactTable->xEntry = zalloc(sizeof(XactionTableEntry_s) * INITIAL_XTABLE_SIZE); + xactTable->numEntries = 0; + xactTable->numUncommittedEntries = 0; + + status = ((xactTable->xEntry == NULL) ? zFAILURE : zOK); + + RTN_STATUS(status); +} + +/************************************************************************* + * + * Free the transaction table + * + *************************************************************************/ +void freeXactTable ( + XactTable_s *xactTable) +{ + ENTER(TXACTION, freeXactTable); + + free(xactTable->xEntry); + xactTable->size = 0; + + RTN_VOID(); +} + +/************************************************************************* + * + * Put an entry in the transaction table + * + *************************************************************************/ +STATUS addXactEntry ( + XactTable_s *xactTable, /* input - transaction table */ + NINT state, /* input - committed/completed state */ + LocalXid_t xid, /* input - transaction id */ +#if NSS_DEBUG IS_ENABLED + ZlogRecoveryKey_s *key, /* Input - to get the time */ +#endif + NINT *index) /* output - index where the entry was added */ +{ + XactionTableEntry_s *xEntry; + + ENTER(TXACTION, addXactEntry); + + if (xactTable->size <= xactTable->numEntries) + { /* grow the table */ + xactTable->size *= XTABLE_GROWTH_FACTOR; + xEntry = realloc(xactTable->xEntry, sizeof(XactionTableEntry_s) * xactTable->size); + if (xEntry == NULL) + { + RTN_STATUS(zFAILURE); + } + xactTable->xEntry = xEntry; + } + /* + * Pass 1 - Every call does ++ + * Pass 2 - No call does ++ + */ + if (!(state & XTE_COMMITTED)) + { /* uncommitted */ + xactTable->numUncommittedEntries++; + } + xactTable->xEntry[xactTable->numEntries].state = state; + xactTable->xEntry[xactTable->numEntries].xid = xid; +#if NSS_DEBUG IS_ENABLED + xactTable->xactionTime = ZLOG_RecoveryTimeGet(key); +#endif + STK_INIT(xactTable->xEntry[xactTable->numEntries].blockList); + *index = xactTable->numEntries; + xactTable->numEntries++; + + RTN_STATUS(zOK); +} + +/************************************************************************* + * + * Remove an entry from the transaction table + * + *************************************************************************/ +void removeXactEntry ( + XactTable_s *xactionTable, + BlockTable_s *blockTable, + NINT index) /* input - array index into the table */ +{ + BlockNumberEntry_s *entry; + + ENTER(TXACTION, removeXactEntry); + /* + * Pass 1 - Every call does -- + * Pass 2 - May or may not do -- + */ + if (!(xactionTable->xEntry[index].state & XTE_COMMITTED)) + { /* completed and uncommitted */ + xactionTable->numUncommittedEntries--; + } + + /* put the blocks on the block table avail list */ + do + { + STK_POP(xactionTable->xEntry[index].blockList, entry, BlockNumberEntry_s, + link); + if (entry != NULL) + { + STK_PUSH(blockTable->availList, entry, link); + DEBUG_PRINTF(TXACTION, DBG_NOINDENT, (BLUE, + MSGNot("Remove blk from xact. xid=%d blk=%d blockAddr=%x\n"), + xactionTable->xEntry[index].xid.unique, entry->block, entry)); + } + } while (entry != NULL); + + xactionTable->xEntry[index] = + xactionTable->xEntry[--xactionTable->numEntries]; + RTN_VOID(); +} + +/************************************************************************* + * + * Find an entry from the transaction table + * return - an index into the transaction table (-1 = not found) + * + *************************************************************************/ +NINT findXactEntry ( + XactTable_s *xactTable, + LocalXid_t xid) /* input - transaction id */ +{ + NINT i; + + ENTER(TXACTION, findXactEntry); + + for (i = 0; i < xactTable->numEntries; ++i) + { + if ((xid.unique == xactTable->xEntry[i].xid.unique) + && (xid.restartCount == xactTable->xEntry[i].xid.restartCount)) + { + RTN_NINT(i); + } + } + RTN_NINT(-1); +} + + +/************************************************************************* + * + * Init the block table + * + *************************************************************************/ +STATUS initBlockTable ( + BlockTable_s *blockTable) +{ + BlockTableMem_s *memBlock; + + ENTER(TXACTION, initBlockTable); + + memBlock = zalloc(sizeof(BlockTableMem_s)); + if (memBlock == NULL) + { /* error */ + RTN_STATUS(zFAILURE); + } + + STK_INIT(blockTable->memList); + STK_PUSH(blockTable->memList, memBlock, nextMem); + /* init the avail list */ + STK_INIT_ELEMENTS(blockTable->availList, memBlock->entry, + BLOCK_TABLE_ALLOC_SIZE, BlockNumberEntry_s, link); + + RTN_STATUS(zOK); +} + +/************************************************************************* + * + * Free the block table + * + *************************************************************************/ +void freeBlockTable ( + BlockTable_s *blockTable) +{ + BlockTableMem_s *memBlock; + + ENTER(TXACTION, freeBlockTable); + + while (STK_NOT_EMPTY(blockTable->memList)) + { + STK_POP(blockTable->memList, memBlock, BlockTableMem_s, nextMem); + free(memBlock); + } + + RTN_VOID(); +} + +/************************************************************************* + * + * Put an entry in the block table for a transaction + * + *************************************************************************/ +STATUS updateBlock ( + XactTable_s *xactionTable, + BlockTable_s *blockTable, + NINT xactionLoc, + BlockInfo_s *blkInfo, +// NINT generation, + NINT sequence) +{ + BlockNumberEntry_s *blockPtr; +// BlockNumberEntry_s *blockEntry; + BlockTableMem_s *memBlock; + + + ENTER(TXACTION, updateBlock); + + blockPtr = NULL; + +// STK_FOREACH(xactionTable->xEntry[xactionLoc].blockList, blockEntry, +// BlockNumberEntry_s, link) +// { +// if (blockEntry->block == blkInfo->blkNum) +// { +// blockPtr = blockEntry; +// break; +// } +// } +// +// if (blockPtr == NULL) +// { /* no block entry found that matches -- add a new entry */ + if (STK_EMPTY(blockTable->availList)) + { /* nothing on the avail list -- make more room in the table */ + memBlock = zalloc(sizeof(BlockTableMem_s)); + if (memBlock == NULL) + { + RTN_STATUS(zFAILURE); + } + STK_PUSH(blockTable->memList, memBlock, nextMem); + /* init the avail list */ + STK_INIT_ELEMENTS(blockTable->availList, memBlock->entry, + BLOCK_TABLE_ALLOC_SIZE, BlockNumberEntry_s, link); + } + /* get an entry from the avail list and link it onto the transaction */ + STK_POP(blockTable->availList, blockPtr, BlockNumberEntry_s, link); + STK_PUSH(xactionTable->xEntry[xactionLoc].blockList, blockPtr, link); +DEBUG_PRINTF(TXACTION, DBG_NOINDENT, (LBLUE,MSGNot("Add blk to xact. xid=%d blk=%d blockAddr=%x\n"), + xactionTable->xEntry[xactionLoc].xid.unique, blkInfo->blkNum, blockPtr)); +// } + /* fill in the new information */ + blockPtr->block = blkInfo->blkNum; +// blockPtr->generation = generation; + blockPtr->sequence = sequence; +// blockPtr->state = blkInfo->logState; + + RTN_STATUS(zOK); +} + +/************************************************************************* + * + * Init the uncommitted transaction table + * + *************************************************************************/ +STATUS initUncommittedTable ( + UncommittedTable_s *uncommittedTable) +{ + STATUS status; + + ENTER(TXACTION, initUncommittedTable); + + uncommittedTable->size = INITIAL_UTABLE_SIZE; + uncommittedTable->xids = + zalloc(sizeof(LocalXid_t) * INITIAL_UTABLE_SIZE); + uncommittedTable->numEntries = 0; + + status = ((uncommittedTable->xids == NULL) ? zFAILURE : zOK); + + RTN_STATUS(status); +} + +/************************************************************************* + * + * Free the uncommitted transaction table + * + *************************************************************************/ +void freeUncommittedTable ( + UncommittedTable_s *uncommittedTable) +{ + ENTER(TXACTION, freeUncommittedTable); + + free(uncommittedTable->xids); + uncommittedTable->size = 0; + + RTN_VOID(); +} + +/************************************************************************* + * + * Put an entry in the uncommitted transaction table + * + *************************************************************************/ +STATUS pushUncommittedEntry ( + UncommittedTable_s *uncommittedTable, + LocalXid_t xid) /* input - transaction id */ +{ + LocalXid_t *tableEntry; + + ENTER(TXACTION, pushUncommittedEntry); + + if (uncommittedTable->size <= uncommittedTable->numEntries) + { /* grow the table */ + uncommittedTable->size *= XTABLE_GROWTH_FACTOR; + tableEntry = realloc(uncommittedTable->xids, + sizeof(LocalXid_t) * uncommittedTable->size); + if (tableEntry == NULL) + { + RTN_STATUS(zFAILURE); + } + uncommittedTable->xids = tableEntry; + } + uncommittedTable->xids[uncommittedTable->numEntries] = xid; + uncommittedTable->numEntries++; + + RTN_STATUS(zOK); +} + +/************************************************************************* + * + * get an entry from the uncommitted transaction table + * + *************************************************************************/ +LocalXid_t popUncommittedEntry ( + UncommittedTable_s *uncommittedTable) +{ + zASSERT(uncommittedTable->numEntries >= 1); + return uncommittedTable->xids[--uncommittedTable->numEntries]; +} + +/************************************************************************* + * + * Init the undo transaction table + * + *************************************************************************/ +STATUS initUndoTable ( + UndoTable_s *undoTable) +{ + STATUS status; + + ENTER(TXACTION, initUndoTable); + + undoTable->size = INITIAL_UTABLE_SIZE; + undoTable->xids = + zalloc(sizeof(LocalXid_t) * INITIAL_UNDO_TABLE_SIZE); + undoTable->numEntries = 0; + + status = ((undoTable->xids == NULL) ? zFAILURE : zOK); + + RTN_STATUS(status); +} + +/************************************************************************* + * + * Free the undo transaction table + * + *************************************************************************/ +void freeUndoTable ( + UndoTable_s *undoTable) +{ + ENTER(TXACTION, freeUndoTable); + + free(undoTable->xids); + undoTable->size = 0; + + RTN_VOID(); +} + +/************************************************************************* + * + * Put an entry in the undo transaction table + * + *************************************************************************/ +STATUS addUndoEntry ( + UndoTable_s *undoTable, + LocalXid_t xid) /* input - transaction id */ +{ + LocalXid_t *tableEntry; + + ENTER(TXACTION, addUndoEntry); + + if (undoTable->size <= undoTable->numEntries) + { /* grow the table */ + undoTable->size *= XTABLE_GROWTH_FACTOR; + tableEntry = realloc(undoTable->xids, + sizeof(LocalXid_t) * undoTable->size); + if (tableEntry == NULL) + { + RTN_STATUS(zFAILURE); + } + undoTable->xids = tableEntry; + } + undoTable->xids[undoTable->numEntries] = xid; + undoTable->numEntries++; + DEBUG_PRINTF(TXACTION, DBG_NOINDENT, + (LBLUE, MSGNot("Added undo. xid=%d\n"), xid)); + + RTN_STATUS(zOK); +} + +/************************************************************************* + * + * Remove an entry from the undo table + * + *************************************************************************/ +void removeUndoEntry ( + UndoTable_s *undoTable, + NINT index) /* input - array index into the table */ +{ + ENTER(TXACTION, removeUndoEntry); + + undoTable->xids[index] = undoTable->xids[--undoTable->numEntries]; + + RTN_VOID(); +} + +/************************************************************************* + * + * PASS 2 of Transaction Recovery. + * + * Do analysis to figure out what transactions need to be undone. This + * pass figures out which transaction are dependent on others being done + * and returns a list of those that must be undone. The dependency is + * based on blocks referneces. If a commited transactions references + * a block AFTER (higher LSN) a uncommitted transaction then the commited + * transaction will be converted to a uncommited transaction. + * + *************************************************************************/ +STATUS undoAnalysis( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + XactTable_s *xactionTable, /* input - transaction table */ + UndoTable_s *undoTable) /* output - the table is filled in with the + transactions that need to be undone */ +{ + ZlogRecoveryKey_s key; + ZfsXasRecovery_s *logRec; + BlockInfo_s *blkInfo; + BlockNumberEntry_s *committedEntry; + BlockNumberEntry_s *unCommittedEntry; + BlockTable_s blockTable; + UncommittedTable_s uncommittedTable; +// NINT location; + NINT sequence; + NINT xactionLoc; +// NINT generationNum; + NINT i; + + ENTER(TXACTION, undoAnalysis); + +#if NSS_DEBUG IS_ENABLED + if (xactionTable->numEntries > 0) + { + DBG_DebugPrintf( LRED, + MSGNot("** Uncommitted Transactions (Type 3 transactions) **\n")); + for (i = 0; i < xactionTable->numEntries; i++) + { + char buffer[40]; + + aprintf(CYAN, MSGNot(" (DEBUG) Xid %d - Uncommitted %s\n"), + xactionTable->xEntry[i].xid.unique, + UTCTime2Str(xactionTable->xactionTime,&buffer[0]) ); + DBG_DebugPrintf(LGREEN, MSGNot(" (DEBUG) Xid %d - Uncommitted (state - %d) %s\n"), + xactionTable->xEntry[i].xid.unique, + xactionTable->xEntry[i].state, + UTCTime2Str(xactionTable->xactionTime,&buffer[0]) ); + + } + } +#endif + + /* + * Init the various table we are going to need + */ + if (initBlockTable(&blockTable) != zOK) + { + zASSERT("Unable to init the block table"==0); + SetErrno(genMsg, zERR_NO_MEMORY); + goto error; + } + if (initUncommittedTable(&uncommittedTable) != zOK) + { + zASSERT("Unable to init the uncommitted table"==0); + SetErrno(genMsg, zERR_NO_MEMORY); + goto freeBlock; + } + + sequence = 0; + ZLOG_RecoveryOpen(ZLOG_RECOVERY_END | ZLOG_RECOVERY_EXPANDED, pool, &key); + while (xactionTable->numUncommittedEntries > 0) + { /* there are still entries that might need to be undone */ + if (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_PREVIOUS) != zOK) + { + if (GetErrno(genMsg) == zERR_ZLOG_NO_MORE_RECORDS) + { /* hit the front of the log -- need to run zRepair */ + errPrintf(WHERE, Module, 1410, + MSG("Unable to recover the file system -- " + "run Rebuild before continuing.", 860)); + zASSERT( "Please tell Greg that this ASSERT occurred" == NULL ); + } + goto errorClose; + } + pool->zp_stats.pass2++; + logRec = key.ZRK_ZfsXasRecovery; + zASSERT(logRec->ZXR_FunctionIndex < XFUNC_MAX); + blkInfo = ZLOG_START_OF_POOL_BLOCKS(logRec); + sequence++; + /* see if the entry is already in the xaction table */ + xactionLoc = findXactEntry(xactionTable, logRec->ZXR_LocalXid); + if (xactionLoc == -1) + { /* no entry found -- add it */ + if (addXactEntry(xactionTable, XTE_COMMITTED, logRec->ZXR_LocalXid, +#if NSS_DEBUG IS_ENABLED + &key, +#endif + &xactionLoc) != zOK) + { /* the table is full */ + zASSERT("Too many open transactions"==0); + SetErrno(genMsg, zERR_NO_MEMORY); + goto errorClose; + } + } + /* + * This next code ignores blocks that are part of the Purge + * Log because many transactions use the same blocks. If + * we did not ignore them we would greatly increase the number + * of committed transactions that would have to be uncommited. + * We can ignore Purge Log Blocks because the Purge Log's undo + * and redo routines operate logically on a block! + * + * The Purge Log can be logical at the block level because + * a block change does NOT effect other blocks - unlike changes + * to B-Trees. + * + * FixFixFix6(Randy) - If the original operation is a DELETE that + * does not commit and the purge log block becomes full 20 + * transactions later before we crash. On the UNDO pass would + * we not do an insert and not have enough space??? + * + * Note that we CANNOT ignore some XFUNC_VOLINFO_MODIFY blocks + * because on a crash during the UNDO pass the LSN would be left + * LOW. Then on the next activate we would REDO some + * items that had already been done and thus end up with + * incorrect values. + */ + +// if ((logRec->ZXR_FunctionIndex != XFUNC_PLOG_INSERT) && +// (logRec->ZXR_FunctionIndex != XFUNC_PLOG_DELETE)) + /* FixFixFix6 - This (above) fixes above fix item by not skipping + * purge loge entries. Could allow skipping of INSERT??? + * Note this fix causes more uncommits of committed + * transactions. + */ + { /* if not a purgelog function then look at all of the blocks */ + for (i = 0; i < logRec->ZXR_PoolBlockCount; ++i) + { /* for each block in the log record */ + if (updateBlock(xactionTable, &blockTable, xactionLoc, &blkInfo[i], + /*generationNum,*/ sequence) != zOK) + { /* the table is full */ + zASSERT("Too many blocks in a transaction"==0); + goto memErrorClose; + } + } + } + if (logRec->ZXR_TransactionState & XAS_XR_TS_START) + { /* this is the first log record in a transaction */ + xactionTable->xEntry[xactionLoc].state |= XTE_COMPLETED; + if (!(xactionTable->xEntry[xactionLoc].state & XTE_COMMITTED)) + { /* the transaction we are finishing is uncommitted */ + if (pushUncommittedEntry(&uncommittedTable, + xactionTable->xEntry[xactionLoc].xid) + != zOK) + { /* the table is full */ + zASSERT("Too many uncommitted entries"==0); + goto memErrorClose; + } + } + } + + /* + * Figure out what transactions need to be undone + */ + while (uncommittedTable.numEntries > 0) + { /* while there are entries on the uncommitted stack */ + xactionLoc = findXactEntry(xactionTable, + popUncommittedEntry(&uncommittedTable)); + if (xactionLoc == -1) + { + zASSERT("unable to find transaction from uncommitted stack\n"==0); + goto errorClose; + } + for (i = 0; i < xactionTable->numEntries; i++) + { /* check each entry in the transaction table */ + if ((i != xactionLoc) && (xactionTable->xEntry[i].state & + XTE_COMMITTED)) + { /* if the entry is in the committed state */ + STK_FOREACH(xactionTable->xEntry[i].blockList, committedEntry, + BlockNumberEntry_s, link) + { /* check each block in the committed table entry */ + STK_FOREACH(xactionTable->xEntry[xactionLoc].blockList, + unCommittedEntry, BlockNumberEntry_s, link) + { /* check each block in the uncommited entry */ +//printf("committed[%x]=%d uncommitted[%x]=%d\n", &committedEntry, committedEntry->block, +// &unCommittedEntry, unCommittedEntry->block); + if ((unCommittedEntry->block == committedEntry->block) && +// ((unCommittedEntry->generation == committedEntry->generation) || +// ((unCommittedEntry->state & LOG_FREE_NODE) && +// (unCommittedEntry->generation > committedEntry->generation))) && + (unCommittedEntry->sequence > committedEntry->sequence)) + { /* we have common blocks in the required order - + * force the committed transaction into the + * uncommitted state. + */ + xactionTable->xEntry[i].state &= ~XTE_COMMITTED; + xactionTable->numUncommittedEntries++; +#if NSS_DEBUG IS_ENABLED + { + char buffer[40]; + + aprintf(CYAN, MSGNot(" (DEBUG) Xid %d - Converted to uncommitted %s\n"), + xactionTable->xEntry[i].xid.unique, + UTCTime2Str(xactionTable->xactionTime,&buffer[0]) ); + DBG_DebugPrintf(CYAN, + MSGNot(" (DEBUG) Xid %d - Converted to uncommitted %s\n"), + xactionTable->xEntry[i].xid.unique, + UTCTime2Str(xactionTable->xactionTime,&buffer[0]) ); + } + DEBUG_PRINTF(TXACTION, DBG_NOINDENT, + (LRED, MSGNot(" undo transaction number (CONVERTED) %d "), + xactionTable->xEntry[i].xid.unique) ); + { + char buffer[40]; + + DEBUG_PRINTF(TXACTION, DBG_NOINDENT, + (CYAN,MSGNot("%s\n"),UTCTime2Str(xactionTable->xactionTime,&buffer[0])) ); + } + DEBUG_PRINTF(TXACTION, DBG_NOINDENT, + (LBLUE,MSGNot("**** COMMON BLOCK NUMBER FOUND **** xid=%d comparing to xid=%d blk=%d\n"), + xactionTable->xEntry[xactionLoc].xid.unique, + xactionTable->xEntry[i].xid.unique, committedEntry->block) ); +#endif + if (pushUncommittedEntry(&uncommittedTable, + xactionTable->xEntry[i].xid) != zOK) + { /* the table is full */ + zASSERT("Too many uncommitted entries"==0); + goto memErrorClose; + } + goto next; + } + } + } + } +next: + continue; + } + if (xactionTable->xEntry[xactionLoc].state & XTE_COMPLETED) + { /* Add XID to table of XIDs that will be undone in PASS 5 */ + if (addUndoEntry(undoTable, xactionTable->xEntry[xactionLoc].xid) + != zOK) + { /* the table is full */ + zASSERT("Too many undo entries"==0); + goto memErrorClose; + } + removeXactEntry(xactionTable, &blockTable, xactionLoc); + } + } + } + ZLOG_RecoveryClose(&key); +#if NSS_DEBUG IS_ENABLED + aprintf(CYAN,MSGNot(" ** (DEBUG) %u transaction(s) to undo\n"), undoTable->numEntries); + DBG_DebugPrintf(CYAN,MSGNot(" ** %u transaction(s) to undo\n"), undoTable->numEntries); + DEBUG_PRINTF(TXACTION, DBG_NOINDENT, + (LGREEN, MSGNot("End of PASS 2 - undo analysis pass. Num to undo=%d\n"),undoTable->numEntries)); + for (i = 0; i < undoTable->numEntries; i++) + { +// aprintf(CYAN, MSGNot(" (DEBUG) Transaction %d - Uncommitted\n"), undoTable->xids[i].unique); + DEBUG_PRINTF(TXACTION, DBG_NOINDENT, + (LGREEN, MSGNot(" undo transaction number %d\n"), undoTable->xids[i].unique)); + } + if (xactionTable->numEntries > 0) + { + DEBUG_PRINTF(TXACTION, DBG_NOINDENT, + (LGREEN, MSGNot("** Transactions still in process at end (xaction table) **\n"))); + for (i = 0; i < xactionTable->numEntries; i++) + { + DEBUG_PRINTF( TXACTION, DBG_NOINDENT, + (LGREEN, MSGNot(" Entry [%d] unique - %d state - 0x%x\n"), + i, xactionTable->xEntry[i].xid.unique, + xactionTable->xEntry[i].state) ); +// if (!(xactionTable->xEntry[i].state & XTE_COMPLETED)) +// { +// DEBUG_PRINTF(TXACTION, DBG_NOINDENT, +// (LGREEN, MSGNot(" Entry [%d]=%d\n"), i, xactionTable->xEntry[i].xid.unique)); +// } + } + } +#endif + freeUncommittedTable(&uncommittedTable); + freeBlockTable(&blockTable); + RTN_STATUS(zOK); + +memErrorClose: + SetErrno(genMsg, zERR_NO_MEMORY); +errorClose: + ZLOG_RecoveryClose( &key); + freeUncommittedTable(&uncommittedTable); +freeBlock: + freeBlockTable(&blockTable); +error: + RTN_STATUS(zFAILURE); +} + +/************************************************************************* + ************************************************************************* + * + * Main routine for recovery + * + ************************************************************************* + *************************************************************************/ +STATUS XactionRecover ( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + NINT mode) +{ + ZlogRecoveryKey_s key; + STATUS status; + XRecovery_f func; + XactTable_s xactTable; /* Table of uncommitted transactions */ + XfreeTable_s freeTable; /* Table of blocks freed in log */ + UndoTable_s undoTable; /* Table of transactions to be undone */ + NINT numUndos = 0; + NINT numRedos = 0; + NINT numRealRedos = 0; + NINT numRealUndos = 0; + NINT location; + BlockInfo_s *blkInfo; + NINT i; +#if NSS_DEBUG IS_ENABLED + Time_t firstLRTime=0; /* The time of the first LR in + * the NORMAL area of the ZLOG file */ + Time_t lastLRTime=0; /* The time of the last LR in ZLOG file */ + Time_t currentLRTime; /* The time of the current LR */ + BOOL haveAsserted = FALSE; +#endif + NINT excessUndos = FALSE; /* When TRUE PASS 2 was + * unable to find a point where there are no + * open uncommitted transactions before + * hitting the last checkpoint. When this + * happens we will do PASS 3 and PASS 4, but + * not PASS 5. We then return an error + * so ZREPAIR will be run. + */ + + ENTER(TXACTION, XactionRecover); + + /* Normally, it is EVIL to clear errNo. We do here to save + * stack space for a genMSg and because no one should be calling + * this as cleanup code. + */ + zASSERT( GetErrno( genMsg ) == zOK ); + ClearErrno( genMsg ); + if (initXactTable(&xactTable) != zOK) + { + SetErrno(genMsg, zERR_NO_MEMORY); + goto errorReturn; + } + if (initXfreeTable(&freeTable) != zOK) + { + SetErrno(genMsg, zERR_NO_MEMORY); + goto freeXact; + } + if (initUndoTable(&undoTable) != zOK) + { + SetErrno(genMsg, zERR_NO_MEMORY); + goto freeXfree; + } + + /* + * PASS 1 of Recovery: ANALYSIS: Figure out the transactions + * that didn't commit. + * + * There are three types of transactions in the ZLOG file. + * + * Type 1 have a Start record. + * Type 2 have a End record. + * Type 3 have a Start record and a End record. + * + * Type 1 - Uncommitted before crash + * Type 2 - Committed across a checkpoint + * Type 3 - Committed + * + * PASS 1 finds all the transactions of type 1. These are + * the ONLY transactions left in the xactTable[] when the pass + * is done. Note that Type 1 transactions never get placed into + * the xactTable[] in PASS 1. + * + */ +#if NSS_DEBUG IS_ENABLED +// aprintf(LGREEN, MSGNot(" (DEBUG) PASS 1\n") ); + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 1***\n")); +#endif + ZLOG_RecoveryOpen(ZLOG_RECOVERY_BEGIN | ZLOG_RECOVERY_NORMAL, pool, &key); + while (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_NEXT ) == zOK) + { + pool->zp_stats.pass1++; +#if NSS_DEBUG IS_ENABLED + lastLRTime = ZLOG_RecoveryTimeGet( &key ); + if (firstLRTime == 0) + { + firstLRTime = lastLRTime; + } +#endif + if (key.ZRK_ZfsXasRecovery->ZXR_TransactionState & XAS_XR_TS_START) + { /* found a starting log record - insert in the transaction table */ + if ( addXactEntry(&xactTable, 0, + key.ZRK_ZfsXasRecovery->ZXR_LocalXid, +#if NSS_DEBUG IS_ENABLED + &key, +#endif + &location) != zOK ) + { /* error */ + SetErrno(genMsg, zERR_NO_MEMORY); + goto errClose; + } + } + if (key.ZRK_ZfsXasRecovery->ZXR_TransactionState & XAS_XR_TS_END) + { /* found a end xaction log record - remove from the transaction table */ + location = findXactEntry(&xactTable, + key.ZRK_ZfsXasRecovery->ZXR_LocalXid); + if (location != -1) + { /* found an entry */ + removeXactEntry(&xactTable, NULL, location); + } +#if NSS_DEBUG IS_ENABLED + if (location == -1) + { + char buffer[40]; + + DBG_DebugPrintf(LRED, MSGNot("TYPE 2 Transaction: %lu %s\n"), + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique, + UTCTime2Str( lastLRTime ,&buffer[0]) ); + } +#endif + } + } + if ( mode & VOLMODE_VERBOSE ) + { + aprintf(NSS_POOL_COLOR,MSGNot(" ** %u uncommitted transaction(s)\n"), + xactTable.numEntries ); + } +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN,MSGNot(" ** %u uncommitted transaction(s)\n"), xactTable.numEntries); +#endif + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); + +#if NSS_DEBUG IS_ENABLED + { + char buffer[40]; + + aprintf(CYAN, MSGNot(" (DEBUG) First transaction start time is %s\n"), + UTCTime2Str(firstLRTime,&buffer[0]) ); + DBG_DebugPrintf(CYAN, MSGNot("First transaction start time is %s\n"), + UTCTime2Str(firstLRTime,&buffer[0]) ); + aprintf(CYAN, MSGNot(" (DEBUG) Last transaction start time is %s\n"), + UTCTime2Str(lastLRTime,&buffer[0]) ); + DBG_DebugPrintf(CYAN, MSGNot("Last transaction start time is %s\n"), + UTCTime2Str(lastLRTime,&buffer[0]) ); + } +#endif + + /* + * PASS 2 of Recovery: ANALYSIS: Figure out which transactions need + * to be undone. + */ +#if NSS_DEBUG IS_ENABLED +// aprintf(LGREEN, MSGNot(" (DEBUG) PASS 2\n") ); + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 2***\n")); +#endif + if (undoAnalysis(genMsg, pool, &xactTable, &undoTable) != zOK) + { /* error */ + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + excessUndos = TRUE; + ClearErrno(genMsg); + } + + /* + * PASS 3 of Recovery: collect the blocks that will be freed + * + * This pass is needed because if we TOSS a buffer because + * it gets deleted then home may be signaled even thought + * other transactions had marked the buffer dirty. These + * other transactions may not be happy if we tell them to + * REDO because the LSN may not match because we did not + * write the buffer before moving home. + * + * Currently I do not believe that any early tosses are occuring + * because the TOSS (and delete) are being delayed until the + * 4th checkpoint occurs. See XfreePoolBlks(). + */ +#if NSS_DEBUG IS_ENABLED +// aprintf(LGREEN, MSGNot(" (DEBUG) PASS 3\n") ); + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 3 ***\n")); +#endif + ZLOG_RecoveryOpen(ZLOG_RECOVERY_BEGIN | ZLOG_RECOVERY_NORMAL, pool, &key); + while (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_NEXT ) == zOK) + { + pool->zp_stats.pass3++; + if (key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount != 0) + { + status = recoverFreedBlocks( genMsg, &freeTable, + key.ZRK_ZfsXasRecovery); + if ( status != zOK ) + { + break; + } + } + } + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); + + /* + * Pass 4 of Recovery: REDO : Call redo function for each log + * record. Mark the blocks that should not be redone by filling + * them with zero. + */ +#if NSS_DEBUG IS_ENABLED +// aprintf(LGREEN, MSGNot(" (DEBUG) PASS 4\n") ); + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 4 ***\n")); +#endif + ZLOG_RecoveryOpen(ZLOG_RECOVERY_BEGIN | ZLOG_RECOVERY_NORMAL, pool, &key); + while (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_NEXT) == zOK) + { + zASSERT(key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex < XFUNC_MAX); + + pool->zp_stats.func[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].redo++; + pool->zp_stats.pass4++; + ++numRedos; + setReplayState(&freeTable, key.ZRK_ZfsXasRecovery); /* used to be called markFreedBlocks */ +// markFreedBlocks( &freeTable, key.ZRK_ZfsXasRecovery); +#if NSS_DEBUG IS_ENABLED + blkInfo = ZLOG_START_OF_POOL_BLOCKS(key.ZRK_ZfsXasRecovery); + DBG_DebugPrintf(LGREEN, MSGNot("R: %d xid:%x"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); + for (i = 0; i < key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount; ++i) + { /* for each block in the log record */ + DBG_DebugPrintf(CYAN, MSGNot(" b%d=%x(%x,%x)"), i+1, blkInfo[i].blkNum, + blkInfo[i].logState, blkInfo[i].replayState); + } +// DBG_DebugPrintf(LRED, MSGNot("\n")); +#endif + pool->zp_stats.redid = 0; + func = XRecoveryTable[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].redo; + status = func(genMsg, pool, key.ZRK_ZfsXasRecovery, X_REDO); +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot("\n")); /* Redo/undo routines print P(rocessed) or S(kip) from _ALREADY done macro */ +#endif + if (status != zOK) + { + goto errClose; + } + if ( pool->zp_stats.redid > 0 ) + { + ++numRealRedos; + } + } + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); + +#if NSS_DEBUG IS_ENABLED +#if 0 + + // This may need a special PASS 3a that creates a freeTable[] that + // notes that all transactions will be be undone. This + // is not needed now because we do not TOSS a deleted buffer at + // commmit time. +// aprintf(LGREEN, MSGNot(" (DEBUG) PASS 4a\n") ); + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 4a ***\n")); + + ZLOG_RecoveryOpen(ZLOG_RECOVERY_END | ZLOG_RECOVERY_NORMAL, pool, &key); + while (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_PREVIOUS) == zOK) + { + zASSERT(key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex < XFUNC_MAX); + + setReplayState(&freeTable, key.ZRK_ZfsXasRecovery); /* used to be called markFreedBlocks */ +#if NSS_DEBUG IS_ENABLED + blkInfo = ZLOG_START_OF_POOL_BLOCKS(key.ZRK_ZfsXasRecovery); + DBG_DebugPrintf(LRED, MSGNot("PASS 4a: f:%d xid:%x"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); + for (i = 0; i < key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount; ++i) + { /* for each block in the log record */ + DBG_DebugPrintf(CYAN, MSGNot(" b%d=%x(%x,%x)"), i+1, blkInfo[i].blkNum, + blkInfo[i].logState, blkInfo[i].replayState); + } + DBG_DebugPrintf(LGREEN, MSGNot("\n")); +#endif + func = XRecoveryTable[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].undo; + status = func(genMsg, pool, key.ZRK_ZfsXasRecovery, X_UNDO); + if (status != zOK) + { + goto errClose; + } + } + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); +// aprintf(LGREEN, MSGNot(" (DEBUG) PASS 4b\n") ); + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 4b ***\n")); + ZLOG_RecoveryOpen(ZLOG_RECOVERY_BEGIN | ZLOG_RECOVERY_NORMAL, pool, &key); + while (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_NEXT) == zOK) + { + zASSERT(key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex < XFUNC_MAX); + setReplayState(&freeTable, key.ZRK_ZfsXasRecovery); /* used to be called markFreedBlocks */ +#if NSS_DEBUG IS_ENABLED + blkInfo = ZLOG_START_OF_POOL_BLOCKS(key.ZRK_ZfsXasRecovery); + DBG_DebugPrintf(LGREEN, MSGNot("PASS 4b: f:%d xid:%x"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); + for (i = 0; i < key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount; ++i) + { /* for each block in the log record */ + DBG_DebugPrintf(CYAN, MSGNot(" b%d=%x(%x,%x)"), i+1, blkInfo[i].blkNum, + blkInfo[i].logState, blkInfo[i].replayState); + } + DBG_DebugPrintf(LGREEN, MSGNot("\n")); +#endif + func = XRecoveryTable[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].redo; + status = func(genMsg, pool, key.ZRK_ZfsXasRecovery, X_REDO); + if (status != zOK) + { + goto errClose; + } + } + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); +#endif +#endif /* NSS_DEBUG (PASS 4a and 4b) */ + + /* + * Pass 5 of Recovery: UNDO : Call undo functions. + */ +#if NSS_DEBUG IS_ENABLED +// aprintf(LGREEN, MSGNot(" (DEBUG) PASS 5\n") ); + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 5 ***\n")); +#endif + if ( excessUndos ) + { + SetErrno( genMsg, zERR_RECOVERY_TOO_MANY_UNCOMMITS ); + goto error; + } + /* FixFixFix6 - We could track numEntries as we hit the START + * record of the transactions being UNDOne. Then when we hit + * zero within the loop below we can stop Pass 5. + */ + if ( undoTable.numEntries > 0 ) + { + ZLOG_RecoveryOpen(ZLOG_RECOVERY_END | ZLOG_RECOVERY_EXPANDED, pool, &key); + while (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_PREVIOUS) == zOK) + { + zASSERT(key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex < XFUNC_MAX); + + if (findUndoEntry(&undoTable, key.ZRK_ZfsXasRecovery->ZXR_LocalXid) != -1) + { /* if this transaction is being undone ...*/ + pool->zp_stats.func[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].undo++; + pool->zp_stats.pass5++; + ++numUndos; + blkInfo = ZLOG_START_OF_POOL_BLOCKS(key.ZRK_ZfsXasRecovery); + for (i = 0; i < key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount; ++i) + { /* for each block in the log record */ + blkInfo[i].replayState = 0; /* Block should be replayed */ + } +#if NSS_DEBUG IS_ENABLED + currentLRTime = ZLOG_RecoveryTimeGet( &key ); + /* This ASSERT just says that we are UNDOing way back in time */ + if ( !haveAsserted ) + { + zASSERT( (currentLRTime+15) >= lastLRTime ); + haveAsserted = TRUE; + } + blkInfo = ZLOG_START_OF_POOL_BLOCKS(key.ZRK_ZfsXasRecovery); + DBG_DebugPrintf(LRED, MSGNot("U: %d xid:%x"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); + for (i = 0; i < key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount; ++i) + { /* for each block in the log record */ + DBG_DebugPrintf(CYAN, MSGNot(" b%d=%x(%x,%x)"), i+1, blkInfo[i].blkNum, + blkInfo[i].logState, blkInfo[i].replayState); + } +// DBG_DebugPrintf(LRED, MSGNot("\n")); +#endif + pool->zp_stats.redid = 0; + func = XRecoveryTable[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].undo; + status = func(genMsg, pool, key.ZRK_ZfsXasRecovery, X_UNDO); +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot("\n")); +#endif + if (status != zOK) + { + goto errClose; + } + if ( pool->zp_stats.redid > 0 ) + { + ++numRealUndos; + } + } + } + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); + } +#if NSS_DEBUG IS_ENABLED + else + { + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 5 (skipped no UNDO entries) ***\n")); + } +#endif + freeUndoTable(&undoTable); + freeXfreeTable(&freeTable); + freeXactTable(&xactTable); + + X_LATCH( &pool->ZFSPOOLbeastLatch); + cacheFlushMyCacheBufs( &pool->ZFSPOOLmycache); + UNX_LATCH( &pool->ZFSPOOLbeastLatch); + defaultFlushWait( &pool->ZFSPOOLmycache.agent); + cacheTossAll( &pool->ZFSPOOLmycache); + + + +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot(" ** %d(%d) Redo(s), %d(%d) Undo(s)\n"), + numRedos, numRealRedos, numUndos, numRealUndos ); + aprintf(CYAN, MSGNot(" ** (DEBUG) %d Redo(s), %d Undo(s)\n"), + numRedos, numUndos ); +#endif + if ( mode & VOLMODE_VERBOSE ) + { + aprintf(NSS_POOL_COLOR, MSG(" ** %d Redo(s), %d Undo(s)\n", 861), + numRealRedos, numRealUndos ); + } + /* + * This code is here just to let FTEST report 'useful' information + * when at demos. + */ + { + ZlogBeast_s *zlog; + + zASSERT( pool != NULL ); + zlog = pool->zfsLogBeast; + zASSERT( zlog != NULL ); + zlog->ZLB_Ftest.ZRI_RedoMax = numRedos; + zlog->ZLB_Ftest.ZRI_RedoActual = numRealRedos; + zlog->ZLB_Ftest.ZRI_UndoMax = numUndos; + zlog->ZLB_Ftest.ZRI_UndoActual = numRealUndos; + + } + + RTN_STATUS(zOK); + +errClose: + ZLOG_RecoveryClose( &key); +error: + /* Flush want we were able to do. */ + X_LATCH( &pool->ZFSPOOLbeastLatch); + cacheFlushMyCacheBufs( &pool->ZFSPOOLmycache); + UNX_LATCH( &pool->ZFSPOOLbeastLatch); + defaultFlushWait( &pool->ZFSPOOLmycache.agent); + cacheTossAll( &pool->ZFSPOOLmycache); + + freeUndoTable(&undoTable); +freeXfree: + freeXfreeTable(&freeTable); +freeXact: + freeXactTable(&xactTable); +errorReturn: + RTN_STATUS(zFAILURE); +} + +/************************************************************************* + ************************************************************************* + * + * Main routine for recovery (Logical Undo version Does Passes 1 to 6 + * Pass 7 done later in XactionRecoverLogicalUndo) + * + ************************************************************************* + *************************************************************************/ +STATUS XactionRecoverLogicalUndoPass1To6 ( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + NINT mode) +{ + ZlogBeast_s *zlogBeast = pool->zfsLogBeast; + ZlogRecoveryKey_s key; + STATUS status; + XRecovery_f func; + XfreeTable_s freeTable; /* Table of blocks freed in log */ + UndoTable_s undoTable; /* Table of transactions to be undone */ + NINT numUndos = 0; + NINT numRedos = 0; + NINT numLogicalUndos = 0; + NINT numRealRedos = 0; + NINT numRealUndos = 0; + NINT location; + BlockInfo_s *blkInfo; + NINT i; +#if NSS_DEBUG IS_ENABLED + Time_t firstLRTime=0; /* The time of the first LR in + * the NORMAL area of the ZLOG file */ + Time_t lastLRTime=0; /* The time of the last LR in ZLOG file */ + Time_t currentLRTime; /* The time of the current LR */ +#endif + + ENTER(TXACTION, XactionRecover); + + /* Normally, it is EVIL to clear errNo. We do here to save + * stack space for a genMSg and because no one should be calling + * this as cleanup code. + */ + zASSERT( GetErrno( genMsg ) == zOK ); + ClearErrno( genMsg ); + + if (initXfreeTable(&freeTable) != zOK) + { + SetErrno(genMsg, zERR_NO_MEMORY); + } + if (initUndoTable(&undoTable) != zOK) + { + SetErrno(genMsg, zERR_NO_MEMORY); + goto freeXfree; + } + + /* + * PASS 1 of Recovery: ANALYSIS: Figure out the transactions + * that didn't commit. + * + * There are three types of transactions in the ZLOG file. + * + * Type 1 have a Start record. + * Type 2 have a End record. + * Type 3 have a Start record and a End record. + * + * Type 1 - Uncommitted before crash + * Type 2 - Committed across a checkpoint + * Type 3 - Committed + * + * PASS 1 finds all the transactions of type 1. These are + * the ONLY transactions left in the undoTable[] when the pass + * is done. Note that Type 2 transactions never get placed into + * the undoTable[] in PASS 1. + * + * If we are doing Compensation Records, then all Type 1 and + * Type 3 records will be left in the undoTable when the Pass is + * is done. If we are processing normal (non-compensation Records) + * only Type 1 records will be left in the undoTable when done. + * Note - When doing Comp Records we UNDO all of them (even + * committed ones). We do this because we will replay the + * logical UNDOs that created the comps. + * + */ + /* + * PASS 2 of Recovery: ANALYSIS: Figure out which transactions need + * to be undone. With Logical undo's this pass no longer needs to + * be done. All the transactions left in the undoTable by Pass 1 + * will be undone. + */ + /* + * PASS 3 of Recovery: collect the blocks that will be freed + * + * This pass is needed because if we TOSS a buffer because + * it gets deleted then home may be signaled even thought + * other transactions had marked the buffer dirty. These + * other transactions may not be happy if we tell them to + * REDO because the LSN may not match because we did not + * write the buffer before moving home. + * + * Currently I do not believe that any early tosses are occuring + * because the TOSS (and delete) are being delayed until the + * 4th checkpoint occurs. See XfreePoolBlks(). + * + * This pass is now being done with Pass 1. + */ +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 1 2 3***\n")); +#endif + ZLOG_RecoveryOpen(ZLOG_RECOVERY_BEGIN | ZLOG_RECOVERY_NORMAL, pool, &key); + while (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_NEXT ) == zOK) + { + pool->zp_stats.pass1++; +#if NSS_DEBUG IS_ENABLED + lastLRTime = ZLOG_RecoveryTimeGet( &key ); + if (firstLRTime == 0) + { + firstLRTime = lastLRTime; + } +#endif + if (key.ZRK_ZfsXasRecovery->ZXR_TransactionState & XAS_XR_TS_START) + { /* found a starting log record - insert in the transaction table */ + if ( addUndoEntry(&undoTable, + key.ZRK_ZfsXasRecovery->ZXR_LocalXid) != zOK ) + { /* error */ + SetErrno(genMsg, zERR_NO_MEMORY); + goto errClose; + } + } + if (key.ZRK_ZfsXasRecovery->ZXR_TransactionState & XAS_XR_TS_END) + { + /* found a end xaction log record, remove from the transaction + * table. We NEVER undo committed transactions. */ + + location = findUndoEntry(&undoTable, + key.ZRK_ZfsXasRecovery->ZXR_LocalXid); + if (location != -1) + { /* found an entry */ + removeUndoEntry(&undoTable, location); + } +#if NSS_DEBUG IS_ENABLED + if (location == -1) + { + char buffer[40]; + + DBG_DebugPrintf(LRED, MSGNot("TYPE 2 Transaction: %lu %s\n"), + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique, + UTCTime2Str( lastLRTime ,&buffer[0]) ); + } +#endif + } + pool->zp_stats.pass3++; + if (key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount != 0) + { + status = recoverFreedBlocks( genMsg, &freeTable, + key.ZRK_ZfsXasRecovery); + if ( status != zOK ) + { + break; + } + } + } + if ( mode & VOLMODE_VERBOSE ) + { + aprintf(NSS_POOL_COLOR, + MSGNot(" ** %u uncommitted transaction(s)\n"), + undoTable.numEntries ); + } +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, + MSGNot(" ** %u uncommitted transaction(s)\n"), undoTable.numEntries); +#endif + + + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); + +#if NSS_DEBUG IS_ENABLED + { + char buffer[40]; + + aprintf(CYAN, + MSGNot(" (DEBUG) First transaction start time is %s\n"), + UTCTime2Str(firstLRTime,&buffer[0]) ); + DBG_DebugPrintf(CYAN, MSGNot("First transaction start time is %s\n"), + UTCTime2Str(firstLRTime,&buffer[0]) ); + aprintf(CYAN, + MSGNot(" (DEBUG) Last transaction start time is %s\n"), + UTCTime2Str(lastLRTime,&buffer[0]) ); + DBG_DebugPrintf(CYAN, MSGNot("Last transaction start time is %s\n"), + UTCTime2Str(lastLRTime,&buffer[0]) ); + } +#endif + + /* + * Pass 4 of Recovery: REDO : Call redo function for each log + * record. Mark the blocks that should not be redone by filling + * them with zero. + */ +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 4 ***\n")); +#endif + ZLOG_RecoveryOpen(ZLOG_RECOVERY_BEGIN | ZLOG_RECOVERY_NORMAL, pool, &key); + while (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_NEXT) == zOK) + { + zASSERT(key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex < XFUNC_MAX); + + pool->zp_stats.func[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].redo++; + pool->zp_stats.pass4++; + ++numRedos; + /* used to be called markFreedBlocks */ + setReplayState(&freeTable, key.ZRK_ZfsXasRecovery); +#if NSS_DEBUG IS_ENABLED + blkInfo = ZLOG_START_OF_POOL_BLOCKS(key.ZRK_ZfsXasRecovery); + DBG_DebugPrintf(LGREEN, MSGNot("R: %d xid:%x"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); + for (i = 0; i < key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount; ++i) + { /* for each block in the log record */ + DBG_DebugPrintf(CYAN, MSGNot(" b%d=%x(%x,%x)"), i+1, blkInfo[i].blkNum, + blkInfo[i].logState, blkInfo[i].replayState); + } +#endif + pool->zp_stats.redid = 0; + func = XRecoveryTable[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].redo; + status = func(genMsg, pool, key.ZRK_ZfsXasRecovery, X_REDO); +#if NSS_DEBUG IS_ENABLED + /* Redo/undo routines print P(rocessed) or S(kip) from + * _ALREADY done macro */ + DBG_DebugPrintf(LGREEN, MSGNot("\n")); +#endif + if (status != zOK) + { + goto errClose; + } + if ( pool->zp_stats.redid > 0 ) + { + ++numRealRedos; + } + } + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); + + /* + * Pass 5 of Recovery: UNDO : Call undo functions. + */ +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 5 ***\n")); +#endif + /* FixFixFix6 - We could track numEntries as we hit the START + * record of the transactions being UNDOne. Then when we hit + * zero within the loop below we can stop Pass 5. + */ + if ( undoTable.numEntries > 0 ) + { + ZLOG_RecoveryOpen(ZLOG_RECOVERY_END | ZLOG_RECOVERY_NORMAL, + pool, &key); + while (ZLOG_RecoveryGet(genMsg, &key, ZLOG_RG_PREVIOUS) == zOK) + { + zASSERT(key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex < XFUNC_MAX); + + if (findUndoEntry(&undoTable, + key.ZRK_ZfsXasRecovery->ZXR_LocalXid) != -1) + { /* if this transaction is being undone ...*/ + pool->zp_stats.func[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].undo++; + pool->zp_stats.pass5++; + ++numUndos; + blkInfo = ZLOG_START_OF_POOL_BLOCKS(key.ZRK_ZfsXasRecovery); + for (i = 0; i < key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount; ++i) + { /* for each block in the log record */ + blkInfo[i].replayState = 0; /* Block should be replayed */ + } +#if NSS_DEBUG IS_ENABLED + currentLRTime = ZLOG_RecoveryTimeGet( &key ); + /*This ASSERT just says that we are UNDOing way back in time */ + zASSERT( (currentLRTime+15) >= lastLRTime ); + blkInfo = ZLOG_START_OF_POOL_BLOCKS(key.ZRK_ZfsXasRecovery); + DBG_DebugPrintf(LRED, MSGNot("U: %d xid:%x"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); + for (i = 0; i < key.ZRK_ZfsXasRecovery->ZXR_PoolBlockCount; ++i) + { /* for each block in the log record */ + DBG_DebugPrintf(CYAN, MSGNot(" b%d=%x(%x,%x)"), i+1, + blkInfo[i].blkNum, blkInfo[i].logState, + blkInfo[i].replayState); + } +#endif + pool->zp_stats.redid = 0; + func = XRecoveryTable[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].undo; + status = func(genMsg, pool, key.ZRK_ZfsXasRecovery, X_UNDO); +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot("\n")); +#endif + if (status == zX_LOGICAL) + { +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED, MSGNot( + "UNDO copied: f:%d xid:%x\n"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); +#endif + ZLOG_CopyLogicalUndoRecord(zlogBeast, + key.ZRK_ZfsXasRecovery); + numLogicalUndos++; + status = zOK; + } + if (status != zOK) + { + goto errClose; + } + /* If we are doing a UNDO insert of the free tree, then we + * need to put the blocks on the special list, so that we + * don't reallocate them during other logical undo operations + * and the logical undo of this insert is the one that + * deletes them from the free tree. + */ + if (key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex == + XFUNC_FXLOG_L_INSERT) + { + undoFXinsertLogicalHoldBlks(pool, key.ZRK_ZfsXasRecovery); + } + if ( pool->zp_stats.redid > 0 ) + { + ++numRealUndos; + } + } + } + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); + } +#if NSS_DEBUG IS_ENABLED + else + { + DBG_DebugPrintf(LGREEN, + MSGNot("*** PASS 5 (skipped no UNDO entries) ***\n")); + } +#endif + + /* + * Pass 6 of Recovery: Copy Previous Pass 7 records. + * Greg likes to say Pre-crash Pass 5 records. + */ + if (zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn != 0) + { +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 6 ***\n")); +#endif + zASSERT(zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn != 0); +// zASSERT(zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrBlkNum != 0); + zASSERT(zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn != 0); +// zASSERT(zlogBeast->ZLB_P.ZLBP_CompensationPtrBlkNum != 0); + zASSERT(zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn != + zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn); + /* We were in compensation records part of the log when we crashed. + * These records were created while doing logical undo(Pass 7). + * We have just undone in Pass 5 all these comp records. + * Now we need to recopy the logical undo records from the normal + * redo/undo log processing to the recovery active pointer location, so + * the logical undo operation can be redone. + */ + + ZLOG_RecoveryOpen(ZLOG_RECOVERY_BEGIN | ZLOG_RECOVERY_LOGICAL, + pool, &key); + /* This only gets the logical undo records between + * LUP and COMP pointers. (ASSERT that are only tagged logical + * records between the 2 pointers) + */ + while (ZLOG_RecoveryGetLogicalOnly(genMsg, &key, ZLOG_RG_NEXT) == zOK) + { + zASSERT(key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex < XFUNC_MAX); + + if (key.ZRK_ZfsXasRecovery->ZXR_Lsn <= zlogBeast->ZLB_RecoveryCompensationPointerLsn) + { + /* This logical undo operation was completed. The transaction + * started by this record was committed. Redo pass above + * ensured that the operation is complete. Since the xaction + * was committed it cannot be undone. So this record will + * not be copied. + */ +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED, MSGNot( + "LOGICAL UNDO copy skipped: f:%d xid:%x\n"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); +#endif + continue; + } +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED, MSGNot( + "LOGICAL UNDO copied: f:%d xid:%x\n"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); +#endif + ZLOG_CopyLogicalUndoRecord(zlogBeast, key.ZRK_ZfsXasRecovery); + numLogicalUndos++; + + /* If we are doing a UNDO insert of the free tree, then we + * need to put the blocks on the special list + */ + if (key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex == + XFUNC_FXLOG_L_INSERT) + { + undoFXinsertLogicalHoldBlks(pool, key.ZRK_ZfsXasRecovery); + } + } + ZLOG_RecoveryClose( &key); + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + goto error; + } + ClearErrno(genMsg); + } +#if NSS_DEBUG IS_ENABLED + else + { + DBG_DebugPrintf(LGREEN, MSGNot( + "*** PASS 6 (skipped no previous L UNDO entries to copy) ***\n")); + } +#endif + + X_LATCH( &pool->ZFSPOOLbeastLatch); + cacheFlushMyCacheBufs( &pool->ZFSPOOLmycache); + UNX_LATCH( &pool->ZFSPOOLbeastLatch); + defaultFlushWait( &pool->ZFSPOOLmycache.agent); + cacheTossAll( &pool->ZFSPOOLmycache); + + /* + * CHECKPOINT 1 (physical checkpoint is taken in ZLOG) + */ + if (zlogBeast->ZLB_RecoveryInitialActivePointerLsn != + zlogBeast->ZLB_P.ZLBP_ActivePointerLsn) + { +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 7 is needed ***\n")); +#endif + zlogBeast->ZLB_State |= ZLOG_ZB_S_NEEDTODO_COMPENSATION; + } +#if NSS_DEBUG IS_ENABLED + else + { + DBG_DebugPrintf(LGREEN, MSGNot( + "*** PASS 7 not needed (skipped no L UNDO entries) ***\n")); + } +#endif + + freeUndoTable(&undoTable); + freeXfreeTable(&freeTable); + +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, + MSGNot(" ** %d(%d) Redo(s), %d(%d) Undo(s), %d Logical Undo(s)\n"), + numRedos, numRealRedos, numUndos, numRealUndos, numLogicalUndos ); + aprintf(CYAN, + MSGNot(" ** (DEBUG) %d Redo(s), %d Undo(s), %d Logical Undo(s)\n"), + numRedos, numUndos, numLogicalUndos ); +#endif + if ( mode & VOLMODE_VERBOSE ) + { + aprintf(NSS_POOL_COLOR, + MSG(" ** %d Redo(s), %d Undo(s), %d Logical Undo(s)\n", 431), + numRealRedos, numRealUndos, numLogicalUndos ); + } + /* + * This code is here just to let FTEST report 'useful' information + * when at demos. + */ + { + ZlogBeast_s *zlog; + + zASSERT( pool != NULL ); + zlog = pool->zfsLogBeast; + zASSERT( zlog != NULL ); + zlog->ZLB_Ftest.ZRI_RedoMax = numRedos; + zlog->ZLB_Ftest.ZRI_RedoActual = numRealRedos; + zlog->ZLB_Ftest.ZRI_UndoMax = numUndos; + zlog->ZLB_Ftest.ZRI_UndoActual = numRealUndos; + + } + + RTN_STATUS(zOK); + +errClose: + ZLOG_RecoveryClose( &key); +error: + /* Flush want we were able to do. */ + X_LATCH( &pool->ZFSPOOLbeastLatch); + cacheFlushMyCacheBufs( &pool->ZFSPOOLmycache); + UNX_LATCH( &pool->ZFSPOOLbeastLatch); + defaultFlushWait( &pool->ZFSPOOLmycache.agent); + cacheTossAll( &pool->ZFSPOOLmycache); + + freeUndoTable(&undoTable); +freeXfree: + freeXfreeTable(&freeTable); + RTN_STATUS(zFAILURE); +} + +/************************************************************************* + * + * This routine calls the functions to do the logical undo pass of + * recovery + * + *************************************************************************/ +STATUS XactionRecoverLogicalUndoPass7( + GeneralMsg_s *genMsg, + ZfsPool_s *pool) +{ + STATUS status, tmpStatus; + ZlogBeast_s *zlogBeast = pool->zfsLogBeast; + ZlogRecoveryKey_s key; + NINT i; + XRecovery_f func; + + /* + * Pass 7 of Recovery: Logical Undo. + */ + + /* This pass will start reading records from the Home pointer/LUP ptr + * and play the logical undo records, creating new redo/undo records + * in the log starting at the Compensation pointer. + */ + + /* At this time the log is available to add redo/undo routines + * to it, even while we are reading parts of it. + */ + + /* Before we can proceed the super beasts have to loaded and the volume + * data has to be read in. + */ + + /* This pass is only needed if there are logical undo records to + * process + */ + if (!(zlogBeast->ZLB_State & ZLOG_ZB_S_NEEDTODO_COMPENSATION)) + { +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot( + "*** PASS 7 (skipped no L UNDO entries) ***\n")); +#endif + return zOK; + } + + if (ZFSPOOL_ActivateAllLVsQuasi(genMsg, pool) != zOK) + { + status = zFAILURE; + goto activateFailed; + } + + zlogBeast->ZLB_State &= ~ZLOG_ZB_S_NEEDTODO_COMPENSATION; + zlogBeast->ZLB_State |= ZLOG_ZB_S_DOING_COMPENSATION; + + + zASSERT(zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn != 0); +// zASSERT(zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrBlkNum != 0); +// zASSERT(zlogBeast->ZLB_P.ZLBP_CompensationPtrBlkNum != 0); + zASSERT(zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn != 0); + zASSERT(zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn != + zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn); + +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LGREEN, MSGNot("*** PASS 7 ***\n")); +#endif + X_LATCH( &zlogBeast->ZFSLOGbeastLatch); + ZLOG_RecoveryOpen(ZLOG_RECOVERY_BEGIN | ZLOG_RECOVERY_LOGICAL, + pool, &key); + /* This gets only the logical undo records between + * LUP and COMP pointers. (ASSERT that are only tagged logical + * records between the 2 pointers) + */ + while (ZLOG_RecoveryGetLogicalOnly(genMsg, &key, ZLOG_RG_NEXT) == zOK) + { + zASSERT(key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex < XFUNC_MAX); + +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED, MSGNot("LOGICAL UNDO: f:%d xid:%x\n"), + key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex, + (unsigned int)key.ZRK_ZfsXasRecovery->ZXR_LocalXid.unique); +#endif + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch); + func = XRecoveryTable[key.ZRK_ZfsXasRecovery->ZXR_FunctionIndex].undo; + status = func(genMsg, pool, key.ZRK_ZfsXasRecovery, X_L_UNDO); + + if (status != zOK) + { + ZLOG_RecoveryClose( &key); + goto returnStatus; + } + X_LATCH( &zlogBeast->ZFSLOGbeastLatch); + } + ZLOG_RecoveryClose( &key); + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch); + + if (GetErrno(genMsg) != zERR_ZLOG_NO_MORE_RECORDS) + { + status = zFAILURE; + goto returnStatus; + } + ClearErrno(genMsg); + status = zOK; + + /* + * CHECKPOINT 2 + * + * Wait until compensation records are written to + * disk (ZFSPOOL_CheckpointTake does this (I.E. CHECKPOINT_CT_S_CLEAN + * forces a flush (and wait) of the ZLOG beast)) + */ + zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn = 0; + zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrBlkNum = 0; + zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn = 0; + zlogBeast->ZLB_P.ZLBP_CompensationPtrBlkNum = 0; + + zlogBeast->ZLB_State &= ~ZLOG_ZB_S_DOING_COMPENSATION; + + for (i = 0; i < CHECKPOINT_NUMBER; i++) + { + tmpStatus = ZFSPOOL_CheckpointTake(genMsg, pool, + CHECKPOINT_CT_S_CLEAN|CHECKPOINT_CT_S_L_UNDO); + if ((tmpStatus != zOK) && (status == zOK)) + { + status = tmpStatus; + } + } + +returnStatus: + ZFSPOOL_DeactivateAllQuasiActiveLVs(pool); +activateFailed: + if (status != zOK) + { + zlogBeast->ZLB_State &= ~ZLOG_ZB_S_DOING_COMPENSATION; + zASSERT(GetErrno(genMsg) != zOK); + if (GetErrno(genMsg) == zOK) + { + SetErrno(genMsg, zERR_GENERIC_NSS_ERROR); + } + zlogBeast->ZLB_RedoUndoStatus = GetErrno(genMsg); + zlogBeast->ZLB_RedoUndoStatusSetter = GetErrnoSetter(genMsg); + errPrintf(WHERE, Module, 1456, + MSG("** System verification failed.\n" + "You may have to rebuild your volume. If this happens again,\n" + "contact your Novell Technical Support Provider.", 432)); + + DEBUG_PRINTF(TZLOG, DBG_NOINDENT, (TZLOG_COLOR, MSGNot("Recovery complete - failure.\n")) ); + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RECOVERY_ERROR; + } + return status; +} + + +/* + * Routines and data structures for deleting individual pool blocks. + * Used by the various B-tree algorithms for deleting blocks no longer + * needed because of shrinks and joins. + * + * XdeferPoolBlk delays the true deletion of any pool blocks until + * after the transaction it is in can no longer be undone. This + * is to avoid the dirty read problem where after deleting an + * pool block it gets allocated to some other transaction before + * the transaction that deleted the pool block has committed. + * + * FixFixFix6(Randy's comment) (or at least think about) - could just put the + * poolBlks in a small array in the transaction structure. + * Typically, we only delete one pool block. + * + * Set up the system pool of extent detete control structures + */ +typedef struct FreedPoolBlk_s +{ + DQlink_t fpb_linkPool; + DQlink_t fpb_linkVolume; + Blknum_t fpb_poolBlk; + ZfsVolume_s *fpb_zfsVol; +/* MyCache_s fpb_myCache; * The MyCache that the block was + * on at XdeferPoolBlk() time. + */ + Lsn_t fpb_lsn; /* The LSN for the log record that + * caused the block to be freed. + */ + ZfsPurgeLogLoc_s fpb_purgeLogLoc; /* The location in the purge log */ +} FreedPoolBlk_s; + +ControlStore_s DeleteControl; + +/************************************************************************* + * + * Start the delayed block free system + * + *************************************************************************/ +STATUS XDEL_Startup (void) +{ + STATUS status; + + ENTER(TXACTION, XDEL_Startup); + + status = CONTROL_Startup( &DeleteControl, ZstoreConfig.zfs.numXdeletes, + sizeof(FreedPoolBlk_s), NULL); + + RTN_STATUS(status); +} + +/************************************************************************* + * + * Shutdown the delayed block free system + * + *************************************************************************/ +void XDEL_Shutdown (void) +{ + ENTER(TXACTION, XDEL_Shutdown); + + CONTROL_Shutdown( &DeleteControl); + + RTN_VOID(); +} + +/************************************************************************* + * + * Add an entry to the transaction delay queue. It is only here till the + * current release of the log record. It will then be moved to the + * pool delay queue. + * + *************************************************************************/ +void XdeferPoolBlk ( + ZfsXaction_s *xaction, + Blknum_t poolBlk) +{ + FreedPoolBlk_s *xdelete; + GeneralMsg_s genMsg; + + ENTER(TXACTION, XdeferPoolBlk); + COMN_SETUP_GENERAL_MSG_NOSA( &genMsg); + + xdelete = CONTROL_get( &DeleteControl); + zASSERT(xdelete != NULL); + + DQ_ENQ(&xaction->ZX_deleteBlkQ, xdelete, fpb_linkPool); + xdelete->fpb_poolBlk = poolBlk; + xdelete->fpb_lsn = xaction->ZX_lsn; + zASSERT( xaction->ZX_zfsVol != NULL ); + xdelete->fpb_zfsVol = xaction->ZX_zfsVol; + RTN_VOID(); +} + +/************************************************************************* + * + * Move the deferred pool blocks from the transaction to pool. + * + *************************************************************************/ +void XqueueDeferredPoolBlks ( + ZfsXaction_s *xaction) +{ + FreedPoolBlk_s *xdelete; + PurgeLogMsg_s purgeLogMsg; + GeneralMsg_s genMsg; + + ENTER(TXACTION, XqueueDeferredPoolBlks); + COMN_SETUP_GENERAL_MSG_NOSA( &genMsg); + +//DBG_DebugPrintf(YELLOW, MSGNot("Start deferred block move to pool.\n")); + while (DQ_NOT_EMPTY(&xaction->ZX_deleteBlkQ)) + { + ZfsPool_s *zfsPool; + + ZfsVolume_s *zfsVol; + zfsVol = xaction->ZX_zfsVol; + + zfsPool = ZLSS_VOLUME_TO_ZFS_POOL( xaction->ZX_zfsVol ); + zASSERT(COMN_IsDerivedFrom(zfsPool, zFTYPE_ZLSS_ZFSPOOL)); + zASSERT( zfsPool != NULL ); + zASSERT( !(xaction->xstate & XAS_XR_TS_END) ); + DQ_DEQ_NO_CHECK(&xaction->ZX_deleteBlkQ, xdelete, FreedPoolBlk_s, fpb_linkPool); + + DQ_ENQ( &zfsPool->ZP_deleteBlkQ, xdelete, fpb_linkPool ); + DQ_ENQ( &xaction->ZX_zfsVol->ZV_deleteBlkQ, xdelete, fpb_linkVolume ); + SETUP_BLOCK_FREE_PURGE_LOG(&purgeLogMsg, &xdelete->fpb_purgeLogLoc, + xdelete->fpb_poolBlk); +//DBG_DebugPrintf(YELLOW, MSGNot("Moving block %d to the pool.\n"), xdelete->poolBlk); + ZFSVOL_VOL_AddPurgeLogEntry(&genMsg, &xaction->ZX_zfsVol->ZFSVOLvol, + PLOG_BLOCK_FREE, &purgeLogMsg, (Xaction_s *)xaction); + } +//DBG_DebugPrintf(YELLOW, MSGNot("End deferred block move.\n")); + + RTN_VOID(); +} + +/************************************************************************* + * + * Process the entries on the delay queue. Only go while the entries + * have an lsn that is smaller than the current one. + * + *************************************************************************/ +void XfreePoolBlks ( + ZfsPool_s *pool, + Lsn_t maxLsn) +{ + ZfsXaction_s *xaction; + FreedPoolBlk_s *xdelete; + GeneralMsg_s genMsg; + Extent_s extent; + STATUS status; + PurgeLogMsg_s purgeLogMsg; +// Buffer_s *buffer; + + ENTER(TXACTION, XfreePoolBlks); + + COMN_SETUP_GENERAL_MSG_NOSA( &genMsg); + +//DBG_DebugPrintf(LRED, MSGNot("Start freeing deferred blocks.\n")); + for (;;) + { + if (DQ_EMPTY(&pool->ZP_deleteBlkQ)) + { + break; /* we are done */ + } + else + { +//DBG_DebugPrintf(LRED, MSGNot("Peeking.\n")); + DQ_PEEK(&pool->ZP_deleteBlkQ, xdelete, FreedPoolBlk_s, fpb_linkPool); + if (xdelete == NULL || xdelete->fpb_lsn >= maxLsn) + { + break; /* we are done */ + } +//DBG_DebugPrintf(LRED, MSGNot("Dequeuing.\n")); + DQ_DEQ_NO_CHECK(&pool->ZP_deleteBlkQ, xdelete, FreedPoolBlk_s, fpb_linkPool); + zASSERT( xdelete->fpb_zfsVol != NULL ); + zASSERT( QMEMBER(&xdelete->fpb_linkVolume) ); + DQ_RMV(xdelete,fpb_linkVolume); + xaction = BeginXLocal(&pool->ZFSPOOLvol, BXL_DEFAULT); + extent.lengthOfExtent = 1; + extent.poolBlkNum = xdelete->fpb_poolBlk; + status = zfsFreeExtent(&genMsg, xdelete->fpb_zfsVol, &extent, xaction); + zASSERT(status == zOK); + + /* FixFixFix6(Future,Performance) - The TOSS of the buffer + * can occur after COMMIT. This was not done becuase we + * do not track when a transaction COMMITs. Doing the + * TOSS at commit will save us a write if the buffer had + * been dirtyed before it was deleted. + */ + + /* We do not have to TOSS here because the buffer has + * already been written. I.E. for home to move far + * enough for the delete to occur the buffer must of + * been written (if dirty). + */ + + SETUP_BLOCK_FREE_PURGE_LOG(&purgeLogMsg, &xdelete->fpb_purgeLogLoc, 0); +//DBG_DebugPrintf(LRED, MSGNot("Removing log entry for block %d.\n"), xdelete->poolBlk); + ZFSVOL_VOL_RemovePurgeLogEntry(&genMsg, &xdelete->fpb_zfsVol->ZFSVOLvol, + PLOG_BLOCK_FREE, &purgeLogMsg, (Xaction_s *)xaction); + CONTROL_FREE(xdelete); + EndXlocal(xaction); + } + } +//DBG_DebugPrintf(LRED, MSGNot("End freeing deferred blocks.\n")); + RTN_VOID(); +} + + +/**************************************************************************** + *** nodeExtent is an extent in the free tree from which requested blks + *** will be allocated. + *** + *** reqExtent is the extent requested by the user with or without a seed + *** value, and user specified length. + *** + *** This routine is called during Alloc Extent after finding a specific seed + *** value or during find length match to ensure that the blocks being + *** returned as available are not on the list of blocks that cannot yet + *** be reassigned. + *** + *** This routine basically knows how the the free tree allocates blks, and + *** uses that knowledge, so BE CAREFUL when modifying this routine and + *** zfsAllocExtent subroutines, so that changes match up. + *** + *** This is necessary to be able to do logical undos. + ****************************************************************************/ +BOOL XACT_blksNotOnFreeList( + ZfsPool_s *pool, + Extent_s *nodeExtent, + Extent_s *requestedExtent) +{ + Blknum_t reqBlk, nodeBlk; + Blknum_t reqLen, nodeLen; + FreeUserDataBlks_s *freeChunk; + + if (DQ_EMPTY(&pool->freeDataBlksList)) + { +#if FREE_DATA_STATS IS_ENABLED + UserFreeStats.listEmpty++; +#endif + return TRUE; + } + +#if FREE_DATA_STATS IS_ENABLED + UserFreeStats.listNotEmpty++; +#endif + + reqBlk = requestedExtent->poolBlkNum; + reqLen = requestedExtent->lengthOfExtent; + nodeBlk = nodeExtent->poolBlkNum; + nodeLen = nodeExtent->lengthOfExtent; + + if ((reqBlk != 0) && (reqBlk > nodeBlk) && (reqBlk < (nodeBlk + nodeLen))) + { + /** A seed value was specified and the seed value is within the + ** extent that is being processed + **/ + nodeBlk = reqBlk; + nodeLen -= (reqBlk - nodeBlk); + } + if (reqLen < nodeLen) + { + nodeLen = reqLen; + } + + DQ_FOREACH(&pool->freeDataBlksList, freeChunk, + FreeUserDataBlks_s, globalListLink) + { +#if FREE_DATA_STATS IS_ENABLED + UserFreeStats.freeExtentsProcessed++; +#endif + if (((freeChunk->freeExt.poolBlkNum + + freeChunk->freeExt.lengthOfExtent) <= nodeBlk) || + (freeChunk->freeExt.poolBlkNum >= (nodeBlk + nodeLen))) + { + continue; + } + else + { +#if FREE_DATA_STATS IS_ENABLED + UserFreeStats.retBlockUnAvail++; +#endif + return FALSE; + } + } + +#if FREE_DATA_STATS IS_ENABLED + UserFreeStats.retBlockAvail++; +#endif + return TRUE; +} + + +/**************************************************************************** + *** This routine is called when a user data blk is freed, before calling + *** zfsFreeExtent. zfsFreeExtent will put these blocks in the free tree. + *** This routine puts the free user data blks on a global list associated + *** with the pool and on a list associated with the transaction. These + *** blks are not reassigned until the transaction ends. This is necessary + *** to be able to do logical undos. + ****************************************************************************/ +STATUS XACT_holdFreeUserDataBlks( + Extent_s *extent, + ZfsXaction_s *xaction) +{ + FreeUserDataBlks_s *free; + ZfsPool_s *zfsPool; + ZfsVolume_s *zfsVol; + ZlssPool_s *zlssPool; + + zfsVol = xaction->ZX_zfsVol; + zfsPool = ZLSS_VOLUME_TO_ZFS_POOL( xaction->ZX_zfsVol ); + zlssPool = (ZlssPool_s *)zfsPool->ZFSPOOLvol.v_pool; + + free = CONTROL_get(&FreeUserDataBlksControl); + + + free->freeExt = *extent; + + DQ_ENQ(&zfsPool->freeDataBlksList, free, globalListLink); + DQ_ENQ(&xaction->freeUserDataBlks, free, xactionLink); + + zlssPool->ZP_UnusableFreeBlkCnt += free->freeExt.lengthOfExtent; + zfsVol->unusableFreeBlkCnt += free->freeExt.lengthOfExtent; + +#if FREE_DATA_STATS IS_ENABLED + UserFreeStats.currentFreeExtents++; + UserFreeStats.totalFreeExtents++; +#endif + +#if NSS_DEBUG IS_ENABLED + xaction->numFreeNonusableBlks += free->freeExt.lengthOfExtent; +#endif + return zOK; + +} + + +/**************************************************************************** + *** This routine is called when the xaction ends. At this we remove all + *** the user data blks that were freed during this xaction from the global + *** list of blks that were freed, but cannot be used until the xaction that + *** freed them has ended. + ****************************************************************************/ +STATUS XACT_releaseFreeUserDataBlks( + ZfsXaction_s *xaction) +{ + FreeUserDataBlks_s *free; + ZfsPool_s *zfsPool; + ZfsVolume_s *zfsVol; + ZlssPool_s *zlssPool; + + zfsVol = xaction->ZX_zfsVol; + zfsPool = ZLSS_VOLUME_TO_ZFS_POOL( xaction->ZX_zfsVol ); + zlssPool = (ZlssPool_s *)zfsPool->ZFSPOOLvol.v_pool; + + while (DQ_NOT_EMPTY(&xaction->freeUserDataBlks)) + { + DQ_DEQ(&xaction->freeUserDataBlks, free, + FreeUserDataBlks_s, xactionLink); + +#if NSS_DEBUG IS_ENABLED + zASSERT(FT_ExtentIsStillFree(zfsPool, &free->freeExt) == TRUE); +#endif + + DQ_RMV(free, globalListLink); + + zASSERT(zfsVol->unusableFreeBlkCnt >= free->freeExt.lengthOfExtent); + zfsVol->unusableFreeBlkCnt -= free->freeExt.lengthOfExtent; + + zASSERT(zlssPool->ZP_UnusableFreeBlkCnt >= free->freeExt.lengthOfExtent); + zlssPool->ZP_UnusableFreeBlkCnt -= free->freeExt.lengthOfExtent; + +#if FREE_DATA_STATS IS_ENABLED + UserFreeStats.currentFreeExtents--; +#endif + +#if NSS_DEBUG IS_ENABLED + zASSERT(xaction->numFreeNonusableBlks >= free->freeExt.lengthOfExtent); + xaction->numFreeNonusableBlks -= free->freeExt.lengthOfExtent; +#endif + CONTROL_FREE(free); + + } + return zOK; +} + +#if 0 +/* + * Cleanup any left over delete blocks from when a pool deactivates. + * + * Usually, will only be stuff on list if volume is being DISABLED + * + * Note - + * The item being removed from the pool list will also be + * the first item on the volume list. We still do a DQ_RMV (verses a + * DQ_DEQ) because it is fastest. + */ + +void XDEL_Deactivate(ZfsPool_s *pool) + +{ + + FreedPoolBlk_s *xdelete; + + zASSERT( pool != NULL ); + while (CIR_NOT_EMPTY(pool->deleteBlkQ)) + { +#if NSS_DEBUG IS_ENABLED + zASSERT("Going to dequeue delete block queue. Check out in RDEBUG"==NULL); +#endif + CIR_DEQ_NO_CHECK(pool->deleteBlkQ, xdelete, FreedPoolBlk_s, fpb_link); + zASSERT( QMEMBER(&xdelete->fpb_linkVolume) ); + DQ_RMV(xdelete,fpb_linkVolume); + zASSERT( junk == xdelete ); + CONTROL_FREE(xdelete); + } + +} +#endif + + +/* + * Cleanup any left over delete blocks from when a volume deactivates. + * + * Usually, will only be stuff on list if volume is being DISABLED. + * + * Note - + * We do not call ZFSVOL_VOL_RemovePurgeLogEntry because we need + * to leave the purge log entry in the log so that the next time the + * LV is activated the block will be freed in the 'play purge log' code. + * In addition, the item being removed from the volume list does + * not have to be the first item on the pool's list. + */ + +void XDEL_DeactivateVolume(ZfsVolume_s *zfsVol) + +{ + FreedPoolBlk_s *xdelete; + + zASSERT( zfsVol != NULL ); + while (DQ_NOT_EMPTY(&zfsVol->ZV_deleteBlkQ)) + { + DQ_DEQ_NO_CHECK(&zfsVol->ZV_deleteBlkQ, xdelete, FreedPoolBlk_s, fpb_linkVolume); + zASSERT( QMEMBER(&xdelete->fpb_linkPool) ); + DQ_RMV(xdelete,fpb_linkPool); +#if NSS_DEBUG IS_ENABLED +#ifdef USER_GPACHNER + DBG_DebugPrintf( CYAN, "Dequeue FreedPoolBlk_s %p which contains info on pool block %0xlx(%ld)", + xdelete,xdelete->fpb_poolBlk,xdelete->fpb_poolBlk); + aprintf( CYAN, "Dequeue FreedPoolBlk_s %p which contains info on pool block %0xlx(%ld)", + xdelete,xdelete->fpb_poolBlk,xdelete->fpb_poolBlk); +#endif +#endif + CONTROL_FREE(xdelete); + } + +} diff --git a/src/nwnss/zlss/z_aes.h b/src/nwnss/zlss/z_aes.h index 608e45c..f8e39c7 100644 --- a/src/nwnss/zlss/z_aes.h +++ b/src/nwnss/zlss/z_aes.h @@ -64,6 +64,12 @@ #define AES_MAXNR 14 #define AES_BLOCK_SIZE 16 +#if defined(NSS_USERSPACE) && !defined(NSS_ZLSS_U32_DEFINED) +#include +typedef uint32_t u32; +#define NSS_ZLSS_U32_DEFINED 1 +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/src/nwnss/zlss/zfsFileMap.c b/src/nwnss/zlss/zfsFileMap.c new file mode 100644 index 0000000..9491124 --- /dev/null +++ b/src/nwnss/zlss/zfsFileMap.c @@ -0,0 +1,6263 @@ +/**************************************************************************** + | + | (C) Copyright 1995-1997 Novell, Inc. + | All Rights Reserved. + | + | This program is free software; you can redistribute it and/or + | modify it under the terms of version 2 of the GNU General Public + | License as published by the Free Software Foundation. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, contact Novell, Inc. + | + | To contact Novell about this file by physical or electronic mail, + | you may find current contact information at www.novell.com + | + |*************************************************************************** + | + | NetWare Advance File Services (NSS) module + | + |--------------------------------------------------------------------------- + | + | $Author: gpachner $ + | $Date: 2007-05-08 05:01:53 +0530 (Tue, 08 May 2007) $ + | + | $RCSfile$ + | $Revision: 1979 $ + | + |--------------------------------------------------------------------------- + | This module is used to: + | This defines all of the primitive BEASTS inside of PSS + +-------------------------------------------------------------------------*/ +#if defined(NSS_USERSPACE) +#include +struct block_device; +#else +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include "msgIO.h" + +#include "zfs.h" +#include "comnBeasts.h" +#include "comnBeastClass.h" +#include "comnAuthorize.h" +#include "zParams.h" +#include "comnPublics.h" +#include "pssmpk.h" +#include "fileHandle.h" +//#include "adminVolume.h" +#include "nameSpace.h" +#include "zfsAsyncio.h" +#include "zlog.h" +#include "zlssStartup.h" +#include "zfsXTree.h" +#include "zfsFileMap.h" +#include "userTree.h" +#include "dirQuotas.h" + +#if defined(NSS_USERSPACE) +#ifndef GFP_USER +#define GFP_USER 0 +#endif +#ifndef READ +#define READ 0 +#endif +#ifndef WRITE +#define WRITE 1 +#endif +#undef ClearStatus +#define ClearStatus(_msg) \ + do { (_msg)->sys.status = zOK; (_msg)->sys.where = (QUAD)(uintptr_t)WHERE; } while (0) +#undef SetStatusFromErrno +#define SetStatusFromErrno(_msg, _genMsg) \ + do { (_msg)->sys.status = GetErrno((_genMsg)); \ + (_msg)->sys.where = (QUAD)(uintptr_t)((_genMsg)->errStatusSetter); } while (0) +#endif + +/************************************************************************** + * read a block that belongs to the btree + **************************************************************************/ +void ZFS_Return (ZioFmap_s *zio) +{ + ASSERT_MPKNSS_LOCK(); + FSM_RUN( &zio->io.fsm); +} + +void readVolBlkBuffer(Asyncio_s *aio) +{ + ASSERT_MPKNSS_LOCK(); + DOWN_LATCH( &aio->buffer->agent.latch); + if (aio->status != zOK) + { + aio->buffer->agent.status = aio->status; + CACHE_RELEASE(aio->buffer); + aio->buffer = NULL; + } + FSM_RUN(&aio->fsm); +} + +void asyncReadVolBlk(Asyncio_s *aio, voidfunc_t action) +{ + RootBeast_s *beast; + + ASSERT_MPKNSS_LOCK(); + zASSERT(aio->volBlk > 0); +// zASSERT(aio->volBlk < DEBUG_MAX_FILE_BLK); + + if (!isCached(aio, action)) + { + FSM_PUSH(&aio->fsm, action); + FSM_PUSH(&aio->fsm, readVolBlkBuffer); + + beast = STRUCT(aio->mycache, RootBeast_s, ROOTmycache); + if ((aio->fileBlk < 0) || (aio->volBlk == 0) || (beast->ROOTzid < zFIRST_ALLOCATABLE_ZID)) + { + asyncCacheAllocBuffer(aio, ZFSMAL_asyncReadBlk, + ZFS_BlockSignalHandler); + } + else + { + asyncCacheAllocBufferForUserData(aio, ZFSMAL_asyncReadBlk, + ZFS_BlockSignalHandler); + } + } +} + + +/**************************************************************************** + * Search the Direct File Map + *****************************************************************************/ +Blknum_t searchDirectMap( + Fmap_s *fmap, + Blknum_t fileBlk, + Blknum_t *seed, + Blknum_t *length, + NINT *index) +{ + NINT i; + + ASSERT_MPKNSS_LOCK(); + if (fmap->numRecs <= 1) + return 0; + + for (i=1; i < fmap->numRecs; i++) + { + if (fmap->dirExt[i].count > fileBlk) + { + *index = i; + *length = fmap->dirExt[i].count - fileBlk; + return (fmap->dirExt[i].poolBlk + + (fileBlk - fmap->dirExt[i-1].count)); + } + } + *seed = fmap->dirExt[i-1].poolBlk + fmap->dirExt[i-1].count + - fmap->dirExt[i-2].count; + return 0; +} + + +/**************************************************************************** + * searchLeaf and searchBranch: + * Search the leaf and branch nodes of a btree + *****************************************************************************/ +Blknum_t searchLeaf( + FmapNode_s *node, + Blknum_t fileBlk, + Blknum_t *length, + NINT *index) +{ + Blknum_t numRecs; + NINT offset; + NINT i; +#if NSS_DEBUG IS_ENABLED + NINT testCounter = 0; +#endif + ASSERT_MPKNSS_LOCK(); + offset = 0; + numRecs = node->head.numRecs; + + /** Added an extra check to make sure we don't get stuck here forever in + ** case there is a corrupted node + **/ + if (fileBlk >= node->extent[numRecs - 1].count) + { + *index = numRecs; + *length = 0; + return 0; + } + + while (1) + { +#if NSS_DEBUG IS_ENABLED + testCounter++; + zASSERT(numRecs); +#endif + numRecs = numRecs >> 1; + if (fileBlk >= node->extent[offset + numRecs].count) + { + if (fileBlk < node->extent[offset + numRecs + 1].count) + { + i = offset + numRecs + 1; + break; + } + else + { + offset += numRecs; + numRecs = node->head.numRecs - offset; + } + } + else + { + if (fileBlk >= node->extent[offset + numRecs - 1].count) + { + i = offset + numRecs; + break; + } + } +#if NSS_DEBUG IS_ENABLED + zASSERT(testCounter < 100); +#endif + } + *index = i; + *length = node->extent[i].count - fileBlk; + if (node->extent[i].poolBlk == 0) + return 0; + else + return (node->extent[i].poolBlk + + (fileBlk - node->extent[i-1].count)); +} + + +Blknum_t searchBranch(FmapNode_s *node, Blknum_t fileBlk, NINT *index) +{ + Blknum_t numRecs; + NINT offset; +#if NSS_DEBUG IS_ENABLED + NINT testCounter = 0; +#endif + ASSERT_MPKNSS_LOCK(); + offset = 0; + numRecs = node->head.numRecs; + + /** Added an extra check to make sure we don't get stuck here forever in + ** case there is a corrupted node + **/ + if (fileBlk >= node->extent[numRecs - 1].count) + { + zASSERT(0); + *index = numRecs; + return 0; + } + + while (1) + { +#if NSS_DEBUG IS_ENABLED + testCounter++; + zASSERT(numRecs); +#endif + numRecs = numRecs >> 1; + if (fileBlk >= node->extent[offset + numRecs].count) + { + if (fileBlk < node->extent[offset + numRecs + 1].count) + { + *index = offset + numRecs + 1; + return (node->extent[offset + numRecs + 1].poolBlk); + } + else + { + offset += numRecs; + numRecs = node->head.numRecs - offset; + } + } + else + { + if (fileBlk >= node->extent[offset + numRecs - 1].count) + { + *index = offset + numRecs; + return (node->extent[offset + numRecs].poolBlk); + } + } +#if NSS_DEBUG IS_ENABLED + zASSERT(testCounter < 100); +#endif + } +} + + +/************************************************************************* + * findBlk in Fmap: (use fsms to do this ) + * Search the filemap recursively (if it is a branch) until we find + * the entry we are looking for. + * Find the fileBlk in the Fmap + *************************************************************************/ +void asyncFindBlkInFmap(ZioFmap_s *zio) +{ + ZFSStorageInfo_s *stInfo; + Fmap_s *fmap; + FmapNode_s *node; + Blknum_t fileBlk = zio->fmap.saveFileBlk; + Blknum_t len; + NINT index; + + ASSERT_MPKNSS_LOCK(); + if (zio->io.buffer == NULL) + { + /** There was a READ error **/ + ZFS_Return(zio); + return; + } + stInfo = STRUCT(zio->io.mycache, RootBeast_s, ROOTmycache)->storage.zfsInfo; + fmap = &stInfo->fmap; + node = (FmapNode_s *)(zio->io.buffer->pBuf.data); + + if (node->head.state & BT_LEAF) + { + zASSERT((node->head.state & BT_ROOT) ? + (node->head.magic == FMAP_BT_ROOT) : + (node->head.magic == FMAP_BT_LEAF)); + if ((node->head.state & BT_ROOT) ? + (node->head.magic != FMAP_BT_ROOT) : + (node->head.magic != FMAP_BT_LEAF)) + { + zio->io.status = zERR_MEDIA_CORRUPTED; + CACHE_RELEASE(zio->io.buffer); + zio->io.buffer = NULL; + ZFS_Return(zio); + return; + } + zio->io.volBlk = searchLeaf(node, fileBlk, &len, &index); + + zASSERT(zio->io.volBlk >= 0); +// zASSERT(zio->io.volBlk < DEBUG_MAX_FILE_BLK); + + zio->io.fileBlk = fileBlk; + CACHE_RELEASE(zio->io.buffer); + zio->io.buffer = NULL; + if (zio->io.volBlk == 0) + { + ASSERT_SLATCH( &CACHE_SparseBuffer.agent.latch); + ADD_LATCH( &CACHE_SparseBuffer.agent.latch); + zio->io.buffer = &CACHE_SparseBuffer; + + ZFS_Return(zio); + return; + } + else + { + asyncReadVolBlk(&zio->io, ZFS_Return); + return; + } + } + else + { + zASSERT((node->head.state & BT_ROOT) ? + (node->head.magic == FMAP_BT_ROOT) : + (node->head.magic == FMAP_BT_BRANCH)); + if ((node->head.state & BT_ROOT) ? + (node->head.magic != FMAP_BT_ROOT) : + (node->head.magic != FMAP_BT_BRANCH)) + { + zio->io.status = zERR_MEDIA_CORRUPTED; + CACHE_RELEASE(zio->io.buffer); + zio->io.buffer = NULL; + ZFS_Return(zio); + return; + } + + zio->io.volBlk = searchBranch(node, fileBlk, &index); + + zASSERT(zio->io.volBlk > 0); +// zASSERT(zio->io.volBlk < DEBUG_MAX_FILE_BLK); + + CACHE_RELEASE(zio->io.buffer); + zio->io.buffer = NULL; + zio->io.fileBlk = POOLBLK_TO_INDIRECT(zio->io.volBlk); + asyncReadVolBlk(&zio->io, asyncFindBlkInFmap); + return; + } +} + + +/************************************************************************* + * ZFS_asyncReadFileBlk + *************************************************************************/ +void ZFSVOL_VOL_asyncReadFileBlk(Asyncio_s *asyncio) +{ + ZioFmap_s *zio = STRUCT(asyncio, ZioFmap_s, io); + RootBeast_s *beast; + ZFSStorageInfo_s *stInfo; + Fmap_s *fmap; + Blknum_t seed = 0; + Blknum_t len = 0; + NINT index = 0; + + ASSERT_MPKNSS_LOCK(); + /* Locate the beast for this I/O */ + beast = STRUCT(zio->io.mycache, RootBeast_s, ROOTmycache); + stInfo = beast->storage.zfsInfo; + fmap = &stInfo->fmap; + + zASSERT(zio->io.mode == CACHE_READ); + ASSERT_LATCH(&beast->ROOTbeastLatch); + + zio->io.buffer = NULL; + + if (zio->io.fileBlk >= stInfo->nextBlk ) + { + /** FileBlk to be read is beyond the last block of the file */ + /** Return a block of zeroes **/ + + ASSERT_SLATCH( &CACHE_SparseBuffer.agent.latch); + ADD_LATCH( &CACHE_SparseBuffer.agent.latch); + zio->io.buffer = &CACHE_SparseBuffer; + + ZFS_Return(zio); + return; + } + + if (zio->io.fileBlk < fmap->dirExt[fmap->numRecs - 1].count) + { + /** The fileBlk is in the direct portion of the filemap **/ + zio->io.volBlk = searchDirectMap(fmap, zio->io.fileBlk, &seed, + &len, &index); + zASSERT(zio->io.volBlk > 0); +// zASSERT(zio->io.volBlk < DEBUG_MAX_FILE_BLK); + + asyncReadVolBlk(&zio->io, ZFS_Return); + return; + } + else + { + /** The fileBlk is in the Btree portion of the filemap **/ + zASSERT(fmap->root != INVALID_BLK_ZERO); + zio->fmap.saveFileBlk = zio->io.fileBlk; + + zio->io.volBlk = fmap->root; + zio->io.fileBlk = POOLBLK_TO_INDIRECT(zio->io.volBlk); + asyncReadVolBlk(&zio->io, asyncFindBlkInFmap); + return; + } +} + + +/************************************************************************* + *************************************************************************/ +/************************************************************************* + * GetFileBlk + *************************************************************************/ +STATUS findBlkInFileMap( + GeneralMsg_s *genMsg, + ZFSStorageInfo_s *stInfo, + Blknum_t fileBlk, + Blknum_t *poolBlk) +{ + Fmap_s *fmap = &stInfo->fmap; + Buffer_s *buf = NULL; + FmapNode_s *node; + NINT index; /** unused **/ + Blknum_t len; /** unused **/ + Blknum_t seed; /** unused **/ + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + if (fmap->dirExt[fmap->numRecs - 1].count > fileBlk) + { + *poolBlk = searchDirectMap(fmap, fileBlk, &seed, &len, &index); + zASSERT(*poolBlk > 0); + } + else + { + READBLK_IO_MSG(iomsg, stInfo->comnInfo.beast, fmap->root, CACHE_READ); + SET_DEBUG_ID(iomsg, 15); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + zASSERT(node->head.magic == FMAP_BT_ROOT); + if (node->head.magic != FMAP_BT_ROOT) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + while (!(node->head.state & BT_LEAF)) + { + *poolBlk = searchBranch(node, fileBlk, &index); + CACHE_RELEASE(buf); + buf = NULL; + READBLK_IO_MSG(iomsg, stInfo->comnInfo.beast, *poolBlk, CACHE_READ); + SET_DEBUG_ID(iomsg, 16); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_BRANCH)); + if ((node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_BRANCH)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + } + *poolBlk = searchLeaf(node, fileBlk, &len, &index); + CACHE_RELEASE(buf); + zASSERT(*poolBlk >= 0); + } +// zASSERT(*poolBlk < DEBUG_MAX_FILE_BLK); + return zOK; +} + +void syncGrowBtree( + ZfsXaction_s *xaction, + Buffer_s *bufChild, + Buffer_s *bufParent, + NINT *indexParent, + RootBeast_s *beast) +{ + FmapNode_s *parent; + FmapNode_s *child; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + + + ASSERT_MPKNSS_LOCK(); + child = (FmapNode_s *)bufChild->pBuf.data; + parent = (FmapNode_s *)bufParent->pBuf.data; + + child->head.state &= ~BT_ROOT; + if (child->head.state & BT_LEAF) + child->head.magic = FMAP_BT_LEAF; + else + child->head.magic = FMAP_BT_BRANCH; + + child->head.fnh_internalID = beast->ROOTinternalID; + parent->head.fnh_internalID = beast->ROOTinternalID; + child->head.fnh_zid = beast->zid; + parent->head.fnh_zid = beast->zid; + + parent->head.magic = FMAP_BT_ROOT; + parent->head.state = BT_ROOT; + parent->head.leafLink = INVALID_BLK_ZERO; + parent->head.lsn = 0; + + parent->extent[0].count = child->extent[0].count; + parent->extent[0].poolBlk = child->extent[0].poolBlk; + parent->head.numRecs = 1; + + zASSERT(bufChild->volBlk != 0); + parent->extent[parent->head.numRecs].count = MAX_FILE_BLK; + parent->extent[parent->head.numRecs].poolBlk = bufChild->volBlk; + parent->head.numRecs++; + + *indexParent = parent->head.numRecs -1; + + if (xaction == NULL) + { + return; + } + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(2) + sizeof(FmapGrow_s) ); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_GROW, xaction, logBuffer, + 2, logBlks, logRecord); + + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], bufChild->volBlk, + child->head.lsn, bufChild, xaction, 0); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[1], bufParent->volBlk, + parent->head.lsn, bufParent, xaction, 1); + ZLOG_ALLOC_BLOCK(logBlks[1]); + + memcpy(&logRecord->u.grow.extent[0], &parent->extent[0], + sizeof(logRecord->u.grow.extent)); + // Either Child or Parent is ok for next 2 lines as ID, ZID are the same + logRecord->u.grow.fg_internalID = child->head.fnh_internalID; + logRecord->u.grow.fg_zid = child->head.fnh_zid; + + child->head.lsn = logBuffer->ZXR_Lsn; + parent->head.lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(xaction, bufChild); + ZLOG_BIND(xaction, bufParent); + ZLOG_ReleaseRecord(xaction); + + return; +} + + +void syncSplitBtree( + ZfsXaction_s *xaction, + Buffer_s *bufChild, + Buffer_s *bufSibling, + Buffer_s *bufParent, + Extent_s *extent) +{ + FmapNode_s *parent; + FmapNode_s *child; + FmapNode_s *sibling; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + + ASSERT_MPKNSS_LOCK(); + child = (FmapNode_s *)bufChild->pBuf.data; + parent = (FmapNode_s *)bufParent->pBuf.data; + sibling = (FmapNode_s *)bufSibling->pBuf.data; + + sibling->head.magic = child->head.magic; + sibling->head.fnh_internalID = child->head.fnh_internalID; + sibling->head.fnh_zid = child->head.fnh_zid; + sibling->head.state = child->head.state; + sibling->head.leafLink = INVALID_BLK_ZERO; + sibling->head.lsn = 0; + + zASSERT(bufSibling->volBlk != 0); + if (child->head.state & BT_LEAF) + { + child->head.leafLink = bufSibling->volBlk; + + parent->extent[parent->head.numRecs - 1].count = + child->extent[child->head.numRecs - 1].count; + parent->extent[parent->head.numRecs].count = MAX_FILE_BLK; + parent->extent[parent->head.numRecs].poolBlk = bufSibling->volBlk; + parent->head.numRecs++; + + sibling->extent[0].count = + child->extent[child->head.numRecs - 1].count; + sibling->extent[0].poolBlk = + child->extent[child->head.numRecs - 1].poolBlk; + sibling->head.numRecs = 1; + + sibling->extent[sibling->head.numRecs].count = + sibling->extent[sibling->head.numRecs - 1].count + + extent->lengthOfExtent; + sibling->extent[sibling->head.numRecs].poolBlk = extent->poolBlkNum; + sibling->head.numRecs++; + } + else + { + parent->extent[parent->head.numRecs - 1].count = + child->extent[child->head.numRecs - 2].count; + parent->extent[parent->head.numRecs].count = MAX_FILE_BLK; + parent->extent[parent->head.numRecs].poolBlk = bufSibling->volBlk; + parent->head.numRecs++; + + sibling->extent[0].count = + child->extent[child->head.numRecs - 2].count; + sibling->extent[0].poolBlk = + child->extent[child->head.numRecs - 2].poolBlk; + sibling->head.numRecs = 1; + + sibling->extent[sibling->head.numRecs].count = + child->extent[child->head.numRecs - 1].count; + sibling->extent[sibling->head.numRecs].poolBlk = + child->extent[child->head.numRecs - 1].poolBlk; + sibling->head.numRecs++; + + child->extent[child->head.numRecs - 1].count = 0; + child->extent[child->head.numRecs - 1].poolBlk = 0; + child->head.numRecs--; + } + + if (xaction == NULL) + { + return; + } + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(3) + + sizeof(FmapSplit_s) - 1 + + (sibling->head.numRecs * sizeof(FmapExt_s)) ); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_SPLIT, xaction, logBuffer, + 3, logBlks, logRecord); + + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], bufChild->volBlk, + child->head.lsn, bufChild, xaction, 0); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[1], bufParent->volBlk, + parent->head.lsn, bufParent, xaction, 1); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[2], bufSibling->volBlk, + sibling->head.lsn, bufSibling, xaction, 2); + ZLOG_ALLOC_BLOCK(logBlks[2]); + + memcpy(&logRecord->u.split.data[0], &sibling->extent[0], + sibling->head.numRecs * sizeof(FmapExt_s)); + memcpy(&logRecord->u.split.parentExt[0], + &parent->extent[parent->head.numRecs - 2], 2 * sizeof(FmapExt_s)); + + logRecord->u.split.pIndex = parent->head.numRecs - 2; + logRecord->u.split.childLink = bufSibling->volBlk; + logRecord->u.split.sibLink = INVALID_BLK_ZERO; + logRecord->u.split.childMagic = child->head.magic; + logRecord->u.split.childState = child->head.state; + logRecord->u.split.numRecs = sibling->head.numRecs; + logRecord->u.split.fs_internalID = sibling->head.fnh_internalID; + logRecord->u.split.fs_zid = sibling->head.fnh_zid; + + child->head.lsn = logBuffer->ZXR_Lsn; + parent->head.lsn = logBuffer->ZXR_Lsn; + sibling->head.lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(xaction, bufChild); + ZLOG_BIND(xaction, bufParent); + ZLOG_BIND(xaction, bufSibling); + + ZLOG_ReleaseRecord(xaction); + + return; +} + + +Buffer_s *splitBtreeBranchSparse( + ZfsXaction_s *xaction, + Buffer_s *bufChild, + Buffer_s *bufSibling, + Buffer_s *bufParent, + NINT index, + NINT *parentIndex) +{ + FmapNode_s *parent; + FmapNode_s *child; + FmapNode_s *sibling; + NINT pIndex = *parentIndex; + NINT tmpIndex; + Buffer_s *retBuf; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + + ASSERT_MPKNSS_LOCK(); + child = (FmapNode_s *)bufChild->pBuf.data; + parent = (FmapNode_s *)bufParent->pBuf.data; + sibling = (FmapNode_s *)bufSibling->pBuf.data; + + sibling->head.magic = child->head.magic; + sibling->head.fnh_internalID = child->head.fnh_internalID; + sibling->head.fnh_zid = child->head.fnh_zid; + sibling->head.state = child->head.state; + sibling->head.leafLink = INVALID_BLK_ZERO; + zASSERT(bufSibling->volBlk != 0); + if (child->extent[index].count == MAX_FILE_BLK) + { + zASSERT(index == child->head.numRecs -1); + zASSERT(parent->extent[pIndex].count == MAX_FILE_BLK); + zASSERT(pIndex == parent->head.numRecs -1); + + parent->extent[pIndex].count = + child->extent[index - 1].count; + parent->extent[pIndex + 1].count = MAX_FILE_BLK; + parent->extent[pIndex + 1].poolBlk = bufSibling->volBlk; + parent->head.numRecs++; + + sibling->extent[0].count = + child->extent[index - 1].count; + sibling->extent[0].poolBlk = + child->extent[index -1].poolBlk; + sibling->head.numRecs = 1; + + sibling->extent[sibling->head.numRecs].count = + child->extent[index].count; + sibling->extent[sibling->head.numRecs].poolBlk = + child->extent[index].poolBlk; + sibling->head.numRecs++; + + child->extent[index].count = 0; + child->extent[index].poolBlk = 0; + child->head.numRecs--; + + *parentIndex = sibling->head.numRecs - 1; + retBuf = bufSibling; + } + else + { + tmpIndex = child->head.numRecs >> 1; + /** Copy half the records to the new node **/ + memcpy(&sibling->extent[0], &child->extent[tmpIndex], + (sizeof(FmapExt_s) * (child->head.numRecs - tmpIndex))); + sibling->head.numRecs = child->head.numRecs - tmpIndex; + + child->head.numRecs = tmpIndex + 1; + bzero(&child->extent[child->head.numRecs], + (sizeof(FmapExt_s) * (FMAP_MAX - child->head.numRecs))); + + memmove(&parent->extent[pIndex + 2], &parent->extent[pIndex + 1], + sizeof(FmapExt_s) * (parent->head.numRecs - (pIndex + 1))); + parent->extent[pIndex + 1].count = parent->extent[pIndex].count; + parent->extent[pIndex + 1].poolBlk = bufSibling->volBlk; + parent->extent[pIndex].count = sibling->extent[0].count; + parent->head.numRecs++; + + if (index < child->head.numRecs) + { + *parentIndex = index; + retBuf = bufChild; + } + else + { + *parentIndex = index - (child->head.numRecs - 1); + retBuf = bufSibling; + } + } + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(3) + + sizeof(FmapSplit_s) - 1 + + (sibling->head.numRecs * sizeof(FmapExt_s))); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_SPLIT, xaction, logBuffer, + 3, logBlks, logRecord); + + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], bufChild->volBlk, + child->head.lsn, bufChild, xaction, 0); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[1], bufParent->volBlk, + parent->head.lsn, bufParent, xaction, 1); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[2], bufSibling->volBlk, + sibling->head.lsn, bufSibling, xaction, 2); + ZLOG_ALLOC_BLOCK(logBlks[2]); + + memcpy(&logRecord->u.split.data[0], &sibling->extent[0], + sibling->head.numRecs * sizeof(FmapExt_s)); + memcpy(&logRecord->u.split.parentExt[0], + &parent->extent[pIndex], 2 * sizeof(FmapExt_s)); + + logRecord->u.split.pIndex = pIndex; + logRecord->u.split.childLink = INVALID_BLK_ZERO; + logRecord->u.split.sibLink = INVALID_BLK_ZERO; + logRecord->u.split.childMagic = child->head.magic; + logRecord->u.split.childState = child->head.state; + logRecord->u.split.numRecs = sibling->head.numRecs; + logRecord->u.split.fs_internalID = sibling->head.fnh_internalID; + logRecord->u.split.fs_zid = sibling->head.fnh_zid; + + child->head.lsn = logBuffer->ZXR_Lsn; + parent->head.lsn = logBuffer->ZXR_Lsn; + sibling->head.lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(xaction, bufChild); + ZLOG_BIND(xaction, bufParent); + ZLOG_BIND(xaction, bufSibling); + ZLOG_ReleaseRecord(xaction); + + return retBuf; +} + +void updateNodeEntry( + ZfsXaction_s *xaction, + Buffer_s *buf, + NINT index, + Extent_s *extent, + Blknum_t fileBlk) +{ + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + FmapExt_s origExt[3]; + FmapNode_s *node = (FmapNode_s *)buf->pBuf.data; + WORD origNumRecs; + + ASSERT_MPKNSS_LOCK(); + memcpy(&origExt[0], &node->extent[index], 3 * sizeof(FmapExt_s)); + origNumRecs = node->head.numRecs; + + if (fileBlk == node->extent[index -1].count) + { + memmove(&node->extent[index + 1], &node->extent[index], + sizeof(FmapExt_s) * (node->head.numRecs - index)); + node->head.numRecs++; + node->extent[index].count = fileBlk + extent->lengthOfExtent; + node->extent[index].poolBlk = extent->poolBlkNum; + } + else if ((fileBlk + extent->lengthOfExtent) == node->extent[index].count) + { + memmove(&node->extent[index + 2], &node->extent[index + 1], + sizeof(FmapExt_s) * (node->head.numRecs - (index + 1))); + node->head.numRecs++; + node->extent[index + 1].count = node->extent[index].count; + node->extent[index + 1].poolBlk = extent->poolBlkNum; + node->extent[index].count = fileBlk; + } + else + { + memmove(&node->extent[index + 3], &node->extent[index + 1], + sizeof(FmapExt_s) * (node->head.numRecs - (index + 1))); + + node->head.numRecs+=2; + + node->extent[index + 2].count = node->extent[index].count; + node->extent[index + 2].poolBlk = 0; + + node->extent[index + 1].count = fileBlk + extent->lengthOfExtent; + node->extent[index + 1].poolBlk = extent->poolBlkNum; + + node->extent[index].count = fileBlk; + node->extent[index].poolBlk = 0; + } + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapInsertSparse_s) ); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_INSERT_SPARSE, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + + memcpy(&logRecord->u.insertSparse.origExt[0], &origExt[0], + 3 * sizeof(FmapExt_s)); + memcpy(&logRecord->u.insertSparse.finalExt[0], &node->extent[index], + 3 * sizeof(FmapExt_s)); + logRecord->u.insertSparse.origNumRecs = origNumRecs; + logRecord->u.insertSparse.finalNumRecs = node->head.numRecs; + logRecord->u.insertSparse.index = index + 1; + + node->head.lsn = logBuffer->ZXR_Lsn; + ZLOG_BIND(xaction, buf); + ZLOG_ReleaseRecord(xaction); +} + +void splitBtreeLeafSparse( + ZfsXaction_s *xaction, + Buffer_s *bufChild, + Buffer_s *bufSibling, + Buffer_s *bufParent, + NINT index, + NINT pIndex, + Extent_s *extent, + NINT allocAhead, + Blknum_t fileBlk) +{ + FmapNode_s *parent; + FmapNode_s *child; + FmapNode_s *sibling; + NINT tmpIndex; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + + ASSERT_MPKNSS_LOCK(); + child = (FmapNode_s *)bufChild->pBuf.data; + parent = (FmapNode_s *)bufParent->pBuf.data; + sibling = (FmapNode_s *)bufSibling->pBuf.data; + + sibling->head.magic = child->head.magic; + sibling->head.fnh_internalID = child->head.fnh_internalID; + sibling->head.fnh_zid = child->head.fnh_zid; + sibling->head.state = child->head.state; + zASSERT(bufSibling->volBlk != 0); + if (allocAhead) + { + child->head.leafLink = bufSibling->volBlk; + sibling->head.leafLink = INVALID_BLK_ZERO; + + pIndex = parent->head.numRecs - 1; + parent->extent[pIndex].count = + child->extent[child->head.numRecs - 1].count; + parent->extent[pIndex + 1].count = MAX_FILE_BLK; + parent->extent[pIndex + 1].poolBlk = bufSibling->volBlk; + parent->head.numRecs++; + + sibling->extent[0].count = + child->extent[child->head.numRecs - 1].count; + sibling->extent[0].poolBlk = + child->extent[child->head.numRecs - 1].poolBlk; + sibling->head.numRecs = 1; + + sibling->extent[sibling->head.numRecs].count = + sibling->extent[sibling->head.numRecs - 1].count + allocAhead; + sibling->extent[sibling->head.numRecs].poolBlk = 0; + sibling->head.numRecs++; + + sibling->extent[sibling->head.numRecs].count = + sibling->extent[sibling->head.numRecs - 1].count + + extent->lengthOfExtent; + sibling->extent[sibling->head.numRecs].poolBlk = extent->poolBlkNum; + sibling->head.numRecs++; + } + else + { + sibling->head.leafLink = child->head.leafLink; + child->head.leafLink = bufSibling->volBlk; + + tmpIndex = child->head.numRecs >> 1; + if (index == tmpIndex) + tmpIndex = tmpIndex + 1; + /** Copy half the records to the new node **/ + memcpy(&sibling->extent[0], &child->extent[tmpIndex], + (sizeof(FmapExt_s) * (child->head.numRecs - tmpIndex))); + sibling->head.numRecs = child->head.numRecs - tmpIndex; + + child->head.numRecs = tmpIndex + 1; + bzero(&child->extent[child->head.numRecs], + (sizeof(FmapExt_s) * (FMAP_MAX - child->head.numRecs))); + + memmove(&parent->extent[pIndex + 2], &parent->extent[pIndex + 1], + sizeof(FmapExt_s) * (parent->head.numRecs - (pIndex + 1))); + parent->head.numRecs++; + parent->extent[pIndex + 1].count = parent->extent[pIndex].count; + parent->extent[pIndex + 1].poolBlk = bufSibling->volBlk; + parent->extent[pIndex].count = sibling->extent[0].count; + } + + if (xaction == NULL) + { + zASSERT(allocAhead != 0); + return; + } + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(3) + + sizeof(FmapSplit_s) - 1 + + (sibling->head.numRecs * sizeof(FmapExt_s)) ); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_SPLIT, xaction, logBuffer, + 3, logBlks, logRecord); + + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], bufChild->volBlk, + child->head.lsn, bufChild, xaction, 0); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[1], bufParent->volBlk, + parent->head.lsn, bufParent, xaction, 1); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[2], bufSibling->volBlk, + sibling->head.lsn, bufSibling, xaction, 2); + ZLOG_ALLOC_BLOCK(logBlks[2]); + + memcpy(&logRecord->u.split.data[0], &sibling->extent[0], + sibling->head.numRecs * sizeof(FmapExt_s)); + memcpy(&logRecord->u.split.parentExt[0], + &parent->extent[pIndex], 2 * sizeof(FmapExt_s)); + + logRecord->u.split.pIndex = pIndex; + logRecord->u.split.childLink = child->head.leafLink; + logRecord->u.split.sibLink = sibling->head.leafLink; + logRecord->u.split.childMagic = child->head.magic; + logRecord->u.split.childState = child->head.state; + logRecord->u.split.numRecs = sibling->head.numRecs; + logRecord->u.split.fs_internalID = sibling->head.fnh_internalID; + logRecord->u.split.fs_zid = sibling->head.fnh_zid; + + child->head.lsn = logBuffer->ZXR_Lsn; + parent->head.lsn = logBuffer->ZXR_Lsn; + sibling->head.lsn = logBuffer->ZXR_Lsn; + + ZLOG_BIND(xaction, bufChild); + ZLOG_BIND(xaction, bufParent); + ZLOG_BIND(xaction, bufSibling); + + ZLOG_ReleaseRecord(xaction); + + if (!allocAhead) + { + if (index < child->head.numRecs) + { + updateNodeEntry(xaction, bufChild, index, extent, fileBlk); + } + else + { + index = index - (child->head.numRecs - 1); + updateNodeEntry(xaction, bufSibling, index, extent, fileBlk); + } + } + return; +} + + +STATUS updateSparseLeaf( + GeneralMsg_s *genMsg, + ZfsXaction_s *xaction, + ZFSStorageInfo_s *stInfo, + Buffer_s *buf, + Buffer_s *bufParent, + NINT index, + NINT parentIndex, + Blknum_t fileBlk, + Extent_s extent, + Blknum_t allocAhead) +{ + Fmap_s *fmap = &stInfo->fmap; + RootBeast_s *beast = stInfo->comnInfo.beast; + FmapNode_s *node = (FmapNode_s *)buf->pBuf.data; + NINT pIndex = parentIndex; + Buffer_s *bufSibling = NULL; + IoMsg_s iomsg; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + NINT tmpFlag = 0; + FmapExt_s origExt[3]; + WORD origNumRecs; +#if FMAP_TEST IS_ENABLED + NINT fmapMax; +#endif + + ASSERT_MPKNSS_LOCK(); + if (allocAhead) + { +#if FMAP_TEST IS_ENABLED + if (FmapTest) + fmapMax = FMAP_MAX_SMALL; + else + fmapMax = FMAP_MAX - 5; + + if (node->head.numRecs < (fmapMax - 1)) +#else + if (node->head.numRecs < (FMAP_MAX - 6)) +#endif + { + node->extent[node->head.numRecs].count = fileBlk; + node->extent[node->head.numRecs].poolBlk = 0; + node->head.numRecs++; + + node->extent[node->head.numRecs].count = + fileBlk + extent.lengthOfExtent; + node->extent[node->head.numRecs].poolBlk = extent.poolBlkNum; + node->head.numRecs++; + + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapInsert_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_INSERT, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + + node->head.lsn = logBuffer->ZXR_Lsn; + + logRecord->u.insert.numRecs = 2; + + logRecord->u.insert.extent.poolBlk = extent.poolBlkNum; + logRecord->u.insert.extent.count = extent.lengthOfExtent; + logRecord->u.insert.fileBlk = fileBlk; + + ZLOG_BIND(xaction, buf); + ZLOG_ReleaseRecord(xaction); + + CACHE_DIRTY_RELEASE(buf); + } + else + { + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufSibling = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(buf); + return zFAILURE; + } + if (node->head.state & BT_ROOT) + { + zASSERT(bufParent == NULL); + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufParent = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + Extent_s localExt; + CACHE_RELEASE(buf); + + localExt.poolBlkNum = bufSibling->volBlk; + localExt.lengthOfExtent = 1; + zfsFreeExtent(genMsg, beast->vol.zfsVol, + &localExt, xaction); + + cacheReleaseToss(bufSibling); + return zFAILURE; + } + stInfo->fmapTreeBlks++; + syncGrowBtree(xaction, buf, bufParent, &pIndex, beast); + fmap->root = bufParent->volBlk; + } + zASSERT(bufParent != NULL); + stInfo->fmapTreeBlks++; + splitBtreeLeafSparse(xaction, buf, bufSibling, bufParent, + index, pIndex, &extent, allocAhead, fileBlk); + CACHE_DIRTY_RELEASE(buf); + buf = NULL; + CACHE_DIRTY_RELEASE(bufParent); + bufParent = NULL; + CACHE_DIRTY_RELEASE(bufSibling); + bufSibling = NULL; + } + stInfo->nextBlk = fileBlk + extent.lengthOfExtent; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + } + else + { + memcpy(&origExt[0], &node->extent[index -1], 3 * sizeof(FmapExt_s)); + origNumRecs = node->head.numRecs; + + if ((index > 1) && + (fileBlk == node->extent[index - 1].count) && + ((node->extent[index - 1].poolBlk + (node->extent[index -1].count - + node->extent[index - 2].count)) == extent.poolBlkNum)) + { + node->extent[index - 1].count += extent.lengthOfExtent; + + if (node->extent[index - 1].count == node->extent[index].count) + { + if (node->extent[index + 1].poolBlk == + (node->extent[index - 1].poolBlk + + (node->extent[index -1].count - + node->extent[index - 2].count)) ) + { + node->extent[index-1].count = node->extent[index+1].count; + memmove(&node->extent[index], &node->extent[index + 2], + sizeof(FmapExt_s) * (node->head.numRecs - (index + 2))); + node->head.numRecs-=2; + node->extent[node->head.numRecs].count = 0; + node->extent[node->head.numRecs].poolBlk = 0; + node->extent[node->head.numRecs + 1].count = 0; + node->extent[node->head.numRecs + 1].poolBlk = 0; + } + else + { + memmove(&node->extent[index], &node->extent[index + 1], + sizeof(FmapExt_s) * (node->head.numRecs - (index + 1))); + node->head.numRecs--; + node->extent[node->head.numRecs].count = 0; + node->extent[node->head.numRecs].poolBlk = 0; + } + } + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + } + else if (((fileBlk + extent.lengthOfExtent) == + node->extent[index].count) && + ((extent.poolBlkNum + extent.lengthOfExtent) == + node->extent[index + 1].poolBlk)) + { + node->extent[index].count = fileBlk; + node->extent[index + 1].poolBlk = extent.poolBlkNum; + + if (node->extent[index - 1].count == node->extent[index].count) + { + if ((index > 1) && (node->extent[index + 1].poolBlk == + (node->extent[index - 1].poolBlk + + (node->extent[index -1].count - + node->extent[index - 2].count)) )) + { + node->extent[index-1].count = node->extent[index+1].count; + memmove(&node->extent[index], &node->extent[index + 2], + sizeof(FmapExt_s) * (node->head.numRecs - (index + 2))); + node->head.numRecs-=2; + node->extent[node->head.numRecs].count = 0; + node->extent[node->head.numRecs].poolBlk = 0; + node->extent[node->head.numRecs + 1].count = 0; + node->extent[node->head.numRecs + 1].poolBlk = 0; + } + else + { + memmove(&node->extent[index], &node->extent[index + 1], + sizeof(FmapExt_s) * (node->head.numRecs - (index + 1))); + node->head.numRecs--; + node->extent[node->head.numRecs].count = 0; + node->extent[node->head.numRecs].poolBlk = 0; + } + } + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + } + else if ((fileBlk == node->extent[index - 1].count) && + (extent.lengthOfExtent == (node->extent[index].count - + node->extent[index - 1].count)) ) + { + node->extent[index].poolBlk = extent.poolBlkNum; + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + } + else + { + tmpFlag = 1; +#if FMAP_TEST IS_ENABLED + if (FmapTest) + fmapMax = FMAP_MAX_SMALL; + else + fmapMax = FMAP_MAX - 5; + + if (node->head.numRecs < (fmapMax - 1)) +#else + if (node->head.numRecs < (FMAP_MAX - 6)) +#endif + { + updateNodeEntry(xaction, buf, index, &extent, fileBlk); + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + CACHE_DIRTY_RELEASE(buf); + } + else + { + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufSibling = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(buf); + return zFAILURE; + } + if (node->head.state & BT_ROOT) + { + zASSERT(bufParent == NULL); + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufParent = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + Extent_s localExt; + + CACHE_RELEASE(buf); + + localExt.poolBlkNum = bufSibling->volBlk; + localExt.lengthOfExtent = 1; + zfsFreeExtent(genMsg, beast->vol.zfsVol, + &localExt, xaction); + + cacheReleaseToss(bufSibling); + return zFAILURE; + } + stInfo->fmapTreeBlks++; + syncGrowBtree(xaction, buf, bufParent, &pIndex, beast); + fmap->root = bufParent->volBlk; + } + zASSERT(bufParent != NULL); + stInfo->fmapTreeBlks++; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + splitBtreeLeafSparse(xaction, buf, bufSibling, bufParent, + index, pIndex, &extent, allocAhead, fileBlk); + CACHE_DIRTY_RELEASE(buf); + buf = NULL; + CACHE_DIRTY_RELEASE(bufParent); + bufParent = NULL; + CACHE_DIRTY_RELEASE(bufSibling); + bufSibling = NULL; + } + } + if (tmpFlag == 0) + { + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapInsertSparse_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_INSERT_SPARSE, xaction, logBuffer, + 1, logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + + memcpy(&logRecord->u.insertSparse.origExt[0], &origExt[0], + 3 * sizeof(FmapExt_s)); + memcpy(&logRecord->u.insertSparse.finalExt[0], + &node->extent[index -1], + 3 * sizeof(FmapExt_s)); + logRecord->u.insertSparse.origNumRecs = origNumRecs; + logRecord->u.insertSparse.finalNumRecs = node->head.numRecs; + logRecord->u.insertSparse.index = index; + + node->head.lsn = logBuffer->ZXR_Lsn; + ZLOG_BIND(xaction, buf); + ZLOG_ReleaseRecord(xaction); + + CACHE_DIRTY_RELEASE(buf); + } + } + return zOK; +} + +STATUS updateSparse( + GeneralMsg_s *genMsg, + ZfsXaction_s *xaction, + ZFSStorageInfo_s *stInfo, + Blknum_t fileBlk, + Blknum_t *poolBlk, + Blknum_t allocAhead, + Blknum_t length, + BYTE flags) +{ + Fmap_s *fmap = &stInfo->fmap; + RootBeast_s *beast = stInfo->comnInfo.beast; + Buffer_s *buf = NULL; + Buffer_s *bufParent = NULL; + Buffer_s *bufSibling = NULL; + Buffer_s *tmpBuf; + FmapNode_s *node; + NINT i, index, indexParent; + Extent_s extent; + Blknum_t seed; + Blknum_t len; + IoMsg_s iomsg; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + SQUAD numBytes; +#if FMAP_TEST IS_ENABLED + NINT fmapMax; +#endif + + ASSERT_MPKNSS_LOCK(); + numBytes = length << beast->ROOTblkSizeShift; + if (fmap->root == INVALID_BLK_ZERO) + { + /* Making file sparse the first time, filemap is still direct, + * continuing entries will be in the btree */ + zASSERT(allocAhead != 0); + + extent.poolBlkNum = 0; + extent.lengthOfExtent = length; + index = 0; + + /* Check to make sure we have enough user and directory space available */ + if (VOL_CheckUserSpace(genMsg, beast, numBytes) != zOK || + DIRQ_CheckDirQuotas(genMsg, beast, numBytes) != zOK) + { + return zFAILURE; + } + + if (zfsAllocExtent(genMsg, beast->vol.zfsVol, &extent, + (flags & ALLOC_BLOCKS_CONTIGUOUS) ? XTREE_CONTIGUOUS_BLKS : 0, + xaction) != zOK) + { + return zFAILURE; + } + + XALLOCBLK_IO_MSG(iomsg, beast, xaction, CACHE_UPDATE); + if ((buf = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + zfsFreeExtent(genMsg, beast->vol.zfsVol, &extent, xaction); + return zFAILURE; + } + + if (fmap->numRecs == 0) + fmap->numRecs++; + + stInfo->fmapTreeBlks++; + node = (FmapNode_s *)buf->pBuf.data; + fmap->root = buf->volBlk; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + + node->head.magic = FMAP_BT_ROOT; + node->head.fnh_internalID = beast->ROOTinternalID; + node->head.fnh_zid = beast->zid; + node->head.state = BT_ROOT | BT_LEAF; + node->head.leafLink = INVALID_BLK_ZERO; + node->head.lsn = 0; + node->extent[0].count = fmap->dirExt[fmap->numRecs -1].count; + node->extent[0].poolBlk = fmap->dirExt[fmap->numRecs -1].poolBlk; + node->head.numRecs = 1; + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapRoot_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_INIT_ROOT, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + ZLOG_ALLOC_BLOCK(logBlks[0]); + + node->head.lsn = logBuffer->ZXR_Lsn; + + logRecord->u.root.numRecs = 1; + logRecord->u.root.fr_internalID = node->head.fnh_internalID; + logRecord->u.root.fr_zid = node->head.fnh_zid; + memcpy(&logRecord->u.root.extent[0], &node->extent[0], + 1 * sizeof(FmapExt_s)); + + ZLOG_BIND(xaction, buf); + + ZLOG_ReleaseRecord(xaction); + goto ContinueAfterCreatingANewRoot; + } + else + { + READBLK_IO_MSG(iomsg, beast, fmap->root, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 17); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + zASSERT(node->head.magic == FMAP_BT_ROOT); + if (node->head.magic != FMAP_BT_ROOT) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + } + while (!(node->head.state & BT_LEAF)) + { + if (allocAhead) + { + *poolBlk = node->extent[node->head.numRecs -1].poolBlk; + index = node->head.numRecs - 1; + } + else + { + *poolBlk = searchBranch(node, fileBlk, &index); + } + +#if FMAP_TEST IS_ENABLED + if (FmapTest) + fmapMax = FMAP_MAX_SMALL; + else + fmapMax = FMAP_MAX - 5; + + if (node->head.numRecs < (fmapMax - 1)) +#else + if (node->head.numRecs < (FMAP_MAX - 6)) +#endif + { + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + bufParent = buf; + indexParent = index; + } + else + { + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufSibling = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(buf); + return zFAILURE; + } + if (node->head.state & BT_ROOT) + { + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufParent = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + Extent_s localExt; + + CACHE_RELEASE(buf); + localExt.poolBlkNum = bufSibling->volBlk; + localExt.lengthOfExtent = 1; + zfsFreeExtent(genMsg, beast->vol.zfsVol, + &localExt, xaction); + cacheReleaseToss(bufSibling); + return zFAILURE; + } + stInfo->fmapTreeBlks++; + syncGrowBtree(xaction, buf, bufParent, &indexParent, beast); + fmap->root = bufParent->volBlk; + } + zASSERT(bufParent != NULL); + stInfo->fmapTreeBlks++; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + + tmpBuf = splitBtreeBranchSparse(xaction, buf, bufSibling, + bufParent, index, &indexParent); + (tmpBuf == buf) ? CACHE_DIRTY_RELEASE(bufSibling): + CACHE_DIRTY_RELEASE(buf); + CACHE_DIRTY_RELEASE(bufParent); + bufParent = tmpBuf; + bufSibling = NULL; + bufParent->state |= CACHE_DIRTY; + } + buf = NULL; + READBLK_IO_MSG(iomsg, beast, *poolBlk, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 18); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_BRANCH)); + if ((node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_BRANCH)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + return zFAILURE; + } + } + if (allocAhead) + { + if (node->head.numRecs > 1) + seed = node->extent[node->head.numRecs -1].poolBlk + + (fileBlk - node->extent[node->head.numRecs -2].count); + else + seed = 0; + index = node->head.numRecs - 1; + } + else + { + *poolBlk = searchLeaf(node, fileBlk, &len, &index); + zASSERT(*poolBlk == 0); + if (index > 2) + seed = node->extent[index - 1].poolBlk + + (fileBlk - node->extent[index -2].count); + else + seed = node->extent[index + 1].poolBlk - + (node->extent[index].count - fileBlk); + } +#if FMAP_TEST IS_ENABLED + if (FmapTest || OneBlockExtents) + extent.poolBlkNum = 0; + else + extent.poolBlkNum = seed; +#else + extent.poolBlkNum = seed; +#endif + extent.lengthOfExtent = length; + + /* Check to make sure we have enough user/directory space available */ + if (VOL_CheckUserSpace(genMsg, beast, numBytes) != zOK || + DIRQ_CheckDirQuotas(genMsg, beast, numBytes) != zOK) + { + CACHE_DIRTY_RELEASE(buf); + if (bufParent) + { + CACHE_DIRTY_RELEASE(bufParent); + } + return zFAILURE; + } + + if (zfsAllocExtent( genMsg, beast->vol.zfsVol, &extent, + (flags & ALLOC_BLOCKS_CONTIGUOUS) ? XTREE_CONTIGUOUS_BLKS : 0, + xaction) != zOK) + { + CACHE_DIRTY_RELEASE(buf); + if (bufParent) + { + CACHE_DIRTY_RELEASE(bufParent); + } + return zFAILURE; + } + +ContinueAfterCreatingANewRoot: + *poolBlk = extent.poolBlkNum; + length = extent.lengthOfExtent; + + if (updateSparseLeaf(genMsg, xaction, stInfo, buf, bufParent, index, + indexParent, fileBlk, extent, allocAhead) != zOK) + { + zfsFreeExtent(genMsg, beast->vol.zfsVol, &extent, xaction); + return zFAILURE; + } + + VOL_AdjustUsedUserSpace(&xaction->xaction, beast, + (SQUAD)extent.lengthOfExtent << beast->ROOTblkSizeShift); + + DIRQ_AdjustUsedDirSpace(&xaction->xaction, beast->ROOTvolume, + beast, (SQUAD)extent.lengthOfExtent << beast->ROOTblkSizeShift); + + stInfo->fmapDataBlks+= extent.lengthOfExtent; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + + if ((flags & ALLOC_NO_ZERO_FILL) == 0) + { + for(i = 1; i < length; i++) + { + buf = cacheAllocBufferForUserData(&beast->ROOTmycache, + fileBlk+i, (*poolBlk)+i, + ZFS_BlockSignalHandler, STAT_CACHE_ALLOCATE); + if (buf->pBuf.data == NULL) + { + buf->pBuf.data = kmap_atomic(buf->b_page, KM_USER0); + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + kunmap_atomic(buf->pBuf.data, KM_USER0); + buf->pBuf.data = NULL; + } + else + { + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + } + CACHE_DIRTY_RELEASE(buf); + } + } + return zOK; +} + + +STATUS updateBtreeFileMap( + GeneralMsg_s *genMsg, + ZfsXaction_s *xaction, + ZFSStorageInfo_s *stInfo, + Blknum_t fileBlk, + Extent_s *extent, + BYTE flags) +{ + RootBeast_s *beast; + Fmap_s *fmap; + Blknum_t seed; + Buffer_s *bufChild = NULL; + FmapNode_s *child; + Buffer_s *bufParent = NULL; + Buffer_s *bufSibling = NULL; + Blknum_t poolBlk; + NINT tmp; + IoMsg_s iomsg; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + SQUAD numBytes; +#if FMAP_TEST IS_ENABLED + NINT fmapMax; +#endif + + ASSERT_MPKNSS_LOCK(); + beast = stInfo->comnInfo.beast; + fmap = &stInfo->fmap; + poolBlk = fmap->root; + +ContinueScanningTheBtree: + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 19); + if ((bufChild = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + if (bufSibling) + { + CACHE_RELEASE(bufSibling); + } + return zFAILURE; + } + child = (FmapNode_s *)(bufChild->pBuf.data); + zASSERT( (child->head.magic == FMAP_BT_LEAF) || + (child->head.magic == FMAP_BT_ROOT) || + (child->head.magic == FMAP_BT_BRANCH) ); + + if ( (child->head.magic != FMAP_BT_LEAF) && + (child->head.magic != FMAP_BT_ROOT) && + (child->head.magic != FMAP_BT_BRANCH) ) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, bufChild, &iomsg); + CACHE_RELEASE(bufChild); + bufChild = NULL; + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + if (bufSibling) + { + CACHE_RELEASE(bufSibling); + } + return zFAILURE; + } + if (child->head.state & BT_LEAF) + { + seed = child->extent[child->head.numRecs-1].poolBlk - + child->extent[child->head.numRecs-2].count + + child->extent[child->head.numRecs-1].count; +#if FMAP_TEST IS_ENABLED + if (FmapTest || OneBlockExtents) + extent->poolBlkNum = 0; + else + extent->poolBlkNum = seed; +#else + extent->poolBlkNum = seed; +#endif + /* Check to make sure we have enough user/directory space available */ + numBytes = extent->lengthOfExtent << beast->ROOTblkSizeShift; + if (VOL_CheckUserSpace(genMsg, beast, numBytes) != zOK || + DIRQ_CheckDirQuotas(genMsg, beast, numBytes) != zOK) + { + if (bufChild) + { + CACHE_DIRTY_RELEASE(bufChild); + } + if (bufParent) + { + CACHE_DIRTY_RELEASE(bufParent); + } + if (bufSibling) + { + CACHE_DIRTY_RELEASE(bufSibling); + } + return zFAILURE; + } + + if (zfsAllocExtent( genMsg, beast->vol.zfsVol, extent, + (flags & ALLOC_BLOCKS_CONTIGUOUS) ? XTREE_CONTIGUOUS_BLKS : 0, + xaction) != zOK) + { + if (bufChild) + { + CACHE_DIRTY_RELEASE(bufChild); + } + if (bufParent) + { + CACHE_DIRTY_RELEASE(bufParent); + } + if (bufSibling) + { + CACHE_DIRTY_RELEASE(bufSibling); + } + return zFAILURE; + } + +#if FMAP_TEST IS_ENABLED + if (FmapTest) + fmapMax = FMAP_MAX_SMALL; + else + fmapMax = FMAP_MAX - 5; +#endif + if ((child->extent[child->head.numRecs -1].poolBlk + + child->extent[child->head.numRecs -1].count - + child->extent[child->head.numRecs -2].count) == extent->poolBlkNum) + { + child->extent[child->head.numRecs -1].count+=extent->lengthOfExtent; + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapInsert_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_INSERT, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], bufChild->volBlk, + child->head.lsn, bufChild, xaction, 0); + + child->head.lsn = logBuffer->ZXR_Lsn; + + logRecord->u.insert.numRecs = 0; + logRecord->u.insert.extent.poolBlk = 0; + logRecord->u.insert.extent.count = extent->lengthOfExtent; + + ZLOG_BIND(xaction, bufChild); + ZLOG_ReleaseRecord(xaction); + } +#if FMAP_TEST IS_ENABLED + else if (child->head.numRecs < (fmapMax - 1)) +#else + else if (child->head.numRecs < (FMAP_MAX - 6)) +#endif + { + child->extent[child->head.numRecs].poolBlk = extent->poolBlkNum; + child->extent[child->head.numRecs].count = + child->extent[child->head.numRecs -1].count+extent->lengthOfExtent; + child->head.numRecs++; + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapInsert_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_INSERT, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], bufChild->volBlk, + child->head.lsn, bufChild, xaction, 0); + + child->head.lsn = logBuffer->ZXR_Lsn; + + logRecord->u.insert.numRecs = 1; + + logRecord->u.insert.extent.poolBlk = + child->extent[child->head.numRecs -1].poolBlk; + logRecord->u.insert.extent.count = + child->extent[child->head.numRecs -1].count; + + ZLOG_BIND(xaction, bufChild); + ZLOG_ReleaseRecord(xaction); + } + else + { + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufSibling = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + zfsFreeExtent(genMsg, beast->vol.zfsVol, + extent, xaction); + CACHE_RELEASE(bufChild); + return zFAILURE; + } + if (child->head.state & BT_ROOT) + { + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufParent = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + Extent_s localExt; + + zfsFreeExtent(genMsg, beast->vol.zfsVol, + extent, xaction); + + CACHE_RELEASE(bufChild); + + localExt.poolBlkNum = bufSibling->volBlk; + localExt.lengthOfExtent = 1; + zfsFreeExtent(genMsg, beast->vol.zfsVol, + &localExt, xaction); + + cacheReleaseToss(bufSibling); + + return zFAILURE; + } + stInfo->fmapTreeBlks++; + syncGrowBtree(xaction, bufChild, bufParent, &tmp, beast); + fmap->root = bufParent->volBlk; + } + zASSERT(bufParent != NULL); + stInfo->fmapTreeBlks++; + syncSplitBtree(xaction, bufChild, bufSibling, bufParent, extent); + } + VOL_AdjustUsedUserSpace(&xaction->xaction, beast, + (SQUAD)extent->lengthOfExtent << beast->ROOTblkSizeShift); + + DIRQ_AdjustUsedDirSpace(&xaction->xaction, beast->ROOTvolume, beast, + (SQUAD)extent->lengthOfExtent << beast->ROOTblkSizeShift); + + stInfo->fmapDataBlks+= extent->lengthOfExtent; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + } + else + { + poolBlk = child->extent[child->head.numRecs-1].poolBlk; +#if FMAP_TEST IS_ENABLED + if (FmapTest) + fmapMax = FMAP_MAX_SMALL; + else + fmapMax = FMAP_MAX - 5; + + if (child->head.numRecs < (fmapMax - 1)) +#else + if (child->head.numRecs < (FMAP_MAX - 6)) +#endif + { + if (bufParent) + { + CACHE_DIRTY_RELEASE(bufParent); + bufParent = NULL; + } + bufParent = bufChild; + bufChild = NULL; + } + else + { + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufSibling = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(bufChild); + return zFAILURE; + } + if (child->head.state & BT_ROOT) + { + XALLOC_SEED_IO_MSG(iomsg, beast, xaction, 0, CACHE_UPDATE); + if ((bufParent = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + Extent_s localExt; + CACHE_RELEASE(bufChild); + + localExt.poolBlkNum = bufSibling->volBlk; + localExt.lengthOfExtent = 1; + zfsFreeExtent(genMsg, beast->vol.zfsVol, + &localExt, xaction); + cacheReleaseToss(bufSibling); + return zFAILURE; + } + stInfo->fmapTreeBlks++; + syncGrowBtree(xaction, bufChild, bufParent, &tmp, beast); + fmap->root = bufParent->volBlk; + } + zASSERT(bufParent != NULL); + + stInfo->fmapTreeBlks++; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + syncSplitBtree(xaction, bufChild, bufSibling, bufParent, extent); + + CACHE_DIRTY_RELEASE(bufChild); + bufChild = NULL; + CACHE_DIRTY_RELEASE(bufParent); + bufParent = NULL; + + bufParent = bufSibling; + bufSibling = NULL; + } + goto ContinueScanningTheBtree; + } + CACHE_DIRTY_RELEASE(bufChild); + if (bufParent) + { + CACHE_DIRTY_RELEASE(bufParent); + } + if (bufSibling) + { + CACHE_DIRTY_RELEASE(bufSibling); + } + return zOK; +} + + +STATUS updateDirectFileMap( + GeneralMsg_s *genMsg, + ZfsXaction_s *xaction, + ZFSStorageInfo_s *stInfo, + Blknum_t fileBlk, + Extent_s *extent) +{ + RootBeast_s *beast; + Fmap_s *fmap; + Buffer_s *buffer = NULL; + FmapNode_s *node; + Blknum_t poolBlk = extent->poolBlkNum; + Blknum_t length = extent->lengthOfExtent; + IoMsg_s iomsg; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + + ASSERT_MPKNSS_LOCK(); + beast = stInfo->comnInfo.beast; + fmap = &stInfo->fmap; + + if (fmap->numRecs == 1) + { + fmap->dirExt[fmap->numRecs].count = length; + fmap->dirExt[fmap->numRecs].poolBlk = poolBlk; + fmap->numRecs++; + } + else if ( ( fmap->dirExt[fmap->numRecs -1].poolBlk - + fmap->dirExt[fmap->numRecs -2].count + + fmap->dirExt[fmap->numRecs -1].count ) == poolBlk) + { + fmap->dirExt[fmap->numRecs - 1].count += length; + } + else if (fmap->numRecs < MAX_DIRECT) + { + fmap->dirExt[fmap->numRecs].poolBlk = poolBlk; + fmap->dirExt[fmap->numRecs].count = + fmap->dirExt[fmap->numRecs -1].count + length; + fmap->numRecs++; + } + else + { + XALLOCBLK_IO_MSG(iomsg, beast, xaction, CACHE_UPDATE); + if ((buffer = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + stInfo->fmapTreeBlks++; + fmap->root = buffer->volBlk; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + + node = (FmapNode_s *)(buffer->pBuf.data); + + node->head.magic = FMAP_BT_ROOT; + node->head.fnh_internalID = beast->ROOTinternalID; + node->head.fnh_zid = beast->zid; + node->head.state = BT_ROOT | BT_LEAF; + node->head.numRecs = 0; + node->head.leafLink = 0; + node->head.lsn = 0; + + node->extent[node->head.numRecs].count = + fmap->dirExt[fmap->numRecs - 1].count; + node->extent[node->head.numRecs].poolBlk = + fmap->dirExt[fmap->numRecs -1].poolBlk; + node->head.numRecs++; + + node->extent[node->head.numRecs].count = + fmap->dirExt[fmap->numRecs - 1].count + length; + node->extent[node->head.numRecs].poolBlk = poolBlk; + node->head.numRecs++; + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapRoot_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_INIT_ROOT, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buffer->volBlk, + node->head.lsn, buffer, xaction, 0); + ZLOG_ALLOC_BLOCK(logBlks[0]); + + node->head.lsn = logBuffer->ZXR_Lsn; + + logRecord->u.root.numRecs = 2; + logRecord->u.root.fr_internalID = node->head.fnh_internalID; + logRecord->u.root.fr_zid = node->head.fnh_zid; + memcpy(&logRecord->u.root.extent[0], &node->extent[0], + 2 * sizeof(FmapExt_s)); + + ZLOG_BIND(xaction, buffer); + + ZLOG_ReleaseRecord(xaction); + + CACHE_DIRTY_RELEASE(buffer); + } + return zOK; +} + + +/* + ZLSS_PARENT_BIAS - + If we use the PS_ routines then they need to be volume specific + and optimized... This logic will get better performance if we + have an I/O scheduler that has an AREA cache. If we don't have + an AREA cache I/O scheduler then we would have to rely on the + devices track cache (which are limited in size). The track cache + may help desktops or lightly used servers. See Physical I/O + specification for more details. +*/ + +#define ZLSS_PARENT_BIAS + + +#ifdef ZLSS_PARENT_BIAS +typedef struct ParentSeed_s { + Zid_t PS_Zid; + Blknum_t PS_Seed; +} ParentSeed_s; +#endif + +#ifdef ZLSS_PARENT_BIAS +BOOL ZLSS_ParentBias = FALSE; /* Allocate 1st block of small file near + other small files from same directory. This should + help read performance, but not writes. Writes should + be helped more by using global seed becuase it causes + ALL new writes to be near each other. ZLSS_ParentBias + only makes writes from same directory next to each + other. */ +Blknum_t ZLSS_ParentBiasSmall = 2; /* Defines how many blocks on the 1st + allocation that make a file 'small'. Note that if + a large file is created by doing small writes then + this code assumes that it is a small file:-( */ + // TODO(Perf): Is the 1st allocLength always large when allocahead is being used? + +ParentSeed_s VolumeSeeds[128]; +#endif + + +#ifdef ZLSS_PARENT_BIAS +static Blknum_t PS_FindSeed( Zid_t parentZid ) +{ + int i; + + for ( i=0; i < NELEMS(VolumeSeeds); ++i ) { + if ( parentZid == VolumeSeeds[i].PS_Zid ) { + return VolumeSeeds[i].PS_Seed; + } + } + return 0; +} +#endif + + +#ifdef ZLSS_PARENT_BIAS +static void PS_InsertItem( Zid_t parentZid, Blknum_t seed ) +{ + memmove( &VolumeSeeds[1], &VolumeSeeds[0], (NELEMS(VolumeSeeds)-1)*sizeof(VolumeSeeds[0]) ); + VolumeSeeds[0].PS_Zid = parentZid; + VolumeSeeds[0].PS_Seed = seed; + return; +} +#endif + + +#ifdef ZLSS_PARENT_BIAS +static void PS_UpdateItem( Zid_t parentZid, Blknum_t seed ) +{ + int i; + + for ( i=0; i < NELEMS(VolumeSeeds); ++i ) { + if ( parentZid == VolumeSeeds[i].PS_Zid ) { + VolumeSeeds[i].PS_Seed = seed; + return; + } + } + PS_InsertItem(parentZid,seed); + return; +} +#endif + + +STATUS extendFileMap( + GeneralMsg_s *genMsg, + ZfsXaction_s *xaction, + ZFSStorageInfo_s *stInfo, + Blknum_t *poolBlk, + NINT allocLength, + BYTE flags) +{ + Fmap_s *fmap = &stInfo->fmap; + RootBeast_s *beast = stInfo->comnInfo.beast; + Blknum_t seed; + Extent_s extent; + Blknum_t fileBlk = stInfo->nextBlk; + Blknum_t allocatedLength; + Buffer_s *buf; + NINT i; + SQUAD numBytes; + Zid_t parentZid = 0; + NINT lFlags = 0; + + + ASSERT_MPKNSS_LOCK(); + extent.lengthOfExtent = allocLength; + if ((fileBlk == 0) && (fmap->numRecs == 0)) + { + fmap->numRecs = 1; + } + + if (fmap->root == INVALID_BLK_ZERO) + { + if (fmap->numRecs == 1) + { + seed = 0; +#ifdef ZLSS_PARENT_BIAS + if ( ZLSS_ParentBias && (allocLength <= ZLSS_ParentBiasSmall) && + COMN_IsDerivedFrom( beast, zFTYPE_NAMED_DATA_STREAM ) ) + { + NamedBeast_s *nbeast = (NamedBeast_s *)beast; + + /*if ( nbeast->firstParent ) */{ /* Need to look at parent count > 0? */ + parentZid = nbeast->firstParent.p.zid; + if ( parentZid ) { + seed = PS_FindSeed( parentZid ); + /* ZLSS_PARENT_BIAS - If we keep this then need new + flags so that we do not mix metadata area with user + blocks. Some of the user files may be more than + one block so they may want to smash into our + metadata blocks. + ZLSS_PARENT_BIAS - May want to use allocLength + to decide where to allocate block(s). */ + if ( seed ) + { + lFlags = XTREE_AF_NEAR_TREE; + } + else + { + lFlags = XTREE_AF_NEW_AREA; + } + } + } + } +#endif + } + else + { + seed = fmap->dirExt[fmap->numRecs - 1].poolBlk + + (fmap->dirExt[fmap->numRecs - 1].count - + fmap->dirExt[fmap->numRecs - 2].count); + } + +#if FMAP_TEST IS_ENABLED + if (FmapTest || OneBlockExtents) + extent.poolBlkNum = 0; + else + extent.poolBlkNum = seed; +#else + extent.poolBlkNum = seed; +#endif + /* Check to make sure we have enough user/directory space available */ + numBytes = allocLength << beast->ROOTblkSizeShift; + if (VOL_CheckUserSpace(genMsg, beast, numBytes) != zOK || + DIRQ_CheckDirQuotas(genMsg, beast, numBytes) != zOK) + { + return zFAILURE; + } + + if (zfsAllocExtent( genMsg, beast->vol.zfsVol, &extent, + (flags & ALLOC_BLOCKS_CONTIGUOUS) ? XTREE_CONTIGUOUS_BLKS : lFlags, + xaction) != zOK) + { + return zFAILURE; + } + if (updateDirectFileMap(genMsg, xaction, + stInfo, fileBlk, &extent) != zOK) + { + zfsFreeExtent(genMsg, beast->vol.zfsVol, &extent, xaction); + return zFAILURE; + } +#ifdef ZLSS_PARENT_BIAS + if ( parentZid ) { + PS_UpdateItem( parentZid, extent.poolBlkNum ); + } +#endif + VOL_AdjustUsedUserSpace(&xaction->xaction, beast, + (SQUAD)extent.lengthOfExtent << beast->ROOTblkSizeShift); + DIRQ_AdjustUsedDirSpace(&xaction->xaction, beast->ROOTvolume, beast, + (SQUAD)extent.lengthOfExtent << beast->ROOTblkSizeShift); + stInfo->fmapDataBlks+= extent.lengthOfExtent; + } + else + { + if (updateBtreeFileMap(genMsg, xaction, + stInfo, fileBlk, &extent, flags) != zOK) + { + return zFAILURE; + } + } + allocatedLength = extent.lengthOfExtent; + *poolBlk = extent.poolBlkNum; + stInfo->nextBlk = fileBlk + extent.lengthOfExtent; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + + if ((flags & ALLOC_NO_ZERO_FILL) == 0) + { + for(i = 1; i < allocatedLength; i++) + { + buf = cacheAllocBufferForUserData(&beast->ROOTmycache, + fileBlk+i, (*poolBlk)+i, + ZFS_BlockSignalHandler, STAT_CACHE_ALLOCATE); + if (buf->pBuf.data == NULL) + { + buf->pBuf.data = kmap_atomic(buf->b_page, KM_USER0); + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + kunmap_atomic(buf->pBuf.data, KM_USER0); + buf->pBuf.data = NULL; + } + else + { + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + } + CACHE_DIRTY_RELEASE(buf); + } + } + return zOK; +} + + +Buffer_s *ZFSVOL_VOL_getFileBlk( + GeneralMsg_s *genMsg, + IoMsg_s *io) +{ + RootBeast_s *beast = io->beast; + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + Blknum_t poolBlk; + Blknum_t tmpBlk; + Blknum_t fileBlk = io->fileBlk; + Blknum_t allocAhead; + Blknum_t allocRemain; + Buffer_s *buf; + ZfsXaction_s *xaction; + + ASSERT_MPKNSS_LOCK(); + if (io->mode == CACHE_READ) + { + /** Does not change the file map **/ + + ASSERT_LATCH(&beast->ROOTbeastLatch); + + if (fileBlk >= stInfo->nextBlk) + { + ASSERT_SLATCH( &CACHE_SparseBuffer.agent.latch); + ADD_LATCH( &CACHE_SparseBuffer.agent.latch); + return &CACHE_SparseBuffer; + } + if (findBlkInFileMap(genMsg, stInfo, fileBlk, &poolBlk) != zOK) + { + return NULL; + } + if (poolBlk == 0) + { + ASSERT_SLATCH( &CACHE_SparseBuffer.agent.latch); + ADD_LATCH( &CACHE_SparseBuffer.agent.latch); + return &CACHE_SparseBuffer; + } + io->volBlk = poolBlk; + SET_DEBUG_ID(*io, 20); + return ZFS_ReadPoolBlk(genMsg, io); + } + else if (fileBlk >= stInfo->nextBlk) + { + ASSERT_XLATCH(&beast->ROOTbeastLatch); + zASSERT(io->allocNumBlks >= 1); + zASSERT(!(beast->ROOTvolume->VOLenabledAttributes & zATTR_READONLY)); + + if ((io->allocNumBlks > 1) && + (io->flags & ALLOC_NUM_BLOCKS_IS_OPTIONAL)) + { + beast->bstState |= BST_STATE_TRUNCATE_CLOSE; + } + xaction = BeginXLocal(beast->vol.volume, + BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + allocAhead = fileBlk - stInfo->nextBlk; + if (allocAhead > ALLOC_AHEAD_SPARSE) + { + allocRemain = io->allocNumBlks; + tmpBlk = fileBlk; + if (updateSparse(genMsg, xaction, stInfo, fileBlk, &poolBlk, + allocAhead, allocRemain, io->flags) != zOK) + { + COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction); + EndXlocal(xaction); + return NULL; + } + zASSERT(stInfo->nextBlk > tmpBlk); + io->volBlk = poolBlk; + allocRemain -= (stInfo->nextBlk - tmpBlk); + while (allocRemain) + { + if ((stInfo->nextBlk > fileBlk) && + (io->flags & (ALLOC_NUM_BLOCKS_IS_OPTIONAL | + ALLOC_BLOCKS_CONTIGUOUS))) + { + break; + } + tmpBlk = stInfo->nextBlk; + if (extendFileMap(genMsg, xaction, stInfo, &poolBlk, + allocRemain, io->flags) != zOK) + { + COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction); + EndXlocal(xaction); + return NULL; + } + zASSERT(stInfo->nextBlk > tmpBlk); + allocRemain -= (stInfo->nextBlk - tmpBlk); + if ((io->flags & ALLOC_NO_ZERO_FILL) == 0) + { + buf = cacheAllocBufferForUserData(&beast->ROOTmycache, + tmpBlk, poolBlk, + ZFS_BlockSignalHandler, STAT_CACHE_ALLOCATE); + if (buf->pBuf.data == NULL) + { + buf->pBuf.data = kmap_atomic(buf->b_page, KM_USER0); + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + kunmap_atomic(buf->pBuf.data, KM_USER0); + buf->pBuf.data = NULL; + } + else + { + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + } + CACHE_DIRTY_RELEASE(buf); + } + } + } + else + { + while (allocAhead) + { + tmpBlk = stInfo->nextBlk; + if (extendFileMap(genMsg, xaction, stInfo, &poolBlk, + allocAhead, io->flags) != zOK) + { + COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction); + EndXlocal(xaction); + return NULL; + } + zASSERT(stInfo->nextBlk > tmpBlk); + allocAhead -= (stInfo->nextBlk - tmpBlk); + if ((io->flags & ALLOC_NO_ZERO_FILL) == 0) + { + buf = cacheAllocBufferForUserData(&beast->ROOTmycache, + tmpBlk, poolBlk, + ZFS_BlockSignalHandler, STAT_CACHE_ALLOCATE); + if (buf->pBuf.data == NULL) + { + buf->pBuf.data = kmap_atomic(buf->b_page, KM_USER0); + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + kunmap_atomic(buf->pBuf.data, KM_USER0); + buf->pBuf.data = NULL; + } + else + { + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + } + CACHE_DIRTY_RELEASE(buf); + } + } + /** Write the requested fileblk **/ + zASSERT(fileBlk == stInfo->nextBlk); + tmpBlk = stInfo->nextBlk; + allocRemain = io->allocNumBlks; + if (extendFileMap(genMsg, xaction, stInfo, &poolBlk, + allocRemain, io->flags) != zOK) + { + COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction); + EndXlocal(xaction); + return NULL; + } + zASSERT(stInfo->nextBlk > tmpBlk); + io->volBlk = poolBlk; + allocRemain -= (stInfo->nextBlk - tmpBlk); + while (allocRemain) + { + if ((stInfo->nextBlk > fileBlk) && + (io->flags & (ALLOC_NUM_BLOCKS_IS_OPTIONAL | + ALLOC_BLOCKS_CONTIGUOUS))) + { + break; + } + tmpBlk = stInfo->nextBlk; + if (extendFileMap(genMsg, xaction, stInfo, &poolBlk, + allocRemain, io->flags) != zOK) + { + COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction); + EndXlocal(xaction); + return NULL; + } + zASSERT(stInfo->nextBlk > tmpBlk); + allocRemain -= (stInfo->nextBlk - tmpBlk); + if ((io->flags & ALLOC_NO_ZERO_FILL) == 0) + { + buf = cacheAllocBufferForUserData(&beast->ROOTmycache, + tmpBlk, poolBlk, + ZFS_BlockSignalHandler, STAT_CACHE_ALLOCATE); + if (buf->pBuf.data == NULL) + { + buf->pBuf.data = kmap_atomic(buf->b_page, KM_USER0); + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + kunmap_atomic(buf->pBuf.data, KM_USER0); + buf->pBuf.data = NULL; + } + else + { + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + } + CACHE_DIRTY_RELEASE(buf); + } + } + } + if (COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction) != zOK) + { + EndXlocal(xaction); + return NULL; + } + if (io->flags & ALLOC_BLOCKS_CONTIGUOUS) + { + if ((io->fileBlk + io->allocNumBlks) != stInfo->nextBlk) + { + /* This is the case where we have only partially allocated + * the requested contiguous blocks + */ + if ((io->flags & ALLOC_NO_ZERO_FILL) == 0) + { + buf = cacheAllocBufferForUserData(&beast->ROOTmycache, + io->fileBlk, io->volBlk, + ZFS_BlockSignalHandler, STAT_CACHE_ALLOCATE); + if (buf->pBuf.data == NULL) + { + buf->pBuf.data = kmap_atomic(buf->b_page, KM_USER0); + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + kunmap_atomic(buf->pBuf.data, KM_USER0); + buf->pBuf.data = NULL; + } + else + { + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + } + CACHE_DIRTY_RELEASE(buf); + } + SetErrno(genMsg, zERR_CONTIGUOUS_SPACE); + EndXlocal(xaction); + return NULL; + } + } + buf = cacheAllocBufferForUserData(&beast->ROOTmycache, + io->fileBlk, io->volBlk, + ZFS_BlockSignalHandler, STAT_CACHE_ALLOCATE); + if ((io->mode != CACHE_WRITE) && + ((io->flags & ALLOC_NO_ZERO_FILL) == 0)) + { + if (buf->pBuf.data == NULL) + { + buf->pBuf.data = kmap_atomic(buf->b_page, KM_USER0); + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + kunmap_atomic(buf->pBuf.data, KM_USER0); + buf->pBuf.data = NULL; + } + else + { + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + } + } + EndXlocal(xaction); + return buf; + } + else + { + ASSERT_XLATCH(&beast->ROOTbeastLatch); + zASSERT(!(beast->ROOTvolume->VOLenabledAttributes & zATTR_READONLY)); + if (findBlkInFileMap(genMsg, stInfo, io->fileBlk, &poolBlk) != zOK) + { + return NULL; + } + if (poolBlk != 0) + { + io->volBlk = poolBlk; + SET_DEBUG_ID(*io, 21); + return ZFS_ReadPoolBlk(genMsg, io); + } + else + { + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + if (updateSparse(genMsg, xaction, stInfo, fileBlk, &poolBlk, + 0, 1, io->flags) != zOK) + { + COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction); + EndXlocal(xaction); + return NULL; + } + io->volBlk = poolBlk; + buf = cacheAllocBufferForUserData(&beast->ROOTmycache, + io->fileBlk, io->volBlk, + ZFS_BlockSignalHandler, STAT_CACHE_ALLOCATE); + if ((io->mode != CACHE_WRITE) && + ((io->flags & ALLOC_NO_ZERO_FILL) == 0)) + { + if (buf->pBuf.data == NULL) + { + buf->pBuf.data = kmap_atomic(buf->b_page, KM_USER0); + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + kunmap_atomic(buf->pBuf.data, KM_USER0); + buf->pBuf.data = NULL; + } + else + { + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + } + } + + if (COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction) != zOK) + { + EndXlocal(xaction); + CACHE_RELEASE(buf); + return NULL; + } + EndXlocal(xaction); + return buf; + } + } +} + + +/************************************************************************* + * Recovery routines + *************************************************************************/ + +STATUS redoInitRoot( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + bzero(buffer->pBuf.data, (1 << buffer->bufSizeShift)); + node = (FmapNode_s *)buffer->pBuf.data; + + node->head.magic = FMAP_BT_ROOT; + node->head.fnh_internalID = logRecord->u.root.fr_internalID; + node->head.fnh_zid = logRecord->u.root.fr_zid; + node->head.state = BT_ROOT | BT_LEAF; + node->head.numRecs = logRecord->u.root.numRecs; + node->head.leafLink = 0; + + memcpy(&node->extent[0], &logRecord->u.root.extent[0], + node->head.numRecs * sizeof(FmapExt_s)); + + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + return zOK; +} + +STATUS undoInitRoot( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + BlockInfo_s *logBlks; + Buffer_s *buffer; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + cacheReleaseToss(buffer); + } + } + return zOK; +} + +STATUS redoFmapInsert( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + WORD index; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + index = node->head.numRecs - 1; + if (logRecord->u.insert.numRecs == 0) + { + node->extent[index].count += logRecord->u.insert.extent.count; + } + else if (logRecord->u.insert.numRecs == 1) + { + node->head.numRecs++; + node->extent[index + 1].poolBlk = + logRecord->u.insert.extent.poolBlk; + node->extent[index + 1].count = + logRecord->u.insert.extent.count; + } + else if (logRecord->u.insert.numRecs == 2) + { + node->head.numRecs+=2; + node->extent[index + 1].poolBlk = 0; + node->extent[index + 1].count = logRecord->u.insert.fileBlk; + node->extent[index + 2].poolBlk = + logRecord->u.insert.extent.poolBlk; + node->extent[index + 2].count = logRecord->u.insert.fileBlk + + logRecord->u.insert.extent.count; + + } + else + { + zASSERT(0); + } + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +STATUS undoFmapInsert( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + WORD index; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + index = node->head.numRecs - 1; + if (logRecord->u.insert.numRecs == 0) + { + node->extent[index].count -= logRecord->u.insert.extent.count; + } + else if (logRecord->u.insert.numRecs == 1) + { + node->head.numRecs--; + node->extent[index].poolBlk = 0; + node->extent[index].count = 0; + } + else if (logRecord->u.insert.numRecs == 2) + { + node->head.numRecs-=2; + node->extent[index].poolBlk = 0; + node->extent[index].count = 0; + node->extent[index - 1].poolBlk = 0; + node->extent[index - 1].count = 0; + } + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +STATUS redoFmapInsertSparse( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + WORD origNumRecs; + WORD finalNumRecs; + WORD diff; + WORD index; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + origNumRecs = logRecord->u.insertSparse.origNumRecs; + finalNumRecs = logRecord->u.insertSparse.finalNumRecs; + index = logRecord->u.insertSparse.index - 1; + + if (origNumRecs > finalNumRecs) + { + diff = origNumRecs - finalNumRecs; + memmove(&node->extent[index], &node->extent[index + diff], + sizeof(FmapExt_s) * (node->head.numRecs - (index + diff))); + while(diff) + { + node->extent[node->head.numRecs - diff].count = 0; + node->extent[node->head.numRecs - diff].poolBlk = 0; + diff--; + } + } + else if (finalNumRecs > origNumRecs) + { + diff = finalNumRecs - origNumRecs; + memmove(&node->extent[index + diff], &node->extent[index], + sizeof(FmapExt_s) * (node->head.numRecs - index)); + } + memcpy(&node->extent[index], &logRecord->u.insertSparse.finalExt[0], + 3 * sizeof(FmapExt_s)); + node->head.numRecs = finalNumRecs; + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + + +STATUS undoFmapInsertSparse( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + WORD index; + WORD origNumRecs; + WORD finalNumRecs; + WORD diff; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + origNumRecs = logRecord->u.insertSparse.origNumRecs; + finalNumRecs = logRecord->u.insertSparse.finalNumRecs; + index = logRecord->u.insertSparse.index - 1; + + if (origNumRecs > finalNumRecs) + { + diff = origNumRecs - finalNumRecs; + memmove(&node->extent[index + diff], &node->extent[index], + sizeof(FmapExt_s) * (node->head.numRecs - index)); + } + else if (finalNumRecs > origNumRecs) + { + diff = finalNumRecs - origNumRecs; + memmove(&node->extent[index], &node->extent[index + diff], + sizeof(FmapExt_s) * (node->head.numRecs - (index + diff))); + while(diff) + { + node->extent[node->head.numRecs - diff].count = 0; + node->extent[node->head.numRecs - diff].poolBlk = 0; + diff--; + } + } + memcpy(&node->extent[index], &logRecord->u.insertSparse.origExt[0], + 3 * sizeof(FmapExt_s)); + node->head.numRecs = origNumRecs; + + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + + +STATUS redoFmapGrow( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /** update child to no longer be root **/ + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + node->head.state &= ~BT_ROOT; + if (node->head.state & BT_LEAF) + node->head.magic = FMAP_BT_LEAF; + else + node->head.magic = FMAP_BT_BRANCH; + node->head.fnh_internalID = logRecord->u.grow.fg_internalID; + node->head.fnh_zid = logRecord->u.grow.fg_zid; + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + + CACHE_DIRTY_RELEASE(buffer); + } + } + /** update parent as new root **/ + if (ZLOG_VALID_BLOCK(logBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + bzero(buffer->pBuf.data, (1 << buffer->bufSizeShift)); + node = (FmapNode_s *)buffer->pBuf.data; + + node->head.magic = FMAP_BT_ROOT; + node->head.fnh_internalID = logRecord->u.grow.fg_internalID; + node->head.fnh_zid = logRecord->u.grow.fg_zid; + node->head.state = BT_ROOT; + node->head.numRecs = 2; + node->head.leafLink = INVALID_BLK_ZERO; + + memcpy(&node->extent[0], &logRecord->u.grow.extent[0], + sizeof(FmapGrow_s)); + + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[1], pass); + CACHE_DIRTY_RELEASE(buffer); + } + return zOK; +} + +STATUS undoFmapGrow( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + node->head.state |= BT_ROOT; + node->head.magic = FMAP_BT_ROOT; + node->head.fnh_internalID = logRecord->u.grow.fg_internalID; + node->head.fnh_zid = logRecord->u.grow.fg_zid; + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + if (ZLOG_VALID_BLOCK(logBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[1].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + cacheReleaseToss(buffer); + } + } + return zOK; +} + +STATUS redoFmapSplit( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /** update child **/ + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + if (node->head.state & BT_LEAF) + { + node->head.leafLink = logRecord->u.split.childLink; + if (logRecord->u.split.numRecs > 3) + { + /* This is the case where we are splitting a leaf that + * has a sparse block and it is in the middle as opposed + * to being on the extreme right. + * When we split a leaf on the extreme right the numRecs + * in child does not change, only 2 or 3 records are added + * to the sibling. This is because we are doing an insert + * with the split and that is why we have to do this slimy + * check of numRecs > 3. But, if the leaf is in the + * middle, then + * when we split, we copy half the data from child to + * sibling, and in that case split.numRecs is always > 3 + * and we need to fix up the child's numRecs as well as + * zero out the entries that were copied over + */ + node->head.numRecs = node->head.numRecs - + logRecord->u.split.numRecs + 1; + bzero(&node->extent[node->head.numRecs], + (sizeof(FmapExt_s) * (FMAP_MAX - node->head.numRecs))); + } + } + else + { + node->head.numRecs = node->head.numRecs - + logRecord->u.split.numRecs + 1; + bzero(&node->extent[node->head.numRecs], + (sizeof(FmapExt_s) * (FMAP_MAX - node->head.numRecs))); + } + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + /** update parent **/ + if (ZLOG_VALID_BLOCK(logBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + memmove(&node->extent[logRecord->u.split.pIndex + 2], + &node->extent[logRecord->u.split.pIndex + 1], + sizeof(FmapExt_s) * (node->head.numRecs - + (logRecord->u.split.pIndex + 1))); + memcpy(&node->extent[logRecord->u.split.pIndex], + &logRecord->u.split.parentExt[0], 2 * sizeof(FmapExt_s)); + node->head.numRecs++; + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[1], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + /** update sibling **/ + if (ZLOG_VALID_BLOCK(logBlks[2])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[2].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + bzero(buffer->pBuf.data, (1 << buffer->bufSizeShift)); + node = (FmapNode_s *)buffer->pBuf.data; + + node->head.magic = logRecord->u.split.childMagic; + node->head.fnh_internalID = logRecord->u.split.fs_internalID; + node->head.fnh_zid = logRecord->u.split.fs_zid; + node->head.state = logRecord->u.split.childState; + node->head.leafLink = logRecord->u.split.sibLink; + + memcpy(&node->extent[0], &logRecord->u.split.data[0], + logRecord->u.split.numRecs * sizeof(FmapExt_s)); + + node->head.numRecs = logRecord->u.split.numRecs; + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[2], pass); + CACHE_DIRTY_RELEASE(buffer); + } + return zOK; +} + +STATUS undoFmapSplit( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /** undo child **/ + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + if (node->head.state & BT_LEAF) + { + node->head.leafLink = logRecord->u.split.sibLink; + if (logRecord->u.split.numRecs > 3) + { + /* This is the case where we are undoing a split of a leaf + * that has a sparse block and it is in the middle as + * opposed to being on the extreme right. + * When we split a leaf on the extreme right the numRecs + * in child does not change, only 2 or 3 records are added + * to the sibling. But, if the leaf is in the middle, then + * when we split, we copy half the data from child to + * sibling, and in that case split.numRecs is always > 3 + */ + + memcpy(&node->extent[node->head.numRecs - 1], + &logRecord->u.split.data[0], + sizeof(FmapExt_s) * logRecord->u.split.numRecs); + node->head.numRecs = node->head.numRecs + + logRecord->u.split.numRecs - 1; + } + } + else + + { + memcpy(&node->extent[node->head.numRecs - 1], + &logRecord->u.split.data[0], + sizeof(FmapExt_s) * logRecord->u.split.numRecs); + node->head.numRecs = node->head.numRecs + + logRecord->u.split.numRecs - 1; + } + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + + /** undo parent **/ + if (ZLOG_VALID_BLOCK(logBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + node->extent[logRecord->u.split.pIndex].count = + node->extent[logRecord->u.split.pIndex + 1].count; + memmove(&node->extent[logRecord->u.split.pIndex + 1], + &node->extent[logRecord->u.split.pIndex + 2], + sizeof(FmapExt_s) * (node->head.numRecs - + (logRecord->u.split.pIndex + 2))); + node->head.numRecs--; + node->extent[node->head.numRecs].count = 0; + node->extent[node->head.numRecs].poolBlk = 0; + + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[1], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + /** undo sibling **/ + if (ZLOG_VALID_BLOCK(logBlks[2])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[2].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + cacheReleaseToss(buffer); + } + } + return zOK; +} + +STATUS redoFmapRemove( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + node->extent[node->head.numRecs - 1].poolBlk = + logRecord->u.remove.finalExt.poolBlk; + node->extent[node->head.numRecs - 1].count = + logRecord->u.remove.finalExt.count; + node->head.numRecs -= logRecord->u.remove.numRecs; + + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +STATUS undoFmapRemove( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + node->head.numRecs += logRecord->u.remove.numRecs; + node->extent[node->head.numRecs - 1].poolBlk = + logRecord->u.remove.origExt.poolBlk; + node->extent[node->head.numRecs - 1].count = + logRecord->u.remove.origExt.count; + + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + + +STATUS redoFmapJoin( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /** child **/ + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + cacheReleaseToss(buffer); + } + } + /** parent **/ + if (ZLOG_VALID_BLOCK(logBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + node->extent[node->head.numRecs - 1].poolBlk = 0; + node->extent[node->head.numRecs - 1].count = 0; + node->head.numRecs--; + node->extent[node->head.numRecs - 1].count = MAX_FILE_BLK; + + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[1], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +STATUS undoFmapJoin( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + /* child */ + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + bzero(buffer->pBuf.data, (1 << buffer->bufSizeShift)); + node = (FmapNode_s *)buffer->pBuf.data; + node->extent[0].poolBlk = logRecord->u.join.childExt.poolBlk; + node->extent[0].count = logRecord->u.join.childExt.count; + node->head.numRecs = logRecord->u.join.childNumRecs; + node->head.magic = logRecord->u.join.childMagic; + node->head.fnh_internalID = logRecord->u.join.fj_internalID; + node->head.fnh_zid = logRecord->u.join.fj_zid; + node->head.state = logRecord->u.join.childState; + + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + + /* parent */ + if (ZLOG_VALID_BLOCK(logBlks[1])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[1].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + memmove(&node->extent[node->head.numRecs - 1], + &logRecord->u.join.parentExt[0], + 2 * sizeof(FmapExt_s)); + node->head.numRecs++; + + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[1], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + +STATUS redoFmapToss( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + if (logRecord->u.toss.numRecs == 0) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buffer, &iomsg); + CACHE_RELEASE(buffer); + buffer = NULL; + return zFAILURE; + } + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + node->head.state |= BT_ROOT; + node->head.magic = FMAP_BT_ROOT; + node->head.fnh_internalID = logRecord->u.toss.ft_internalID; + node->head.fnh_zid = logRecord->u.toss.ft_zid; + node->head.leafLink = INVALID_BLK_ZERO; + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + else + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_WRITE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) != NULL) + { + cacheReleaseToss(buffer); + } + } + } + return zOK; +} + +STATUS undoFmapToss( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + ZfsXasRecovery_s *logBuffer, + NINT pass) +{ + FmapLog_s *logRecord; + BlockInfo_s *logBlks; + Buffer_s *buffer; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + logBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + + if (ZLOG_VALID_BLOCK(logBlks[0])) + { + READBLK_IO_MSG(iomsg, pool, logBlks[0].blkNum, CACHE_UPDATE) + SET_DEBUG_ID(iomsg, 0); + if ((buffer = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buffer->pBuf.data; + if (logRecord->u.toss.numRecs == 0) + { /* Not an block delete */ + if (ZLOG_ALREADY_DONE(pool, logBuffer, node->head.lsn, pass)) + { + CACHE_RELEASE(buffer); + } + else + { + node->head.magic = logRecord->u.toss.magic; + node->head.fnh_internalID = logRecord->u.toss.ft_internalID; + node->head.fnh_zid = logRecord->u.toss.ft_zid; + node->head.state = logRecord->u.toss.state; + node->head.leafLink = INVALID_BLK_ZERO; + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + else + { /* numrec of non-zero indicates this originally was + * a block delete. */ + bzero(buffer->pBuf.data, (1 << buffer->bufSizeShift)); + + node->head.magic = logRecord->u.toss.magic; + node->head.fnh_internalID = logRecord->u.toss.ft_internalID; + node->head.fnh_zid = logRecord->u.toss.ft_zid; + node->head.state = logRecord->u.toss.state; + node->head.leafLink = INVALID_BLK_ZERO; + + memcpy(&node->extent[0], &logRecord->u.toss.extent[0], + logRecord->u.toss.numRecs * sizeof(FmapExt_s)); + node->head.numRecs = logRecord->u.toss.numRecs; + ZLOG_SET_LSN(logBuffer, node->head.lsn, logBlks[0], pass); + CACHE_DIRTY_RELEASE(buffer); + } + } + return zOK; +} + + +/************************************************************************* + * Shred Data + *************************************************************************/ + +BYTE DataShredPatterns[MAX_DATA_SHRED_PATTERNS] = +{ + 0x35, /* 00110101 */ + 0xCA, /* 11001010 */ + 0x97, /* 10010111 */ + 0x68, /* 01101000 */ + 0x55, /* 01010101 */ + 0xAA, /* 10101010 */ + 0xFF /* 11111111 */ +}; + +struct page *DataShredPatPage[MAX_DATA_SHRED_PATTERNS]; + +NINT DataShredChunk = DATA_SHREDDING_CHUNK; + +STATUS DataShredInit( + LONG patCount) +{ + NINT i; + BYTE *data; + + if ( patCount > MAX_DATA_SHRED_PATTERNS) + { + return zFAILURE; + } + for (i = 0; i < patCount; i++) + { + if (DataShredPatPage[i] == NULL) + { + MPKNSS_UNLOCK(); + DataShredPatPage[i] = alloc_page(GFP_USER); + MPKNSS_LOCK(); + if (DataShredPatPage[i] == NULL) + { + return zFAILURE; + } + data = kmap_atomic(DataShredPatPage[i], KM_USER0); + memset( data, DataShredPatterns[i], 1 << PAGE_SHIFT); + kunmap_atomic(DataShredPatPage[i], KM_USER0); + } + } + return zOK; +} + +void DataShredCleanup() +{ + NINT i; + for (i = 0; i < MAX_DATA_SHRED_PATTERNS; i++) + { + if (DataShredPatPage[i] != NULL) + { + MPKNSS_UNLOCK(); + __free_page(DataShredPatPage[i]); + MPKNSS_LOCK(); + } + } +} + + +void DataShred( + ZfsVolume_s *zfsVol, + Extent_s *extent) +{ + zConPool_s *phypool = zfsVol->pool->storagepool->phypool; + Blknum_t poolBlk; + Blknum_t totalLength; + Blknum_t length; + NINT i; + + zASSERT(phypool != NULL); + zASSERT(zfsVol->ZFSVOLshredCount > 0); + zASSERT(zfsVol->ZFSVOLshredCount <= MAX_DATA_SHRED_PATTERNS); + + if (phypool == NULL) + { + return ; + } + + for (i = 0; i < zfsVol->ZFSVOLshredCount; i++) + { + poolBlk = extent->poolBlkNum; + totalLength = extent->lengthOfExtent; + length = DataShredChunk; + + while (totalLength > 0) + { + if (length > totalLength) + { + length = totalLength; + } + + zlssBioIOPages(WRITE, phypool->ZCP_dev, DataShredPatPage[i], + poolBlk, length, phypool->pol.poolblocksize); + + totalLength -= length; + poolBlk += length; + PERIODIC_YIELD(); + } + /* Can block and release NSS spinlock */ + ZLSSDoBarrierWriteIfRequired( phypool ); + } + + return; +} + +/************************************************************************* + * Truncate File + *************************************************************************/ +NINT truncBranchNodeChild( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Buffer_s *buf, + Buffer_s *parentBuf, + Blknum_t fileBlk); + +STATUS btreeFileMapTrunc( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Buffer_s *buf, + Blknum_t fileBlk); + +NINT truncLeafNode( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Buffer_s *buf, + Buffer_s *parentBuf, + Blknum_t fileBlk) +{ + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + Fmap_s *fmap = &stInfo->fmap; + FmapNode_s *node; + FmapNode_s *parent; + Blknum_t poolBlk; + Extent_s extent; + IoMsg_s iomsg; + ZfsXaction_s *xaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + Lsn_t lsn; + SNINT periodicReleaseCount = PERIODIC_RELEASE_COUNT; + + ASSERT_MPKNSS_LOCK(); + node = (FmapNode_s *)(buf->pBuf.data); + zASSERT((node->head.magic == FMAP_BT_BRANCH) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_ROOT)); + +ContinueProcessingLeaf: + if (node->head.numRecs <= 1) + { + node->head.leafLink = INVALID_BLK_ZERO; + if (node->head.state & BT_ROOT) + { + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapToss_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_TOSS, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + + lsn = logBuffer->ZXR_Lsn; + + logRecord->u.toss.extent[0].poolBlk = node->extent[0].poolBlk; + logRecord->u.toss.extent[0].count = node->extent[0].count; + logRecord->u.toss.numRecs = 1; + logRecord->u.toss.magic = node->head.magic; + logRecord->u.toss.state = node->head.state; + zASSERT( node->head.fnh_zid != 0 ); + logRecord->u.toss.ft_internalID = node->head.fnh_internalID; + logRecord->u.toss.ft_zid = node->head.fnh_zid; + + ZLOG_DELETE_BLOCK(xaction, logBlks[0]); + ZLOG_TEST_REDO(xaction); + ZLOG_ReleaseRecord(xaction); + + node->head.lsn = lsn; + CACHE_RELEASE(buf); + + zASSERT(stInfo->fmapTreeBlks > 0); + stInfo->fmapTreeBlks--; + + fmap->root = INVALID_BLK_ZERO; + stInfo->nextBlk = fmap->dirExt[fmap->numRecs -1].count; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + if (COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction) != zOK) + { + EndXlocal(xaction); + return zFAILURE; + } + EndXlocal(xaction); + } + else + { + parent = (FmapNode_s *)(parentBuf->pBuf.data); + + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(2) + + sizeof(FmapJoin_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_JOIN, xaction, logBuffer, 2, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[1], parentBuf->volBlk, + parent->head.lsn, parentBuf, xaction, 1); + + lsn = logBuffer->ZXR_Lsn; + + memcpy(&logRecord->u.join.parentExt[0], + &parent->extent[parent->head.numRecs -2], + 2 * sizeof(FmapExt_s)); + + logRecord->u.join.childExt.poolBlk = node->extent[0].poolBlk; + logRecord->u.join.childExt.count = node->extent[0].count; + logRecord->u.join.childState = node->head.state; + logRecord->u.join.childMagic = node->head.magic; + logRecord->u.join.childNumRecs = node->head.numRecs; + zASSERT( node->head.fnh_zid != 0 ); + logRecord->u.join.fj_internalID = node->head.fnh_internalID; + logRecord->u.join.fj_zid = node->head.fnh_zid; + + ZLOG_DELETE_BLOCK(xaction, logBlks[0]); + ZLOG_BIND(xaction, parentBuf); + + ZLOG_TEST_REDO(xaction); + ZLOG_ReleaseRecord(xaction); + + node->head.lsn = lsn; + parent->head.lsn = lsn; + + parent->extent[parent->head.numRecs -1].count = 0; + parent->extent[parent->head.numRecs -1].poolBlk = 0; + parent->head.numRecs--; + + if (parent->head.numRecs > 1) + { + poolBlk = parent->extent[parent->head.numRecs -1].poolBlk; + parent->extent[parent->head.numRecs -1].count = MAX_FILE_BLK; + parentBuf->state |= CACHE_DIRTY; + + CACHE_RELEASE(buf); + zASSERT(stInfo->fmapTreeBlks > 0); + stInfo->fmapTreeBlks--; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + + buf = NULL; + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 22); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(parentBuf); + EndXlocal(xaction); + return TRUNC_ERROR; + } + node = (FmapNode_s *)(buf->pBuf.data); + zASSERT((node->head.magic == FMAP_BT_ROOT) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_BRANCH)); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + CACHE_RELEASE(parentBuf); + EndXlocal(xaction); + return TRUNC_ERROR; + } + + EndXlocal(xaction); + goto ContinueProcessingLeaf; + } + else + { + CACHE_DIRTY_RELEASE(parentBuf); + parentBuf = NULL; + + CACHE_RELEASE(buf); + zASSERT(stInfo->fmapTreeBlks > 0); + stInfo->fmapTreeBlks--; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + EndXlocal(xaction); + return TRUNC_CONTINUE; + } + } + } + else if (node->extent[node->head.numRecs -1].count <= fileBlk) + { + if (node->extent[node->head.numRecs -1].poolBlk == 0) + { + /** The last entry is a sparse entry -- remove it and + ** further truncate the file **/ + fileBlk = node->extent[node->head.numRecs -2].count; + node->extent[node->head.numRecs -1].count = 0; + node->head.numRecs--; + goto ContinueProcessingLeaf; + } + if (parentBuf) + { + CACHE_RELEASE(parentBuf); + } + node->head.leafLink = INVALID_BLK_ZERO; + + CACHE_DIRTY_RELEASE(buf); + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + stInfo->nextBlk = fileBlk; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + if (COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction) != zOK) + { + EndXlocal(xaction); + return zFAILURE; + } + EndXlocal(xaction); + } + else if (node->extent[node->head.numRecs -2].count >= fileBlk) + { + poolBlk = node->extent[node->head.numRecs -1].poolBlk; + + if (DATA_SHREDDING_ENABLED(beast->vol.zfsVol) && (poolBlk != 0)) + { + extent.poolBlkNum = poolBlk; + extent.lengthOfExtent = node->extent[node->head.numRecs -1].count - + node->extent[node->head.numRecs -2].count; + + DataShred(beast->vol.zfsVol, &extent); + } + + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapRemove_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_REMOVE, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + + lsn = logBuffer->ZXR_Lsn; + + logRecord->u.remove.origExt.poolBlk = + node->extent[node->head.numRecs -1].poolBlk; + logRecord->u.remove.origExt.count = + node->extent[node->head.numRecs -1].count; + logRecord->u.remove.finalExt.poolBlk = 0; + logRecord->u.remove.finalExt.count = 0; + logRecord->u.remove.numRecs = 1; + + ZLOG_BIND(xaction, buf); + + ZLOG_TEST_REDO(xaction); + ZLOG_ReleaseRecord(xaction); + node->head.lsn = lsn; + + if (poolBlk == 0) + { + /* Hole in file.. nothing to free */ + node->extent[node->head.numRecs -1].count = 0; + node->head.numRecs--; + } + else + { + extent.poolBlkNum = poolBlk; + extent.lengthOfExtent = node->extent[node->head.numRecs -1].count - + node->extent[node->head.numRecs -2].count; + + node->extent[node->head.numRecs -1].count = 0; + node->extent[node->head.numRecs -1].poolBlk = 0; + node->head.numRecs--; + + zASSERT(stInfo->fmapDataBlks >= extent.lengthOfExtent); + VOL_AdjustUsedUserSpace(&xaction->xaction, beast, + -((SQUAD)extent.lengthOfExtent << beast->ROOTblkSizeShift)); + DIRQ_AdjustUsedDirSpace(&xaction->xaction, + beast->ROOTvolume, beast, + -((SQUAD)extent.lengthOfExtent << beast->ROOTblkSizeShift)); + stInfo->fmapDataBlks-=extent.lengthOfExtent; + stInfo->nextBlk = node->extent[node->head.numRecs -1].count; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + /** FixFixFix6 Free should be done before logging or changing + ** metadata + **/ + if (zfsFreeExtent(genMsg, beast->vol.zfsVol, + &extent, xaction) != zOK) + { + if (parentBuf) CACHE_RELEASE(parentBuf); + node->head.leafLink = INVALID_BLK_ZERO; + CACHE_DIRTY_RELEASE(buf); + COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction); + EndXlocal(xaction); + aprintf(LRED, "Got an error from free tree. Potentially lost %d blocks\n", extent.lengthOfExtent); + return zFAILURE; + } + } + buf->state |= CACHE_DIRTY; + EndXlocal(xaction); + if (--periodicReleaseCount <= 0) + { + Blknum_t bufBlk = buf->volBlk; + periodicReleaseCount = PERIODIC_RELEASE_COUNT; + + CACHE_RELEASE(buf); + buf = NULL; + if (parentBuf) + { + Blknum_t pBufBlk = parentBuf->volBlk; + CACHE_RELEASE(parentBuf); + READBLK_IO_MSG(iomsg, beast, pBufBlk, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 24); + parentBuf = NULL; + if ((parentBuf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { +// CACHE_RELEASE(buf); + return zFAILURE; + } + /* I am just releasing and re reading it back it, so not + * checking the magic numbers + */ + } + Yield(); + READBLK_IO_MSG(iomsg, beast, bufBlk, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 23); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + if (parentBuf) + { + CACHE_RELEASE(parentBuf); + } + return zFAILURE; + } + node = (FmapNode_s *)(buf->pBuf.data); + } + goto ContinueProcessingLeaf; + } + else + { + if (node->extent[node->head.numRecs -1].poolBlk == 0) + { + poolBlk = 0; + fileBlk = node->extent[node->head.numRecs -2].count; + } + else + { + poolBlk = node->extent[node->head.numRecs -1].poolBlk + + (fileBlk - node->extent[node->head.numRecs -2].count); + } + + if (DATA_SHREDDING_ENABLED(beast->vol.zfsVol) && (poolBlk != 0)) + { + extent.poolBlkNum = poolBlk; + extent.lengthOfExtent = + node->extent[node->head.numRecs -1].count - fileBlk; + + DataShred(beast->vol.zfsVol, &extent); + } + + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapRemove_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_REMOVE, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + + lsn = logBuffer->ZXR_Lsn; + + logRecord->u.remove.origExt.poolBlk = + node->extent[node->head.numRecs -1].poolBlk; + logRecord->u.remove.origExt.count = + node->extent[node->head.numRecs -1].count; + if (poolBlk == 0) + { + logRecord->u.remove.finalExt.poolBlk = 0; + logRecord->u.remove.finalExt.count = 0; + logRecord->u.remove.numRecs = 1; + } + else + { + logRecord->u.remove.finalExt.poolBlk = + node->extent[node->head.numRecs -1].poolBlk; + logRecord->u.remove.finalExt.count = fileBlk; + logRecord->u.remove.numRecs = 0; + } + + ZLOG_BIND(xaction, buf); + + ZLOG_TEST_REDO(xaction); + ZLOG_ReleaseRecord(xaction); + + node->head.lsn = lsn; + if (poolBlk == 0) + { + /* Hole in file.. nothing to free */ + node->extent[node->head.numRecs -1].count = 0; + node->head.numRecs--; + } + else + { + extent.poolBlkNum = poolBlk; + extent.lengthOfExtent = + node->extent[node->head.numRecs -1].count - fileBlk; + node->extent[node->head.numRecs -1].count = fileBlk; + + zASSERT(stInfo->fmapDataBlks >= extent.lengthOfExtent); + VOL_AdjustUsedUserSpace(&xaction->xaction, beast, + -((SQUAD)extent.lengthOfExtent << beast->ROOTblkSizeShift)); + DIRQ_AdjustUsedDirSpace(&xaction->xaction, + beast->ROOTvolume, beast, + -((SQUAD)extent.lengthOfExtent << beast->ROOTblkSizeShift)); + stInfo->fmapDataBlks-=extent.lengthOfExtent; + stInfo->nextBlk = node->extent[node->head.numRecs -1].count; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + /** FixFixFix6 Free should be done before logging or changing + ** metadata + **/ + if (zfsFreeExtent(genMsg, beast->vol.zfsVol, + &extent, xaction) != zOK) + { + if (parentBuf) CACHE_RELEASE(parentBuf); + node->head.leafLink = INVALID_BLK_ZERO; + CACHE_DIRTY_RELEASE(buf); + COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction); + EndXlocal(xaction); + aprintf(LRED, "Got an error from free tree. Potentially lost %d blocks\n", extent.lengthOfExtent); + return TRUNC_ERROR; + } + } + buf->state |= CACHE_DIRTY; + EndXlocal(xaction); + goto ContinueProcessingLeaf; + } + return TRUNC_DONE; +} + + +NINT truncBranchNodeChild( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Buffer_s *buf, + Buffer_s *parentBuf, + Blknum_t fileBlk) +{ + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + FmapNode_s *node; + FmapNode_s *parent; + Blknum_t poolBlk; +// Extent_s extent; + IoMsg_s iomsg; + ZfsXaction_s *xaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + Lsn_t lsn; + + ASSERT_MPKNSS_LOCK(); + +ContinueDownTheTree: + node = (FmapNode_s *)(buf->pBuf.data); + zASSERT((node->head.magic == FMAP_BT_ROOT) || + (node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_BRANCH)); + parent = (FmapNode_s *)(parentBuf->pBuf.data); + zASSERT((parent->head.magic == FMAP_BT_ROOT) || + (parent->head.magic == FMAP_BT_LEAF) || + (parent->head.magic == FMAP_BT_BRANCH)); + + if (node->head.state & BT_LEAF) + { + return truncLeafNode(genMsg, beast, buf, parentBuf, fileBlk); + } + else if (node->head.numRecs > 1) + { + poolBlk = node->extent[node->head.numRecs -1].poolBlk; + CACHE_RELEASE(parentBuf); + parentBuf = buf; + buf = NULL; + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 25); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(parentBuf); + return TRUNC_ERROR; + } + node = (FmapNode_s *)(buf->pBuf.data); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + CACHE_RELEASE(parentBuf); + buf = NULL; + parentBuf = NULL; + return TRUNC_ERROR; + } + goto ContinueDownTheTree; + } + else + { + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(2) + + sizeof(FmapJoin_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_JOIN, xaction, logBuffer, 2, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[1], parentBuf->volBlk, + parent->head.lsn, parentBuf, xaction, 1); + + lsn = logBuffer->ZXR_Lsn; + + memcpy(&logRecord->u.join.parentExt[0], + &parent->extent[parent->head.numRecs -2], + 2 * sizeof(FmapExt_s)); + + logRecord->u.join.childExt.poolBlk = node->extent[0].poolBlk; + logRecord->u.join.childExt.count = node->extent[0].count; + logRecord->u.join.childState = node->head.state; + logRecord->u.join.childMagic = node->head.magic; + logRecord->u.join.childNumRecs = node->head.numRecs; + zASSERT( node->head.fnh_zid != 0 ); + logRecord->u.join.fj_internalID = node->head.fnh_internalID; + logRecord->u.join.fj_zid = node->head.fnh_zid; + + ZLOG_DELETE_BLOCK(xaction, logBlks[0]); + ZLOG_BIND(xaction, parentBuf); + + ZLOG_TEST_REDO(xaction); + ZLOG_ReleaseRecord(xaction); + + node->head.lsn = lsn; + parent->head.lsn = lsn; + + parent->extent[parent->head.numRecs -1].count = 0; + parent->extent[parent->head.numRecs -1].poolBlk = 0; + parent->head.numRecs--; + + if (parent->head.numRecs > 1) + { + poolBlk = parent->extent[parent->head.numRecs -1].poolBlk; + parent->extent[parent->head.numRecs -1].count = MAX_FILE_BLK; + parentBuf->state |= CACHE_DIRTY; + + CACHE_RELEASE(buf); + zASSERT(stInfo->fmapTreeBlks > 0); + stInfo->fmapTreeBlks--; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + + buf = NULL; + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 26); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(parentBuf); + EndXlocal(xaction); + return TRUNC_ERROR; + } + node = (FmapNode_s *)(buf->pBuf.data); + if ((node->head.magic != FMAP_BT_BRANCH) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_ROOT)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + CACHE_RELEASE(parentBuf); + buf = NULL; + parentBuf = NULL; + EndXlocal(xaction); + return TRUNC_ERROR; + } + EndXlocal(xaction); + goto ContinueDownTheTree; + } + else + { + CACHE_DIRTY_RELEASE(parentBuf); + parentBuf = NULL; + + CACHE_RELEASE(buf); + zASSERT(stInfo->fmapTreeBlks > 0); + stInfo->fmapTreeBlks--; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + + EndXlocal(xaction); + return TRUNC_CONTINUE; + } + } + return TRUNC_DONE; +} + + +STATUS btreeFileMapTrunc( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Buffer_s *buf, + Blknum_t fileBlk) +{ + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + Fmap_s *fmap = &stInfo->fmap; +// Extent_s extent; + Buffer_s *parentBuf = NULL; + FmapNode_s *node = (FmapNode_s *)buf->pBuf.data; + Blknum_t poolBlk; + IoMsg_s iomsg; + NINT ccode; + ZfsXaction_s *xaction; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *logBlks; + FmapLog_s *logRecord; + Lsn_t lsn; + + ASSERT_MPKNSS_LOCK(); + +ContinueTraversingTheTree: + zASSERT(node->head.magic == FMAP_BT_ROOT); + zASSERT(node->head.state & BT_ROOT); + + if (node->head.state & BT_LEAF) + { + ccode = truncLeafNode(genMsg, beast, buf, parentBuf, fileBlk); + if (ccode == TRUNC_CONTINUE) + { + buf = NULL; + READBLK_IO_MSG(iomsg, beast, fmap->root, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 27); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + parentBuf = NULL; + node = (FmapNode_s *)buf->pBuf.data; + if (node->head.magic != FMAP_BT_ROOT) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + goto ContinueTraversingTheTree; + } + else if (ccode == TRUNC_DONE) + { + return zOK; + } + else + { + return zFAILURE; + } + } + else + { + if (node->head.numRecs == 1) + { + /** this is the end .. All my leaves are gone **/ + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapToss_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_TOSS, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + + lsn = logBuffer->ZXR_Lsn; + + logRecord->u.toss.extent[0].poolBlk = node->extent[0].poolBlk; + logRecord->u.toss.extent[0].count = node->extent[0].count; + logRecord->u.toss.numRecs = 1; + logRecord->u.toss.magic = node->head.magic; + logRecord->u.toss.state = node->head.state; + zASSERT( node->head.fnh_zid != 0 ); + logRecord->u.toss.ft_internalID = node->head.fnh_internalID; + logRecord->u.toss.ft_zid = node->head.fnh_zid; + + ZLOG_DELETE_BLOCK(xaction, logBlks[0]); + ZLOG_TEST_REDO(xaction); + ZLOG_ReleaseRecord(xaction); + + node->head.lsn = lsn; + fmap->root = INVALID_BLK_ZERO; + stInfo->nextBlk = fmap->dirExt[fmap->numRecs -1].count; + + CACHE_RELEASE(buf); + zASSERT(stInfo->fmapTreeBlks > 0); + stInfo->fmapTreeBlks--; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + if (COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction) != zOK) + { + if (parentBuf) + { + CACHE_RELEASE(parentBuf); + } + EndXlocal(xaction); + return zFAILURE; + } + EndXlocal(xaction); + } + else if (node->head.numRecs == 2) + { + /** Root Blk has only 1 entry (other one is std first entry), so we + ** can shrink the B-tree **/ + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapToss_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_TOSS, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + + lsn = logBuffer->ZXR_Lsn; + + logRecord->u.toss.extent[0].poolBlk = node->extent[0].poolBlk; + logRecord->u.toss.extent[0].count = node->extent[0].count; + logRecord->u.toss.extent[1].poolBlk = node->extent[1].poolBlk; + logRecord->u.toss.extent[1].count = node->extent[1].count; + logRecord->u.toss.numRecs = 2; + logRecord->u.toss.magic = node->head.magic; + logRecord->u.toss.state = node->head.state; + zASSERT( node->head.fnh_zid != 0 ); + logRecord->u.toss.ft_internalID = node->head.fnh_internalID; + logRecord->u.toss.ft_zid = node->head.fnh_zid; + + ZLOG_DELETE_BLOCK(xaction, logBlks[0]); + ZLOG_TEST_REDO(xaction); + ZLOG_ReleaseRecord(xaction); + + node->head.lsn = lsn; + fmap->root = node->extent[node->head.numRecs -1].poolBlk; + + CACHE_RELEASE(buf); + zASSERT(stInfo->fmapTreeBlks > 0); + stInfo->fmapTreeBlks--; + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + if (COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction) != zOK) + { + EndXlocal(xaction); + return zFAILURE; + } + + buf = NULL; + READBLK_IO_MSG(iomsg, beast, fmap->root, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 28); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + EndXlocal(xaction); + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + if ((node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_BRANCH)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + EndXlocal(xaction); + return zFAILURE; + } + + ZLOG_ObtainRecord(xaction, ZLOG_BLOCK_INFO_SIZE(1) + + sizeof(FmapToss_s)); + ZLOG_INIT_LOG_RECORD(XFUNC_FMAP_TOSS, xaction, logBuffer, 1, + logBlks, logRecord); + ZLOG_ASSIGN_BLOCK_INFO(logBlks[0], buf->volBlk, + node->head.lsn, buf, xaction, 0); + + lsn = logBuffer->ZXR_Lsn; + + logRecord->u.toss.numRecs = 0; + logRecord->u.toss.magic = node->head.magic; + logRecord->u.toss.state = node->head.state; + zASSERT( node->head.fnh_zid != 0 ); + logRecord->u.toss.ft_internalID = node->head.fnh_internalID; + logRecord->u.toss.ft_zid = node->head.fnh_zid; + + ZLOG_TEST_REDO(xaction); + ZLOG_BIND(xaction, buf); + + ZLOG_ReleaseRecord(xaction); + + node->head.lsn = lsn; + node->head.state |= BT_ROOT; + node->head.magic = FMAP_BT_ROOT; // Waited because log record + // hold the old MAGIC/type of + // this node. + + buf->state |= CACHE_DIRTY; + EndXlocal(xaction); + goto ContinueTraversingTheTree; + } + else + { + poolBlk = node->extent[node->head.numRecs -1].poolBlk; + parentBuf = buf; + buf = NULL; + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 29); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(parentBuf); + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + if ((node->head.magic != FMAP_BT_ROOT) && + (node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_BRANCH)) + { + CACHE_RELEASE(parentBuf); + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + ccode = truncBranchNodeChild(genMsg,beast,buf,parentBuf,fileBlk); + + if (ccode == TRUNC_CONTINUE) + { + buf = NULL; + READBLK_IO_MSG(iomsg, beast, fmap->root, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 30); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + parentBuf = NULL; + node = (FmapNode_s *)buf->pBuf.data; + if (node->head.magic != FMAP_BT_ROOT) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + goto ContinueTraversingTheTree; + } + else if (ccode == TRUNC_DONE) + { + return zOK; + } + else + { + return zFAILURE; + } + } + } + return zOK; +} + + +STATUS directFileMapTrunc( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Blknum_t fileBlk) +{ + STATUS status = zOK; + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + Fmap_s *fmap = &stInfo->fmap; + Extent_s extent; + ZfsXaction_s *xaction; + Blknum_t length; + SQUAD totalLength = 0; + NINT numRecs; + + ASSERT_MPKNSS_LOCK(); + if (fmap->numRecs <= 1) + { + return status; + } + + if (DATA_SHREDDING_ENABLED(beast->vol.zfsVol)) + { + numRecs = fmap->numRecs; + while(fmap->dirExt[numRecs -1].count > fileBlk) + { + if (fmap->dirExt[numRecs -2].count >= fileBlk) + { + extent.poolBlkNum = fmap->dirExt[numRecs - 1].poolBlk; + extent.lengthOfExtent = fmap->dirExt[numRecs -1].count - + fmap->dirExt[numRecs -2].count; + } + else + { + extent.poolBlkNum = fmap->dirExt[numRecs - 1].poolBlk + + (fileBlk - fmap->dirExt[numRecs -2].count); + extent.lengthOfExtent = fmap->dirExt[numRecs -1].count - + fileBlk; + } + DataShred(beast->vol.zfsVol, &extent); + numRecs--; + zASSERT(numRecs != 0); + } + } + + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + + while(fmap->dirExt[fmap->numRecs -1].count > fileBlk) + { + if (fmap->dirExt[fmap->numRecs -2].count >= fileBlk) + { + extent.poolBlkNum = fmap->dirExt[fmap->numRecs - 1].poolBlk; + extent.lengthOfExtent = fmap->dirExt[fmap->numRecs -1].count - + fmap->dirExt[fmap->numRecs -2].count; + + zASSERT(stInfo->fmapDataBlks >= extent.lengthOfExtent); + length = extent.lengthOfExtent; + if (zfsFreeExtent(genMsg, beast->vol.zfsVol, + &extent, xaction) != zOK) + { + status = zFAILURE; + break; + } + + fmap->dirExt[fmap->numRecs -1].count = 0; + fmap->dirExt[fmap->numRecs -1].poolBlk = 0; + fmap->numRecs--; + } + else + { + extent.poolBlkNum = fmap->dirExt[fmap->numRecs - 1].poolBlk + + (fileBlk - fmap->dirExt[fmap->numRecs -2].count); + extent.lengthOfExtent = fmap->dirExt[fmap->numRecs -1].count - + fileBlk; + + zASSERT(stInfo->fmapDataBlks >= extent.lengthOfExtent); + length = extent.lengthOfExtent; + if (zfsFreeExtent(genMsg, beast->vol.zfsVol, + &extent, xaction) != zOK) + { + status = zFAILURE; + break; + } + + fmap->dirExt[fmap->numRecs -1].count = fileBlk; + } + totalLength+=length; + stInfo->fmapDataBlks-=length; + stInfo->nextBlk = fmap->dirExt[fmap->numRecs -1].count; + } + VOL_AdjustUsedUserSpace(&xaction->xaction, beast, + -(totalLength << beast->ROOTblkSizeShift)); + DIRQ_AdjustUsedDirSpace(&xaction->xaction, beast->ROOTvolume, beast, + -(totalLength << beast->ROOTblkSizeShift)); + + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + status = COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction); + + EndXlocal(xaction); + + return status; +} + +/**************************************************************************** + * Copy the file map to the snap shot file + *****************************************************************************/ +STATUS CopyTheFileMapToSnapBeast( + GeneralMsg_s *genMsg, + ZfsXaction_s *xaction, + RootBeast_s *beast) +{ + RootBeast_s *snapBeast; + + ASSERT_MPKNSS_LOCK(); + + snapBeast = beast->fileSnapshotBeast; + zASSERT(snapBeast != NULL); + ASSERT_XLATCH(&snapBeast->ROOTbeastLatch); + + snapBeast->storage.zfsInfo->nextBlk = + beast->storage.zfsInfo->nextBlk; + snapBeast->storage.zfsInfo->fmapDataBlks = + beast->storage.zfsInfo->fmapDataBlks; + snapBeast->storage.zfsInfo->fmapTreeBlks = + beast->storage.zfsInfo->fmapTreeBlks; + memcpy(&snapBeast->storage.zfsInfo->fmap, + &beast->storage.zfsInfo->fmap, sizeof(Fmap_s)); + + beast->storage.zfsInfo->nextBlk = 0; + beast->storage.zfsInfo->fmapDataBlks = 0; + beast->storage.zfsInfo->fmapTreeBlks = 0; + bzero(&beast->storage.zfsInfo->fmap, sizeof(Fmap_s)); + + COMN_MARK_BEAST_XLOCAL(snapBeast, &xaction->xaction); + if (COMN_ForceBeastWrite(genMsg, snapBeast, &xaction->xaction) != zOK) + { + return zFAILURE; + } + COMN_MARK_BEAST_XLOCAL(beast, &xaction->xaction); + if (COMN_ForceBeastWrite(genMsg, beast, &xaction->xaction) != zOK) + { + snapBeast->storage.zfsInfo->nextBlk = 0; + snapBeast->storage.zfsInfo->fmapDataBlks = 0; + snapBeast->storage.zfsInfo->fmapTreeBlks = 0; + bzero(&snapBeast->storage.zfsInfo->fmap, sizeof(Fmap_s)); + COMN_MARK_BEAST_XLOCAL(snapBeast, &xaction->xaction); + COMN_ForceBeastWrite(genMsg, snapBeast, &xaction->xaction); + return zFAILURE; + } + return zOK; +} + + +/**************************************************************************** + * Set up beginning of file truncation + *****************************************************************************/ +STATUS ZFSVOL_VOL_truncateFile ( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Blknum_t fileBlk, + Blkcnt_t count) +{ + ZFSStorageInfo_s *stInfo; + Fmap_s *fmap; + Buffer_s *buf = NULL; + IoMsg_s iomsg; + FmapNode_s *node; + ZfsXaction_s *xaction; + STATUS status; + + ASSERT_MPKNSS_LOCK(); + stInfo = beast->storage.zfsInfo; + fmap = &stInfo->fmap; + + zASSERT(count == -1); + ASSERT_XLATCH( &beast->ROOTbeastLatch); + zASSERT(!(beast->ROOTvolume->VOLenabledAttributes & zATTR_READONLY)); + + if ((genMsg->flags & COPY_FMAP_TO_SNAP) && + (beast->ROOTvolume->VOLenabledAttributes & zATTR_COW)) + { + /* If snapshotting is enabled on the volume and the file is being + * truncated, we just copy the filemap to the snapshot beast if + * the file is being truncated to zero. Else, the high level routine + * will copy individual blocks to the snapshot beast, before truncating + * the original file + */ + if (fileBlk == 0) + { + /* + * Adjust used space now because the truncated blocks are being + * moved over to the snapshot beast, not actually being released. + * To keep the users/directory used block counts correct they + * should be decremented now. + */ + xaction = BeginXLocal(beast->vol.volume,BXL_DEFAULT|X_CF_OK_TO_THROTTLE); + VOL_AdjustUsedUserSpace(&xaction->xaction, beast, + -((SQUAD)stInfo->fmapDataBlks << beast->ROOTblkSizeShift)); + DIRQ_AdjustUsedDirSpace(&xaction->xaction, beast->ROOTvolume, beast, + -((SQUAD)stInfo->fmapDataBlks << beast->ROOTblkSizeShift)); + status = CopyTheFileMapToSnapBeast(genMsg, xaction, beast); + EndXlocal(xaction); + return status; + } + else + { + SetErrno(genMsg, zERR_ACCESS_DENIED); + return zFAILURE; + } + } + + if (fileBlk >= stInfo->nextBlk) + { + /** the fileBlk to be truncated is beyond EOF **/ + /** Should this be an error? -- I don't think so -- Vandana **/ + return zOK; + } + + if ((fmap->numRecs <= 1) && (fmap->root == INVALID_BLK_ZERO)) + { + /** File is already zero, and has no blocks associated with it **/ + /** Should this be an error? -- I don't think so -- Vandana **/ + return zOK; + } + + if (fmap->root) + { + READBLK_IO_MSG(iomsg, beast, fmap->root, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 31); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + if (node->head.magic != FMAP_BT_ROOT) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + if (btreeFileMapTrunc(genMsg, beast, buf, fileBlk) != zOK) + { + return zFAILURE; + } + } + if (fmap->root == INVALID_BLK_ZERO) + { + if (directFileMapTrunc(genMsg, beast, fileBlk) != zOK) + { + return zFAILURE; + } + } + return zOK; +} + + +/************************************************************************ + ************************************************************************/ + +/*************************************************************************** + * + * This returns an array of longs which contains the size (in blocks) of + * each logically allocated region of file. This is also an entry + * for every hole in the file. For allocated extents, the entries are + * positive numbers indicating how many blocks are in the extent. For + * holes, the entries are negative numbers, the complement of which + * indicates how many blocks make up the hole. + ***************************************************************************/ +STATUS ZFSVOL_VOL_getExtentList( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Blknum_t fileBlk, + NINT extentListSize, + Blknum_t (*extentList)[], + NINT *retNumExtents, + Blknum_t *retNextBlock) +{ + Buffer_s *buf; + ZFSStorageInfo_s *stInfo; + Fmap_s *fmap; + Blknum_t poolBlk; + NINT i, index; + FmapNode_s *node; + Blknum_t seed; + Blknum_t retLen = 0; + IoMsg_s iomsg; + BOOL fileBlkInDirectMap = FALSE; + + ASSERT_MPKNSS_LOCK(); + ASSERT_LATCH( &beast->ROOTbeastLatch); + + stInfo = beast->storage.zfsInfo; + fmap = &stInfo->fmap; + + *retNumExtents = 0; + *retNextBlock = stInfo->nextBlk; + + if (fileBlk >= stInfo->nextBlk) + { + SetErrno(genMsg, zERR_END_OF_FILE); + return zFAILURE; + } + i = 0; + (*extentList)[i] = 0; + if (fileBlk < fmap->dirExt[fmap->numRecs - 1].count) + { + searchDirectMap(fmap, fileBlk, &seed, &retLen, &index); + (*extentList)[i] += (fmap->dirExt[fmap->numRecs -1].count - fileBlk); + fileBlk = fmap->dirExt[fmap->numRecs -1].count; + *retNumExtents = 1; + fileBlkInDirectMap = TRUE; + } + if (fmap->root != 0) + { + READBLK_IO_MSG(iomsg, beast, fmap->root, CACHE_READ); + SET_DEBUG_ID(iomsg, 32); + buf = ZFS_ReadPoolBlk(genMsg, &iomsg); + if (buf == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + if (node->head.magic != FMAP_BT_ROOT) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + while (!(node->head.state & BT_LEAF)) + { + poolBlk = searchBranch(node, fileBlk, &index); + CACHE_RELEASE(buf); + buf = NULL; + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_READ); + SET_DEBUG_ID(iomsg, 33); + buf = ZFS_ReadPoolBlk(genMsg, &iomsg); + if (buf == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + if ((node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_BRANCH)) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + } + poolBlk = searchLeaf(node, fileBlk, &retLen, &index); + /** If the first extent is in the filemap and is a hole, + ** the direct map will not have added an entry and we need to + ** do this extra check here, before we go into the for loop. + **/ + if (!fileBlkInDirectMap) + { + /* Need to handle starting in the middle of an extent that + * has not been handle in direct map code. + */ + if (poolBlk == 0) + { + /* Hole code */ + (*extentList)[i] = -(node->extent[index].count - fileBlk); + if (++i == extentListSize) + { + *retNextBlock = node->extent[index].count; + *retNumExtents = i; + goto returnValues; + } + (*extentList)[i] = 0; + } + else + { + /* Non-hole code */ + (*extentList)[i] += (node->extent[index].count - fileBlk); + } + index++; + } + while (1) + { + for (;index < node->head.numRecs; index++) + { + if (node->extent[index].poolBlk != 0) + { + (*extentList)[i] += (node->extent[index].count - + node->extent[index-1].count); + } + else + { + if (++i == extentListSize) + { + *retNextBlock = node->extent[index-1].count; + *retNumExtents = i; + goto returnValues; + } + (*extentList)[i] = -(node->extent[index].count - + node->extent[index-1].count); + if (++i == extentListSize) + { + *retNextBlock = node->extent[index].count; + *retNumExtents = i; + goto returnValues; + } + (*extentList)[i] = 0; + } + } + LB_delay(0); + poolBlk = node->head.leafLink; + if ((poolBlk == INVALID_BLK_ZERO) || + (node->extent[node->head.numRecs -1].count >= stInfo->nextBlk)) + { + *retNextBlock = stInfo->nextBlk; + *retNumExtents = ++i; + goto returnValues; + } + CACHE_RELEASE(buf); + buf = NULL; + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_READ); + SET_DEBUG_ID(iomsg, 34); + buf = ZFS_ReadPoolBlk(genMsg, &iomsg); + if (buf == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + if (node->head.magic != FMAP_BT_LEAF) + { + SetErrno(genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + return zFAILURE; + } + index = 1; + } +returnValues: + CACHE_RELEASE(buf); + buf = NULL; + } + return zOK; +} + + /* + * Convert a pool extent to a device number and physical extent. + */ +STATUS pool2physicalExtent ( + zNSSMsg_s *msg, + RootBeast_s *beast, + Extent_s *extent) +{ + zPhysicalExtent_s *physical = (zPhysicalExtent_s *)(uintptr_t)msg->sys.data[MAP_DATA].start; + NINT maxExtents; + NINT numExtents; + STATUS status; + ADDR deviceID; + QUAD fileByteOffset; + QUAD poolByteOffset; + QUAD deviceByteOffset; + QUAD poolByteLength; + QUAD deviceByteLength; + + maxExtents = msg->sys.data[MAP_DATA].length / sizeof(zPhysicalExtent_s); + numExtents = msg->body.map.retExtentListCount; + if (numExtents >= maxExtents) + { + SetStatus(msg, zERR_FINISHED_WITH_EXTENTS); + return zFAILURE; + } + fileByteOffset = msg->body.map.retEndingOffset; + poolByteOffset = ((QUAD)(LONG)extent->poolBlkNum) << beast->blkSizeShift; + poolByteLength = ((QUAD)(LONG)extent->lengthOfExtent) << beast->blkSizeShift; + if (poolByteOffset == 0) + { /* + * Hole in file, we just skip passed it and don't change the number + * of extents we are returning. + */ + msg->body.map.retEndingOffset = poolByteLength + fileByteOffset; + return zOK; + } + physical = &physical[numExtents]; + for (;;) + { + status = ZFSMAL_PhysicalExtent(beast->vol.zfsVol, poolByteOffset, + poolByteLength, &deviceID, + &deviceByteOffset, &deviceByteLength); + if (status != zOK) + { + SetStatus(msg, status); + return zFAILURE; + } + physical->logicalOffset = fileByteOffset; + physical->poolOffset = poolByteOffset; + physical->physical.offset = deviceByteOffset; + physical->physical.deviceID = deviceID; + ++numExtents; + if (poolByteLength <= deviceByteLength) // Should we limit length by size of file? + { + physical->length = poolByteLength; + msg->body.map.retEndingOffset = fileByteOffset + poolByteLength; + msg->body.map.retExtentListCount = numExtents; + return zOK; + } + physical->length = deviceByteLength; + fileByteOffset += deviceByteLength; + poolByteOffset += deviceByteLength; + poolByteLength -= deviceByteLength; + ++physical; + if (numExtents >= maxExtents) + { + msg->body.map.retEndingOffset = fileByteOffset; + msg->body.map.retExtentListCount = numExtents; + SetStatus(msg, zERR_FINISHED_WITH_EXTENTS); + return zFAILURE; + } + } +} + + /* + * ZFSVOL_doDirectPhysicalExtents - handles the direct physical extents + */ +STATUS ZFSVOL_doDirectPhysicalExtents ( + zNSSMsg_s *msg, + RootBeast_s *beast) +{ + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + Fmap_s *fmap = &stInfo->fmap; + Blknum_t fileBlk; + NINT i; + STATUS status; + Extent_s extent; + Blknum_t diff; + + ASSERT_MPKNSS_LOCK(); + ASSERT_SLATCH( &beast->ROOTbeastLatch); + + /* + * retEndingOffset was initialized to the passed in offset. + */ + fileBlk = msg->body.map.retEndingOffset >> beast->blkSizeShift; + for (i = 1; i < fmap->numRecs; ++i) + { /* + * This if lets us skip forward to the correct offset + * in file map. + */ + if (fileBlk < fmap->dirExt[i].count) + { + extent.poolBlkNum = fmap->dirExt[i].poolBlk; + extent.lengthOfExtent = fmap->dirExt[i].count - + fmap->dirExt[i-1].count; + if (fileBlk > fmap->dirExt[i-1].count) + { + diff = fileBlk - fmap->dirExt[i-1].count; + extent.poolBlkNum += diff; + extent.lengthOfExtent -= diff; + } + status = pool2physicalExtent(msg, beast, &extent); + if (status != zOK) + { + return zFAILURE; + } + } + } + return zOK; +} + + /* + * Given a file block number in the beast, descend the + * file map tree and find the extent mapping information. + */ +Buffer_s *descendFileMapTree (RootBeast_s *beast, Blknum_t fileBlk) +{ + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + Fmap_s *fmap = &stInfo->fmap; + GeneralMsg_s genMsg; + IoMsg_s iomsg; + Buffer_s *buf; + FmapNode_s *node; + Blknum_t poolBlk; + NINT index; + + COMN_SETUP_GENERAL_MSG_NO_CONNECTION_RESOLVE( &genMsg); + for (poolBlk = fmap->root; poolBlk != 0;) + { + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_READ); + SET_DEBUG_ID(iomsg, 36); + buf = ZFS_ReadPoolBlk( &genMsg, &iomsg); + if (buf == NULL) + { + return NULL; + } + node = (FmapNode_s *)buf->pBuf.data; + if (node->head.state & BT_LEAF) + { + return buf; + } + poolBlk = searchBranch(node, fileBlk, &index); + CACHE_RELEASE(buf); + } + return NULL; +} + +STATUS scanFileMapLeaf ( + zNSSMsg_s *msg, + RootBeast_s *beast, + FmapNode_s *node) +{ + Blknum_t fileBlk; + NINT index; + NINT numRecs = node->head.numRecs; + NINT i; + STATUS status; + Extent_s extent; + + ASSERT_MPKNSS_LOCK(); + + fileBlk = msg->body.map.retEndingOffset >> beast->blkSizeShift; + + /* + * Find the first extent that contains the file block. + */ + extent.poolBlkNum = searchLeaf(node, fileBlk, &extent.lengthOfExtent, + &index); + for (i = index; i < numRecs; ++i) + { + if (i != index) + { + extent.poolBlkNum = node->extent[i].poolBlk; + extent.lengthOfExtent = node->extent[i].count - + node->extent[i-1].count; + } + status = pool2physicalExtent(msg, beast, &extent); + if (status != zOK) + { + return zFAILURE; + } + } + return zOK; +} + + /* + * ZFSVOL_doTreePhysicalExtents - handles physical extents in the + * file map tree. + */ +STATUS ZFSVOL_doTreePhysicalExtents ( + zNSSMsg_s *msg, + RootBeast_s *beast) +{ + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + GeneralMsg_s genMsg; + IoMsg_s iomsg; + FmapNode_s *node; + Buffer_s *buf; + Blknum_t fileBlk; + Blknum_t poolBlk; + STATUS status; + + COMN_SETUP_GENERAL_MSG_NO_CONNECTION_RESOLVE( &genMsg); + + fileBlk = msg->body.map.retEndingOffset >> beast->blkSizeShift; + + buf = descendFileMapTree(beast, fileBlk); + if (buf == NULL) + { + return zOK; + } + for (;;) + { + node = (FmapNode_s *)buf->pBuf.data; + status = scanFileMapLeaf(msg, beast, node); + if (status != zOK) + { + CACHE_RELEASE(buf); + return zFAILURE; + } + PERIODIC_YIELD(); + poolBlk = node->head.leafLink; + if ((poolBlk == INVALID_BLK_ZERO) + || (node->extent[node->head.numRecs-1].count >= stInfo->nextBlk)) + { + CACHE_RELEASE(buf); + return zOK; + } + CACHE_RELEASE(buf); + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_READ); + SET_DEBUG_ID(iomsg, 40); + buf = ZFS_ReadPoolBlk( &genMsg, &iomsg); + if (buf == NULL) + { + SetStatusFromErrno(msg, &genMsg); + return zFAILURE; + } + } +} + +/*************************************************************************** + * ZFSVOL_VOL_getPhysicalExtent - returns a list of physical extents for + * a file. + ***************************************************************************/ +STATUS ZFSVOL_VOL_getPhysicalExtent ( + zNSSMsg_s *msg, + RootBeast_s *beast) +{ + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + Blknum_t fileBlk; + Blknum_t nextBlk; + STATUS status; + + ASSERT_MPKNSS_LOCK(); + ASSERT_SLATCH( &beast->ROOTbeastLatch); + + /* + * Initialize the return values. They are updated in pool2physicalExtent + */ + msg->body.map.retEndingOffset = msg->body.map.offset; + msg->body.map.retExtentListCount = 0; + + nextBlk = stInfo->nextBlk; + fileBlk = msg->body.map.offset >> beast->blkSizeShift; + + if (fileBlk >= nextBlk) + { + SetStatus(msg, zERR_END_OF_FILE); + return zFAILURE; + } + status = ZFSVOL_doDirectPhysicalExtents(msg, beast); + if (status != zOK) + { + goto finish; + } + status = ZFSVOL_doTreePhysicalExtents(msg, beast); + if (status != zOK) + { + goto finish; + } + return zOK; + +finish: + if (GetStatus(msg) == zERR_FINISHED_WITH_EXTENTS) + { + ClearStatus(msg); + return zOK; + } + return zFAILURE; +} + + +/***************************************************************************** + * Given a fileBlk, ZFS_fileBlk2volBlk converts it to the poolBlk + * address and returns the value. If the fileBlk corresponds to a + * hole in the file, 0 is returned. Input parameter length specifies + * the length of logical file blocks required. As output it is set + * to a value (less than or equal to the input value), specifying the + * number of contiguous poolblocks that correspond to the fileblock + *****************************************************************************/ + +Blknum_t ZFS_fileBlk2volBlk ( + RootBeast_s *beast, + Blknum_t fileBlk, + NINT *length) +{ + GeneralMsg_s genMsg; + Buffer_s *buf; + ZFSStorageInfo_s *stInfo; + Fmap_s *fmap; + Blknum_t poolBlk; + Blknum_t retLen = 0; + Blknum_t seed = 0; + NINT index = 0; + FmapNode_s *node; + IoMsg_s iomsg; + + ASSERT_MPKNSS_LOCK(); + ASSERT_LATCH( &beast->ROOTbeastLatch); + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + + stInfo = beast->storage.zfsInfo; + fmap = &stInfo->fmap; + + if (fileBlk >= stInfo->nextBlk) + { + poolBlk = 0; + } + else if (fileBlk < fmap->dirExt[fmap->numRecs - 1].count) + { + poolBlk = searchDirectMap(fmap, fileBlk, &seed, &retLen, &index); + } + else if (fmap->root) + { + READBLK_IO_MSG(iomsg, beast, fmap->root, CACHE_READ); + SET_DEBUG_ID(iomsg, 0); + buf = ZFS_ReadPoolBlk(&genMsg, &iomsg); + if (buf == NULL) + { + poolBlk = 0; + goto returnStatus; + } + node = (FmapNode_s *)buf->pBuf.data; + if (node->head.magic != FMAP_BT_ROOT) + { + SetErrno(&genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(&genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + poolBlk = 0; + goto returnStatus; + } + + while (!(node->head.state & BT_LEAF)) + { + poolBlk = searchBranch(node, fileBlk, &index); + CACHE_RELEASE(buf); + buf = NULL; + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_READ); + SET_DEBUG_ID(iomsg, 0); + buf = ZFS_ReadPoolBlk(&genMsg, &iomsg); + if (buf == NULL) + { + poolBlk = 0; + goto returnStatus; + } + node = (FmapNode_s *)buf->pBuf.data; + if ((node->head.magic != FMAP_BT_LEAF) && + (node->head.magic != FMAP_BT_BRANCH)) + { + SetErrno(&genMsg, zERR_MEDIA_CORRUPTED); + ZLSSPOOL_MediaIsCorrupt(&genMsg, buf, &iomsg); + CACHE_RELEASE(buf); + buf = NULL; + poolBlk = 0; + goto returnStatus; + } + } + poolBlk = searchLeaf(node, fileBlk, &retLen, &index); + CACHE_RELEASE(buf); + buf = NULL; + } + else + { + poolBlk = 0; + } +returnStatus: + if (retLen < *length) + *length = retLen; + return poolBlk; +} + + +/************************************************************************** + * This returns TRUE if "blockNum" is physically allocated in the file, + * it returns FALSE if the block is in a SPARSE hole or beyond the end + * of the file. + ***************************************************************************/ +BOOL ZFSVOL_VOL_isBlockInBeast( + RootBeast_s *beast, + Blknum_t fileBlk) +{ + NINT len = 1; + ASSERT_MPKNSS_LOCK(); + + if ((fileBlk << beast->ROOTmycache.bufSizeShift) < beast->ROOTeof) + { + return (ZFS_fileBlk2volBlk(beast, fileBlk, &len) != 0); + } + else + { + return FALSE; + } +} + +/*************************************************************************** + * Read Ahead Pool Blocks. + * Setup IoMsg using READBLK_IO_MSG befor calling this routine + * (This macro will set the fileBlk to be the negative of the specified + * volBlk, as well as set the beast) + * Make sure that the beast is not X_LATCHED when ZFS_ReadAheadPoolBlk + * is called. It can be share latched. + ***************************************************************************/ + +void asyncReadAheadPoolBlkDone(Fsm_s *fsm) +{ + Asyncio_s *asyncio = STRUCT(fsm,Asyncio_s,fsm); + RootBeast_s *beast = STRUCT(asyncio->mycache, RootBeast_s, ROOTmycache); + + ASSERT_MPKNSS_LOCK(); + if (asyncio->buffer) + CACHE_RELEASE(asyncio->buffer); + + COMN_UnlatchAndRelease(&beast, SLATCHED); + + freeAsyncio(asyncio); +} + + +void asyncContinueReadAheadPoolBlk(FsmLite_s *fsmLite) +{ + Asyncio_s *aio = STRUCT(fsmLite,Asyncio_s,fsm.lite); + ASSERT_MPKNSS_LOCK(); + + asyncReadVolBlk(aio, asyncReadAheadPoolBlkDone); + +} + +/** Initialize IoMsg_s using READBLK_IO_MSG before calling this routine **/ +void ZFS_ReadAheadPoolBlk( + IoMsg_s *ioMsg) +{ + Asyncio_s *asyncio; + + ASSERT_MPKNSS_LOCK(); + COMN_USE_BEAST(ioMsg->beast); + + asyncio = getAsyncio(); + INIT_AIO(asyncio, &ioMsg->beast->ROOTmycache, ioMsg->fileBlk, ioMsg->mode); + asyncio->volBlk = ioMsg->volBlk; + + FSM_S_LATCH(&ioMsg->beast->ROOTbeastLatch, &asyncio->fsm.lite, + asyncContinueReadAheadPoolBlk); +} + +/*- (FUNCTION) ----- ZFS_appendFileMap() --------------------------------------- + | + | do not allow this api to be exposed to the public. this api is used + | internal to zlss only and does not have logging, transactions, latching, + | etc. + | IPU uses this to fill in the file map before the NSS volume is real + | and before any external access. + | + +-------------------------------------------------------------------------*/ + +STATUS ipuUpdateSparseLeaf( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Extent_s *extent, + Blknum_t fileBlk, + Blknum_t holeBlkCnt, + Buffer_s *buf, + Buffer_s *bufParent) +{ + ZFSStorageInfo_s *stInfo; + Fmap_s *fmap; + FmapNode_s *node; + Buffer_s *bufSibling = NULL; + IoMsg_s iomsg; + NINT index = 0, pIndex = 0; + + stInfo = beast->storage.zfsInfo; + fmap = &stInfo->fmap; + node = (FmapNode_s *)buf->pBuf.data; + + if (node->head.numRecs < (FMAP_MAX - 6)) + { + node->extent[node->head.numRecs].count = fileBlk; + node->extent[node->head.numRecs].poolBlk = 0; + node->head.numRecs++; + + node->extent[node->head.numRecs].count = + fileBlk + extent->lengthOfExtent; + node->extent[node->head.numRecs].poolBlk = extent->poolBlkNum; + node->head.numRecs++; + + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + + CACHE_DIRTY_RELEASE(buf); + } + else + { + XALLOC_SEED_IO_MSG(iomsg, beast, NULL, 0, CACHE_UPDATE); + if ((bufSibling = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(buf); + return zFAILURE; + } + if (node->head.state & BT_ROOT) + { + zASSERT(bufParent == NULL); + XALLOC_SEED_IO_MSG(iomsg, beast, NULL, 0, CACHE_UPDATE); + if ((bufParent = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + Extent_s localExt; + + CACHE_RELEASE(buf); + localExt.poolBlkNum = bufSibling->volBlk; + localExt.lengthOfExtent = 1; + zfsFreeExtent(genMsg, beast->vol.zfsVol, + &localExt, NULL); + + cacheReleaseToss(bufSibling); + return zFAILURE; + } + stInfo->fmapTreeBlks++; + syncGrowBtree(NULL, buf, bufParent, &pIndex, beast); + fmap->root = bufParent->volBlk; + } + zASSERT(bufParent != NULL); + stInfo->fmapTreeBlks++; + + splitBtreeLeafSparse(NULL, buf, bufSibling, bufParent, index, + pIndex, extent, (NINT)holeBlkCnt, fileBlk); + + CACHE_DIRTY_RELEASE(buf); + buf = NULL; + CACHE_DIRTY_RELEASE(bufParent); + bufParent = NULL; + CACHE_DIRTY_RELEASE(bufSibling); + bufSibling = NULL; + } + return zOK; +} + + +STATUS ipuUpdateSparse( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Extent_s *extent, + Blknum_t fileBlk, + Blknum_t holeBlkCnt) +{ + ZFSStorageInfo_s *stInfo; + Fmap_s *fmap; + Buffer_s *buf = NULL; + Buffer_s *bufParent = NULL; + Buffer_s *bufSibling = NULL; + FmapNode_s *node; + IoMsg_s iomsg; + Blknum_t poolBlk; + NINT tmp; + + stInfo = beast->storage.zfsInfo; + fmap = &stInfo->fmap; + + if (fmap->root == INVALID_BLK_ZERO) + { + /* Making file sparse the first time, filemap is still direct, + * continuing entries will be in the btree */ + + XALLOCBLK_IO_MSG(iomsg, beast, NULL, CACHE_UPDATE); + if ((buf = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + stInfo->fmapTreeBlks++; + + node = (FmapNode_s *)buf->pBuf.data; + fmap->root = buf->volBlk; + + node->head.magic = FMAP_BT_ROOT; + node->head.fnh_internalID = beast->ROOTinternalID; + node->head.fnh_zid = beast->zid; + node->head.state = BT_ROOT | BT_LEAF; + node->head.leafLink = INVALID_BLK_ZERO; + node->head.lsn = 0; + node->extent[0].count = fmap->dirExt[fmap->numRecs -1].count; + node->extent[0].poolBlk = fmap->dirExt[fmap->numRecs -1].poolBlk; + node->head.numRecs = 1; + } + else + { + READBLK_IO_MSG(iomsg, beast, fmap->root, CACHE_UPDATE); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + zASSERT(node->head.magic == FMAP_BT_ROOT); + } + while (!(node->head.state & BT_LEAF)) + { + poolBlk = node->extent[node->head.numRecs-1].poolBlk; + if (node->head.numRecs < (FMAP_MAX - 6)) + { + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + bufParent = buf; + } + else + { + XALLOC_SEED_IO_MSG(iomsg, beast, NULL, 0, CACHE_UPDATE); + if ((bufSibling = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(buf); + return zFAILURE; + } + if (node->head.state & BT_ROOT) + { + XALLOC_SEED_IO_MSG(iomsg, beast, NULL, 0, CACHE_UPDATE); + if ((bufParent = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + Extent_s localExt; + + CACHE_RELEASE(buf); + localExt.poolBlkNum = bufSibling->volBlk; + localExt.lengthOfExtent = 1; + zfsFreeExtent(genMsg, beast->vol.zfsVol, + &localExt, NULL); + cacheReleaseToss(bufSibling); + return zFAILURE; + } + stInfo->fmapTreeBlks++; + syncGrowBtree(NULL, buf, bufParent, &tmp, beast); + fmap->root = bufParent->volBlk; + } + zASSERT(bufParent != NULL); + stInfo->fmapTreeBlks++; + + syncSplitBtree(NULL, buf, bufSibling, bufParent, extent); + + CACHE_DIRTY_RELEASE(buf); + CACHE_DIRTY_RELEASE(bufParent); + bufParent = NULL; + + bufParent = bufSibling; + bufSibling = NULL; + } + buf = NULL; + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_UPDATE); + if ((buf = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + return zFAILURE; + } + node = (FmapNode_s *)buf->pBuf.data; + zASSERT((node->head.magic == FMAP_BT_LEAF) || + (node->head.magic == FMAP_BT_BRANCH)); + } + + if (ipuUpdateSparseLeaf(genMsg, beast, extent, fileBlk, + holeBlkCnt, buf, bufParent) != zOK) + { + return zFAILURE; + } + return zOK; +} + + +STATUS ipuUpdateBtreeFileMap ( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Extent_s *extent) +{ + ZFSStorageInfo_s *stInfo; + Buffer_s *bufChild = NULL; + Buffer_s *bufParent = NULL; + Buffer_s *bufSibling = NULL; + FmapNode_s *child; + Fmap_s *fmap; + Blknum_t poolBlk; + IoMsg_s iomsg; + NINT tmp; + + stInfo = beast->storage.zfsInfo; + fmap = &stInfo->fmap; + poolBlk = fmap->root; + +ContinueScanningTheBtree: + READBLK_IO_MSG(iomsg, beast, poolBlk, CACHE_UPDATE); + if ((bufChild = ZFS_ReadPoolBlk(genMsg, &iomsg)) == NULL) + { + if (bufParent) + { + CACHE_RELEASE(bufParent); + } + if (bufSibling) + { + CACHE_RELEASE(bufSibling); + } + return zFAILURE; + } + child = (FmapNode_s *)(bufChild->pBuf.data); + zASSERT( (child->head.magic == FMAP_BT_LEAF) || + (child->head.magic == FMAP_BT_ROOT) || + (child->head.magic == FMAP_BT_BRANCH) ); + + if (child->head.state & BT_LEAF) + { + if ((child->extent[child->head.numRecs -1].poolBlk + + child->extent[child->head.numRecs -1].count - + child->extent[child->head.numRecs -2].count) == extent->poolBlkNum) + { + child->extent[child->head.numRecs -1].count+=extent->lengthOfExtent; + } + else if (child->head.numRecs < (FMAP_MAX - 6)) + { + child->extent[child->head.numRecs].poolBlk = extent->poolBlkNum; + child->extent[child->head.numRecs].count = + child->extent[child->head.numRecs -1].count+extent->lengthOfExtent; + child->head.numRecs++; + } + else + { + XALLOC_SEED_IO_MSG(iomsg, beast, NULL, 0, CACHE_UPDATE); + if ((bufSibling = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(bufChild); + return zFAILURE; + } + if (child->head.state & BT_ROOT) + { + XALLOC_SEED_IO_MSG(iomsg, beast, NULL, 0, CACHE_UPDATE); + if ((bufParent = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + Extent_s localExt; + CACHE_RELEASE(bufChild); + + localExt.poolBlkNum = bufSibling->volBlk; + localExt.lengthOfExtent = 1; + zfsFreeExtent(genMsg, beast->vol.zfsVol, + &localExt, NULL); + + cacheReleaseToss(bufSibling); + + return zFAILURE; + } + stInfo->fmapTreeBlks++; + syncGrowBtree(NULL, bufChild, bufParent, &tmp, beast); + fmap->root = bufParent->volBlk; + } + zASSERT(bufParent != NULL); + stInfo->fmapTreeBlks++; + syncSplitBtree(NULL, bufChild, bufSibling, bufParent, extent); + } + } + else + { + poolBlk = child->extent[child->head.numRecs-1].poolBlk; + if (child->head.numRecs < (FMAP_MAX - 6)) + { + if (bufParent) + { + CACHE_DIRTY_RELEASE(bufParent); + bufParent = NULL; + } + bufParent = bufChild; + bufChild = NULL; + } + else + { + XALLOC_SEED_IO_MSG(iomsg, beast, NULL, 0, CACHE_UPDATE); + if ((bufSibling = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + CACHE_RELEASE(bufChild); + return zFAILURE; + } + if (child->head.state & BT_ROOT) + { + XALLOC_SEED_IO_MSG(iomsg, beast, NULL, 0, CACHE_UPDATE); + if ((bufParent = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + Extent_s localExt; + + CACHE_RELEASE(bufChild); + + localExt.poolBlkNum = bufSibling->volBlk; + localExt.lengthOfExtent = 1; + zfsFreeExtent(genMsg, beast->vol.zfsVol, + &localExt, NULL); + cacheReleaseToss(bufSibling); + return zFAILURE; + } + stInfo->fmapTreeBlks++; + syncGrowBtree(NULL, bufChild, bufParent, &tmp, beast); + fmap->root = bufParent->volBlk; + } + zASSERT(bufParent != NULL); + + stInfo->fmapTreeBlks++; + syncSplitBtree(NULL, bufChild, bufSibling, bufParent, extent); + + CACHE_DIRTY_RELEASE(bufChild); + bufChild = NULL; + CACHE_DIRTY_RELEASE(bufParent); + bufParent = NULL; + + bufParent = bufSibling; + bufSibling = NULL; + } + goto ContinueScanningTheBtree; + } + CACHE_DIRTY_RELEASE(bufChild); + if (bufParent) + { + CACHE_DIRTY_RELEASE(bufParent); + } + if (bufSibling) + { + CACHE_DIRTY_RELEASE(bufSibling); + } + return zOK; +} + + +STATUS ipuUpdateDirectFileMap ( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Extent_s *extent) +{ + ZFSStorageInfo_s *stInfo; + Buffer_s *buffer = NULL; + FmapNode_s *node; + Fmap_s *fmap; + Blknum_t poolBlk; + Blknum_t length; + IoMsg_s iomsg; + + stInfo = beast->storage.zfsInfo; + fmap = &stInfo->fmap; + poolBlk = extent->poolBlkNum; + length = extent->lengthOfExtent; + + if (fmap->numRecs == 1) + { + fmap->dirExt[fmap->numRecs].count = length; + fmap->dirExt[fmap->numRecs].poolBlk = poolBlk; + fmap->numRecs++; + } + else if ((fmap->dirExt[fmap->numRecs -1].poolBlk - + fmap->dirExt[fmap->numRecs -2].count + + fmap->dirExt[fmap->numRecs -1].count) == poolBlk) + { + fmap->dirExt[fmap->numRecs - 1].count += length; + } + else if (fmap->numRecs < MAX_DIRECT) + { + fmap->dirExt[fmap->numRecs].poolBlk = poolBlk; + fmap->dirExt[fmap->numRecs].count = + fmap->dirExt[fmap->numRecs -1].count + length; + fmap->numRecs++; + } + else + { + XALLOCBLK_IO_MSG(iomsg, beast, NULL, CACHE_UPDATE); + if ((buffer = ZFS_AllocPoolBlk(genMsg, &iomsg)) == NULL) + { + return zFAILURE; + } + + stInfo->fmapTreeBlks++; + fmap->root = buffer->volBlk; + + node = (FmapNode_s *)(buffer->pBuf.data); + + node->head.magic = FMAP_BT_ROOT; + node->head.fnh_internalID = beast->ROOTinternalID; + node->head.fnh_zid = beast->zid; + node->head.state = BT_ROOT | BT_LEAF; + node->head.numRecs = 0; + node->head.leafLink = 0; + node->head.lsn = 0; + + node->extent[node->head.numRecs].count = + fmap->dirExt[fmap->numRecs - 1].count; + node->extent[node->head.numRecs].poolBlk = + fmap->dirExt[fmap->numRecs -1].poolBlk; + node->head.numRecs++; + + node->extent[node->head.numRecs].count = + fmap->dirExt[fmap->numRecs - 1].count + length; + node->extent[node->head.numRecs].poolBlk = poolBlk; + node->head.numRecs++; + + CACHE_DIRTY_RELEASE(buffer); + } + return zOK; +} + + +/***************************************************************************** + ** This routine is for IPU to call to add specific extents to the end of + ** the file map for the specified beast. + ** + ** This routine does NOT: 1) check for latching + ** 2) do transactions + ** 3) do logging + ** + ** The beast is marked dirty when we are done. It does not change the + ** logical eof. + ** + ** The only error cases that exist in this routine is if there is an error + ** allocating and getting a buffer for a meta block to extend the filemap, + ** or if there is an error reading an existing filemap metadata block. + ** + ** This routine will also take care of extending the file in a sparse manner. + ** For sparse files holeBlkCnt represents the number of 4K blocks that should + ** be sparse (not allocated) before adding the extent to the end of the file + ** map. + ** + ** For non sparse files: holeBlkCnt should be set to 0. + ** + ****************************************************************************/ + +STATUS ZFS_appendFileMap( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + Blknum_t holeBlkCnt, + Extent_s *extent) +{ + ZFSStorageInfo_s *stInfo; + Fmap_s *fmap; + Blknum_t fileBlk; + + stInfo = beast->storage.zfsInfo; + fmap = &stInfo->fmap; + fileBlk = stInfo->nextBlk + holeBlkCnt; + + if ((fileBlk == 0) && (fmap->numRecs == 0)) + { + /** Very first time adding a blk to the file **/ + fmap->numRecs = 1; + } + + if (holeBlkCnt) + { + if (ipuUpdateSparse(genMsg, beast, extent, fileBlk, holeBlkCnt) != zOK) + { + return zFAILURE; + } + } + else if (fmap->root == INVALID_BLK_ZERO) + { + if (ipuUpdateDirectFileMap(genMsg, beast, extent) != zOK) + { + return zFAILURE; + } + } + else + { + if (ipuUpdateBtreeFileMap(genMsg, beast, extent) != zOK) + { + return(zFAILURE); + } + } + stInfo->fmapDataBlks += extent->lengthOfExtent; + stInfo->nextBlk = fileBlk + extent->lengthOfExtent; + + BST_MarkDirty(beast); + return(zOK); +} diff --git a/src/nwnss/zlss/zfsPool.c b/src/nwnss/zlss/zfsPool.c new file mode 100644 index 0000000..2e15fd5 --- /dev/null +++ b/src/nwnss/zlss/zfsPool.c @@ -0,0 +1,7217 @@ +/**************************************************************************** + | + | (C) Copyright 1995 - 2001 Novell, Inc. + | All Rights Reserved. + | + | This program is free software; you can redistribute it and/or + | modify it under the terms of version 2 of the GNU General Public + | License as published by the Free Software Foundation. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, contact Novell, Inc. + | + | To contact Novell about this file by physical or electronic mail, + | you may find current contact information at www.novell.com + | + |*************************************************************************** + | + | NetWare Advance File Services (NSS) module + | + |--------------------------------------------------------------------------- + | + | $Author: gpachner $ + | $Date: 2008-03-15 03:50:57 +0530 (Sat, 15 Mar 2008) $ + | + | $RCSfile$ + | $Revision: 2301 $ + | + |------------------------------------------------------------------------- + | This module is used to: + | ZFS_Classes: Register all Beast classes used by ZFS. + +-------------------------------------------------------------------------*/ +#include +#include /* NetWare Includes*/ + +#include +#include +#include +#include + +#include +#include "nwreg.h" +#include "nssPubs.h" + +#include "zfs.h" +#include "zfsAsyncio.h" +#include "comnPublics.h" +#include "comnBeastClass.h" +#include "comnAuthorize.h" +#include "zfsXTree.h" +#include "zfsSuperBlk.h" +#include "zlog.h" +#include "hmc.h" +#include "purgeLog.h" +#include "xAdminVolume.h" +#include "beastTree.h" +#include "zlssStartup.h" +#include "purgeTree_if.h" +#include "parse.h" +#include "xaction.h" +#include "cmNSS.h" +#include "zlssLogicalVolume.h" +#include "userTree.h" +#include "zlssUpgrade.h" +#include "searchMap.h" +#include "zfsdefs.h" +#include "zEvent.h" +#include "eventSys.h" +#include "fileHandle.h" +#include "virtualIO.h" +#include "name.h" +#include "xCtype.h" +#include "xmlTags.h" +#include "mgmt.h" +#include "manage.h" +#include "objectIDStore.h" +#include "pssConfig.h" +#include "hardLinkBeast.h" +#include "dir.h" /* For ZFSNAMETREE_smap_cursor_cleanup used in DIR_SCTRL_CLEANUP */ + + /* Generally, we only track ACTIVE state I/O because in MAINTENANCE + * state we use the pool beast for most I/O. This will make us + * think we did a lot of PDB I/Os. In addition, we do not track + * in DEACTIVE because the tracking will not be persistent because + * we do not take checkpoints. By having multiple flags we can check + * if we are doing I/O when we have told the clustering software + * we will not. + */ +BOOL gCLPoolTrackActive = TRUE; +BOOL gCLPoolTrackMaintenance = FALSE; +BOOL gCLPoolTrackDeactive = FALSE; + + + /** The link list of all LVs that are being deleted or purged. Must + ** be empty when all pools are deactive. I.E. if the pool an + ** LV is on is not ACTIVE then we do NOT have any information + ** about the LV in this list. + **/ + +ZLSSLVDeleteList_s ZLSSLVDeleteList; +Time_t ZLSSStartUpTime; /* UpTime when ZLSS started. */ + + + /*** Set to TRUE when detailed ZLSS I/O information is needed */ +PoolWriteStatistics_s gZLSSPWS; /* Holds Global ZLSS Write Statisitics */ +PoolReadStatistics_s gZLSSPRS; /* Holds Global ZLSS Read Statisitics */ +LONG gZLSSIOStartUpTime; /* UpTime when global stats set to + * zero. Currently this is when ZLSS + * starts up. In the future, we may + * allow the counters to be reset so + * this time would reflect that time. + */ + + +#if ZLSS_IO_GATHER IS_ENABLED + +NINT gZLSSPoolIOLogSize = 1024*1024*1; +NINT gZLSSPoolIOLogNext = 0; /* Next free spot */ +NINT gZLSSPoolIOLogStart = 0; /* Start of the cirular buffer */ +NINT gZLSSPoolIOFull = FALSE; +NINT gZLSSPoolIOLogDo = TRUE; + +ZLSSPoolIOHeader_s *gZLSSPoolIOHeader = NULL; +BYTE *gZLSSPoolIOLog = NULL; + +#endif + + +#if NSS_DEBUG IS_ENABLED +extern unicode_t gZCL_OnlyPoolName[]; /* Name of POOL to load */ +#endif + +#if 0 +FsmLite_s gZLSSUserBlocksWorkToDoFsm; +#endif + +extern NINT EnterExitEventID; + +/************************************************************************** + * + ***************************************************************************/ +STATUS ZFSPOOL_Startup( + GeneralMsg_s *genMsg) +{ + STATUS status; + + ENTER(TZPOOL, ZFSPOOL_Startup); + ASSERT_MPKNSS_LOCK(); + +// ZIO_RA_Startup(); + + SET_INIT( &ZLSSLVDeleteList.ZDL_list ); + +// INIT_LATCH( &ZLSSLVDeleteList.ZDL_listLatch ); + INIT_LATCH( &ZLSSLVDeleteList.ZDL_beastLatch ); + ZLSSLVDeleteList.ZDL_cnt = 0; + FSMLITE_INIT(&ZLSSLVDeleteList.ZDL_fsmLite, "FSM for LV List", 0); + INIT_ONESHOT( ZLSSLVDeleteList.ZDL_timer ); + setOneShot(&ZLSSLVDeleteList.ZDL_timer, + ZLSSVOL_LVD_SCAN_TICKS_FIRST, ZLSSVOL_LVD_DeleteScanTimer); + ZLSSStartUpTime = GetUpTime(); + + gZLSSIOStartUpTime = GetUpTime(); +#if ZLSS_IO_GATHER IS_ENABLED + gZLSSPoolIOHeader = LB_malloc( sizeof(*gZLSSPoolIOHeader) ); + gZLSSPoolIOLog = LB_malloc( gZLSSPoolIOLogSize ); + if ( gZLSSPoolIOLog == NULL || gZLSSPoolIOHeader == NULL ) + { + free(gZLSSPoolIOHeader); + gZLSSPoolIOHeader = NULL; + free(gZLSSPoolIOLog); + gZLSSPoolIOLog = NULL; + } else { + LB_bzero( gZLSSPoolIOHeader, sizeof(*gZLSSPoolIOHeader) ); + LB_bzero( gZLSSPoolIOLog, gZLSSPoolIOLogSize ); + } +#endif +#if 0 + FSMLITE_INIT( &gZLSSUserBlocksWorkToDoFsm /* Lite FSM */, + MSGNot("User Blocks Work-To-Do"), 0 /* Instance */ ); +#endif + status = ZLSS_VFCCreate( genMsg ); + if ( status != zOK ) + { + ClearErrno( genMsg ); + } + + RTN_STATUS(zOK); +} + + +/************************************************************************** + * SHUTDOWN a specific pool and all of its LVs. The user can not + * intervene and stop this shutdown. + * + * Caller must remove from Master pool lists before calling us. + * + * Unlike volume shutdown the caller does not have a use count. + * This is bad, but we can consider that they are using the + * 'load' use count. + ***************************************************************************/ +void ZFSPOOL_ShutdownPool( ZfsPool_s *poolVolume ) + +{ + STATUS status; + GeneralMsg_s dummyGenMsg; + Volume_s *volume; + ZlssPool_s *zlssPool; + typedef struct Stack_s { + unicode_t pName[zMAX_COMPONENT_NAME]; + } Stack_s; + STACK_ALLOC(); + +// printk("<1>start shutdownpool %s\n",WHERE); + COMN_SETUP_GENERAL_MSG_NOSA(&dummyGenMsg); + zlssPool = (ZlssPool_s *)poolVolume->ZFSPOOLvol.v_pool; + zASSERT( zlssPool != NULL ); + + if ( zlssPool->ZLSSPOOLpool.v_statusFlag & VOL_SF_UNLOADING ) + { +#ifdef USER_GPACHNER + zASSERT("This should not happen too often!!!"==NULL); +#endif +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(CYAN, MSGNot("Pool already being unloaded\n") ); +#endif + STACK_FREE(); +// printk("<1>start shutdownpool %s\n",WHERE); + return; + } + /* Set unload bit so we can catch multiple unloads AND + * the so the common layer will prevent change volume + * states to higher states. + */ + zlssPool->ZLSSPOOLpool.v_statusFlag |= VOL_SF_UNLOADING; + + COMN_GetPoolName(&dummyGenMsg, &zlssPool->ZLSSPOOLpool, aStack->pName,NELEMS(aStack->pName)); + if ((poolVolume->ZFSPOOLvol.state != zVOLSTATE_DEACTIVE) && + (poolVolume->ZFSPOOLvol.state != zVOLSTATE_UNKNOWN)) + { +#ifdef USER_GPACHNER + aprintf(YELLOW, WHERE ); +#endif + aprintf(YELLOW,MSG("Deactivating pool \"%U\"...\n",884),aStack->pName); + } + /* + * Go through the link list of LVs that this pool + * owns and shut them down first. + */ +// printk("<1>start shutdownpool %s\n",WHERE); + for (;;) + { + X_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); + DQ_DEQ(&zlssPool->ZP_Pool.P_VolumeList,volume,Volume_s,v_poolVolLink); + if (volume == NULL) + { + UNX_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); + break; + } + volume->VOLfile.FILEuseCount += 1; + status = ZLSSVOL_LV_Unload( &dummyGenMsg, (ZfsVolume_s *)volume ); + volume = NULL; + if ( status != zOK ) + { + zASSERT(status == zOK); + ClearErrno( &dummyGenMsg ); + } + UNX_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); +//// /* If volume is NOT DEACTIVE or UNKNOWN then display +//// * that we are deactivating it. +//// */ +//// COMN_GetVolumeName(&dummyGenMsg, volume, vName, NELEMS(vName)); +// status = COMN_ChangeVolumeState(&dummyGenMsg, volume, +// zVOLSTATE_UNKNOWN, VOLMODE_VERBOSE ); +// zASSERT(status == zOK); +// WORK_WaitForPending(); /* before freeing the poolVolume structure, wait +// * for all pending WORK to complete. This +// * gives a chance for all outstanding +// * operations and transactions to complete.*/ +// BST_releaseAndFree(volume); + } + /* This will deactivate the ZLSS Pool and its physical volume */ + status = COMN_ChangePoolState(&dummyGenMsg, &zlssPool->ZLSSPOOLpool, + zVOLSTATE_UNKNOWN, 0 /*VOLMODE_VERBOSE*/); + if ( status != zOK ) + { +// printk("<1>start shutdownpool %s\n",WHERE); + zASSERT(status == zOK); + ClearErrno( &dummyGenMsg ); + } + +// printk("<1>start shutdownpool %s\n",WHERE); + ZLSS_removePoolMgmtFile( aStack->pName ); + +// printk("<1>start shutdownpool %s\n",WHERE); + /* By removing from the master volume link list we prevent + * this volume from being found. This prevents any new + * operations from starting which is a requirement since + * we will be freeing the volume. + */ + COMN_RemoveVolumeFromAllLists( &poolVolume->ZFSPOOLvol ); + /* Defect 270987 - Remove the ZlssPool_s from the master pool + list. We do not want the pool found just because a use + count is left on it becuase of a volume use count error. */ + COMN_RemovePoolFromAllLists( &zlssPool->ZP_Pool ); + +// printk("<1>start shutdownpool %s\n",WHERE); + WORK_WaitForPending(); /* before freeing the poolVolume structure, wait + * for all pending WORK to complete. This + * gives a chance for all outstanding + * operations and transactions to complete.*/ + +// printk("<1>start shutdownpool %s\n",WHERE); + zASSERT( poolVolume->storagepool != NULL ); /* This can happen if loadpool + * fails really early on */ + if ( poolVolume->storagepool != NULL ) + { /* Added this to clean up. Mike's stuff has a pointer + * to us (no use count though). During LV AIPU I re-load + * amd unload the zfsPool_s several times and would get + * confussed when looking at freed memory. E.G. in low level + * I/O disable code we have storagepool and if points to + * a zfsPool we use to see if I/O has been disabled. + */ + if ( poolVolume->storagepool->zfspool == poolVolume ) + { /* Only poke if it still points to us. */ + poolVolume->storagepool->zfspool = NULL; + } + } + /* These are releasing the use counts obtained at 'load' time. + * We have these useCounts so that we control when the + * pool is freed. The beasts will be freed when their + * useCounts get to 0. + */ +// printk("<1>start shutdownpool %s\n",WHERE); + COMN_Release(&poolVolume); +// printk("<1>start shutdownpool %s\n",WHERE); + COMN_Release(&zlssPool); + STACK_FREE(); +// printk("<1>start shutdownpool %s\n",WHERE); + return; + +} /* End of ZFSPOOL_ShutdownPool() */ + + +/************************************************************************** + * Loop through and SHUTDOWN all pools and their LVs. + ***************************************************************************/ +void ZFSPOOL_Shutdown(void) +{ +//#ifndef __linux__ // LINUX_Upgrade + STATUS status; + GeneralMsg_s genMsg; +//#endif + ZfsPool_s *poolVolume; + zConPool_s *phypool; +#if NSS_ASSERT IS_ENABLED + NINT count=0; +#endif + ASSERT_MPKNSS_LOCK(); + + ENTER(TZPOOL, ZFSPOOL_Shutdown); + +//#ifndef __linux__ // LINUX_Upgrade + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + ZLSS_UpgradeShutdownNakoma(); + /* Do Manage_NSS cleanup up front because _Admin may be a + * persistent ZLSS volume. + */ + status = ZLSS_VFCDelete( &genMsg ); + if ( status != zOK ) + { + ClearErrno( &genMsg ); + } +//#endif + + while ( DQ_NOT_EMPTY(&ZLSSPhyPoolList) ) + { + DQ_DEQ_NO_CHECK(&ZLSSPhyPoolList, phypool, zConPool_s, pollink); + + if((poolVolume = phypool->pol.zfspool) != NULL) + { + ZFSPOOL_ShutdownPool( poolVolume ); + } + } + + X_LATCH( &ZLSSLVDeleteList.ZDL_beastLatch ); + ZLSSLVDeleteList.ZDL_stop = TRUE; + zASSERT( ZLSSLVDeleteList.ZDL_cnt == 0 ); + CANCEL_ALARM( ZLSSLVDeleteList.ZDL_timer ); + WARN( ZLSSLVDeleteList.ZDL_scheduled == FALSE ); + while ( ZLSSLVDeleteList.ZDL_scheduled == TRUE ) + { + UNX_LATCH( &ZLSSLVDeleteList.ZDL_beastLatch ); + LB_delay( ZLOG_WORK_CHECKPOINT_WAIT_DELAY ); /* 20 millisecs */ + X_LATCH( &ZLSSLVDeleteList.ZDL_beastLatch ); + +#if NSS_ASSERT IS_ENABLED + ++count; /* Not in ASSERT macro just in case of side-effects */ +#endif + /* If we have been doing this for 15 secs then ASSERT */ + zASSERT( count != ((15 * 1000)/ZLOG_WORK_CHECKPOINT_WAIT_DELAY) ); + /* If we have been doing this for 30 secs then ASSERT */ + zASSERT( count != ((30 * 1000)/ZLOG_WORK_CHECKPOINT_WAIT_DELAY) ); + } + UNX_LATCH( &ZLSSLVDeleteList.ZDL_beastLatch ); + +#if ZLSS_IO_GATHER IS_ENABLED + free(gZLSSPoolIOHeader); + gZLSSPoolIOHeader = NULL; + free(gZLSSPoolIOLog); + gZLSSPoolIOLog = NULL; +#endif +// ZIO_RA_Shutdown(); + RTN_VOID(); + +} /* End of ZFSPOOL_Shutdown() */ + + +/*========================================================================= + *========================================================================= + *========================================================================= + * + * ZFSPOOL BEAST Methods + * + *========================================================================= + *========================================================================= + *=========================================================================*/ + +/************************************************************************** + * ZFS pool beast constructor + ***************************************************************************/ +STATUS ZFSPOOL_BST_Construct( + GeneralMsg_s *genMsg, + void *pool_LX) +{ + ZfsPool_s *pool = (ZfsPool_s *)pool_LX; + + ENTER(TZPOOL, ZFSPOOL_BST_Construct); + ASSERT_MPKNSS_LOCK(); + + /* Assert that packed beast portion of checkpoint is on QUAD boundary */ + zASSERT( offsetof(Checkpoint_s,CP_PackedData) == + ALIGN( offsetof( Checkpoint_s, CP_PackedData ), 8 ) ); + /* Assert that superblock header is on LONG boundary. Checksum + code requires this. */ + zASSERT( sizeof( SuperBlockHeader_s ) == + ALIGN( sizeof( SuperBlockHeader_s ) , 4 ) ); + +/*--------------------------------------------------------------------------- + | Start initing the pool + *-------------------------------------------------------------------------*/ + /* Technically, this is incorrect as it is initialing the ZLSS Volume + * and therefore should be done in the ZLSS Volume constructor. This + * can not be done there because that constructor does not know the pool. + * In LV code the ZLSS Pool's volume is not NEWed until after the pool. + */ + pool->ZFSPOOLpool = pool; + /* Mark this as a special internal volume. This hides it from the user. + * At the same time, unmark it as a logical volume. We also mark + * hidden and system in NSS_Admin volume so a normal dir does not + * reveal the volume. + */ + pool->ZFSPOOLvol.VOLv_statusFlag |= VOL_SF_INTERNAL_VOLUME; + pool->ZFSPOOLvol.VOLv_statusFlag &= ~VOL_SF_LOGICAL_VOLUME; + pool->ZFSPOOLfile.FILEattributes = zFA_SYSTEM|zFA_HIDDEN; + DQ_INIT( &pool->ZP_deleteBlkQ ); + DQ_INIT(&pool->freeDataBlksList); + INIT_LATCH( &pool->ZP_SuperblockHeaderLatch ); + RTN_STATUS(zOK); +} + +/************************************************************************** + * ZFS pool beast destructor + ***************************************************************************/ +void ZFSPOOL_BST_Destruct( + void *pool_LX ) +{ + ZfsPool_s *pool = (ZfsPool_s *)pool_LX; + + ENTER(TZPOOL, ZFSPOOL_Destruct); + ASSERT_MPKNSS_LOCK(); + + /* Note that normally deactivate will have taken care of the + * 4 releaseAndFree() and the free call. This code is here + * just in case we could not deactivate. + */ + BST_releaseAndFree(pool->freeExtent); + BST_releaseAndFree(pool->purgeLogBeast); + BST_releaseAndFree(pool->purgeTree); + BST_releaseAndFree(pool->zfsLogBeast); + /* Same goes here - ZP_super should already be freed */ + free(pool->ZP_super); + pool->ZP_super = NULL; + RTN_VOID(); +} + +/**************************************************************************** + * This function modifies metadata information for a POOL beast object + *****************************************************************************/ + /* This one does hidden volume modifies but not real ZLSS pool modifies*/ +STATUS ZFSPOOL_BST_ModifyInfo( + GeneralMsg_s *genMsg, + RootBeast_s *pool_LX, + ModifyInfoMsg_s *modifyMsg, + Xaction_s *xaction) /* Optional xaction, may be NULL */ +{ + ZfsPool_s *pool = (ZfsPool_s *)pool_LX; + zVolumeInfo_s *volInfo; + statusfunc_t derivedFromModifyInfo; + NINT modified = FALSE; + + + ASSERT_MPKNSS_LOCK(); + zASSERT(!(pool->ZFSPOOLenabledAttributes & zATTR_READONLY)); + +/*--------------------------------------------------------------------------- + * First, call the generic file information routine and modify any + * requested generic information. If it fails, do not continue... + *---------------------------------------------------------------------------*/ + derivedFromModifyInfo = COMN_GetNextParentBeastComnOp(pool->ZFSPOOLbeastClass, + COMNOPS_INDEX(BST_modifyInfo),ZFSPOOL_BST_ModifyInfo); + if (derivedFromModifyInfo(genMsg,pool,modifyMsg,xaction) != zOK) + { + return(zFAILURE); + } +/*--------------------------------------------------------------------------- + * Now, if the caller requested, fill in any pool specific information + *---------------------------------------------------------------------------*/ + if ((volInfo = modifyMsg->modifyTypeInfo) == NULL) + { + return(zOK); /* No type specific info to be modified */ + } + + if (modifyMsg->modifyTypeInfoMask) + { + if (pool->ZFSPOOLmayIDoThis(genMsg, pool, modifyMsg->parentZid, +#ifdef NSS_NW60 + MAY_I_DO_EVERYTHING) != zOK) +#else + MAY_I_SUPERVISE) != zOK) +#endif + { + SetErrno(genMsg,zERR_NO_SET_PRIVILEGE); + return(zFAILURE); + } +// if (modifyMsg->modifyTypeInfoMask & zMOD_VOL_ATTRIBUTES) +// { +// pool->ZFSPOOLvolAttributes = volInfo->volumeAttributes; +// modified = TRUE; +// } + + if (modifyMsg->modifyTypeInfoMask & zMOD_VOL_LOW_WATER_MARK) + { + if (volInfo->salvage.lowWaterMark > zMAX_LOWWATERMARK) + volInfo->salvage.lowWaterMark = zMAX_LOWWATERMARK; + if (volInfo->salvage.lowWaterMark < zMIN_LOWWATERMARK) + volInfo->salvage.lowWaterMark = zMIN_LOWWATERMARK; + pool->ZFSPOOLlowWaterMark = volInfo->salvage.lowWaterMark; + modified = TRUE; + } + if (modifyMsg->modifyTypeInfoMask & zMOD_VOL_HIGH_WATER_MARK) + { + if (volInfo->salvage.highWaterMark > zMAX_HIGHWATERMARK) + volInfo->salvage.highWaterMark = zMAX_HIGHWATERMARK; + if (volInfo->salvage.highWaterMark < zMIN_HIGHWATERMARK) + volInfo->salvage.highWaterMark = zMIN_HIGHWATERMARK; + pool->ZFSPOOLhighWaterMark = volInfo->salvage.highWaterMark; + modified = TRUE; + } + else + { + /* Even if not changing high water mark, force it to be at least + * 2% greater than low water mark */ + if (pool->ZFSPOOLhighWaterMark < pool->ZFSPOOLlowWaterMark+2) + { + pool->ZFSPOOLhighWaterMark = pool->ZFSPOOLlowWaterMark+2; + modified = TRUE; + } + + } + if ( modified ) + { /* This is an async call that does not do transactions + * so no warrenty that persistent information is updated. + */ + ZFSVOL_WritePersistentVolumeData(&pool->ZFSPOOLzfsvol); + } + } + return(zOK); +} + +/**************************************************************************** + * Read the LOGFILE beast out of the CheckPoint + *****************************************************************************/ +STATUS ZFSPOOL_LoadLogBeast( + GeneralMsg_s *genMsg, + ZfsPool_s *pool) +{ + ZlogBeast_s *beast; + MediaCheckpoint_s *mcp; + ZnodeHeader_s *packedHeader; + + ENTER(TZPOOL, ZFSPOOL_LoadLogBeast); + ASSERT_MPKNSS_LOCK(); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_LoadLogBeast\n"))); + zASSERT( pool->ZP_super != NULL ); + mcp = &pool->ZP_super->SB_Checkpoint; + + packedHeader = (ZnodeHeader_s *)&mcp->chkPnt.CP_PackedData[0]; + zASSERT(packedHeader->magic == NODE_MAGIC); + zASSERT(packedHeader->zid == ZFSPOOL_LOGFILE_ZID); + beast = BST_new(genMsg,packedHeader->type,pool); + if (beast == NULL) + { + errPrintf(WHERE, Module, 1415, + MSG("Error creating journal beast, status=%d. " + "You may be out of memory.",116), GetErrno(genMsg)); + RTN_STATUS(zFAILURE); + } + beast->ZFSLOGbstState |= BST_STATE_DO_NOT_WRITE; + + if (BST_doUnpack(genMsg,&beast->ZFSLOGroot,&mcp->chkPnt.CP_PackedData[0], + packedHeader->length) != zOK) + { + errPrintf(WHERE, Module, 1416, + MSG("Error unpacking journal beast, status=%d.\n" + "NSS is trying to read the data or a checksum error has occurred. " + "Run Verify.", 117), GetErrno(genMsg)); + BST_free(beast); + RTN_STATUS(zFAILURE); + } + + BEASTHASH_Insert(&beast->ZFSLOGroot); + /* we do not want these special system beasts to be in the POOL linked list */ + DQ_RMV(&beast->ZFSLOGroot,volLink); + pool->zfsLogBeast = beast; + + RTN_STATUS(zOK); +} /* End of ZFSPOOL_LoadLogBeast() */ + + +/**************************************************************************** + * This is called to unPACK all of the Beasts from the SuperBlock. This + * routine always unpacks all of the beasts all of the time. + *****************************************************************************/ +STATUS ZFSPOOL_LoadSystemBeasts( + GeneralMsg_s *genMsg, + ZfsPool_s *pool) +{ + NINT i; + Buffer_s *buffer; + Blknum_t blk; + IoMsg_s iomsg; + RootBeast_s *beast; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZPOOL, ZFSPOOL_LoadSystemBeasts); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_LoadSystemBeasts\n"))); + + for(i=0;SuperBeasts[i].beastClass != -1;i++) + { + if (SuperBeasts[i].zid == ZFSPOOL_VOLBSTTREE_ZID) + { +/*------------------------------------------------------------------------- + * Read the BEASTTREE beast. We get the block number to read directly + * from the SuperBlock. Then read in the persistent volume data block. + *-------------------------------------------------------------------------*/ + ZfsBeastTreeBeast_s *beastTree; + + /* read in beastTree beast */ + zASSERT( pool->ZP_super != NULL ); + blk = pool->ZFSPOOLzfsvolp.PZV_systemBeastBlkNum; +// blk = pool->ZP_super->SB_Header.hdr.SBH_BeastTreeBlkNum + + zASSERT( blk != 0 ); + READBLK_IO_MSG(iomsg,pool,blk,CACHE_READ); + SET_DEBUG_ID(iomsg, 35); + buffer = ZFS_ReadPoolBlk(genMsg,&iomsg); + if (buffer == NULL) + { + if ( GetErrno(genMsg) == zERR_NO_MEMORY ) + { + errPrintf(WHERE, Module, 1417, + MSG("Out of memory, status=%d.", 118), + GetErrno(genMsg)); + } + else + { + errPrintf(WHERE, Module, 1418, + MSG("Error reading beast tree block %d, status=%d.\n" + "Check the status, you may need to run Rebuild.", 119), + blk, GetErrno(genMsg)); + } +// cacheReleaseToss(buffer); + RTN_STATUS(zFAILURE); + } + beast = ZFSVOL_GetBeastFromBuffer(genMsg,SuperBeasts[i].zid, + &pool->ZFSPOOLvol,buffer); + cacheReleaseToss(buffer); + + if (beast == NULL) + { + if ( GetErrno(genMsg) == zERR_NO_MEMORY ) + { + errPrintf(WHERE, Module, 1419, + MSG("Out of memory, status=%d.\r\n", 120), + GetErrno(genMsg)); + } + else + { + errPrintf(WHERE, Module, 1420, + MSG("Error reading beast tree, status=%d.\n" + "Check the status, you may need to run Rebuild.", 451), + GetErrno(genMsg)); + } + RTN_STATUS(zFAILURE); + } + + /* read and set the persistent volume information */ + beastTree = (ZfsBeastTreeBeast_s *)beast; + } + else if (SuperBeasts[i].zid == ZFSPOOL_LOGFILE_ZID) + { + continue; + } + else + { +/*------------------------------------------------------------------------- + * All other beasts are read out of the BEAST file. + *-------------------------------------------------------------------------*/ + if (SuperBeasts[i].zid == ZFSPOOL_PURGETREE_ZID) + { + zASSERT( pool != NULL ); + zASSERT( pool->ZP_super != NULL ); + if ( pool->ZP_super->SB_Header.hdr.SBH_VersionMediaMajor == 32 ) + { /* Media will be upgraded in place later */ + *(RootBeast_s **)((ADDR)pool+SuperBeasts[i].poolOffset) = NULL; + continue; + } + /* Else we drop through and load up now */ + } + + beast = ZFSVOL_VOL_GetBeastFromVolume(genMsg,SuperBeasts[i].zid, + &pool->ZFSPOOLvol); + if (beast == NULL) + { + if ( GetErrno(genMsg) == zERR_NO_MEMORY ) + { + errPrintf(WHERE, Module, 1421, + MSG("Out of memory, status=%d.\n", 122), + GetErrno(genMsg)); + RTN_STATUS(zFAILURE); + } + else + { + if (SuperBeasts[i].zid == ZFSVOL_USERTREE_ZID || + SuperBeasts[i].zid == ZFSVOL_DIRTREE_ZID) + { + ClearErrno(genMsg); + } + else if (SuperBeasts[i].zid == ZFSVOL_PURGELOG_ZID && + GetErrno(genMsg) == zERR_ZID_NOT_FOUND) + { + LONG version; + + version = pool->ZP_super->SB_Header.hdr.SBH_VersionMediaMajor * 0x100L + + pool->ZP_super->SB_Header.hdr.SBH_VersionMediaMinor; + /* FixFixFix(LV,ACI,11) - should the pool's volume + * purge log be created here? then could remove + * for LV AIPU and not have to worry if someone + * needs the purge log before it is created. + */ + if ( version >= (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_PVPL_DONE) ) + { /* Media still may not have Volume Purge Log */ + errPrintf(WHERE, Module, 1422, + MSG("Error reading system beast (ZID %d), status=%d.\n" + "Check the status, you may need to run Rebuild.", 866), + (NINT)SuperBeasts[i].zid, + GetErrno(genMsg)); + RTN_STATUS(zFAILURE); + } + ClearErrno( genMsg ); + } + else + { + errPrintf(WHERE, Module, 1422, + MSG("Error reading system beast (ZID %d), status=%d.\n" + "Check the status, you may need to run Rebuild.", 124), + (NINT)SuperBeasts[i].zid, + GetErrno(genMsg)); + RTN_STATUS(zFAILURE); + } + } + } +#if NSS_DEBUG IS_ENABLED + else + { + if (SuperBeasts[i].zid == ZFSVOL_DIRTREE_ZID) + { + NINT version; + version = pool->ZP_super->SB_Header.hdr.SBH_VersionMediaMajor + * 0x100L + pool->ZP_super->SB_Header.hdr.SBH_VersionMediaMinor; + zASSERT(version < + (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_4_DONE)); + } + } +#endif + + } + /* Handle that the Volume Purge Log may not exist */ + if ( beast != NULL ) + { + BEASTHASH_Insert(beast); + /* we do not want these special system beasts to be in the POOL linked list */ + DQ_RMV(beast,volLink); + /* + * If we have just loaded up the purge log then go through its setup + * so that entries can be added in later phases of activation, + * before it has a chance to be played. + */ + if ((SuperBeasts[i].zid == ZFSVOL_PURGELOG_ZID) || + (SuperBeasts[i].zid == ZFSPOOL_PURGELOG_ZID)) + { + ZFSPURGELOG_Setup(genMsg, (ZfsPurgeLogBeast_s *)beast); + } + } + *(RootBeast_s **)((ADDR)pool+SuperBeasts[i].poolOffset) = beast; + } + zASSERT( pool->zfsLogBeast != NULL ); + /* Set up quick pointer to ZLOG Beast (needed to do XACTIONS) */ + pool->ZFSPOOLzfsvolzlog = pool->zfsLogBeast; + + /* All upgrades that do transactioning should be done after the + * XactionRecoverLogicalUndo call in ZFSPOOL_Activate + */ + + RTN_STATUS(zOK); +} + +/**************************************************************************** + * This is called to dirty all of the Beasts from the SuperBlock. This + * is usually needed if we have on-the-fly upgraded the beasts. We cannot + * dirty the beasts until after the volume is active. + *****************************************************************************/ +void ZFSPOOL_DirtySystemBeasts( + GeneralMsg_s *genMsg, + ZfsPool_s *pool) +{ + NINT i; + RootBeast_s *beast; + + for(i=0;SuperBeasts[i].beastClass != -1;i++) + { + + beast = BEASTHASH_LookupByZid(genMsg, &pool->ZFSPOOLvol, + SuperBeasts[i].zid, XLATCHED); + + if (beast != NULL) + { + if (beast->beastVersion != CURRENT_BEAST_VERSION) + { + COMN_MARK_BEAST_DIRTY(beast); + } + COMN_UnlatchAndRelease(&beast, XLATCHED); + } + else + { + ClearErrno( genMsg ); + } + } + return; +} + + +/************************************************************************ + * + * This routine activates all the logical volumes for the given pool + * upto the point where they can be used for logical undo. The recovery + * code is the only one that should call this routine. (It assumes that + * the pool's cvsLatch is held). The recovery code will also call the + * cleanup routine corresponding to this routine, once the logical undo + * records are done playing. + * + * We first find all the logical volumes by doing a wildcard lookup in + * the well known directory on the pool's internal volume where the + * locator beasts for these volumes are kept. + * + * Once we find the locator beast we do a BST_new of the lv beast, and + * put in on a linked list. Then we load the persistent pool data and + * the system beasts. + * + ************************************************************************/ +STATUS ZFSPOOL_ActivateAllLVsQuasi( + GeneralMsg_s *genMsg, + ZfsPool_s *pool ) +{ + NamedBeast_s *directory = NULL; + NameSpace_s *nameSpace = NULL; + ZlssVolumeLocator_s *zvlBeast = NULL; + ZfsVolume_s *zVolume = NULL; +// cnt NINT nameUniquifier; + Zid_t dirZid; + Zid_t zvlZid; + BOOL cleanupSMAP = FALSE; + typedef struct Stack_s { + SearchMap_s smap; + } Stack_s; + STACK_ALLOC(); + + ClearErrno(genMsg); + bzero( &aStack->smap, sizeof( SearchMap_s ) ); + directory = (NamedBeast_s *)ZFSVOL_VOL_GetBeastFromVolume(genMsg, + zROOTDIR_ZID, ZFS_POOL_TO_VOLUME(pool)); + if (directory == NULL) + { + goto cleanup_error; + } + + nameSpace = COMN_NameSpaceIDLookup(genMsg, zNSPACE_LONG); + if (nameSpace == NULL) + { + goto cleanup_error; + } + + S_LATCH(&directory->NAMEDbeastLatch); + + dirZid = ZFSVOL_VOL_LookupByNameInDirectory(genMsg, directory, + nameSpace, zNTYPE_FILE, ZLSSVOL_DIRECTORY_UNICODE, + /* zFNU_FIRST_PARENT, &nameUniquifier,*/ NULL); + + UNS_LATCH(&directory->NAMEDbeastLatch); + + if (dirZid == zINVALID_ZID) + { + goto cleanup_error; + } + BST_free(directory); + + directory = (NamedBeast_s *)ZFSVOL_VOL_GetBeastFromVolume(genMsg, + dirZid, ZFS_POOL_TO_VOLUME(pool)); + if (directory == NULL) + { + goto cleanup_error; + } + + aStack->smap.options |= SMAPOPT_matchAllEntries; +/* + If we stop calling ZFSVOL_VOL_WildcardLookup() before getting an error + WE must clean up the smap! This means we need to call + DIR_SCTRL_CLEANUP(smap); + so that the SMAP is removed from any lists that it is on. +*/ + while (1) + { + S_LATCH(&directory->NAMEDbeastLatch); + + cleanupSMAP = TRUE; + zvlZid = ZFSVOL_VOL_WildcardLookup(genMsg, directory, nameSpace, + zNTYPE_FILE, NULL, &aStack->smap, /* cnt &nameUniquifier,*/ NULL); + + UNS_LATCH(&directory->NAMEDbeastLatch); + + if (zvlZid == zINVALID_ZID) + { + cleanupSMAP = FALSE; /* ZFSVOL_VOL_WildcardLookup has cleaned up SMAP */ + if (GetErrno(genMsg) == zERR_NAME_NOT_FOUND_IN_DIRECTORY) + { + ClearErrno(genMsg); + break; + } + else + { + goto cleanup_error; + } + } + + zvlBeast = (ZlssVolumeLocator_s *)ZFSVOL_VOL_GetBeastFromVolume(genMsg, + zvlZid, ZFS_POOL_TO_VOLUME(pool)); + if (zvlBeast == NULL) + { + if (GetErrno(genMsg) == zERR_ZID_NOT_FOUND) + { +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(RED, MSGNot("Error finding logicalVolumeBeast in the beast Tree. Error = %ld (%s) \n"), GetErrno(genMsg), GetErrnoSetter(genMsg)); + zASSERT("Logical Volume not found during Quasi Activation." == NULL); +#endif + ClearErrno(genMsg); + continue; + } + else + { + goto cleanup_error; + } + } + +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(CYAN, + MSGNot("LV %U with state %d and substate %d found\n"), + zvlBeast->ZVL_p.PZVL_volumeName, + zvlBeast->ZVL_p.PZVL_state, + zvlBeast->ZVL_p.PZVL_subState); +#endif + switch (zvlBeast->ZVL_p.PZVL_state) + { + case PZVL_S_DELETION: + if (zvlBeast->ZVL_p.PZVL_subState > PZVL_SS_DELETE_STEP_ACTIVATE) + { + break; + } + /* If the subState is less than or equal to the above substate + * we want to fall into the code below + */ + case PZVL_S_CREATED: + + zASSERT((zvlBeast->ZVL_p.PZVL_state == PZVL_S_CREATED) || + (zvlBeast->ZVL_p.PZVL_state == PZVL_S_DELETION)); + +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(CYAN, + MSGNot("Quasi activation of LV %U. \n"), + zvlBeast->ZVL_p.PZVL_volumeName); +#endif + zVolume = ZLSSVOL_NewFakeLVBeast(genMsg, zvlBeast, pool, + zvlBeast->ZVL_p.PZVL_volumeName); + if (zVolume == NULL) + { + goto cleanup_error; + } + + if (ZLSSVOL_LoadPersistentPool(genMsg, zVolume, 0, 0) != zOK) + { + goto cleanup_error; + } + if (ZFSVOL_Activate(genMsg, zVolume, 0) != zOK) + { + ZLSSVOL_UnloadPersistentPool(zVolume, 0); + goto cleanup_error; + } + if (VOL_Activate(genMsg, ZLSS_VOLUME_TO_VOLUME(zVolume), 0) != zOK) + { + ZFSVOL_Deactivate(genMsg, zVolume, 0); + ZLSSVOL_UnloadPersistentPool(zVolume, 0); + goto cleanup_error; + } + + /* System Beasts have a pointer to the volume and put a + * use Count on the volume */ + zASSERT( zVolume->ZFSVOLfile.FILEuseCount >= 2 ); + --(zVolume->ZFSVOLfile.FILEuseCount); + zVolume = NULL; + + break; + + case PZVL_S_CREATION: + case PZVL_S_CREATION_DELETE: + /* This volume has not been created yet, so there should not be + * any logical undo records for this volume, we will ignore it + * and go on + */ + break; + default: + zASSERT("Unrecognized Volume Locator Beast state" == NULL); + break; + } + + BST_free(zvlBeast); + zvlBeast = NULL; + } + + BST_free(directory); + BST_UNUSE_BEAST(&nameSpace->NSPACEroot); + STACK_FREE(); + return zOK; + + +cleanup_error: + if ( cleanupSMAP ) + { + DIR_SCTRL_CLEANUP(&aStack->smap); + cleanupSMAP = FALSE; + } + + if (zVolume) + { +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(CYAN, + MSGNot("Quasi activation of LV %U failed. \n"), + zvlBeast->ZVL_p.PZVL_volumeName); +#endif + zASSERT(QMEMBER(&zVolume->ZLSSVOLvol.masterVolLink)); + SET_RMV(zVolume,ZLSSVOLvol.masterVolLink); + + zASSERT(QMEMBER(&zVolume->ZFSVOLvol.v_poolVolLink)); + DQ_RMV(&zVolume->ZFSVOLvol, v_poolVolLink); + + zASSERT( zVolume->ZFSVOLfile.FILEuseCount == 2 ); + --(zVolume->ZFSVOLfile.FILEuseCount); + zASSERT( zVolume->ZFSVOLfile.FILEuseCount == 1 ); + + COMN_Release(&zVolume); +// zVolume = NULL; + } + if (zvlBeast) + { + BST_free(zvlBeast); + } + if (directory) + { + BST_free(directory); + } + if (nameSpace) + { + BST_UNUSE_BEAST(&nameSpace->NSPACEroot); + } +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(RED, MSGNot("Quasi activation of LVs failed. Error = %ld (%s) \n"), GetErrno(genMsg), GetErrnoSetter(genMsg)); + zASSERT("Error during Quasi Activation." == NULL); +#endif + + ClearErrno(genMsg); + STACK_FREE(); + return zOK; +// ZFSPOOL_DeactivateAllQuasiActiveLVs(pool); +// return zFAILURE; +} + +/************************************************************************ + * + * This routine will cleanup the volumes that were quasi activated for + * recovery. It removes it from the linked list and does a BST_free + * on the beast for the logical volume. + * + ************************************************************************/ + +void ZFSPOOL_DeactivateAllQuasiActiveLVs( + ZfsPool_s *pool) +{ + GeneralMsg_s genMsg; + Volume_s *volume; + ZlssPool_s *zlssPool = ZFS_POOL_TO_ZLSS_POOL(pool); + ZfsVolume_s *zVolume; + + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + + for (;;) + { + DQ_DEQ(&zlssPool->ZP_Pool.P_VolumeList,volume,Volume_s,v_poolVolLink); + if (volume == NULL) + { + break; + } + ++(volume->VOLfile.FILEuseCount); + zASSERT( volume->VOLfile.FILEuseCount >= 2 ); + + VOL_Deactivate(&genMsg, volume, 0); + + zVolume = VOLUME_TO_ZLSS_VOLUME(volume); + ZFSVOL_Deactivate(&genMsg, zVolume, 0); + ZLSSVOL_UnloadPersistentPool(zVolume, 0); + + zASSERT(QMEMBER(&zVolume->ZFSVOLvol.masterVolLink)); + SET_RMV(zVolume,ZFSVOLvol.masterVolLink); + + --(volume->VOLfile.FILEuseCount); + zASSERT( volume->VOLfile.FILEuseCount == 1 ); + COMN_Release(&volume); + } + return; +} + + +/************************************************************************** + * This is called to move a volume from the OFFLINE to the NEARLINE state. + ***************************************************************************/ +STATUS ZFSPOOL_Activate( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + NINT mode, /* NSS's VOLMODE_ */ + NINT zmode ) /* ZLSS mode */ +{ + + STATUS status; + typedef struct Stack_s { + unicode_t poolName[zMAX_COMPONENT_NAME]; + GeneralMsg_s dummyGenMsg; + } Stack_s; + + STACK_ALLOC(); + + ENTER(TZPOOL, ZFSPOOL_Activate); + ASSERT_MPKNSS_LOCK(); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_Activate\n"))); + +/*--------------------------------------------------------------------------- + | Get the newest checkpoint (Loaded into pool->ZP_super->SB_Checkpoint) + *-------------------------------------------------------------------------*/ + /* Get the pools name */ + COMN_SETUP_GENERAL_MSG_NOSA( &aStack->dummyGenMsg ); + COMN_GetVolumeName(&aStack->dummyGenMsg,(Volume_s *)pool,aStack->poolName,NELEMS(aStack->poolName)); + + status = ZFSPOOL_CheckpointFindNewest( genMsg, pool, + &pool->ZP_super->SB_Checkpoint, aStack->poolName ); + if ( status != zOK ) + { /* All errors mean we can not come up */ + goto cleanup_error; + } + + /* These values must be setup before the beasts are read in */ + pool->ZFSPOOLblockShift = pool->storagepool->poolblkshift; + pool->ZFSPOOLblockSize = (1 << pool->ZFSPOOLblockShift); +/*------------------------------------------------------------------------- + * Recovery phases + *-------------------------------------------------------------------------*/ + if (ZFSPOOL_LoadLogBeast(genMsg, pool) != zOK) + goto cleanup_error; + + if (ZLOG_RecoveryPhase( genMsg, pool->zfsLogBeast, mode ) != zOK) + { + goto cleanup_error; + } +/*------------------------------------------------------------------------- + * Update the restart count and make new restart count persistent + *-------------------------------------------------------------------------*/ + if (ZFSPOOL_CheckpointSetRestartID(genMsg,pool) != zOK) + { + goto cleanup_error; + } + +/*------------------------------------------------------------------------- + * Now that recovery is completed, bring up the pool + *-------------------------------------------------------------------------*/ + if ( mode & VOLMODE_VERBOSE ) + { + aprintf(NSS_POOL_COLOR,MSG(" ** Loading system objects\n", 125)); + } + if (ZFSPOOL_LoadSystemBeasts(genMsg,pool) != zOK) + { + goto cleanup_error; + } + +//#ifndef __linux__ // LINUX_Upgrade + /* Do not do any in place upgrades that do transactioning in this + * routine. + */ +#ifdef NW5X_UPGRADE + if (ZFSPOOL_AutoInPlaceUpgradeNoXactions(genMsg, pool) != zOK) + { + goto cleanup_error; + } +#endif +//#endif + if (XactionRecoverLogicalUndoPass7(genMsg, pool) != zOK) + { + goto cleanup_error; + } + +//#ifndef __linux__ // LINUX_Upgrade + /* (We can now do XACTIONS) */ + /* All upgrades that do transactioning can be done after this point */ + + if (ZFSPOOL_AutoInPlaceUpgrade(genMsg, pool) != zOK) + { + goto cleanup_error; + } +//#endif + + STACK_FREE(); + RTN_STATUS(zOK); + +/*------------------------------------------------------------------------- + * Error handling, deactivate the POOL + *-------------------------------------------------------------------------*/ +cleanup_error: + STACK_FREE(); + RTN_STATUS(zFAILURE); +} + +/************************************************************************** + * This is called when we are bringing the POOL down + * + * Not allowed to return an error! + * + * Always returns zOK, but will set errno to indicate that some + * sort of error did occur. The zOK just indicates that we succeeded + * in the sense that we have done what we could. + ***************************************************************************/ +STATUS ZFSPOOL_Deactivate( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + NINT mode ) +{ + NINT i; + ZlogBeast_s *zlogBeast; + NINT count = 0; + GeneralMsg_s dummyGenMsg; + + ENTER(TZPOOL, ZFSPOOL_Deactivate); + + ASSERT_MPKNSS_LOCK(); + + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_Deactivate\n"))); + CANCEL_ALARM(pool->ZFSPOOLmycache.agent.timer); + /* Removed CANCEL_ALARM(pool->timer) because + * ZFSPOOL_CheckpointTake() below handles the alarm cancel. Make + * sure that the routine is called with the 'LAST' bit set if you + * change this routine. + */ + zASSERT(DQ_EMPTY(&pool->ZFSPOOLbeastList)); + /* The *10 is to ENSURE we take enough checkpoints to clear + * the defered delete queue. + */ + for ( i=0; i < (CHECKPOINT_NUMBER*30); i++ ) + { +/*------------------------------------------------------------------------- + * Write all of the SYSTEM beasts + *-------------------------------------------------------------------------*/ + if (ZFSPOOL_DoFlushSystemBeasts(&dummyGenMsg,pool,FALSE) != zOK) + { + errPrintf(WHERE, Module, 1423, + MSG("Unable to flush system beasts, status=%d. Run Verify.", 126), + GetErrno(&dummyGenMsg)); + if ( GetErrno( genMsg ) == zOK ) + { + SetErrno( genMsg, GetErrno(&dummyGenMsg) ); + } + ClearErrno( &dummyGenMsg ); + /* continue with the deactivation */ + } + /* + * This code prevents us from taking extra checkpoints when + * trying to clear the defered delete queue. Here we check + * that there are no more items on the defered delete queue. + * + * We must clear the defered delete queue so that we can take + * one last checkpoint that will be clean. If we do not clear + * defered delete queue 1st then after we take the last + * checkpoint more log records may be generated by blocks + * that are now deletable. + * + * The QUEUE is protected by NetWare being non-pre-emptive. + * + */ + if (DQ_EMPTY(&pool->ZP_deleteBlkQ)) + { /* Nothing on queue */ + break; + } + if ( ZFSPOOL_CheckpointTake( &dummyGenMsg, pool, + CHECKPOINT_CT_S_DEFERED_DELETE|CHECKPOINT_CT_S_NORMAL) != zOK ) + { + errPrintf(WHERE, Module, 1424, + MSG("Error flushing checkpoint, status=%d. Run Verify.", 127), + GetErrno(&dummyGenMsg)); + + /* continue with the deactivation */ + if ( GetErrno( genMsg ) == zOK ) + { + SetErrno( genMsg, GetErrno(&dummyGenMsg) ); + } + ClearErrno( &dummyGenMsg ); + } + LB_delay(200); + } + /* Worst case is that the Free B-Tree will cause MANY delete + * block deletes of its own blocks. Note a delete could cause + * the Free B-Tree to delete one of its nodes. Depending on state + * this delete could then cause another delete of a block in the + * Free B-Tree. 40 seems like more than ressonable number of + * checkpoints to take. If this is WRONG then we will have + * a purge log entry and be forced to do REDO/UNDO and/or replay + * purge log. + */ +#if NSS_DEBUG IS_ENABLED + /* We skip ASSERTS if in DISABLE mode because they will occur */ + if ( !(ZLSS_VOLUME_IO_DISABLED( &pool->ZFSPOOLzfsVol ) ) ) + { + zASSERT( i < CHECKPOINT_NUMBER*30 ); + zASSERT( DQ_EMPTY(&pool->ZFSPOOLzfsVol.ZV_deleteBlkQ) ); + } +#endif + /* TOSS any left over blocks. We must do because if we can not write + * checkpoints then we have been unable to properly delete the + * blocks on the delete block queue. + */ + XDEL_DeactivateVolume(&pool->ZFSPOOLzfsVol); + + /* This code waits for all transactions to home. The system flush + * in the above loop returns when the metadata is flushed. It + * does not wait for home to move. By waiting for all transactions + * to complete we also allow the ASYNC EndXLocal threads to + * clear out of the system. + */ + zlogBeast = pool->zfsLogBeast; + if ( zlogBeast != NULL ) + { + (void)ZFSPOOL_DoFlushSystemBeasts(&dummyGenMsg,pool,FALSE); /* Bug 308300 */ + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + while ( (!(DQ_EMPTY( &zlogBeast->ZLB_SeniorityListHead ))) && + (count < ((30 * 1000)/ZLOG_WORK_CHECKPOINT_WAIT_DELAY)) ) + { + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + LB_delay( ZLOG_WORK_CHECKPOINT_WAIT_DELAY ); /* 20 millisecs */ + /* Bug 308300 + * Now that we do group writes of metadata we must ensure + * defaultFlush is done to all the metadata blocks. Since + * we have only used TRUE (3rd paramter) we may have some + * metadata blocks that got dirty via updating other system + * beasts. We do not want to have to wait up to 45 seconds + * for the metadata group write timer to perform defaultFlush. + */ + (void)ZFSPOOL_DoFlushSystemBeasts(&dummyGenMsg,pool,FALSE); /* Bug 308300 */ + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + ++count; + } + ClearErrno(&dummyGenMsg); /* Bug 308300 */ + /* If we did the above for 30 secs then ASSERT */ + zASSERT( count < ((30 * 1000)/ZLOG_WORK_CHECKPOINT_WAIT_DELAY) ); + /* We drop out after 15 seconds so we do not hang. This means + * that some items have not homed. This will mean we do a + * REDO/UNDO sequenece on the next activate even though the + * user believes they did a clean deactivate. + */ + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + + /*----------------------------------------------------------------------- + * Write a clean checkpoint + *-----------------------------------------------------------------------*/ + /* Don't do tosses until after we are done with all the activity on + * the system beasts, because cacheToss has a problem where it + * removes the blocks from the cache HASH even if they still need + * to be flushed. + */ + ZFSPOOL_DoFlushSystemBeasts(&dummyGenMsg,pool,TRUE); +// if ( mode & iVOLMODE_UNDO ) +// { +// zASSERT(" Not writing clean checkpoint because iVOLMODE_UNDO set(OK to do go)"==NULL); +// DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_CheckpointTake(skipping becuase iVOLMODE_UNDO)\n"))); +// } +// else + { + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_CheckpointTake(fake)\n"))); + if (ZFSPOOL_CheckpointTake(&dummyGenMsg,pool,CHECKPOINT_CT_S_DEACTIVATION| + CHECKPOINT_CT_S_CLEAN|CHECKPOINT_CT_S_LAST) != zOK) + { + errPrintf(WHERE, Module, 1425, + MSG("Error flushing checkpoint, status=%d. Run Verify.", 262), + GetErrno( &dummyGenMsg)); + /* continue with the deactivation */ + if ( GetErrno( genMsg ) == zOK ) + { + SetErrno( genMsg, GetErrno(&dummyGenMsg) ); + } + ClearErrno( &dummyGenMsg ); + } + } + } + else + { + ZFSPOOL_DoFlushSystemBeasts(&dummyGenMsg,pool,TRUE); + } +/*------------------------------------------------------------------------- + * Now release all of the SYSTEM beasts + *-------------------------------------------------------------------------*/ + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_UnloadSystemBeasts(fake)\n"))); + for(i=0;SuperBeasts[i].beastClass != -1;i++) + { + if (*((File_s **)((ADDR)pool+SuperBeasts[i].poolOffset)) != NULL) + { + BST_releaseAndFree(*((File_s **)((ADDR)pool+SuperBeasts[i].poolOffset))); + *((File_s **)((ADDR)pool+SuperBeasts[i].poolOffset)) = NULL; + } + } + /* The return says the function succeeded in cleaning up + * all resources. If errno is set then we got some sort + * of error while deactivating. + */ + + RTN_STATUS( zOK ); + +} + + +/************************************************************************** + * Compare Master Pool List to find DUP Pool names + ***************************************************************************/ +STATUS ZFSPOOL_CmpName(unicode_t *name) +{ + STATUS status; + + ENTER(TZPOOL, ZFSPOOL_CmpName); + ASSERT_MPKNSS_LOCK(); + + status = LB_PoolNameAcceptable( name ); + RTN_STATUS(status); + +} /* End of ZFSPOOL_CmpName() */ + + +//#ifndef __linux__ // LINUX_Upgrade +/************************************************************************** + * This routine upgrades a user tree with local user IDs to a tree with + * GUIDs for user IDs. + ***************************************************************************/ +#ifdef NW5X_UPGRADE +STATUS ZFSPOOL_upgradeUserTree( + GeneralMsg_s *genMsg, + ZfsVolume_s *volume, + ZfsUserTreeBeast_s *oldTree) +{ + ZfsUserTreeBeast_s *newTree = NULL; + Buffer_s *oldBuffer; + OldUserTreeNode_s *oldNode; + UserTreeParms_s parms; + Blknum_t nextLeaf; + ZfsXaction_s *xaction; + IoMsg_s iomsg; + NINT index; + STATUS retStatus; + UserID_t newUserID; + STATUS status; + + if (oldTree->zfsBtree.p.btRoot == INVALID_BLK) + { + status = zOK; + goto deleteOldTree; + } + + newTree = (ZfsUserTreeBeast_s *)ZFSVOL_VOL_GetBeastFromVolume(genMsg, + ZFSVOL_USERTREE_ZID, &volume->ZFSVOLvol); + + if (newTree == NULL) + { + ClearErrno(genMsg); + if (UBT_CreateUserTreeBeast(genMsg, volume) != zOK) + { + status = zFAILURE; + zASSERT("Error creating new user tree beast"==0); + goto cleanupOldTreeFromMemory; + } + else + { + newTree = volume->ZLSSVOLuserTree; + } + } + else + { + volume->ZLSSVOLuserTree = newTree; + } + + /* Remove any existing user entries in the new tree */ + ZFSVOL_VOL_ResetAllUsers(genMsg, &volume->ZFSVOLvol); + + /* Find the left-most leaf in the old tree */ + if (oldTree->zfsBtree.p.btLeftMostLeaf == INVALID_BLK) + { /* no left-most leaf -- parse down to the first node */ + nextLeaf = oldTree->zfsBtree.p.btRoot; + for(;;) + { + READBLK_IO_MSG(iomsg, oldTree, nextLeaf, CACHE_READ); + oldBuffer = ZFS_ReadPoolBlk(genMsg, &iomsg); + if (oldBuffer == NULL) + { + zASSERT("Read error parsing tree for left-most leaf"==0); + status = zFAILURE; + goto cleanupOldTreeFromMemory; + } + oldNode = (OldUserTreeNode_s *)oldBuffer->pBuf.data; + if (UBT_IS_LEAF(oldNode)) + { + CACHE_RELEASE(oldBuffer); + break; + } + nextLeaf = oldNode->UBRANCH(0).child; + CACHE_RELEASE(oldBuffer); + } + } + else + { + /* Start at the left most leaf */ + nextLeaf = oldTree->zfsBtree.p.btLeftMostLeaf; + } + + /* + * Parse the old tree by following the leaf links. Convert the user IDs + * to GUIDs and add the entries to the new tree. + */ + while(nextLeaf != INVALID_BLK) + { + READBLK_IO_MSG(iomsg, oldTree, nextLeaf, CACHE_READ); + oldBuffer = ZFS_ReadPoolBlk(genMsg, &iomsg); + if (oldBuffer == NULL) + { + zASSERT("Error reading old tree block"==0); + status = zFAILURE; + goto cleanupOldTreeFromMemory; + } + oldNode = (OldUserTreeNode_s *)oldBuffer->pBuf.data; + + /* Convert the user IDs and add them to the new user tree */ + for (index = 0; index < oldNode->numRecs; index++) + { + if (COMN_MapNDSIDToGUID(&retStatus, oldNode->ULEAF(index).userID, + &newUserID) != zOK) + { + errPrintf(WHERE, Module, 1474, + MSG("Unable to convert ID to GUID. Status=%d\n" + "The entry for a user has been lost.", 877), + retStatus); + zASSERT("Error converting to GUID -- Ignoring entry"==0); + continue; + } + else + { + parms.volume = &volume->ZFSVOLvol; + parms.xaction = BeginXLocal(parms.volume, BXL_DEFAULT); + parms.userTreeBeast = newTree; + parms.userID = newUserID; + parms.value = oldNode->ULEAF(index).usedAmount; + parms.type = UBT_TYPE_USED_AMOUNT; + if (UBT_modifyUserValue(genMsg, &parms) != zOK) + { + EndXlocal(parms.xaction); + errPrintf(WHERE, Module, 1475, + MSG("Error modifying used amount during convert. Status=%d\n" + "User restrictions may be inaccurate", 929), + GetErrno(genMsg)); + zASSERT("Error modifying used amount"==0); + continue; + } + if (oldNode->ULEAF(index).restrictionAmount != + zUSER_NO_RESTRICTIONS) + { + parms.value = oldNode->ULEAF(index).restrictionAmount; + parms.type = UBT_TYPE_RESTRICTION; + if (UBT_modifyUserValue(genMsg, &parms) != zOK) + { + EndXlocal(parms.xaction); + errPrintf(WHERE, Module, 1476, + MSG("Error modifying restriction amount during convert. Error=%d\n" + "User restrictions may be innaccurate", 938), + GetErrno(genMsg)); + zASSERT("Error modifying restriction amount"==0); + continue; + } + } + EndXlocal(parms.xaction); + } + } + nextLeaf = oldNode->n.leaf.nextLeaf; + CACHE_RELEASE(oldBuffer); + } + status = zOK; + + BST_flush(newTree); + +deleteOldTree: + /* + * Get rid of the old tree + */ + UBT_OLD_ResetAllUsers(genMsg, &volume->ZFSVOLvol, oldTree); + BST_flush(oldTree); + cacheTossAll(&oldTree->ZFSUSERTREEmycache); + X_LATCH(&oldTree->ZFSUSERTREEbeastLatch); + xaction = BeginXLocal(&volume->ZFSVOLvol, BXL_DEFAULT); + ZFSVOL_VOL_RemoveBeastFromVolume(genMsg, &oldTree->ZFSUSERTREEroot, + &xaction->xaction); + EndXlocal(xaction); + UNX_LATCH(&oldTree->ZFSUSERTREEbeastLatch); + +cleanupOldTreeFromMemory: + BST_UNUSE_BEAST(&oldTree->ZFSUSERTREEroot); + BST_free(&oldTree->ZFSUSERTREEroot); + if (status == zOK) + { + /* Make sure all object IDs in the file system are in the tree */ + status = OID_InitObjectInfo(genMsg, &volume->ZFSVOLvol); + } + + return status; +} +#endif +//#endif + + +/* ZFSPOOL_CheckSharedPool() - + * Performs checks to see if we should change the STATE of a pool. + * + * Returns - + * zFAILURE - when state chaneg should not be done. + * + * Notes - + * Must only be called when going to MAINTENANCE or ACTIVE state + * from DEATIVE state. When going to ACTIVE state the PDB must already + * be in (I.E. ZFSPOOL_LoadPersistentPool has been called). + * In MAINTENANCE case this code will read in and they discard the PDB + * if needed. The need is based on if the volume broker is not loaded AND no + * override specified. Note that an error is returned if we can not + * read in the PDB. The user will have to do a SHARED override to change + * state to MAINTENANCE state. + */ + +STATUS ZFSPOOL_CheckSharedPool( + GeneralMsg_s *genMsg, + ZfsPool_s *zfsPool, + NINT mode, + NINT state ) + +{ +// STATUS status; + + if ( COMN_ClusterSoftwareIsCheckingPoolStateChanges() ) + { /* Cluster software is in charge if pool is in use elsewhere. */ +#if NSS_DEBUG IS_ENABLED + DBG_ScreenAPrintf( "ZLSS.Greg.Pool", WHERE, CYAN,"Cluster software is checking shared pools\n"); +#endif + } + else + { +#if NSS_DEBUG IS_ENABLED + DBG_ScreenAPrintf( "ZLSS.Greg.Pool", WHERE, CYAN,"Cluster software NOT checking for multiple shared pool activations\n"); +#endif + if ( mode & VOLMODE_OVERRIDE_SHARED ) + { +#if NSS_DEBUG IS_ENABLED + DBG_ScreenAPrintf( "ZLSS.Greg.Pool", WHERE, CYAN,"Shared override specified\n"); +#endif + } + else + { + ZlssPool_s *zlssPool = ZFS_POOL_TO_ZLSS_POOL( zfsPool ); + QUAD features; + +#if NSS_DEBUG IS_ENABLED + DBG_ScreenAPrintf( "ZLSS.Greg.Pool", WHERE, CYAN,"Shared override not requested specified\n"); +#endif +// if ( state == zVOLSTATE_MAINTENANCE ) +// { +// status = ZFSPOOL_LoadPersistentPool( genMsg, zfsPool, mode, 0 ); +// if ( status != zOK ) +// { +// SetErrno(genMsg,zERR_POOL_SHARED_STATE_UNKNOWN); +// return( zFAILURE ); +// } +// } + features = zlssPool->ZLSSPOOLenabledFeatures; + +// if ( state == zVOLSTATE_MAINTENANCE ) +// { +// ZFSPOOL_UnloadPersistentPool( zfsPool, mode ); +// } + + if ( features & zPOOL_FEATURE_SHARED_CLUSTER ) + { + SetErrno(genMsg,zERR_POOL_SHARED_NO_BROKER); + return( zFAILURE ); + } + + } + } + + return( zOK ); + +} /* End of ZFSPOOL_CheckSharedPool() */ + + +/************************************************************************** + * this is called to bring a POOL online, the MODE parameter defines + * different level of ONLINE + ***************************************************************************/ +STATIC STATUS ZFSPOOL_VOL_ChangeVolumeState( + GeneralMsg_s *genMsg, + void *pool_LX, + NINT sourceState, + NINT requestedState, + NINT mode, + NINT pass ) +{ + ZfsPool_s *pool = (ZfsPool_s *)pool_LX; + STATUS status; + statusfunc_t parentFunc; + RootBeast_s *beast; + ZlssPool_s *zlssPool; + + ASSERT_MPKNSS_LOCK(); + + ENTER(TZPOOL, ZFSPOOL_ChangeVolumeState); + + zlssPool = ZFS_POOL_TO_ZLSS_POOL( pool ); + zASSERT( zlssPool != NULL ); + zASSERT( COMN_IsDerivedFrom(zlssPool, zFTYPE_ZLSS_LOGICAL_POOL) ); + +/*------------------------------------------------------------------------- + * Since we know we need to call the ZfsVolume_s code, get the address of + * that routine. + *-------------------------------------------------------------------------*/ + parentFunc = COMN_GetNextParentVolumeComnOp(pool->ZFSPOOLbeastClass, + COMNVOLOPS_INDEX(VOL_changeVolumeState),ZFSPOOL_VOL_ChangeVolumeState); + zASSERT(parentFunc != NULL); + status = parentFunc( genMsg, &pool->zfsVol, sourceState, + requestedState, mode, pass ); + if ( status != zOK ) + { + return( status ); + } + + switch (requestedState) + { + +/*=========================================================================*/ + case zVOLSTATE_ACTIVE: /*** Destination State ***/ + switch( sourceState ) + { + case zVOLSTATE_DEACTIVE: + /*** Yes, we want to drop through this case. ***/ + /*** Yes, we want to drop through this case. ***/ + /*** Yes, we want to drop through this case. ***/ + case zVOLSTATE_MAINTENANCE: + if ( pass == 1 ) + { + HMC_InvalidatePoolHimemPages( &zlssPool->ZP_Pool ); + if ( zlssPool->ZP_Version < + (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_4_DONE) ) + { /* We are NOT done with with UPGRADE so the ONLY + * time we allow us to go into ACTIVE state is + * if we are LOADING/UPGRADEING the pool. This + * is required because the LV AIPU part of + * upgrade loads and unloads the pool. + */ + if ( !zlssPool->ZP_Loading ) + { + SetErrno( genMsg, zERR_ZLSSPOOL_UPGRADE_POOL_FIRST ); + RTN_STATUS( zFAILURE ); + } + } + zlssPool->ZP_PoolTrackIO = gCLPoolTrackActive; + } + if ( pass == ZLSSPOOL_ACTIVATE ) + { + /* When we go from DEACTIVE to ACTIVE we need + * to do our extra SHARED pool checks. Note + * the check is not needed when going from + * MAINTENANCE to ACTIVE because the check was + * done when we went from DEACTIVE to MAINTENANCE. + */ + status = ZFSPOOL_LoadPersistentPool( genMsg, pool, mode, 0, sourceState == zVOLSTATE_DEACTIVE, requestedState ); + if ( status != zOK ) + { + RTN_STATUS(zFAILURE); + } + /* If we are not in override mode then we need + * to fail going to ACTIVE state if the volume + * is corrupt or was being repaired/checked. We + * have to wait until now to check because the + * corrupt and repairing bits are stored in the + * persistent volume. Maybe Volume_s should do + * the check. + */ + if ( !(mode & VOLMODE_OVERRIDE) +// && (sourceState == zVOLSTATE_DEACTIVE) + ) + { + if ( (pool->ZFSPOOLvol.p.stateAttributes & VOLSTATEATTR_CORRUPT) || + (pool->ZFSPOOLvol.p.stateAttributes & VOLSTATEATTR_REPAIRING) ) + { + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("Change state converted to a request from %d to %d based on stateAtrribute of 0x%lx\n"), + sourceState, zVOLSTATE_MAINTENANCE, + pool->ZFSPOOLvol.p.stateAttributes)); + if ( sourceState == zVOLSTATE_DEACTIVE ) + { + if ( mode & VOLMODE_VERBOSE ) + { + if ( pool->ZFSPOOLvol.p.stateAttributes & VOLSTATEATTR_REPAIRING ) + { + aprintf(LRED,MSG(" ** Volume was being rebuilt so being placed into MAINTENANCE state\n", 286) ); + } + else + { + aprintf(LRED,MSG(" ** Volume has corruption so being placed into MAINTENANCE state\n", 287) ); + } + } + SetErrno(genMsg,zERR_VOLUME_STATE_CHANGE_A_TO_M); + } + else + { /* Since we are corrupt or were repairing and + * we have not been told to OVERRIDE then we + * will not do this activation request. + */ + SetErrno(genMsg,zERR_VOLUME_SHOULD_NOT_ACTIVATE); + } + return( zFAILURE ); + } + } + status = ZFSPOOL_Activate(genMsg,pool,mode,0); + } + + if ( pass == ZLSSVOL_ACTIVATE_TIME ) + { + ZfsPurgeLogBeast_s *holdPurgeLog; + + /* + * The following can be done here and does not need to + * be added to Verify or Repair because we will not + * allow the a volume to be repaired or verified that + * has not been already upgraded. + */ + + + /** Replenish the special blk in the free tree to have + ** enough free blocks for the free tree to use when + ** in needs to grow or split. + ** If the special blk does not exist, it is created + ** at this time. (on existing volumes) + **/ + CheckSpecialBlkAndAdd(pool, NULL, + XTREE_SPBLK_XACTION | XTREE_SPBLK_THREAD_SCH); + + /* + * If there is an old version of the user tree then + * upgrade it to the new version. This must be done + * before the purge log is played, as playing the + * purge log may cause modifications to the user tree. + */ + + holdPurgeLog = + ((ZfsVolume_s *)pool)->ZFSVOLvolumePurgeLog; + + if (holdPurgeLog == NULL) + { /* we have not yet got the volume purge log set up */ + ((ZfsVolume_s *)pool)->ZFSVOLvolumePurgeLog = + pool->ZFSPOOLpurgeLogBeast; + } + beast = ZFSVOL_VOL_GetBeastFromVolume(genMsg, + ZFSPOOL_VOLUSERTREE_ZID, &pool->ZFSPOOLvol); + if (beast != NULL) + { + BEASTHASH_Insert(beast); + DQ_RMV(beast, volLink); +//#ifndef __linux__ // LINUX_Upgrade +#ifdef NW5X_UPGRADE + ZFSPOOL_upgradeUserTree(genMsg, + &pool->ZFSPOOLzfsVol, + (ZfsUserTreeBeast_s *)beast); +#endif +//#endif + } + else + { + ClearErrno( genMsg ); + } + ((ZfsVolume_s *)pool)->ZFSVOLvolumePurgeLog = + holdPurgeLog; + } + + if ( pass == ZLSSPOOL_LAST ) + { + /* After the system is up, now process the PURGE log. */ + if ( mode & VOLMODE_VERBOSE ) + { + aprintf(NSS_POOL_COLOR,MSG(" ** Processing pool purge log\n", 289)); + } + status = ZFSPOOL_PlayPurgeLog(genMsg, pool, &pool->ZFSPOOLzfsvol, + pool->purgeLogBeast, mode); + if ( status != zOK ) + { + RTN_STATUS( status ); + } + +#if SUPERBLOCK_SB_VM_MEDIA_MAJOR != 43 +#error "This is the place you would add conversion 43 to 44 (if POOL related) look at ZFSPOOL_LoadSystemBeasts as another place" +#error "If not then you may need to change which pass 32 to 33 conversion takes place in" +#endif +// status = ZFSPOOL_AIPU34To35( genMsg, pool, mode ); +#if 0 +// if ( zlssPool->ZP_Version == +// (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_4_DONE) ) +// { /* Schedule LAST part of LV upgrade */ +// ZLVAIPU_AIPU4006To4007( zlssPool ); +// } +// if ( zlssPool->ZP_Version == ZLSS_MEDIA_VERSION_4007 ) +// { +// (void)MSAP_AIPU4007To4008( zlssPool, genMsg ); +// ClearErrno( genMsg ); +// /* No error checking because if MSAP off at server +// * we do not want MSAP to fail a activate. In any +// * case, the error must have been an I/O failure +// * which will be caught on the next I/O. +// */ +// status = MSAP_MSAPActivate( zlssPool, genMsg ); +// if ( status != zOK ) +// { +// RTN_STATUS( status ); +// } +// } +#endif +//#ifndef __linux__ // LINUX_Upgrade + /* Start up the background upgrade thread that + * ensures that the pool gets upgraded into + * Nakoma version sooner or later. + */ + if ( ZLSS_AIPUNakomaPool_Ready( zlssPool ) ) + { + ZLSS_UpgradeStartupNakoma(); + } +//#endif + /* Increase the POOL size if a segment has + * been added since last activation OR we + * crashed while adding space. + */ + status = ZFSPOOL_SetNewPoolSizeDuringActivation( genMsg, pool ); + if ( status != zOK ) + { + RTN_STATUS( status ); + } + /* + * Dirty the system beasts if needed. + * This must be done just before we + * set the volume state to active so the + * timer does not pop before the volume + * is active + */ + ZFSPOOL_DirtySystemBeasts(genMsg, pool); + } + break; + case zVOLSTATE_UNKNOWN: + case zVOLSTATE_ACTIVE: + default: + break; + } + break; + +/*=========================================================================*/ + case zVOLSTATE_DEACTIVE: + switch( sourceState ) + { + case zVOLSTATE_ACTIVE: + if ( pass == 10 ) + { + MSAP_MSAPDeactivate( zlssPool ); + zlssPool->ZP_PoolTrackIO = gCLPoolTrackDeactive; + HMC_InvalidatePoolHimemPages( &zlssPool->ZP_Pool ); + } + if ( pass == ZLSSPOOL_DEACTIVATE_CLEANUP_LV_DELETES ) + { + ZLSSVOL_LVD_DeleteListRemoveAllFromPool( (Volume_s *)pool ); + } + else if ( pass == ZLSSPOOL_DEACTIVATE ) + { + GeneralMsg_s dummyGenMsg; + /* We do not care about errors from + * deactivate so pass it a dummy genMsg + * so it does not mess with our errno. + */ + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + ASSERT_MPKNSS_LOCK(); + ZFSPOOL_Deactivate( &dummyGenMsg, pool, mode ); + ZFSPOOL_UnloadPersistentPool( pool, mode ); + } + break; + case zVOLSTATE_MAINTENANCE: + if ( pass == 10 ) + { + MSAP_MSAPDeactivate( zlssPool ); + zlssPool->ZP_PoolTrackIO = gCLPoolTrackDeactive; + HMC_InvalidatePoolHimemPages( &zlssPool->ZP_Pool ); + } + case zVOLSTATE_DEACTIVE: + case zVOLSTATE_UNKNOWN: + default: + break; + } + break; +/*=========================================================================*/ + case zVOLSTATE_MAINTENANCE: + switch( sourceState ) + { + case zVOLSTATE_ACTIVE: + if ( pass == 1 ) + { + zlssPool->ZP_PoolTrackIO = gCLPoolTrackMaintenance; + } + if ( pass == ZLSSPOOL_DEACTIVATE_CLEANUP_LV_DELETES ) + { + ZLSSVOL_LVD_DeleteListRemoveAllFromPool( (Volume_s *)pool ); + } + if ( pass == ZLSSPOOL_ACTIVE_TO_MAINTENANCE ) + { + ZFSPOOL_Deactivate( genMsg, pool, mode ); + ZFSPOOL_UnloadPersistentPool( pool, mode ); + } + if ( pass == 10 ) + { + HMC_InvalidatePoolHimemPages( &zlssPool->ZP_Pool ); + } + break; + case zVOLSTATE_DEACTIVE: + if ( pass == 1 ) + { + HMC_InvalidatePoolHimemPages( &zlssPool->ZP_Pool ); + zlssPool->ZP_PoolTrackIO = gCLPoolTrackMaintenance; + } + if ( pass == ZLSSPOOL_ACTIVATE ) + { + status = ZFSPOOL_CheckSharedPool( genMsg, pool, mode, requestedState ); + if ( status != zOK ) + { + RTN_STATUS( zFAILURE ); + } + status = MSAP_MSAPActivate( zlssPool, genMsg ); + if ( status != zOK ) + { + RTN_STATUS( zFAILURE ); + } + } + break; + case zVOLSTATE_MAINTENANCE: + case zVOLSTATE_UNKNOWN: + default: + break; + } + break; +/*=========================================================================*/ + case zVOLSTATE_UNKNOWN: + switch( sourceState ) + { + case zVOLSTATE_ACTIVE: + case zVOLSTATE_DEACTIVE: + case zVOLSTATE_MAINTENANCE: + case zVOLSTATE_UNKNOWN: + default: + break; + } + break; + } + RTN_STATUS( status ); + +} + +//#if 1 +/*--------------------------------------------------------------------------- + * This is an internal structure used to manipulate all of the beasts that + * are stored + *---------------------------------------------------------------------------*/ +PoolBeastCtrl_s SuperBeasts[] = +{ + /*- do not change the order, some initialization is based on this order -*/ + {zFTYPE_ZLSS_BEAST_TREE,ZFSPOOL_VOLBSTTREE_ZID, + offsetof(ZfsPool_s,ZFSPOOLbeastTree), + MSGNot(L"volumeBeastTree")}, + + {zFTYPE_ZLSS_FREE_EXTENT,ZFSPOOL_FREETREE_ZID, + offsetof(ZfsPool_s,freeExtent), + MSGNot(L"freeTree")}, + + {zFTYPE_ZLSS_LOG,ZFSPOOL_LOGFILE_ZID, + offsetof(ZfsPool_s,zfsLogBeast), + MSGNot(L"transactionLog")}, + + {zFTYPE_ZLSS_PURGE_LOG,ZFSPOOL_PURGELOG_ZID, + offsetof(ZfsPool_s,purgeLogBeast), + MSGNot(L"poolPurgeLog")}, + + {zFTYPE_ZLSS_PURGE_LOG,ZFSVOL_PURGELOG_ZID, + offsetof(ZfsVolume_s,ZLSSVOLvolumePurgeLog), + MSGNot(L"volumePurgeLog")}, + + {zFTYPE_ZLSS_SALVAGE,ZFSPOOL_PURGETREE_ZID, + offsetof(ZfsPool_s,purgeTree), + MSGNot(L"purgeTree")}, + + {zFTYPE_ZLSS_NAME_TREE,ZFSPOOL_VOLNAMETREE_ZID, + offsetof(ZfsPool_s,ZFSPOOLnameTree), + MSGNot(L"volumeNameTree")}, + + {zFTYPE_ZLSS_USER_TREE,ZFSVOL_USERTREE_ZID, + offsetof(ZfsPool_s,ZFSPOOLuserTree), + MSGNot(L"volumeUserTree")}, + + {zFTYPE_ZLSS_DIR_TREE,ZFSVOL_DIRTREE_ZID, + offsetof(ZfsPool_s,ZFSPOOLdirTree), + MSGNot(L"volumeDirTree")}, + + {-1} +}; + + +/************************************************************************** + * This will flush all of the SYSTEM beasts to the disk except the LOGFILE + * beast which is stored in CheckPoint. This is called when we are both + * deactivating as well as simply flushing the pool. We pass a flag to + * say what mode we are in. Make sure that the volume persistent data + * is being written. Some code depends on this (i.e. + * VOL_UpgradeBeastsOnVolume). + ***************************************************************************/ +STATUS ZFSPOOL_DoFlushSystemBeasts( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + BOOL deactivating) /* if TRUE we are deactivating the pool, else flushing*/ +{ + RootBeast_s *beast; + NINT i; + NINT retryCount; + + ENTER(TZPOOL,ZFSPOOL_DoFlushSystemBeasts); + ASSERT_MPKNSS_LOCK(); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_DoFlushSystemBeasts\n"))); +/*------------------------------------------------------------------------- + * First go through and flush all of the system beasts. We will wait for + * them to write. + *-------------------------------------------------------------------------*/ + STOP_SIGNAL_PROPAGATION(&pool->ZFSPOOLagent); + for(i=0;SuperBeasts[i].beastClass != -1;i++) + { + beast = *((RootBeast_s **)((ADDR)pool+SuperBeasts[i].poolOffset)); + if(beast != NULL) + { + if (!(beast->bstState & BST_STATE_DO_NOT_WRITE)) + { + bind(&pool->ZFSPOOLagent,&beast->ROOTmycache.agent); + beast->bstState |= BST_STATE_FULL_FLUSH; + cacheFlushMyCache(&beast->ROOTmycache); + } + } + } + START_SIGNAL_PROPAGATION(&pool->ZFSPOOLagent); + defaultFlushWait(&pool->ZFSPOOLagent); +/*------------------------------------------------------------------------- + * After we have made the initial pass through the system beasts, we can + * now write the persistent volume information. + *-------------------------------------------------------------------------*/ +#if NSS_DEBUG IS_ENABLED + if ( !SkipCheckpoints ) +#endif + { + ZlssPool_s *zlssPool; + + ZFSVOL_WritePersistentVolumeData(&pool->ZFSPOOLzfsvol); + zlssPool = (ZlssPool_s *)pool->ZFSPOOLvol.v_pool; + zASSERT( zlssPool != NULL ); + + ZLSSPOOL_WritePersistentPoolData( zlssPool ); + } + if ( (pool->ZFSPOOLroot.bstState & BST_STATE_DIRTY) || + !DQ_EMPTY(&pool->ZFSPOOLroot.mycache.bufList)) + { + pool->ZFSPOOLroot.bstState |= BST_STATE_FULL_FLUSH; + cacheFlushMyCacheBufs(&pool->ZFSPOOLmycache ); + defaultFlushWait(&pool->ZFSPOOLmycache.agent ); + } +/*------------------------------------------------------------------------- + * The act of flushing the system beasts may have caused other system + * beasts to change. We will keep looping and flushing the beasts until + * we can do a pass where none of them are dirty. We only do this when + * we are deactivating, we don't need to do it on a FLUSH. + *-------------------------------------------------------------------------*/ + if (deactivating) + { + for (retryCount=10;;retryCount--) + { + BOOL didFlush; + + didFlush = FALSE; + for(i=0;SuperBeasts[i].beastClass != -1;i++) + { + beast = *((RootBeast_s **)((ADDR)pool+SuperBeasts[i].poolOffset)); + if(beast != NULL) + { + if (((beast->bstState & BST_STATE_DIRTY) || + !DQ_EMPTY(&beast->ROOTmycache.bufList)) && + !(beast->bstState & BST_STATE_DO_NOT_WRITE)) + { + didFlush = TRUE; + beast->bstState |= BST_STATE_FULL_FLUSH; + cacheFlushMyCacheBufs(&beast->ROOTmycache); + defaultFlushWait(&beast->ROOTmycache.agent); + cacheTossAll(&beast->ROOTmycache); + } + } + } + if ( (pool->ZFSPOOLroot.bstState & BST_STATE_DIRTY) || + !DQ_EMPTY(&pool->ZFSPOOLroot.mycache.bufList)) + { + didFlush = TRUE; + pool->ZFSPOOLroot.bstState |= BST_STATE_FULL_FLUSH; + cacheFlushMyCacheBufs(&pool->ZFSPOOLmycache ); + defaultFlushWait(&pool->ZFSPOOLmycache.agent ); + cacheTossAll(&pool->ZFSPOOLmycache ); + } + if (!didFlush) + break; + if (retryCount <= 0) + { + errPrintf(WHERE, Module, 1426, + MSG("Unable to flush the system beasts after 10 retries. " + "Run Verify.", 368)); + zASSERT(0); + break; + } + } + } + RTN_STATUS(zOK); +} + +/* + * ZLSSPOOL_PoolVolDataInitialize() - + * Initialize the Pool's internal volume persistent data. + * + */ +void ZLSSPOOL_PoolVolDataInitialize( + ZfsPool_s *zfsPool, + StorPool_s *storagepool) +{ + zASSERT( offsetof( PersistentVolume_s, PV_reserved3 ) == 184 ); + zASSERT( offsetof( PersistentZfsVolume_s, PZV_reserved ) == 88 ); + /* If you hit ANY of the above ASSERTs be sure to add the item + * that changed the size to the initialization code below. + * Then update the ASSERT + */ + ZLSSVOL_InitVDB( ZFS_POOL_TO_ZLSS_VOLUME( zfsPool ), + /* IV allocates and inits LVDB, VDB, and BT fields later */ + INVALID_BLK_ZERO, INVALID_BLK_ZERO, INVALID_BLK_ZERO, + GetUTCTime(), + 0, 0, + FALSE ); + + zASSERT( offsetof( LoggedPersistentVolume_s, LPV_reserved ) == 120 ); + zASSERT( offsetof( LoggedPersistentZfsVolume_s, LPZV_reserved ) == 32 ); + /* If you hit ANY of the above ASSERTs be sure to add the item + * that changed the size to the initialization code below. + * Then update the ASSERT + */ + ZLSSVOL_InitLVDB( ZFS_POOL_TO_ZLSS_VOLUME( zfsPool ), + storagepool->pooltotalblocks, 0, + /* All blocks initially in use during pool create */ + storagepool->pooltotalblocks, FALSE ); + + return; + +} /* End of ZLSSPOOL_PoolVolDataInitialize() */ + + +/*************************************************************************** + * + * This is a common routine to manually initialize a newly created beast. + * (We use it to initilialize the root dir also. + * + ***************************************************************************/ +STATUS ZFSPOOL_InitNewBeast( + GeneralMsg_s *genMsg, + File_s *file, + Zid_t zid, + Zid_t parentZid, + NINT fileAttributes, + unicode_t *name) +{ + file->FILEzid = zid; + file->FILEfirstParentZid = parentZid; + file->FILEattributes = fileAttributes; + file->FILEaccessedTime = + file->FILEcreatedTime = + file->FILEmodifiedTime = + file->FILEmetaDataModifiedTime = GetUTCTime(); + + file->FILEarchivedTime = INVALID_UTC_TIME; + + file->FILEownerID = + file->FILEmodifierID = + file->FILEmetaDataModifierID = zSUPERVISOR_ID; + + file->FILEmetaDataSeqNum = 1; + file->FILEbstState |= BST_STATE_NEW; + + if (name == NULL) + { + return zOK; + } + + if (NAME_doAddName(genMsg, &file->FILEnamed, &file->FILEfirstParent, + 1 << zNSPACE_LONG, name, 0) != zOK) + { + return zFAILURE; + } + file->FILEfirstParent.p.primaryNameSpaceID = zNSPACE_LONG; +// cnt file->FILEfirstParentNameUniquifier = zFNU_FIRST_PARENT; + file->FILEfirstParentNameType = zNTYPE_FILE; + file->FILEnumParents++; + + return zOK; +} + +/*************************************************************************** + * + * Initialize the newly created root dir. + * + * Returns + * zFAILURE - Caller must BST_free beast. + * zOK - xxvol_deactivate will free beast as VOLrootdir has been set. + ***************************************************************************/ +STATUS ZFSPOOL_InitNewRootDir( + GeneralMsg_s *genMsg, + File_s *rootdir, + ZfsVolume_s *zfsVol) +{ + + ASSERT_XLATCH(&rootdir->FILEbeastLatch); + if (ZFSPOOL_InitNewBeast(genMsg, rootdir, zROOTDIR_ZID, zINVALID_ZID, + zFA_SUBDIRECTORY, NULL) != zOK) + { + return zFAILURE; + } + + zfsVol->ZFSVOLrootdir = rootdir; // xxxvol_deactivate will NULL out + if (rootdir->FILEauthModelOps->initVolumeAuthInfo( + genMsg, &zfsVol->vol) != zOK) + { + errPrintf(WHERE, Module, 1429, + MSG("Unable to initialize authentication information on the root " + "directory on the volume, status=%d.", 422), + GetErrno(genMsg)); + zfsVol->ZFSVOLrootdir = NULL; + return zFAILURE; + } + return zOK; +} + + +/*- (FUNCTION) ----- ZFSPOOL_InitNewPool() ---------------------------------- + | + | This is called to setup a new pool. Because ZLSS associates a internal + | volume with the pool this call also creates that internal volume. + | + +-------------------------------------------------------------------------*/ +STATUS ZFSPOOL_InitNewPool( + GeneralMsg_s *genMsg, + StorPool_s *storagepool) +{ + ZfsPool_s *pool; + ZlssPool_s *zlssPool; + ZFSMemorySuperBlk_s *super; + File_s *beast; + File_s *rootdir; + NINT i; + NINT packedSize; + Extent_s seedext; + Extent_s extent; + STATUS status; + ZfsXaction_s *xaction; + ZfsPurgeLogBeast_s *purgeLog = NULL; + ZfsPurgeLogBeast_s *volumePurgeLog = NULL; + ZfsBeastTreeBeast_s *ztree = NULL; + Buffer_s *buffer; + IoMsg_s iomsg; + VolumeID_t volumeID; + zConPool_s *phypool; + NINT sbMinor; + + ENTER(TZPOOL, ZFSPOOL_InitNewPool); + ASSERT_MPKNSS_LOCK(); + + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED,"ZFSPOOL_InitNewPool(enter)\n")); + + if ( storagepool->pooltotalblocks < ((10 * 1024 * 1024)/4096) ) + { /* Enforce old 10MB pool/volume rule for pools */ + SetErrno( genMsg, zERR_POOL_NOT_BIG_ENOUGH ); + return( zFAILURE ); + } + + if ( storagepool->pooltotalblocks >= UI64_CONST(0x80000000) ) + { /* Enforce pool size of no larger that 8 Tb - 1 */ + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("ZFSPOOL_SetNewPoolSize pool size is NEGATIVE\n"))); + SetErrno( genMsg, zERR_POOL_TOO_LARGE ); + return( zFAILURE ); + } + + LB_GUIDGenerate( &volumeID ); + + zlssPool = POOL_BstNew( genMsg, zFTYPE_ZLSS_LOGICAL_POOL, COMN_GetAdminVolume(), + storagepool->poolname, &volumeID, zLSS_ID_ZLSS ); +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED,"%s zlssPool address is %lx\n",WHERE,zlssPool); +#endif + if ( zlssPool == NULL ) + { + zASSERT( "POOL_BstNew error"==NULL); + goto error_exit; + } +/*--------------------------------------------------------------------------- + | New the pool beast - Use the special VOLUME beast new call. This + | places the volume into the admin volume directory. The AVFILE_destruct() + | will remove from the admin volume when our pool is freed via + | BST_releaseAndFree(). Note we need to call the release version because + | VOL_BstNew() returns with an inuse count of one. + *-------------------------------------------------------------------------*/ + pool = VOL_BstNew( genMsg, zFTYPE_ZLSS_ZFSPOOL, COMN_GetAdminVolume(), + storagepool->poolname, &volumeID, zLSS_ID_ZLSS ); + if ( pool == NULL ) + { + goto error_pool; + } + +/* Set the beast version of the new pool depending upon whether it is a shared + or local pool. +*/ + if(zlssPool->ZLSSPOOLenabledFeatures & zPOOL_FEATURE_SHARED_CLUSTER) + { + zlssPool->ZLSSPOOLroot.beastVersion = HL_ClusterApprovesUpgrade() ? + HL_NewSharedVolumeBeastVersion : BEAST_VERSION_2; + } + else + { + zlssPool->ZLSSPOOLroot.beastVersion = HL_NewLocalVolumeBeastVersion; + } + if(zlssPool->ZLSSPOOLroot.beastVersion == BEAST_VERSION_3) + { + sbMinor = SUPERBLOCK_SB_VM_MEDIA_MINOR; + } + else + { + sbMinor = 2; + } + + /*- get the storage deposit -*/ + phypool = storagepool->phypool; + /* Set up our cross reference pointer between ZLSS Pool and its physical Volume */ + zlssPool->ZLSSPOOLpool.P_VolumeInternal = &pool->ZFSPOOLvol; + zlssPool->ZLSSPOOLzfsPool = pool; + zlssPool->ZLSSPOOLroot.useCount++; /* Because Internal Volume points to pool */ + pool->ZFSPOOLvol.v_pool = &zlssPool->ZLSSPOOLpool; + + pool->ZFSPOOLinternalID = pool->ZFSPOOLvolumeID; + + zASSERT( LB_GUIDValidate( &zlssPool->ZLSSPOOLmVolumeID ) ); + /* Must match because we use _IV_ to write the POOL's LPDB and PDB */ + zlssPool->ZLSSPOOLmInternalID = pool->ZFSPOOLinternalID; + /* ZP_Version must be filled in before ZLSSPOOL_PoolVolDataInitialize call. + * We init to the values that will be in superblock in a few lines + */ + if ( ZLSS_UpgradeMediaFormat ) + { + zlssPool->ZP_Version = (SUPERBLOCK_SB_VM_MEDIA_MAJOR * 0x100L) + sbMinor; + } + else + { + zlssPool->ZP_Version = (40 * 0x100L) + 8; + } + /* Init MOST of the POOL's persistent fields to default values. + * We do nothing */ + ZLSSPOOL_PoolDataInitialize( zlssPool, + storagepool->pooltotalblocks, + storagepool->poolblkshift, + storagepool->sharedStatus ? zPOOL_FEATURE_SHARED_CLUSTER : 0); + + /* Init MOST of the POOL's internal volume persistent fields + * to their default values. + */ + ZLSSPOOL_PoolVolDataInitialize(pool, storagepool); + +/*--------------------------------------------------------------------------- + | Allocate the super block and initialize it. + | The super block memory is freed in the ZfsPool destructor. + *-------------------------------------------------------------------------*/ + super = (ZFSMemorySuperBlk_s *)zalloc(sizeof(ZFSMemorySuperBlk_s)); + if ( super == NULL ) + { + SetErrno(genMsg,zERR_NO_MEMORY); + goto error_nosuper; + } + +/*--------------------------------------------------------------------------- + | More initing the pool (constructor did some) + *-------------------------------------------------------------------------*/ + pool->ZP_super = super; + pool->storagepool = storagepool; + + /*- header -*/ + super->SB_Header.hdr.SBH_Signature = SUPERBLOCK_SB_S_SIGNATURE; + super->SB_Header.hdr.SBH_VersionMajor = SUPERBLOCK_SB_VM_MAJOR; + super->SB_Header.hdr.SBH_VersionMinor = SUPERBLOCK_SB_VM_MINOR; + if ( ZLSS_UpgradeMediaFormat ) + { + super->SB_Header.hdr.SBH_VersionMediaMajor = SUPERBLOCK_SB_VM_MEDIA_MAJOR; + super->SB_Header.hdr.SBH_VersionMediaMinor = sbMinor; + super->SB_Header.hdr.SBH_VersionMediaMajorCreate = SUPERBLOCK_SB_VM_MEDIA_MAJOR; + super->SB_Header.hdr.SBH_VersionMediaMinorCreate = sbMinor; + } + else + { + super->SB_Header.hdr.SBH_VersionMediaMajor = 40; + super->SB_Header.hdr.SBH_VersionMediaMinor = 8; + super->SB_Header.hdr.SBH_VersionMediaMajorCreate = 40; + super->SB_Header.hdr.SBH_VersionMediaMinorCreate = 8; + } + + zASSERT( zlssPool->ZP_Version == (super->SB_Header.hdr.SBH_VersionMediaMajor * 0x100L) + + super->SB_Header.hdr.SBH_VersionMediaMinor ); + + super->SB_Header.hdr.physSizeUsed = SUPERBLOCK_PHYSICAL_SIZE; + /* FixFixFix6 - what is sized used?? Physical disk space of + super block OR super block header???? */ + super->SB_Header.hdr.sizeUsed = sizeof(ZFSMediaSuperBlk_s); + super->SB_Header.hdr.SBH_PackedSize = sizeof(SuperBlockHeader_s); + super->SB_Header.hdr.superlocation[0] = + ZFSPOOL_SuperBlockMathematicalBlock(0, storagepool->pooltotalblocks); + super->SB_Header.hdr.superlocation[1] = + ZFSPOOL_SuperBlockMathematicalBlock(1, storagepool->pooltotalblocks); + super->SB_Header.hdr.superlocation[2] = + ZFSPOOL_SuperBlockMathematicalBlock(2, storagepool->pooltotalblocks); + super->SB_Header.hdr.superlocation[3] = + ZFSPOOL_SuperBlockMathematicalBlock(3, storagepool->pooltotalblocks); + zlssPool->ZP_MSAPBlock = super->SB_Header.hdr.superlocation[MSAP_SUPERBLOCK_NUMBER] + MSAP_SUPERBLOCK_SECTION; + super->SB_Header.hdr.superTimeStamp = GetUTCTime(); + super->SB_Header.hdr.supersyncid = SUPERBLOCKHEADER_SYNCSTARTVALUE; + + /*- pool info -*/ +// super->SB_Header.hdr.nssMagicNumber = storagepool->magicnumber; + super->SB_Header.hdr.poolClassID = zFTYPE_ZLSS_ZFSPOOL; +// super->SB_Header.hdr.poolID = storagepool->poolid; + super->SB_Header.hdr.createTime = GetUTCTime(); + super->SB_Header.hdr.SBH_VolumeID = pool->ZFSPOOLvolumeID; + super->SB_Header.hdr.SBH_InternalID = pool->ZFSPOOLinternalID; + super->SB_Header.hdr.SBH_PoolID = zlssPool->ZLSSPOOLmVolumeID; + super->SB_Header.hdr.SBH_PoolInternalID = zlssPool->ZLSSPOOLmInternalID; + unicpy(super->SB_Header.hdr.SBH_Name, storagepool->poolname); + + /*- group info -*/ +// super->SB_Header.hdr.numberofseg = gseg->grp.grpsiblingcount; +// super->SB_Header.hdr.groupid = gseg->grp.groupid; + super->SB_Header.hdr.totalblocks = storagepool->pooltotalblocks; +/*--------------------------------------------------------------------------- + | This will loop through and create all of the system beasts. The log + | beast will be placed in the Super block, all other beasts are placed + | in the beastTree including the beastTree itself. + *-------------------------------------------------------------------------*/ + for(i=0;SuperBeasts[i].beastClass != -1;i++) + { + /* Don't create the user tree or directory tree by default */ + if (SuperBeasts[i].zid == ZFSVOL_DIRTREE_ZID) + { + continue; + } + + beast = BST_new(genMsg,SuperBeasts[i].beastClass,pool); + if (beast == NULL) + goto error_cleanup; + + *((File_s **)((ADDR)pool+SuperBeasts[i].poolOffset)) = beast; + + beast->FILEzid = SuperBeasts[i].zid; + beast->FILEfirstParentZid = zINVALID_ZID; + + /* save the Pool purgelog beast to be use later */ + if (beast->FILEzid == ZFSPOOL_PURGELOG_ZID) + { + purgeLog = (ZfsPurgeLogBeast_s *)beast; + } + + /* save the Volume purgelog beast to be use later */ + if (beast->FILEzid == ZFSVOL_PURGELOG_ZID) + { + volumePurgeLog = (ZfsPurgeLogBeast_s *)beast; + } + + if (beast->FILEzid == ZFSPOOL_VOLBSTTREE_ZID) + { + ztree = (ZfsBeastTreeBeast_s *)beast; + } + + if((SuperBeasts[i].zid == ZFSPOOL_FREETREE_ZID)) + { + /* allocate space to the FREE extent btree */ + /*- this must be the first call to the free extent btree -*/ + InitRootNode(genMsg,(ZfsFreeExtent_s *)beast, + super->SB_Header.hdr.totalblocks / 2 ); + /** Initialize the special block where some free blocks are kept + ** exclusively for use by the free tree for its splits and grows + **/ + InitSpecialNode(genMsg,(ZfsFreeExtent_s *)beast, + (super->SB_Header.hdr.totalblocks / 2) + 1); + + /*- set the total pool size in the free extent btree -*/ + seedext.poolBlkNum = 0; + seedext.lengthOfExtent = pool->ZP_super->SB_Header.hdr.totalblocks; + zfsFreeExtent(genMsg, &pool->ZFSPOOLzfsVol, &seedext, NULL); + zASSERT(seedext.lengthOfExtent == 0); + + /* Remove the space just allocated to the FREE tree */ + seedext.poolBlkNum = super->SB_Header.hdr.totalblocks / 2; + seedext.lengthOfExtent = 2; + zfsAllocExtent( genMsg, &pool->ZFSPOOLzfsVol, &seedext, + XTREE_SYSTEM_REQUEST, NULL); + zASSERT(((super->SB_Header.hdr.totalblocks / 2) == + seedext.poolBlkNum) && (seedext.lengthOfExtent == 2)); + +// pool->ZP_super->SB_Header.hdr.SBH_FreeTreeBlk = seedext.poolBlkNum; + } + BEASTHASH_Insert(&beast->FILEroot); + /* Do not put any of the SYSTEM beasts into the POOL linked list because + * we need to control when the files are flushed */ + DQ_RMV(beast,FILEvolLink); + + } +/*--------------------------------------------------------------------------- + | Allocate the space used by all four super blocks + *-------------------------------------------------------------------------*/ + /* Super block 1 */ + seedext.poolBlkNum = super->SB_Header.hdr.superlocation[0]; + seedext.lengthOfExtent = SUPERBLOCK_BLKCOUNT; + zfsAllocExtent( genMsg, &pool->ZFSPOOLzfsVol, &seedext, + XTREE_SYSTEM_REQUEST, NULL); + zASSERT((super->SB_Header.hdr.superlocation[0] == seedext.poolBlkNum) && + (seedext.lengthOfExtent == SUPERBLOCK_BLKCOUNT)); + + /* Super block 2 */ + seedext.poolBlkNum = super->SB_Header.hdr.superlocation[1]; + seedext.lengthOfExtent = SUPERBLOCK_BLKCOUNT; + zfsAllocExtent( genMsg, &pool->ZFSPOOLzfsVol, &seedext, + XTREE_SYSTEM_REQUEST, NULL); + zASSERT((super->SB_Header.hdr.superlocation[1] == seedext.poolBlkNum) && + (seedext.lengthOfExtent == SUPERBLOCK_BLKCOUNT)); + + /* Super block 3 */ + seedext.poolBlkNum = super->SB_Header.hdr.superlocation[2]; + seedext.lengthOfExtent = SUPERBLOCK_BLKCOUNT; + zfsAllocExtent( genMsg, &pool->ZFSPOOLzfsVol, &seedext, + XTREE_SYSTEM_REQUEST, NULL); + zASSERT((super->SB_Header.hdr.superlocation[2] == seedext.poolBlkNum) && + (seedext.lengthOfExtent == SUPERBLOCK_BLKCOUNT)); + + /* Super block 4 */ + seedext.poolBlkNum = super->SB_Header.hdr.superlocation[3]; + seedext.lengthOfExtent = SUPERBLOCK_BLKCOUNT; + zfsAllocExtent( genMsg, &pool->ZFSPOOLzfsVol, &seedext, + XTREE_SYSTEM_REQUEST, NULL); + zASSERT((super->SB_Header.hdr.superlocation[3] == seedext.poolBlkNum) && + (seedext.lengthOfExtent == SUPERBLOCK_BLKCOUNT)); + +/*--------------------------------------------------------------------------- + | Allocate TWO blocks for persistent volume informations. + | One for the LOGGED data and one for the NON-LOGGED data. We + | force the allocation to the middle of the pool (which means + | nothing if multiple drives make up the pool). + | + | Allocate TWO blocks of the persistent pool information. + | One for the LOGGED data and one for the NON-LOGGED data. We + | force the allocation to the middle of the pool (which means + | nothing if multiple drives make up the pool). + | + | Allocate ONE block for system beasts. + | We force the allocation to the middle of the volume (which means + | nothing if multiple drives make up the pool). + *-------------------------------------------------------------------------*/ + seedext.poolBlkNum = (super->SB_Header.hdr.totalblocks / 2) + 2; + seedext.lengthOfExtent = 3+LV_POOL_BLOCKS; + zfsAllocExtent( genMsg, &pool->ZFSPOOLzfsVol, &seedext, + XTREE_SYSTEM_REQUEST, NULL); + if (seedext.lengthOfExtent != (3+LV_POOL_BLOCKS)) + { + goto error_cleanup; + } + pool->ZP_super->SB_Header.hdr.SBH_LoggedVolumeDataBlk = seedext.poolBlkNum; + pool->ZFSPOOLzfsvolp.PZV_loggedVolumeDataBlk = seedext.poolBlkNum; + pool->ZP_super->SB_Header.hdr.SBH_VolumeDataBlk = seedext.poolBlkNum+1; + pool->ZFSPOOLzfsvolp.PZV_volumeDataBlk = seedext.poolBlkNum+1; + + pool->ZP_super->SB_Header.hdr.SBH_SystemBeastBlkNum = seedext.poolBlkNum+2; + pool->ZFSPOOLzfsvol.p.PZV_systemBeastBlkNum = seedext.poolBlkNum+2; + ztree->specialSystemBstsBlk = seedext.poolBlkNum+2; + + pool->ZP_super->SB_Header.hdr.SBH_LoggedPoolDataBlk = seedext.poolBlkNum+3; + zlssPool->ZP_p.PZP_loggedPoolDataBlk = seedext.poolBlkNum+3; + pool->ZP_super->SB_Header.hdr.SBH_PoolDataBlk = seedext.poolBlkNum+4; + zlssPool->ZP_p.PZP_poolDataBlk = seedext.poolBlkNum+4; + + READBLK_IO_MSG(iomsg, ztree, ztree->specialSystemBstsBlk, CACHE_WRITE); + buffer = ZFS_ReadPoolBlk(genMsg, &iomsg); + if (buffer == NULL) + { + goto error_cleanup; + } + initNode(buffer, BBT_LEAF|BBT_SYSTEM_BSTS_BLK, &pool->ZFSPOOLinternalID); + insertZeroNode((BeastTreeNode_s *)buffer->pBuf.data); + CACHE_DIRTY_RELEASE(buffer); + +/** Allocate FXBT_MINBLKS_SPECIAL blks to be put in the special blk where + ** they will be used by the free tree alone for its metadata blks. As the + ** free tree uses blocks from this place they will be replenished. + **/ + CheckSpecialBlkAndAdd(pool, NULL, 0); + + +#if 0 + { + /* + * We like to have completly zeroed volume data blocks + * when we start. Some of this is required because our + * logging code does not ever update the reserved area + * so to save a little time on writing the logged data + * which is done ALL the time. + * + * As of May 21, 1998 the above is no longer true. We + * now no longer pre-read the volume data blocks mainly + * so we do not have to handle read errors. We still + * zero the reserved areas (via the in-memory structure), + * but we no longer have zeros at the end of the 4k block + * in the area I call UNUSED. + */ + + IoMsg_s iomsg; + Blknum_t blk; + Buffer_s *buffer; + + blk = pool->ZFSPOOLzfsvol.p.PZV_volumeDataBlk; + zASSERT( blk != 0 ); + READBLK_IO_MSG(iomsg,pool,blk,CACHE_WRITE); + SET_DEBUG_ID(iomsg, 36); + buffer = ZFS_ReadPoolBlk(genMsg,&iomsg); + if (buffer == NULL) + { + errPrintf(WHERE, Module, 1427, + MSG("Error reading VolumeData Block %d, status=%d.", 369), + blk, GetErrno(genMsg)); + goto error_cleanup; + } + bzero( buffer->pBuf.data, (1<bufSizeShift) ); + /* Note that the zero's will be replaced at the end of this + * routine when ZFSPOOL_Deactive is called. + */ + CACHE_DIRTY_RELEASE(buffer); + + blk = pool->ZFSPOOLzfsvol.p.PZV_loggedVolumeDataBlk; + zASSERT( blk != 0 ); + READBLK_IO_MSG(iomsg,pool,blk,CACHE_WRITE); + SET_DEBUG_ID(iomsg, 37); + buffer = ZFS_ReadPoolBlk(genMsg,&iomsg); + if (buffer == NULL) + { + errPrintf(WHERE, Module, 1428, + MSG("Error reading VolumeData Block %d, status=%d.", 383), + blk, GetErrno(genMsg)); + goto error_cleanup; + } + bzero( buffer->pBuf.data, (1<bufSizeShift) ); + /* Note that the zero's will be replaced at the end of this + * routine when ZFSPOOL_Deactive is called. + */ + CACHE_DIRTY_RELEASE(buffer); + + } +#endif + +/*--------------------------------------------------------------------------- + | Initialize the LOG file. + *-------------------------------------------------------------------------*/ + /* Set up quick pointer to ZLOG Beast */ + pool->ZFSPOOLzfsvolzlog = pool->zfsLogBeast; // FixFixFix6 - why not in normal place? Where should we NULL at? + + /* Let ZLOG initialize it's log file */ + if ( Zlog_PoolInitialize( genMsg, pool->zfsLogBeast, FALSE ) != zOK ) + { /* ZLOG is not happy - we must stop */ + goto error_cleanup; + } + +/*--------------------------------------------------------------------------- + | Init the purge logs (Pool and Volume) + *-------------------------------------------------------------------------*/ + extent.lengthOfExtent = 2; + extent.poolBlkNum = (super->SB_Header.hdr.totalblocks / 2) + 1 + 2; + if (zfsAllocExtent( genMsg, &pool->ZFSPOOLzfsVol, &extent, + XTREE_SYSTEM_REQUEST, NULL) != zOK) + { + goto error_cleanupRelease; + } + + purgeLog->p.firstBlock = extent.poolBlkNum; + purgeLog->nextUnused.block = extent.poolBlkNum; /* init pointer to first free slot */ + purgeLog->nextUnused.slot = 0; /* init pointer to next free slot */ + purgeLog->freeList.block = 0; /* no free list */ + purgeLog->freeList.slot = 0; + READBLK_IO_MSG(iomsg, purgeLog, extent.poolBlkNum, CACHE_UPDATE); + SET_DEBUG_ID(iomsg, 38); + buffer = ZFS_ReadPoolBlk(genMsg, &iomsg); + if (buffer == NULL) + { + goto error_cleanupRelease; + } + ZFSVOL_InitPurgeLogBlock(buffer,&purgeLog->ZFSPURGELOGroot.ROOTinternalID); + CACHE_DIRTY_RELEASE(buffer); + + volumePurgeLog->p.firstBlock = extent.poolBlkNum+1; + volumePurgeLog->nextUnused.block = extent.poolBlkNum+1; /* init pointer to first free slot */ + volumePurgeLog->nextUnused.slot = 0; /* init pointer to next free slot */ + volumePurgeLog->freeList.block = 0; /* no free list */ + volumePurgeLog->freeList.slot = 0; + READBLK_IO_MSG(iomsg, volumePurgeLog, extent.poolBlkNum+1, CACHE_WRITE); + buffer = ZFS_ReadPoolBlk(genMsg, &iomsg); + zASSERT( buffer != NULL ); + if (buffer == NULL) + { + goto error_cleanupRelease; + } + ZFSVOL_InitPurgeLogBlock(buffer,&volumePurgeLog->ZFSPURGELOGroot.ROOTinternalID); + CACHE_DIRTY_RELEASE(buffer); + +/*--------------------------------------------------------------------------- + | We now need to write all of the system files into the beastTree so that + | the space for them will be allocated. Note that we do NOT write the + | logBeast because it is stored in the SuperBlock, or the user and + | directory trees because they only exist on pools during conversion + | from pre-logical volume pools. + *-------------------------------------------------------------------------*/ + for(i=0;SuperBeasts[i].beastClass != -1;i++) + { + beast = *((File_s **)((ADDR)pool+SuperBeasts[i].poolOffset)); + if (SuperBeasts[i].zid != ZFSPOOL_LOGFILE_ZID && + SuperBeasts[i].zid != ZFSVOL_USERTREE_ZID && + SuperBeasts[i].zid != ZFSVOL_DIRTREE_ZID) + { + X_LATCH(&beast->FILEbeastLatch); + + packedSize = BST_getPackedSize(&beast->FILEroot); + + xaction = BeginXLocal(beast->FILEvolume,BXL_DEFAULT); + status = ZFSVOL_VOL_InsertBeastIntoVolume(genMsg,&beast->FILEroot, + packedSize, &xaction->xaction); + EndXlocal(xaction); + + if (status != zOK) + { +/** Insert beast now does the cleanup on error conditions. We have no way + ** to tell if the error was before or after the pack was called. + ** + ** BST_noPackCleanup(&beast->FILEroot); + **/ + goto error_cleanup; + } + +// if (beast->FILEzid == ZFSPOOL_VOLBSTTREE_ZID) +// { +// ZfsBeastTreeBeast_s *btree = (ZfsBeastTreeBeast_s *)beast; +// +// /* Store BEAST B-Tree block number into the ZFS volume */ +// zASSERT( btree->ZFSBEASTTREEbtRoot != 0 ); +// pool->ZFSPOOLzfsvolp.PZV_beastTreeBlkNum = btree->ZFSBEASTTREEbtRoot; +// pool->ZP_super->SB_Header.hdr.SBH_BeastTreeBlkNum = +// btree->ZFSBEASTTREEbtRoot; +// COMN_MARK_BEAST_DIRTY(&btree->ZFSBEASTTREEroot); +// } + UNX_LATCH(&beast->FILEbeastLatch); + } + } + +/*--------------------------------------------------------------------------- + | Create and init the root directory of the POOL/VOLUME + *-------------------------------------------------------------------------*/ + rootdir = (File_s *)BST_new(genMsg,zFTYPE_FILE,pool); + if (rootdir == NULL) + goto error_cleanupRelease; + + X_LATCH(&rootdir->FILEbeastLatch); + /* Do not put any of the SYSTEM beasts into the Volume's list because + * we need to control when the files are flushed. + */ + + if (ZFSPOOL_InitNewRootDir(genMsg, rootdir, &pool->ZFSPOOLzfsvol) != zOK) + { + UNX_LATCH(&rootdir->FILEbeastLatch); + BST_free( rootdir ); + goto error_freeRootdir; + } + + BEASTHASH_Insert(&rootdir->FILEroot); + DQ_RMV(rootdir,FILEvolLink); + + COMN_MARK_BEAST_DIRTY( &rootdir->FILEroot); + UNX_LATCH(&rootdir->FILEbeastLatch); + + X_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); + ZLSSVOL_LVD_DeleteListRemoveAllFromPool( (Volume_s *)pool ); + UNX_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); +#if VOL_DOES_ROOTDIR + status = VOL_Deactivate( genMsg, (Volume_s *)pool, 0 ); +#else + status = VOL_Deactivate( genMsg, (Volume_s *)pool, 0 ); + if ( status != zOK ) + { + errPrintf(WHERE, Module, 1430, + MSG("Unable to flush the root directory " + "on the volume \"%U\", status=%d.\n" + "An error occurred trying to create a pool. " + "You may be out of memory. Read\n" + "the status message and follow the instructions.", 423), + storagepool->poolname,GetErrno(genMsg)); + goto error_cleanupRelease; + } + status = ZFSVOL_Deactivate( genMsg, (ZfsVolume_s *)pool, 0 ); +#endif + if ( status != zOK ) + { + errPrintf(WHERE, Module, 1431, + MSG("Unable to flush the root directory " + "on the volume \"%U\", status=%d.\n" + "An error occurred trying to create a pool. " + "You may be out of memory. Read\n" + "the status message and follow the instructions.", 424), + storagepool->poolname,GetErrno(genMsg)); + goto error_cleanupRelease; + } +/*--------------------------------------------------------------------------- + | Write default patterns into all super blocks. + *-------------------------------------------------------------------------*/ + if (ZFSPOOL_SuperBlockPoolInitialize(genMsg,pool) != zOK) + goto error_cleanupRelease; +/*--------------------------------------------------------------------------- + | Write all checkpoints. + *-------------------------------------------------------------------------*/ + if (ZFSPOOL_CheckpointPoolInitialize(genMsg,pool) != zOK) + goto error_cleanupRelease; +/*--------------------------------------------------------------------------- + | Write the SINGLE MSAP block. + *-------------------------------------------------------------------------*/ + zASSERT( zlssPool->ZP_MSAPBlock == pool->ZP_super->SB_Header.hdr.superlocation[MSAP_SUPERBLOCK_NUMBER] + MSAP_SUPERBLOCK_SECTION ); + status = MSAP_MSAPBlockInitializePhysical( zlssPool, genMsg ); + if ( status != zOK ) + { + goto error_cleanupRelease; + } + +/*--------------------------------------------------------------------------- + | Write all superblock headers. + *-------------------------------------------------------------------------*/ + status = ZFSPOOL_SuperBlockHeaderWriteInitial(genMsg, pool, storagepool->poolname); + if ( status != zOK ) + { + goto error_cleanupRelease; + } + +/*--------------------------------------------------------------------------- + | Flush and Toss all ZFSPOOL system beasts. + | The deactivate also flushes the Volume Data Blocks and Logged + | Volume Data Blocks. + *-------------------------------------------------------------------------*/ + ZFSPOOL_Deactivate( genMsg, pool, 0 ); + ZFSPOOL_UnloadPersistentPool( pool, 0 ); + + /* By removing from the master volume link list we prevent + * this volume from being found. This prevents any new + * operations from starting which is a requirement since + * we will be freeing the volume. + */ + if (QMEMBER(&pool->ZFSPOOLvol.masterVolLink)) + { + SET_RMV(pool,ZFSPOOLvol.masterVolLink); + } + COMN_Release(&pool); + COMN_Release(&zlssPool); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED,"ZFSPOOL_InitNewPool(exit)\n")); + RTN_STATUS(zOK); +/*--------------------------------------------------------------------------- + | Free all of the beasts we just created. + *-------------------------------------------------------------------------*/ +error_freeRootdir: + X_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); + ZLSSVOL_LVD_DeleteListRemoveAllFromPool( (Volume_s *)pool ); + UNX_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); +#if VOL_DOES_ROOTDIR + (void)VOL_Deactivate( genMsg, (Volume_s *)pool, 0 ); +#else + (void)VOL_Deactivate( genMsg, (Volume_s *)pool, 0 ); + (void)ZFSVOL_Deactivate( genMsg, (ZfsVolume_s *)pool, 0 ); +#endif + +error_cleanupRelease: +error_cleanup: + for(i=0;SuperBeasts[i].beastClass != -1;i++) + { + beast = *((File_s **)((ADDR)pool+SuperBeasts[i].poolOffset)); + if (beast != NULL) + { + File_s *tempBeast = beast; + + *((File_s **)((ADDR)pool+SuperBeasts[i].poolOffset)) = NULL; + cacheFlushMyCacheBufs(&beast->FILEmycache); + defaultFlushWait(&beast->FILEmycache.agent); + cacheTossAll(&beast->FILEmycache); + + COMN_Release(&tempBeast); /* so we still have a pointer*/ + BST_free(beast); + } + } + ZFSPOOL_UnloadPersistentPool( pool, 0 ); + +error_nosuper: + //UNX_LATCH(&pool->ZFSPOOLmetadataLatch); + if (QMEMBER(&pool->ZFSPOOLvol.masterVolLink)) + { + SET_RMV(pool,ZFSPOOLvol.masterVolLink); + } + COMN_Release(&pool); +error_pool: + COMN_Release(&zlssPool); +error_exit: + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("ZFSPOOL_InitNewPool(error exit) %ld %s\n"), + GetErrno(genMsg),GetErrnoSetter(genMsg))); + RTN_STATUS(zFAILURE); +} + + + + + +#ifdef USER_GPACHNER +#if NSS_DEBUG IS_ENABLED + +LONG gLVAIPUUnitTest = TRUE; + + +#if zNETWARE +void ReBoot(void); /* Reboots a PC - taken from crashNMIShell.386 */ + +#pragma aux ReBoot = \ + "cli" \ + "mov al, 254", \ + "out 100, al" \ + "jmp *-2" \ + modify exact [EAX]; +#endif /* #if zNETWARE */ + +#endif +#endif + +//#ifndef __linux__ // LINUX_Install +#if zNETWARE +/* + * ZFSPOOL_IsInstallTime() - + * Determine if the server is being installed. This information + * is stored in the NetWare Registry. + * + * Notes - + * This is a copy of the E-Mail that Blair Merrell sent about + * the install bit on Jan 10, 20001. + * + * "... the key in the registry that indicated what mode the + * server is running under is "\My Server\Operational Mode". + * When bit 0 of the DWORD is set to 1 it indicates that it + * is in install mode. The registery can be accessed + * programatically using the attached api set. It also can + * be accessed manually using the CDBE.NLM. Load the nlm, + * type "CDBE EDIT ON" at the console, and use the edit commands." + * + * The following is the documentation of NetWare Registry API + * from Bruce Cutler attached(1/8/2001) in above E-Mail from + * Blair. + * + * "I have attached the page from the Microsoft SDK documentation + * that lists all of their registry API's. Which of these are NOT + * supported in NetWare? (I found these at http://msdn.microsoft.com/ + * library/default.asp?URL=/library/wcedoc/wcesdkr/regapi_7.htm ) + * + * Registry Functions + * Function Description + * RegCloseKey - SUPPORTED + * RegConnectRegistry + * RegCreateKeyEx - SUPPORTED + * RegDeleteKey - SUPPORTED + * RegDeleteValue - SUPPORTED + * RegDisablePredefinedCache + * RegEnumKeyEx - SUPPORTED + * RegEnumValue - SUPPORTED + * RegFlushKey - SUPPORTED + * RegGetKeySecurity + * RegLoadKey NOTE: planned but no time available to work on yet. + * RegNotifyChangeKeyValue NOTE: planned but no time available to work on yet. + * RegOpenCurrentUser + * RegOpenKeyEx - SUPPORTED + * RegOpenUserClassesRoot + * RegOverridePredefKey + * RegQueryInfoKey - SUPPORTED + * RegQueryMultipleValues - SUPPORTED + * RegQueryValueEx - SUPPORTED + * RegReplaceKey + * RegRestoreKey + * RegSaveKey - SUPPORTED (50%) + * RegSetKeySecurity + * RegSetValueEx - SUPPORTED + * RegUnLoadKey + * + * Bruce Cutler + * Lead Engineer, Novell NetWare Migration Wizard team + * Novell, Inc. the leading provider of Net services software. + * bcutler@novell.com" + * + */ + +BOOL ZFSPOOL_IsInstallTime( ) + +{ + + HKEY localKey; + int ccode; + LONG oMode; + LONG dataType; + LONG dataLen; + + MPKNSS_UNLOCK(); + ccode = RegOpenKeyEx(HKEY_ROOT, "My Server", 0, KEY_READ, &localKey); + if (ccode) + { + MPKNSS_LOCK(); + return( FALSE ); + } + dataLen = sizeof( oMode ); + ccode = RegQueryValueEx(localKey, "Operational Mode", NULL, &dataType, &oMode, &dataLen); + RegCloseKey(localKey); + + MPKNSS_LOCK(); + if (ccode) + { + return( FALSE ); + } + + zASSERT( dataType == REG_DWORD ); + aprintf(CYAN, "Operational Mode %lx\n", oMode ); + if ( oMode & 0x01 ) + { /* The install BIT is on so we must be installing */ + return( TRUE ); + } + return( FALSE ); + +} /* End of ZFSPOOL_IsInstallTime() */ +#endif + +#if zLINUX +BOOL ZFSPOOL_IsInstallTime() +{ // Never install time in Linux because NSS does not run at that time. + return FALSE; +} +#endif + +/* + * ZFSPOOL_LoadPool() + * Given the pool object and a group, this will try and load pool into + * memory. + * + * Notes - + * *** Upgrade of NSS 2.x volumes into NSS 3.00 volumes. *** + * During install of the product we do not upgrade into pools and volumes + * unless someone specifically asks (via the command /zlssvolumeupgrade which + * calls us with the loadFlag value of ZLSS_PLF_UPGRADE). We generally + * do not upgrade during install because of the following. + * + * 1) Once a volume is upgraded there is no going back to NetWare 5.x. + * 2) We may crash. + * 3) Volume may be blown away during install. + * 4) Slows down install process (not enough to notice). + * + * We allow specific upgrades from the command line during install + * because WebSphere can be installed on any volume. Therefore, the + * user may wish to upgrade a NSS volume so that they can place + * WebSphere on the volume during install. + * + * On March 28, 2000, Mike and Greg agreed that ZFSPOOL_LoadPool + * would no longer set the poolstatus. The caller of this routine will set. + * + */ + +ZfsPool_s *ZFSPOOL_LoadPool( + GeneralMsg_s *genMsg, + StorPool_s *storagepool, + BOOL verbose, + NINT loadFlag, + BOOL snapshot, /* TRUE when we need to create a snapshot + * of current pool being loaded. I.E. + * convert the current pool into a snapshot + * of some other pool. + */ + unicode_t *snapshotInfo, /* Only used if 'snapshot' is TRUE. Caller + * must ensure that this is less than 64 + * unicode (including the required NULL). + */ + BOOL newGuids ) /* TRUE when we need to generate new GUIDs + * of the pool (and internal volume) being loaded. + * This is done as part of AUTO Pool rename + * which is done if our caller detects a duplicate + * pool name. + */ + +{ + Pool_s *dupPool; + ZfsPool_s *pool = NULL; + ZlssPool_s *zlssPool = NULL; + unicode_t *poolName = NULL; + STATUS status; + BOOL forceState; + BOOL isInstallTime = FALSE; /* OS is being installed. We have + * decided not to upgrade any volumes + * during this time unless specifically + * requested via the 'loadFlag' + */ + NINT state = 0; +#if NSS_DEBUG IS_ENABLED + NINT resetAlready = FALSE; + const GUID_t *guid; + char vbuffer[GUID_FORMAT_SIZE]; +#endif + + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED,MSGNot("ZFSPOOL_LoadPool\n"))); + ASSERT_MPKNSS_LOCK(); + ENTER(TZPOOL, ZFSPOOL_LoadPool); + + zASSERT( (loadFlag == ZLSS_PLF_NORMAL) || + (loadFlag == ZLSS_PLF_NO_UPGRADE) || (loadFlag == ZLSS_PLF_UPGRADE) ); + zASSERT( storagepool != NULL ); + zASSERT( storagepool->phypool != NULL ); + zASSERT( storagepool->poolname[0] != '\0' ); + zASSERT( storagepool->zfspool == NULL ); + + zASSERT( sizeof( 1uL ) == 4 ); + zASSERT( sizeof( UI64_CONST(1) ) == 8 ); + + poolName = storagepool->poolname; + if ( verbose ) + { + aprintf(NSS_POOL_COLOR,MSG("Loading pool \"%U\"\n", 885),poolName); + } +//#ifndef __linux__ // LINUX_Install + isInstallTime = ZFSPOOL_IsInstallTime(); +//#endif + +#if NSS_DEBUG IS_ENABLED + if ( isInstallTime ) + { + DBG_DebugPrintf( MAGENTA, + MSGNot("Loading pool %U during INSTALL of O.S. Shared bit 0x%lx\n"), + poolName, storagepool->sharedStatus ); + } + else + { + DBG_DebugPrintf( MAGENTA, + MSGNot("Loading pool %U not during install of O.S. Shared bit 0x%lx\n"), + poolName, storagepool->sharedStatus ); + } + DBG_ScreenAPrintf( "ZLSS.Greg.Pool", WHERE, CYAN, "Size of superblock header %d\n",sizeof(SuperBlockHeader_s)); + if ( unicmp( &gZCL_OnlyPoolName[0], L"" ) != 0 ) + { + if ( uniicmp( poolName, &gZCL_OnlyPoolName[0] ) != 0 ) + { + aprintf(LRED,MSGNot(" Not loading pool \"%U\" because /OnlyPoolName=%U switch\n"),poolName,&gZCL_OnlyPoolName[0]); + SetErrno(genMsg,zERR_NO_MEMORY); + RTN_PTR(NULL); + } + } +ResetStart: +#endif + +/*--------------------------------------------------------------------------- + | We now get the Pool's name and Volume ID out of the superblock header. + | To do this we call the Superblock header validate code and then pass + | the name and volume ID to VOL_Bstnew(). At which point we free up + | the memory(4k) used to store the superblock information. This is done + | so to use less memory for pools that are not in the ACTIVE state. + | + | Note - we still use the Pool name from GSEG in some of the code below, + | this should be changed later. + | + | BEWARE + | BEWARE The superblock header is XACTIONed during AIPU. This + | BEWARE causes the values in it to change during pool activate, + | BEWARE therefore do not use any fields without thinking about + | BEWARE how them changing on you during REDO/UNDO!!! For example, + | BEWARE the beast tree special block may change. We handle this + | BEWARE today by NOT using the block until AFTER REDO/UNDO. + | BEWARE + | BEWARE We are doing I/O (writes) to the pool even though it is not + | BEWARE ACTIVE this is technically ILLEGAL in a cluster environment. + | BEWARE + *-------------------------------------------------------------------------*/ + + { + MediaSuperBlockHeader_s *super; + + /* The super block memory is freed in the ZfsPool destructor */ + super = (MediaSuperBlockHeader_s *)zalloc(sizeof(*super)); + if ( super == NULL ) + { + SetErrno(genMsg,zERR_NO_MEMORY); + RTN_PTR(NULL); + } + /* FixFixFix(LV,ACI,10) - Validate and other code here in + * load will WRITE to the VOLUME. This is wrong in a CLUSTER. + * Also reading is wrong if we do not re-read at activate + * time I.E. another server could have changed original data on + * us. Do we need an unload of the pool or should load do less??? + * What happens if someone changes the name of the pool on another + * server in the cluster? + * + * We currently throw out the SB Header after load. We may want + * to unload and reload the pool before ACTIVATION? Or maybe just + * in Allen's cluster scan code he could do unload/load on all + * pools that are NOT ACTIVE or MAINTENANCE. This sounds GOOD! + */ + status = ZFSPOOL_SuperBlockHeaderValidate(genMsg, storagepool, super); + if (status != zOK) + { /* User will have to re-configure group */ + errPrintf(WHERE, Module, 1432, + MSG("Volume \"%U\" in group %ld " + "contains invalid super block headers, status=%d.\n" + "Run Verify.", 426), + poolName, 0/*storagepool->poolid*/ ,GetErrno(genMsg)); + free( super ); + RTN_PTR(NULL); + } + + /** + * This is cheap upgrade code. We fill in the NEW volume ID + * location(SBH_VolumeID) with the old value(SBH_InternalID). + * We do this AIPU here because SBH_VolumeID needs to be correct + * from the get go (e.g. the POOL_BstNew needs (see below)). + * We also fill in the POOL's internal and volume ID. We + * also make reserved2 bigger so we zero out the area. + * + * June 26, 2001 Added check for zero GUID because during + * NW6 install we do not upgrade the pools. This meant + * that on reboot I would give the pool a NEW pool ID. + * The cluster boys did not like this. + */ + if ( (super->hdr.SBH_VersionMediaMajor < AIPU_1ST_6PACK_MEDIA_VERSION) + && (LB_GUIDCompare( &super->hdr.SBH_PoolID, &zZERO_GUID) == 0) ) + { + + /* Copy the internal ID (at offset 16) to the + * new location for the volume ID. + */ + zASSERT( LB_GUIDValidate( &super->hdr.SBH_InternalID ) ); +#ifdef USER_GPACHNER +// zASSERT("Doing internal ID to Volume ID copy in load pool"==NULL); +#endif + super->hdr.SBH_VolumeID = super->hdr.SBH_InternalID; + zASSERT( LB_GUIDValidate( &super->hdr.SBH_VolumeID ) ); + /* The Pool's and Internal Volume's internal IDs must + * match because we write the LPDB and PDB with the + * _IV_ because writes require a volume and not a + * pool. + */ + super->hdr.SBH_PoolInternalID = super->hdr.SBH_InternalID; + LB_GUIDGenerate( &super->hdr.SBH_PoolID ); + zASSERT( LB_GUIDValidate( &super->hdr.SBH_PoolID ) ); + /* Fill block 12 of superblocks 0 and 1. These blocks + * can be used for persistent debug information. As + * of 05Jan20001 these two blocks are NOT being used. + */ + if (ZFSPOOL_SuperBlock12Initialize( genMsg, storagepool, super ) != zOK) + { + ClearErrno( genMsg ); + } +#ifdef USER_GPACHNER + aprintf(LRED,"Size of SBH zero reserved area %d\n",sizeof( super->hdr.reserved2 )); +#endif + LB_bzero( &super->hdr.reserved2, sizeof( super->hdr.reserved2 ) ); + zASSERT( LB_GUIDValidate( &super->hdr.SBH_PoolID ) ); + /* Write out the changes we have made to the superblock + * header. + */ + status = ZFSPOOL_SuperBlockUpdate2( genMsg, storagepool, super ); + if (status != zOK) + { /* Error message already printed. */ + free( super ); + RTN_PTR(NULL); + } + } + /* If snapshot request AND the the old name is not + * too long then 'take a snapshot.' + */ + if ( snapshot && + ( NELEMS( super->hdr.SBH_SS_OriginalName ) > + unilen( snapshotInfo ) ) ) + { /* Take a snapshot */ + bzero( super->hdr.SBH_SS_OriginalName, + sizeof( super->hdr.SBH_SS_OriginalName ) ); + unicpy( super->hdr.SBH_SS_OriginalName, snapshotInfo ); + super->hdr.SBH_SS_OriginalPoolID = super->hdr.SBH_PoolID; + super->hdr.SBH_SS_CreateTime = GetUTCTime(); + super->hdr.SBH_SS_OriginalVolumeID = super->hdr.SBH_VolumeID; + super->hdr.SBH_SS_Enabled = ZLSS_SBH_SNAPSHOT_ENABLED; + LB_GUIDGenerate( &super->hdr.SBH_SS_Guid ); + zASSERT( LB_GUIDValidate( &super->hdr.SBH_SS_Guid ) ); + + LB_GUIDGenerate( &super->hdr.SBH_PoolID ); + zASSERT( LB_GUIDValidate( &super->hdr.SBH_PoolID ) ); + LB_GUIDGenerate( &super->hdr.SBH_VolumeID ); + zASSERT( LB_GUIDValidate( &super->hdr.SBH_VolumeID ) ); + /* In NetWare 6.x, we will clear the SNAPSHOT's NDS Object Ids. Here we do + * the Pool's NDS Object ID and the Internal Volume's NDS Object ID. + * The Pool's LVs are done in ZLSSVOL_LV_Snapshot(). + */ + status = ZFSVOL_PDBNDSObjectClear( genMsg, storagepool, (Blknum_t)super->hdr.SBH_PoolDataBlk ); + zASSERT( status == zOK ); + if (status != zOK) + { + errPrintf(WHERE, Module, -1, + MSG("Could not clear Pool NDS Object ID in pool \"%U\", status=%d(%s).\n", 605), + poolName, GetErrno(genMsg), GetErrnoSetter(genMsg) ); + ClearErrno( genMsg ); + } + status = ZFSVOL_VDBNDSObjectClear( genMsg, storagepool, (Blknum_t)super->hdr.SBH_VolumeDataBlk ); + zASSERT( status == zOK ); + if (status != zOK) + { + errPrintf(WHERE, Module, -1, + MSG("Could not clear Volume NDS Object ID in pool \"%U\", status=%d(%s).\n", 606), + poolName, GetErrno(genMsg), GetErrnoSetter(genMsg) ); + ClearErrno( genMsg ); + } + status = ZFSPOOL_SuperBlockUpdate2( genMsg, storagepool, super ); + if (status != zOK) + { /* Error message already printed. */ + free( super ); + RTN_PTR(NULL); + } + } + dupPool = COMN_PoolIDLookup( genMsg, &super->hdr.SBH_PoolID, FALSE); + if ( dupPool != NULL ) + { /* A pool with the same GUID is already loaded we + * better change this pool's GUIDs. + */ +#ifdef USER_GPACHNER + zASSERT("Duplicate Pool ID found auto pool id rename will be done"==NULL); +#endif + newGuids = TRUE; + COMN_Release( &dupPool ); + } + else + { + ClearErrno( genMsg ); + } + if ( newGuids ) + { /* Generate new GUIDs for pool and internal volume */ + LB_GUIDGenerate( &super->hdr.SBH_PoolID ); + zASSERT( LB_GUIDValidate( &super->hdr.SBH_PoolID ) ); + LB_GUIDGenerate( &super->hdr.SBH_VolumeID ); + zASSERT( LB_GUIDValidate( &super->hdr.SBH_VolumeID ) ); + + status = ZFSPOOL_SuperBlockUpdate2( genMsg, storagepool, super ); + if (status != zOK) + { /* Error message already printed. */ + free( super ); + RTN_PTR(NULL); + } + } + + +/*--------------------------------------------------------------------------- + | New the pool beast - Use the special VOLUME beast new call. This + | places the volume into the admin volume directory. The AVFILE_destruct() + | will remove from the admin volume when our pool is freed via the last + | COMN_Release(pool). Note we need to call the release version because + | VOL_BstNew() returns with an inuse count of one. + | + | Note that we currently never free the pool beast that is created + | via a successful ZFSPOOL_LoadPool(). + *-------------------------------------------------------------------------*/ + { + typedef struct Stack_s { + unicode_t volName[POOL_MAXNAME+5]; + } Stack_s; + NINT len; + + STACK_ALLOC(); + + + zASSERT( LB_GUIDValidate( &super->hdr.SBH_VolumeID ) ); +#if NSS_DEBUG IS_ENABLED + { + guid = &super->hdr.SBH_PoolID; + (void)LB_GUIDToString( guid, sizeof( vbuffer ), vbuffer ); + + DBG_DebugPrintf(LRED,MSGNot(" SBH_PoolID %s\n"), vbuffer ); + + guid = &super->hdr.SBH_PoolInternalID; + (void)LB_GUIDToString( guid, sizeof( vbuffer ), vbuffer ); + + DBG_DebugPrintf(LRED,MSGNot(" SBH_PoolInternalID %s\n"), vbuffer ); + } +#endif + zlssPool = POOL_BstNew( genMsg, zFTYPE_ZLSS_LOGICAL_POOL, COMN_GetAdminVolume(), + super->hdr.SBH_Name, + &super->hdr.SBH_PoolID, zLSS_ID_ZLSS ); + /* ZLSS also creates a physical volume which goes back to the + * days when a ZfsPool_s 'is a' ZfsVolume_s. + */ +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED,"%s zlssPool address is %lx inUse is at offset %x\n",WHERE,zlssPool, offsetof( RootBeast_s, useCount) ); +#endif + if ( zlssPool == NULL ) + { +#if NSS_DEBUG IS_ENABLED + zASSERT( "POOL_BstNew error"==NULL); +#endif + free( super ); + STACK_FREE(); + RTN_PTR(NULL); + } + zlssPool->ZP_Loading = TRUE; + + if ( super->hdr.SBH_SS_Enabled == ZLSS_SBH_SNAPSHOT_ENABLED ) + { + zlssPool->ZP_Snapshot = TRUE; + zASSERT( LB_GUIDValidate( &super->hdr.SBH_SS_Guid ) ); + zlssPool->ZP_SnapshotID = super->hdr.SBH_SS_Guid; + zlssPool->ZLSSPOOLenabledFeatures |= zPOOL_FEATURE_SNAPSHOT; + } + else + { + zlssPool->ZP_Snapshot = FALSE; + zlssPool->ZP_SnapshotID = zINVALID_GUID; + zlssPool->ZLSSPOOLenabledFeatures &= ~zPOOL_FEATURE_SNAPSHOT; + } + zlssPool->ZP_MSAPBlock = super->hdr.superlocation[MSAP_SUPERBLOCK_NUMBER] + MSAP_SUPERBLOCK_SECTION; + + unicpy( aStack->volName, super->hdr.SBH_Name ); + len = unilen( aStack->volName ); + unicpy( &aStack->volName[len], L"_IV_" ); + zASSERT( LB_GUIDValidate( &zlssPool->ZLSSPOOLmVolumeID ) ); + zlssPool->ZLSSPOOLmInternalID = super->hdr.SBH_PoolInternalID; + + if ( storagepool->sharedStatus ) + { /* Allen says we are shared */ + zlssPool->ZLSSPOOLenabledFeatures |= zPOOL_FEATURE_SHARED_CLUSTER; + } + else + { + zlssPool->ZLSSPOOLenabledFeatures &= ~zPOOL_FEATURE_SHARED_CLUSTER; + } +#ifdef USER_GPACHNER + if ( uniicmp( super->hdr.SBH_Name, L"MSAP" ) == 0 ) + { + aprintf(YELLOW,"Marked MSAP as cluster enabled\n"); + zlssPool->ZLSSPOOLenabledFeatures |= zPOOL_FEATURE_SHARED_CLUSTER; + } +#endif + + +#if NSS_DEBUG IS_ENABLED + { + guid = &super->hdr.SBH_VolumeID; + (void)LB_GUIDToString( guid, sizeof( vbuffer ), vbuffer ); + + DBG_DebugPrintf(LRED,MSGNot(" SBH_VolumeID %s\n"), vbuffer ); + + guid = &super->hdr.SBH_InternalID; + (void)LB_GUIDToString( guid, sizeof( vbuffer ), vbuffer ); + + DBG_DebugPrintf(LRED,MSGNot(" SBH_InternalID %s\n"), vbuffer ); + + guid = &super->hdr.SBH_OldInternalID; + (void)LB_GUIDToString( guid, sizeof( vbuffer ), vbuffer ); + + DBG_DebugPrintf(LRED,MSGNot(" SBH_OldInternalID %s\n"), vbuffer ); + } +#endif + pool = VOL_BstNew( genMsg, zFTYPE_ZLSS_ZFSPOOL, COMN_GetAdminVolume(), + aStack->volName /*super->hdr.SBH_Name*/, + &super->hdr.SBH_VolumeID, zLSS_ID_ZLSS ); + STACK_FREE(); + } + if ( pool != NULL ) + { + pool->ZFSPOOLinternalID = super->hdr.SBH_InternalID; + } + +#ifdef USER_GPACHNER +#if NSS_DEBUG IS_ENABLED + { + extern QUAD gSBD_WriteCrashFailureCount; + SuperBlockDebug_s *sbd; + + /* The super block memory is freed in the ZfsPool destructor */ + zASSERT( sizeof(*sbd) == 4096 ); + sbd = (SuperBlockDebug_s *)zalloc( sizeof(*sbd) ); + zASSERT( sbd != NULL ); + ClearErrno( genMsg ); + (void)ZFSPOOL_SuperBlockReadByBlock( genMsg, storagepool, super, 0, 12, sbd ); + zASSERT( GetErrno( genMsg ) == zOK ); + if ( gLVAIPUUnitTest ) + { + if ( sbd->SBD_WriteCrashFailureCount == 0 ) + { + sbd->SBD_WriteCrashFailureCount = 0x13; + } + else + { + ++sbd->SBD_WriteCrashFailureCount; + } + gLVAIPUUnitTest = FALSE; /* Reset because we do reloads + during LV AIPU and do not wish to skip crashing + at a I/O point */ + (void)ZFSPOOL_SuperBlockWriteByBlock( genMsg, storagepool, super, 0, 12, sbd ); + zASSERT( GetErrno( genMsg ) == zOK ); + } +// gSBD_WriteCrashFailureCount = sbd->SBD_WriteCrashFailureCount; + DBG_DebugPrintf(LRED," Current value of gSBD_WriteCrashFailureCount is %Ld\n",gSBD_WriteCrashFailureCount ); + aprintf(LRED," Current value of gSBD_WriteCrashFailureCount is %Ld\n",gSBD_WriteCrashFailureCount ); + free( sbd ); + } +#endif +#endif + + if ( pool == NULL ) + { +#if NSS_DEBUG IS_ENABLED + zASSERT( "VOL_BstNew error"==NULL); +#endif + free( super ); + RTN_PTR(NULL); + } + + /*--------------------------------------------------------------------------- + | Start initing the pool + *-------------------------------------------------------------------------*/ + pool->storagepool = storagepool; + storagepool->zfspool = pool; // FixFixFix6 - where should this be done? + /* Set up our cross reference pointer between ZLSS Pool and its physical Volume */ + zlssPool->ZLSSPOOLpool.P_VolumeInternal = &pool->ZFSPOOLvol; + zlssPool->ZLSSPOOLzfsPool = pool; + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot(" zfs pool is %lx at location %lx\n"), + zlssPool->ZLSSPOOLzfsPool,&zlssPool->ZLSSPOOLzfsPool)); + zlssPool->ZLSSPOOLroot.useCount++; /* Because Internal Volume points to pool */ + pool->ZFSPOOLvol.v_pool = &zlssPool->ZLSSPOOLpool; + /* We store the inode for the cache code so that it can find the inode + * when the cache code only has a MyCache_s. The cache code/LSA is in + * charge of any usecounts, etc that must be kept around. + */ + zlssPool->ZP_Pool.P_Inode = storagepool->phypool->cp_inode; + + /*** Here we load in the POOL's persistent infomarion so + *** that the POOL's features are always available. This + *** is very important for the SHARED cluster bit as many + *** pool commands use when the pool is not ACTIVE. + *** + *** As of Dec 2000, the above is no longer TRUE. The + *** shared bit is given to as by the MAL/MM. Although + *** we still load the persistent information to fix + *** SPD 256378. Clustering needs access to the pool's + *** GUID and feature bits when a pool is not ACTIVE. + ***/ + zlssPool->ZP_Version = (super->hdr.SBH_VersionMediaMajor * 0x100L) + + super->hdr.SBH_VersionMediaMinor; + if ( zlssPool->ZP_Version >= (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_PDB_DONE) ) + { + zASSERT( super->hdr.SBH_LoggedPoolDataBlk != 0 ); + zlssPool->ZP_p.PZP_loggedPoolDataBlk = super->hdr.SBH_LoggedPoolDataBlk; + zASSERT( super->hdr.SBH_PoolDataBlk != 0 ); + zlssPool->ZP_p.PZP_poolDataBlk = super->hdr.SBH_PoolDataBlk; + status = ZLSSPOOL_ReadPersistentPoolData( genMsg, zlssPool ); + if ( status != zOK ) + { /* FixFixFix(LV,ACI,11) - We may wish to set a bit that + * the FEATUREs are not REALLY correct? + */ + /* Or in bits here so we do not WRITE over the + * ZLSSPOOL_PERSISTENT_ELSEWHERE bits. + */ + zlssPool->ZLSSPOOLenabledFeatures |= ZLSSPOOL_DEFAULT_ENABLED_FEATURES; + } +#if NSS_DEBUG IS_ENABLED + DBG_ScreenAPrintf( "ZLSS.Greg.Pool", WHERE, CYAN,"%s POOL features are 0x%Lx\n",WHERE,zlssPool->ZLSSPOOLenabledFeatures); +#endif + ZFSPOOL_UnloadPersistentPool( pool, 0 ); + } + else + { /* We display pool version only if the pool is not + * up to 40.06. This is done to help in any problems + * we have in the field. + */ + aprintf(NSS_POOL_COLOR,MSG(" Pool layout v%u.%02u\n", 918), + (unsigned int)super->hdr.SBH_VersionMediaMajor, + (unsigned int)super->hdr.SBH_VersionMediaMinor ); +#if NSS_DEBUG IS_ENABLED + DBG_ScreenAPrintf( "ZLSS.Greg.Pool", WHERE, CYAN,"%s POOL features set to DEFAULT 0x%Lx\n",WHERE,zlssPool->ZLSSPOOLenabledFeatures); +#endif + /** Not a POOL yet, so use default feature set. + ** Or in bits here so we do not WRITE over the + ** ZLSSPOOL_PERSISTENT_ELSEWHERE bit. + **/ + zlssPool->ZLSSPOOLenabledFeatures |= ZLSSPOOL_DEFAULT_ENABLED_FEATURES; +// zlssPool->ZLSSPOOLtotalBlocks = superHeader->hdr.totalblocks; + } +#if NSS_DEBUG IS_ENABLED + DBG_ScreenAPrintf( "ZLSS.Greg.Pool", WHERE, CYAN,"POOL features are 0x%Lx\n",zlssPool->ZLSSPOOLenabledFeatures); +#endif + + free( super ); +#if NSS_DEBUG IS_ENABLED + super = NULL; +#endif + + } + +#if NSS_DEBUG IS_ENABLED +/*--------------------------------------------------------------------------- + | See if they are requesting to RESET the pool. If so then do it now + *-------------------------------------------------------------------------*/ + if ( ResetPools && !resetAlready ) + { + ZFSPOOL_ShutdownPool( pool ); + aprintf(LRED,MSGNot(" ** Reinitializing pool\n")); + if (ZFSPOOL_InitNewPool(genMsg, storagepool) != zOK) + { + aprintf(LRED, MSGNot("Unable to reinitialize the pool \"%U\", status=%d."), + poolName, GetErrno(genMsg)); + RTN_PTR(NULL); + } + /* In the debug system were we allow /reset we must give the + * pool a new GUID because ZLOG does not initialize its file. + * ZLOG detects unwritten blocks when the internalID of the block + * does not match the internalID of the volume. Because the volume's + * volume ID is assigned in VOL_BstNew(), we do another + * VOL_BstNew() so that we get the correct volumeID into + * the pool beast. The internal ID is set to the volume ID below. + */ + resetAlready = TRUE; + goto ResetStart; + + } +#endif +/*--------------------------------------------------------------------------- + | Give control over to NSS + | + | Under normal conditions NSS will change the Pool's state + | to ACTIVE. + | During upgrade or force no upgrade we will override the + | pool's default policy. This is done by telling NSS which + | state we wish to go to. This is done if we are called with + | the ZLSS_PLF_NO_UPGRADE or ZLSS_PLF_UPGRADE option. In addition, + | during install of the OS we do not upgrade a volume except if + | asked via the ZLSS_PLF_UPGRADE switch. + *-------------------------------------------------------------------------*/ + forceState = FALSE; + if ( zlssPool->ZP_Version < + (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_4_DONE) ) + { /* A Pool that needs to be upgraded */ + if ( loadFlag == ZLSS_PLF_NO_UPGRADE) + { /* Loader does not wish to attempt an upgrade */ + forceState = TRUE; + state = zVOLSTATE_DEACTIVE; + if ( verbose ) + { + aprintf(YELLOW, + MSG(" Pool %U auto policies overridden by loader. Pool will be left DEACTIVE.\n",924 ), + poolName ); + } + } + else if (loadFlag == ZLSS_PLF_UPGRADE) + { /* Loader wishes to force an upgrade attempt */ + forceState = TRUE; + state = zVOLSTATE_ACTIVE; + if ( verbose ) + { + aprintf(YELLOW, + MSG(" Pool %U auto policies overridden during upgrade. Activation will be attempted.\n",925 ), + poolName ); + } + } + else if ( TRUE /*isInstallTime*/ ) /* FixFixFix() - Remove before ship when the GUID/NDS issues are resolved. By Having TRUE we force USER to UPGRADE VOLUMES */ + { /* Loader has no specical requests, but it is INSTALL + * time so we will not activate (let upgrade run). + */ + forceState = TRUE; + state = zVOLSTATE_DEACTIVE; + if ( verbose ) + { + aprintf(YELLOW, + MSG(" Pool %U auto policies overridden during install. Pool will be left DEACTIVE.\n",926 ), + poolName ); + } + } + } + + (void)COMN_PoolEvent( genMsg, &zlssPool->ZLSSPOOLpool, poolName, + VOLEVENT_LOAD, verbose, forceState, state ); + + X_LATCH( &pool->ZFSPOOLvol.stateLatch ); + state = pool->ZFSPOOLstate; /* The STATE that the pool got to */ + UNX_LATCH( &pool->ZFSPOOLvol.stateLatch ); + if ( state == zVOLSTATE_ACTIVE ) + { /* We only work if the pool is ACTIVE. */ + if ( zlssPool->ZP_Version < + (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_4_DONE) ) + { /* Media needs to be upgraded */ + status = ZLVAIPU_AIPU4001To4006( genMsg, pool, verbose ? VOLMODE_VERBOSE : 0 ); + /* The AIPU has unloaded the pool (errors or no errors) + * At most ZLVAIPU_AIPU4001To4006 has done one minor LV upgrade. + * The call to ZFSPOOL_LoadPool() below will recursively put + * us back to above routine to do another minor LV upgrade. + * This recursion is limited to Steps 2A, 2B, 3 and 4. Step + * 1 is done with Step 2A since it is a very simple step. + * In otherwords, ZLVAIPU_AIPU4001To4006() gets called FOUR + * times in the normal process of doing a Logical Volume Auto + * In Place Upgrade (LV AIPU). + */ + if ( status != zOK ) + { + errPrintf(WHERE, Module, 1466, + MSG("Volume \"%U\" could not be auto upgraded, status=%d from %s.\n" + "Use the command \"NSS /ZLSSVolumeUpgrade\" to retry upgrade.\n" + "If upgrade does not work use \"NSS /PoolAutoMaintenace=%U\"\n" + "and then \"NSS /PoolRebuild\" to upgrade the volume.", 978), + poolName, GetErrno(genMsg), GetErrnoSetter(genMsg), poolName ); + RTN_PTR(NULL); + } + /** + * A successful upgrade requires us to re-load the + * pool. This is because the upgrade has changed + * persistent block numbers and unloaded the 'old' pool. + * The upgade does not re-load the pool because of complexity + * of returning the new pool pointer. It is cleaner + * for us to re-load the pool here. We pass in VERBOSE + * of FALSE so that we do not get multiple messages that + * says we are upgrading the volume. + */ + pool = ZFSPOOL_LoadPool( genMsg, storagepool, FALSE, loadFlag, FALSE, NULL, FALSE ); + RTN_PTR(pool); + } + +#if NSS_DEBUG IS_ENABLED +/*--------------------------------------------------------------------------- + | See if they are requesting to RESET the pool. If so re-create a LV + *-------------------------------------------------------------------------*/ + if ( ResetPools ) + { /* We only work if the pool is ACTIVE. */ + aprintf(LRED,MSGNot(" ** Re-creating a single LV\n")); + X_LATCH( &pool->ZFSPOOLvol.v_pool->cvsLatch ); + (void)ZLSSVOL_LV_Create( genMsg, (Volume_s *)pool, + poolName, (QUAD)pool->ZFSPOOLtotalBlocks << pool->ZFSPOOLblockShift, + zFTYPE_ZAS_AUTH_MODEL, &zINVALID_GUID, FALSE, NULL, FALSE ); + UNX_LATCH( &pool->ZFSPOOLvol.v_pool->cvsLatch ); + (void)ZLSSVOL_LV_LoadAll( genMsg, pool, TRUE, FALSE, FALSE ); + } +#endif + } + else + { /** We did not activate - which is a problem ONLY if + ** we are trying to upgrade the volume. + **/ + /* Do not give big error messages IF + * 1) We are purposely not upgading OR + * 2) It is install time OR + * 3) The media has already been upgraded + */ + if ( (loadFlag != ZLSS_PLF_NO_UPGRADE) && + (!isInstallTime) && + (zlssPool->ZP_Version < + (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_4_DONE) ) ) + { /*** LV AIPU is designed to work only at load pool time. + *** The main issue here is that block numbers in the + *** superblock are changed during the upgrade. Therefore + *** if we can not upgrade we must fail the load so that + *** the upgrade can be attempted on the 'next' load. A + *** command will be added to let the user attempt a load. + *** This command is called "NSS /ZLSSPoolScan". + *** Generally, is a server is brought up with /autoact=none + *** then no LV AIPU will take place. + ***/ + switch ( state ) + { + case zVOLSTATE_UNKNOWN: + case zVOLSTATE_DEACTIVE: + errPrintf(WHERE, Module, 1480, + MSG("Volume \"%U\" could not be upgraded because the volume did not\n" + "activate. Use the command \"NSS /ZLSSVolumeUpgrade=%U\"\n" + "to retry upgrade. If upgrade still fails, use \"NSS /PoolRebuild\"\n" + "to rebuild and upgrade the volume.", 979), + poolName, poolName ); + break; + case zVOLSTATE_MAINTENANCE: + errPrintf(WHERE, Module, 1481, + MSG("Volume \"%U\" could not be upgraded because the volume was placed\n" + "into maintenance state. Use the command \"NSS /ZLSSVolumeUpgrade=%U\"\n" + "to retry upgrade or \"NSS /PoolRebuild\" to rebuild and upgrade\n" + "the volume.", 980), + poolName, poolName ); + break; + case zVOLSTATE_ACTIVE: + zASSERT("This can not happen"==NULL); + break; + default: + zASSERT("Some one added a STATE without changing the code"==NULL); + break; + } + } + } + + zlssPool->ZP_Loading = FALSE; + { + GeneralMsg_s dummyGenMsg; + STATUS status; + + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + status = ZLSS_addPoolMgmtFile( &dummyGenMsg, poolName); + zASSERT( status == zOK ); + } + RTN_PTR(pool); + +} /* End of ZFSPOOL_LoadPool() */ + + +/*- (FUNCTION) ----- ZFSPOOL_SetNewPoolSize() ------------------------------- + | + | change the size of the pool/group, new segment added to group with zfspool + | make the volume bigger without deactivating + | + | + | Change State Latch must be obtained by caller. The volume must be + | in ACTIVE state (or VERY close to it). + | + +-------------------------------------------------------------------------*/ +STATUS ZFSPOOL_SetNewPoolSize( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + StorPool_s *storagepool ) +{ + + ZFSMemorySuperBlk_s *super; + Extent_s seedext; + SQUAD tempstart; + SQUAD templen; + ZfsXaction_s *xaction; + VolInfoLog_s volLog; + STATUS status; + QUAD neededBlocks; /* Size that Rebuild file needs to be */ + ZlogBeast_s *zlogBeast; + ZlssPool_s *zlssPool; + + + ASSERT_MPKNSS_LOCK(); + zASSERT( pool != NULL ); + zASSERT( storagepool != NULL ); + zASSERT( pool->ZP_super != NULL ); + super = pool->ZP_super; + + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED,MSGNot("ZFSPOOL_SetNewPoolSize\n"))); + /*- find what's missing -*/ + if ( storagepool->pooltotalblocks < pool->ZFSPOOLtotalBlocks ) + { /* Something BAD is going on here */ + SetErrno( genMsg, zERR_BUFFER_TOO_SMALL ); + return( zFAILURE ); + } + if ( storagepool->pooltotalblocks >= UI64_CONST(0x80000000) ) + { /* ZLSS/NSS does not support 32-bit block numbers */ + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("ZFSPOOL_SetNewPoolSize pool size is NEGATIVE\n"))); + return( zOK ); + } + + tempstart = pool->ZFSPOOLtotalBlocks; + templen = storagepool->pooltotalblocks - tempstart; + if ( templen > 0 ) + { /* ZLSS pool has not accounted for segment addition so do it now */ + SQUAD blocksInNewPool; /* Number of blocks that are in the + * pool. + */ + Blknum_t block3n; /* Block number of the first block + * of super block 3n. + */ + Blknum_t block4n; /* Block number of super block 4n */ + + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("ZFSPOOL_SetNewPoolSize pool size changed by %Ld\n"), + templen)); + { + // The MM/MAL has been giving us blocks that are too big for the partition. + // We will attempt I/O to the last block and if we get an + // error we will reject the blocks given to us. This prevents + // us from hurting the ZLSS media by adding in blocks to the + // free tree that will only generated I/O errors. + BYTE *buffer; + STATUS status; + + buffer = LB_malloc( PAGE_SIZE ); + if ( buffer == NULL ) + { // We reject the blocks (for now) if we can not get 4K of memory + return( zOK ); + } + // Sync read of the LAST block in the NEW pool. + status = ZFSPOOL_SuperBlockRead( storagepool, buffer, + (Blknum_t)(storagepool->pooltotalblocks-1), 1 ); + if ( status != zOK ) + { + errPrintf( WHERE, Module, -1, + MSG("Pool not grown: Partition read error, status=%d. ", 615 ), status); + LB_free( buffer ); + return( zOK ); + } + // Verify that the blocks are not Read-Only + status = ZFSPOOL_SuperBlockWrite( storagepool, buffer, + (Blknum_t)(storagepool->pooltotalblocks-1), 1 ); + if ( status != zOK ) + { + errPrintf( WHERE, Module, -1, + MSG("Pool not grown: Partition write error, status=%d. ", 617 ), status); + LB_free( buffer ); + return( zOK ); + } + LB_free( buffer ); + } + /* calculate the block numbers of the new super blocks 3n, 4n */ + blocksInNewPool = storagepool->pooltotalblocks; + block3n = ZFSPOOL_SuperBlockMathematicalBlock( 3-1, blocksInNewPool ); + block4n = ZFSPOOL_SuperBlockMathematicalBlock( 4-1, blocksInNewPool ); + zASSERT( block4n > block3n ); + if ( block3n <= tempstart ) + { /* The add is too small, instead of giving an error and + * preventing activation we just do not grow the pool. + */ + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("ZFSPOOL_SetNewPoolSize pool size did NOT increased enough to move superblocks\n"))); + return( zOK ); + } + + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("ZFSPOOL_SetNewPoolSize pool size increased enough to move superblocks\n"))); + /*- Set up a transaction for this operation -*/ + xaction = BeginXLocal((void *)pool, BXL_DEFAULT); + /*- update the zfspool super block info -*/ + + /*- update the zfspool info -*/ + volLog.delta = templen; + pool->ZFSPOOLtotalBlocks = storagepool->pooltotalblocks; + volLog.action = VOLINFO_CHANGE_TOT_BLOCKS; +#ifdef USER_GPACHNER +#if NSS_DEBUG IS_ENABLED +// zASSERT( "Updating size of ZLSS Pool (this is normal if disk space has been added)"==NULL); +#endif +#endif + ZFSVOL_VOL_WriteVolumeLoggedData(&pool->ZFSPOOLvol, + &xaction->xaction, &volLog); + /* Now we add the number of blocks that the pool is increasing + * by to the 'inUseBlocks'. This is needed because we are + * adding the new space to the pool by freeing the new space. + * When we free the space the Free B-Tree will change the + * 'inUseSpace' back to the correct value. + */ +// pool->ZFSPOOLinUseBlocks += templen; + zlssPool = (ZlssPool_s *)pool->ZFSPOOLvol.v_pool; + zlssPool->ZLSSPOOLbookedInUseBlocks += volLog.delta; + pool->ZFSPOOLbookedInUseBlocks += volLog.delta; + + volLog.delta = templen; + volLog.action = VOLINFO_CHANGE_NUM_BLOCKS; + ZFSVOL_VOL_WriteVolumeLoggedData(&pool->ZFSPOOLvol, + &xaction->xaction, &volLog); + + /* Now we can give the new space to the Free B-Tree by + * simply freeing the blocks that are new. We exclude + * the two super blocks from this area. + */ + /*- update the zfspool free extent size -*/ +#if NSS_DEBUG IS_ENABLED + ClearErrno( genMsg ); /* To many people do not clear error no + for ASSERTs below to be legal */ +#endif + seedext.poolBlkNum = tempstart; + seedext.lengthOfExtent = templen - (blocksInNewPool-block3n); + zfsFreeExtent(genMsg, &pool->ZFSPOOLzfsVol, &seedext, xaction); + /* FixFixFix5( SPD 189303 ) - Need to handle errors. Mike says you + * can not get an error on frees (he will steal a + * block that is being freed if a split occurs). I saw + * error returns when the Free B-Tree gets an + * error from ZFS_ReadPoolBlk. Need to re-adjust some + * of the Volume counts we just updated. Maybe we + * should call COMN_AbortXLocal (really undo transaction) + */ + zASSERT(seedext.lengthOfExtent == 0); + zASSERT(GetErrno(genMsg) == zOK); + ClearErrno( genMsg ); + + seedext.poolBlkNum = block3n+SUPERBLOCK_BLKCOUNT; + seedext.lengthOfExtent = block4n - block3n - SUPERBLOCK_BLKCOUNT; + zfsFreeExtent(genMsg, &pool->ZFSPOOLzfsVol, &seedext, xaction); + zASSERT(seedext.lengthOfExtent == 0); + zASSERT(GetErrno(genMsg) == zOK); + ClearErrno( genMsg ); + + seedext.poolBlkNum = block4n+SUPERBLOCK_BLKCOUNT; + seedext.lengthOfExtent = blocksInNewPool - (block4n + SUPERBLOCK_BLKCOUNT); + zfsFreeExtent(genMsg, &pool->ZFSPOOLzfsVol, &seedext, xaction); + zASSERT(seedext.lengthOfExtent == 0); + zASSERT(GetErrno(genMsg) == zOK); + ClearErrno( genMsg ); + + status = ZFSPOOL_SuperBlockMove( genMsg, pool, + storagepool, xaction ); + if ( status != zOK ) + { + /* FixFixFix5( SPD 189303 ) - what to do on errors!!!! See previous + * fix fix fix item above. + */ + zASSERT( "FixFixFix5( SPD 189303 ) - not handling error condition" + == NULL ); + /* Be very carefull on errors because the super blocks have + * been changed and they will use the new blocks in the + * added area. + */ + + + } + EndXlocal(xaction); + + } + else + { /* ZLSS pool has already added space for this segment */ + status = zOK; + } + + zASSERT( pool->zfsLogBeast != NULL ); + zlogBeast = pool->zfsLogBeast; + zASSERT( zlogBeast != NULL ); + if ( zlogBeast != NULL ) + { + neededBlocks = storagepool->pooltotalblocks; + neededBlocks = ((neededBlocks * ZLOG_REBUILD_BYTES_PER_BLOCK) + + (1 << zlogBeast->ZFSLOGblkSizeShift)-1) + >> zlogBeast->ZFSLOGblkSizeShift; + neededBlocks += ZLOG_REBUILD_FUDGE_BLOCKS; +#if NSS_DEBUG IS_ENABLED +#ifdef USER_GPACHNER +neededBlocks *= 4; /* Just to create a rebuild file on 1 gig volume */ +#endif +#endif + status = zOK; + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot(" ZLOG has %ld blocks and Rebuild needs %ld blocks.\n"), + (unsigned long)zlogBeast->ZLB_NumberOfLogBlocks, + (unsigned long)neededBlocks )); + if ( neededBlocks > zlogBeast->ZLB_NumberOfLogBlocks ) + { /* ZLOG file does not have enough blocks */ + neededBlocks = neededBlocks - zlogBeast->ZLB_NumberOfLogBlocks; + + /* Put extra needed blocks in special rebuild file + * This is done because it is much easier to + * extend than the ZLOG file. + */ + status = ZLSSPOOL_RebuildFileExtend( genMsg, pool, + neededBlocks ); + } +#if NSS_DEBUG IS_ENABLED + else + { + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("Did not need to grow the Rebuild file as ZLOG file big enough\n"))); + } +#endif + } + if ( (zlogBeast == NULL) || (status != zOK) ) + { /* FixFixFix5( SPD 189303 ) - what to do on errors!!!! + * Unless we move + * everthing that has been allocated by other then we + * should just tell user that re-build may get + * mad at them and that they should run re-build to + * make the file bigger. In which case we should continue + * with the segment add. The re-build could be a simple + * switch that enlarges the rebuild file. + */ + zASSERT( "FixFixFix5( SPD 189303 ) - not handling error condition(OK to do go)" == NULL ); + ClearErrno( genMsg ); + status = zOK; + } + + return( status ); + +} + + + + + +/* + * ZFSPOOL_SetNewPoolSizeDuringActivation - + * Increase the size of the pool if storage size idicates that we + * should. This check is done during activation. The purpose is + * to allow the MAL to add segments when we are not active and + * for us to detect this so that we can use the space. + * + * Caller must - + * Own Change State Latched + * Be going to ACTIVE state. We must be very close to ACTIVE. + * For example, the Free B-Tree must be loaded. + * + */ + +STATUS ZFSPOOL_SetNewPoolSizeDuringActivation( + struct GeneralMsg_s *genMsg, + struct ZfsPool_s *pool ) + +{ + + StorPool_s *storagepool; + STATUS status; + + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("ZFSPOOL_SetNewPoolSizeDuringActivation\n"))); + ASSERT_MPKNSS_LOCK(); + storagepool = pool->storagepool; + zASSERT( storagepool != NULL ); + status = ZFSPOOL_SetNewPoolSize( genMsg, pool, storagepool ); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("ZFSPOOL_SetNewPoolSizeDuringActivation(exit)\n"))); + return( status ); + +} + + +/* + * ZFSPOOL_SetNewPoolSizeWhileActive - + * Increase the size of the pool if storage size idicates that we + * should. This fucntion will only work if ACTIVE. The purpose is + * to allow the MAL to add segments when we are ACTIVE + * so that we can use the space right away. + * + * Caller must - + * Not Own Change State Latched. + * + */ + +STATUS ZFSPOOL_SetNewPoolSizeWhileActive( + GeneralMsg_s *genMsg, + StorPool_s *storagepool ) + +{ + + ZfsPool_s *pool; + STATUS status; + + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("ZFSPOOL_SetNewPoolSizeWhileActive\n"))); + ASSERT_MPKNSS_LOCK(); + /*- if zfspool attached, then update the zfspool -*/ + pool = storagepool->zfspool; + if ( pool == NULL ) + { + SetErrno( genMsg, zERR_VOLUME_STATE_NOT_SUPPORTED ); + status = zFAILURE; + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("ZFSPOOL_SetNewPoolSizeWhileActive(exit)\n"))); + return( status ); + } + + X_LATCH( &pool->ZFSPOOLvol.stateLatch ); + + if ( pool->ZFSPOOLstate == zVOLSTATE_ACTIVE ) + { /* We only work if the pool is active ACTIVE state. */ + status = ZFSPOOL_SetNewPoolSize( genMsg, pool, storagepool ); + } + else + { + SetErrno( genMsg, zERR_VOLUME_STATE_NOT_SUPPORTED ); + status = zFAILURE; + } + + UNX_LATCH( &pool->ZFSPOOLvol.stateLatch ); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("ZFSPOOL_SetNewPoolSizeWhileActive(exit)\n"))); + return( status ); + +} + + +#if 0 + /** + * This is code that attempts to GROUP user buffer writes + * so that we get some better elevator performance. On initial + * testings this actually slowed down Roger's 5000 4K file + * restore. When I have time I will re-address so to find out + * why. My guess is that 'IMPORTANT' I/Os got queued up after + * the users blocks so we actually slowed down. After the + * COPY ON XLATCH code is complete I will try again. Reads will + * still be delayed though. + */ +#define ZLOG_COMBINE 100 +Agent_s *gJunk[ZLOG_COMBINE]; +int gJunkIndex = 0; +int gJunkWait = TRUE; /* Set to true when we should not combine writes*/ + +#define TT_NOT_RUNNING 0 +#define TT_RUNNING 1 +#define TT_ABORT_REQUEST 2 + + +int gNSSUserBlocksState = TT_NOT_RUNNING; +QUAD gNSSUserBlocksCount = 0; +Time_t gNSSUserBlocksTimeLast; +Time_t gNSSUserBlocksTimeAssert = 20; /* Number of seconds before asserting */ +Time_t gNSSUserBlocksTimeMax; /* Max seconds */ +Time_t gNSSUserBlocksTimeMaxTime; /* When max occurred */ + + +void ZFS_UserBlocksWorkToDoRoutine( FsmLite_s *workToDoFsm ) +{ + + WORK_PROCESS_INIT(); +// ++gNSSUserBlocksCount; +// if ( gNSSUserBlocksCount < 2 ) + { +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(RED,MSGNot("User Blocks is running ...\n")); +#endif + } + LB_delay( 300 ); + ZFS_WriteDelayedUsers(); + gNSSUserBlocksState = TT_NOT_RUNNING; + return; + +} /* End of ZFS_UserBlocksWorkToDoRoutine() */ + + +void ZFS_WriteDelayedUsers() + +{ + + int mass; + + ENTER(TBOND, ZFS_WriteDelayedUsers); + if ( gJunkWait == FALSE ) + { + gJunkWait = TRUE; + for ( mass=0; mass < gJunkIndex; mass++ ) + { +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(RED,MSGNot("Timer FIRE User Blocks agent %ld(%d)...\n"),gJunk[mass],mass); +#endif + ZFS_DoBlockWriteFromSignal( gJunk[mass], NULL /*, 1 */); + } + gJunkIndex = 0; + gJunkWait = FALSE; + } + RTN_VOID(); + +} /* End of ZFS_WriteDelayedUsers() */ + +#endif /* #if 0 */ + + +/************************************************************************** + * This is called to actually write a buffer to the media. This routine + * can be called from only one of two different ways: + * + * DefaultFlush: Someone is explicitly requesting that this buffer be + * written now. + * + * DefaultSignal: Someone is signaling us that this before may be written + * now. + * + * The DefaultFlush case always cancels a timer so if a timer is set then + * we don't have to flush now so we always decide not to. + ***************************************************************************/ +void ZFS_BlockSignalHandler( + Agent_s *agent) +{ + ENTER(TBOND, ZFS_BlockSignalHandler); + ASSERT_MPKNSS_LOCK(); + + zASSERT(!ONESHOT_SET(agent->timer)); + + +#if 0 + if ( gJunkWait == FALSE ) + { + + RootBeast_s *beast; + Buffer_s *buf; + + buf = STRUCT(agent,Buffer_s,agent); + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + /* FixFixFix (may wish to change to if no one is waiting on agent!!! */ + if ( !ZLSS_IS_SYSTEM_BLOCK( beast, buf ) ) + { // User blocks only + gJunk[gJunkIndex] = agent; + cachePrepareToFlush(buf); +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(RED,MSGNot("DELAY User Blocks agent %ld(%d)...\n"),agent,gJunkIndex); +#endif + gJunkIndex += 1; + if ( gNSSUserBlocksState == TT_NOT_RUNNING ) + { + gNSSUserBlocksState = TT_RUNNING; + WORK_Schedule( &gZLSSUserBlocksWorkToDoFsm, + ZFS_UserBlocksWorkToDoRoutine, 0); + } + + if ( gJunkIndex >= ZLOG_COMBINE ) + { + int mass; + gJunkWait = TRUE; + for ( mass=0; mass < gJunkIndex; mass++ ) + { +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(RED,MSGNot("FIRE User Blocks agent %ld(%d)...\n"),gJunk[mass],mass); +#endif + ZFS_DoBlockWriteFromSignal( gJunk[mass], NULL/*, 1*/ ); + } + gJunkIndex = 0; + gJunkWait = FALSE; + RTN_VOID(); + } + RTN_VOID(); + } + } +#endif + + ZFS_DoBlockWriteFromSignal(agent,NULL /*,0*/); + RTN_VOID(); +} + + +/************************************************************************** + * This will do a flush of the system beasts. + ***************************************************************************/ +STATUS ZFSPOOL_VOL_FlushSystemBeasts( + GeneralMsg_s *genMsg, + void *pool_LX) +{ + ZfsPool_s *pool = (ZfsPool_s *)pool_LX; + STATUS status; + STATUS status2; + statusfunc_t parentFunc; + ASSERT_MPKNSS_LOCK(); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_FlushSystemBeasts\n"))); + parentFunc = COMN_GetNextParentVolumeComnOp(pool->ZFSPOOLbeastClass, + COMNVOLOPS_INDEX(VOL_flushSystemBeasts),ZFSPOOL_VOL_FlushSystemBeasts); + + zASSERT(parentFunc != NULL); + + // Is pool correct or do we have to go to volume (not zfsvolume)? + status = parentFunc( genMsg, pool ); + if ( status != zOK ) + { /* Remember the REAL error */ + status = GetErrno( genMsg ); + ClearErrno( genMsg ); + } + + status2 = ZFSPOOL_DoFlushSystemBeasts(genMsg,pool,FALSE); + /* If our flush did not get and error, but the parent call + * did then return the parent error. + */ + if ( (status2 == zOK) && (status != zOK) ) + { + SetErrno( genMsg, status ); + return( zFAILURE ); + } + /* Return our error or zOK if use and parents both returned + * zOK. + */ + return( status ); + +} + +#if SUPERBLOCK_SB_VM_MEDIA_MAJOR != 43 +#error "You must determine how new media will be supported" +#error "Media Changes may also require changes to Rebuild and Verify!!!" +#endif +/* + * ZFSPOOL_PoolLoadSupported() - + * Determines if a pool is supported by load. Note this also + * defines the pools that rebuild can load. + * + * Returns - + * TRUE if the pool has a media version that ZLSS supports. + * + * Notes - + * Media versions 37 and 38 are not in the list below + * because they do not exist. If they are added to + * COBRA/MOAB Service Pack then 6Pack/Nakoma will need + * to add support. + * Media versions 41 and 42 are not in the list below + * because they do not exist. If they are added to a 6Pack + * Service Pack then Nakoma will need to add support. + * 6Pack can only use media version 40.09 to 40.49 if + * the shipping version of Nakoma can handle. Otherwise, + * 6Pack needs to change to 40.50 through 40.83. + * Nakoma is using 40.84 through 40.88. + */ +BOOL ZFSPOOL_PoolLoadSupported( MediaSuperBlockHeader_s *superHeader ) +{ + + if ( (superHeader->hdr.SBH_VersionMediaMajor == 34) || + (superHeader->hdr.SBH_VersionMediaMajor == 35) || + (superHeader->hdr.SBH_VersionMediaMajor == 36) || + (superHeader->hdr.SBH_VersionMediaMajor == 39) || + (superHeader->hdr.SBH_VersionMediaMajor == AIPU_NAKOMA_MEDIA_MAJOR) || + ( (superHeader->hdr.SBH_VersionMediaMajor == AIPU_LV_MEDIA_MAJOR) && + ( (superHeader->hdr.SBH_VersionMediaMinor < 50) || + (superHeader->hdr.SBH_VersionMediaMinor == 84) || + (superHeader->hdr.SBH_VersionMediaMinor == 85) || + (superHeader->hdr.SBH_VersionMediaMinor == 86) || + (superHeader->hdr.SBH_VersionMediaMinor == 87) || + (superHeader->hdr.SBH_VersionMediaMinor == 88) + ) + ) + ) + { + return( TRUE ); + } + + return( FALSE ); + +} /* End of ZFSPOOL_PoolLoadSupported() */ + + +/* + * ZFSPOOL_PoolVerifySupported() - + * Verify supports a sub-set of the pools that ZLSS can load. + * + * Returns - + * TRUE if verify can verify the pool. + */ +BOOL ZFSPOOL_PoolVerifySupported( MediaSuperBlockHeader_s *superHeader ) +{ + + if ( (superHeader->hdr.SBH_VersionMediaMajor == AIPU_NAKOMA_MEDIA_MAJOR) || + ( (superHeader->hdr.SBH_VersionMediaMajor == AIPU_LV_MEDIA_MAJOR) && + (superHeader->hdr.SBH_VersionMediaMinor >= AIPU_LV_STEP_4_DONE) + ) + ) + { + return( TRUE ); + } + + return( FALSE ); + +} /* End of ZFSPOOL_PoolVerifySupported() */ + + +/************************************************************************** + * ZFSPOOL_LoadPersistentPool() - + * This is the routine that is called to load and validate the + * superblocks, LPDB, PDB, VDB and LVDB of a pool. In some cases + * we will attempt to repair the superblocks. + * + * Notes - + * Before we repair the superblocks we will ensure that the pool + * is not SHARED (part of a cluster). This is ONLY done if the clustering + * software is not present. If the clustering software is running then + * it would have rejected the Change Pool State if this pool is being + * used by another server in the cluster. + ***************************************************************************/ +STATUS ZFSPOOL_LoadPersistentPool( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + NINT mode, /* Must be VOLMODE_xxx define */ + NINT flags, /* Must be LPP_FLAGS_xxx define (or 0) */ + BOOL ioStateTransition, + NINT requestedState ) + +{ + MediaSuperBlockHeader_s *superHeader; + STATUS status; + StorPool_s *storagepool; + ZFSMemorySuperBlk_s *super; + ZlssPool_s *zlssPool; + typedef struct Stack_s { + GeneralMsg_s dummyGenMsg; + unicode_t poolName[zMAX_COMPONENT_NAME]; + } Stack_s; + + STACK_ALLOC(); + + COMN_SETUP_GENERAL_MSG_NOSA( &aStack->dummyGenMsg ); + + ASSERT_MPKNSS_LOCK(); + ENTER(TZPOOL, ZFSPOOL_LoadPersistentPool); + zlssPool = (ZlssPool_s *)pool->ZFSPOOLvol.v_pool; + zASSERT( zlssPool != NULL ); + + /* If either of this ASSERTs occur someone added/removed an + * item to the structure(s) without adjusting the reserved area + * to ensure that this structures are 256 bytes long. + */ + zASSERT( sizeof( PersistentPool_s ) == 256 ); + zASSERT( sizeof( LoggedPersistentPool_s ) == 256 ); + zASSERT( sizeof( PersistentZlssPool_s ) == 256 ); + zASSERT( sizeof( LoggedPersistentZlssPool_s ) == 256 ); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_LoadPersistentPool\n"))); +/*--------------------------------------------------------------------------- + | Get the ZLSS volume's data block + *-------------------------------------------------------------------------*/ + + /* The super block is normally freed in ZFSPOOL_UnloadPersistentPool */ + super = (ZFSMemorySuperBlk_s *)zalloc(sizeof(ZFSMemorySuperBlk_s)); + if ( super == NULL ) + { + SetErrno(genMsg,zERR_NO_MEMORY); + STACK_FREE(); + RTN_STATUS( zFAILURE ); + } + superHeader = &super->SB_Header; + storagepool = pool->storagepool; + zASSERT( storagepool != NULL ); + zASSERT( storagepool->phypool != NULL ); + zASSERT( storagepool->poolname[0] != '\0' ); + + status = ZFSPOOL_SuperBlockHeaderValidate(genMsg, pool->storagepool, superHeader); + if (status != zOK) + { /* User will have to re-configure group */ + + ASSERT_MPKNSS_LOCK(); + /* Get the pools name */ + COMN_GetVolumeName(&aStack->dummyGenMsg,(Volume_s *)pool,aStack->poolName,NELEMS(aStack->poolName)); + + errPrintf(WHERE, Module, 1444, + MSG("Volume \"%U\" in pool %ld " + "contains invalid super block headers, status=%d.\n" + "Run Verify.", 438), + aStack->poolName, 0/*storagepool->poolid*/ ,GetErrno(genMsg)); + free( super ); + ZFSPOOL_UnloadPersistentPool( pool, mode ); + + STACK_FREE(); + RTN_STATUS( zFAILURE ); + } + /** + * This is an attempt to catch SNAPSHOT being turned + * on/off from another cluster. In the end, it would be much + * better for the POOL to be shutdown and reloaded so + * that the snapshot and the shared bit are correct without + * having special cluster code in ZLSS. I.E. if someone + * in the cluster modifies the snapshot or shared features + * then the other servers should shutdown and reload the pool. + */ + if ( superHeader->hdr.SBH_SS_Enabled == ZLSS_SBH_SNAPSHOT_ENABLED ) + { + zlssPool->ZP_Snapshot = TRUE; + zASSERT( LB_GUIDValidate( &superHeader->hdr.SBH_SS_Guid ) ); + zlssPool->ZP_SnapshotID = superHeader->hdr.SBH_SS_Guid; + zlssPool->ZLSSPOOLenabledFeatures |= zPOOL_FEATURE_SNAPSHOT; + } + else + { + zlssPool->ZP_Snapshot = FALSE; + zlssPool->ZP_SnapshotID = zINVALID_GUID; + zlssPool->ZLSSPOOLenabledFeatures &= ~zPOOL_FEATURE_SNAPSHOT; + } + zlssPool->ZP_MSAPBlock = superHeader->hdr.superlocation[MSAP_SUPERBLOCK_NUMBER] + MSAP_SUPERBLOCK_SECTION; + +/*--------------------------------------------------------------------------- + | See if understand the media format that is stored in super block header. + | FixFixFix6 - we may wish to do sooner (I.E. in validate above) + *-------------------------------------------------------------------------*/ + if ( mode & VOLMODE_VERBOSE ) + { + aprintf(NSS_POOL_COLOR,MSG(" ** Pool layout v%u.%02u\n", 434), + (unsigned int)superHeader->hdr.SBH_VersionMediaMajor, + (unsigned int)superHeader->hdr.SBH_VersionMediaMinor ); +#if NSS_DEBUG IS_ENABLED + aprintf(CYAN,MSGNot(" ** (DEBUG) Pool media version %2d.%02d(when pool original created)\n"), + (unsigned int)superHeader->hdr.SBH_VersionMediaMajorCreate, + (unsigned int)superHeader->hdr.SBH_VersionMediaMinorCreate ); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot(" ** (DEBUG) Pool media version %2d.%02d(when pool original created)\n"), + (unsigned int)superHeader->hdr.SBH_VersionMediaMajorCreate, + (unsigned int)superHeader->hdr.SBH_VersionMediaMinorCreate )); + { + char buffer[40]; + + aprintf(CYAN,MSGNot(" ** (DEBUG) LV AIPU time %s\n"), + UTCTime2Str(superHeader->hdr.SBH_PoolToLVEndUTC,&buffer[0]) ); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot(" ** (DEBUG) LV AIPU time %s\n"), + UTCTime2Str(superHeader->hdr.SBH_PoolToLVEndUTC,&buffer[0]) )); + } +#endif + } + if ( !ZFSPOOL_PoolLoadSupported( superHeader ) ) + { /* ZLSS does not support the Pool Media Version */ + if ( !(mode & VOLMODE_VERBOSE) && !(flags & LPP_FLAGS_NO_VERSION_MESSAGES) ) + { + aprintf(NSS_POOL_COLOR,MSG(" ** Volume layout v%u.%02u\n", 435), + (unsigned int)superHeader->hdr.SBH_VersionMediaMajor, + (unsigned int)superHeader->hdr.SBH_VersionMediaMinor ); + } + if ( !(flags & LPP_FLAGS_NO_VERSION_MESSAGES) ) + { + COMN_GetVolumeName(&aStack->dummyGenMsg,(Volume_s *)pool,aStack->poolName,NELEMS(aStack->poolName)); + + aprintf(LRED,MSG(" ** Volume \"%U\" not activated.\n", 867), aStack->poolName); + errPrintf(WHERE, Module, 1445, + MSG("ZLSS supports volume layout v%u.%02u, \"%U\" is v%u.%02u.\n" + "Run the correct NSS version, or recreate this volume.\n", 437), + SUPERBLOCK_SB_VM_MEDIA_MAJOR, SUPERBLOCK_SB_VM_MEDIA_MINOR, + aStack->poolName, + (unsigned int)superHeader->hdr.SBH_VersionMediaMajor, + (unsigned int)superHeader->hdr.SBH_VersionMediaMinor ); + } + free( super ); + ZFSPOOL_UnloadPersistentPool( pool, mode ); + SetErrno(genMsg,zERR_SUPERBLOCK_UNSUPPORTED_MEDIA); + STACK_FREE(); + RTN_STATUS( zFAILURE ); + } +#if NSS_DEBUG IS_ENABLED + if ( !(mode & VOLMODE_VERBOSE) ) + { + aprintf(CYAN,MSGNot(" ** (Debug)Pool layout v%u.%02u\n"), + (unsigned int)superHeader->hdr.SBH_VersionMediaMajor, + (unsigned int)superHeader->hdr.SBH_VersionMediaMinor ); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot(" ** (Debug)Pool layout v%u.%02u\n"), + (unsigned int)superHeader->hdr.SBH_VersionMediaMajor, + (unsigned int)superHeader->hdr.SBH_VersionMediaMinor )); + } +#endif + + /* The super blocks checked out OK, so clear the repair's + * flag. + */ + pool->ZP_RepairFlags &= ~ZP_REPAIRFLAGS_REBUILD_POOL_SUPER_BLOCKS; + + zASSERT( superHeader->hdr.SBH_LoggedVolumeDataBlk != 0 ); + pool->ZFSPOOLzfsvolp.PZV_loggedVolumeDataBlk = superHeader->hdr.SBH_LoggedVolumeDataBlk; + pool->ZFSPOOLzfsvol.ZV_loggedVolumeDataBlk = superHeader->hdr.SBH_LoggedVolumeDataBlk; + zASSERT( superHeader->hdr.SBH_VolumeDataBlk != 0 ); + pool->ZFSPOOLzfsvolp.PZV_volumeDataBlk = superHeader->hdr.SBH_VolumeDataBlk; + pool->ZFSPOOLzfsvol.ZV_volumeDataBlk = superHeader->hdr.SBH_VolumeDataBlk; + zASSERT( superHeader->hdr.SBH_SystemBeastBlkNum != 0 ); + pool->ZFSPOOLzfsvolp.PZV_systemBeastBlkNum = superHeader->hdr.SBH_SystemBeastBlkNum; + pool->ZFSPOOLzfsvol.ZV_systemBeastBlkNum = superHeader->hdr.SBH_SystemBeastBlkNum; + + /* Set up OLD volume ID because it is only stored in the + * superblock. This is helpful in that rebuild never + * blows away the superblock unlike the Pool's Pool Data + * Blocks (LPDB and PDB) and Volume Data Blocks (LVDB and VDB). */ + pool->ZFSPOOLoldInternalID = superHeader->hdr.SBH_OldInternalID; + + + /** + * Pool data blocks do not exist until media 40.03 so skip + * reading pool blocks until then. To get around a low space + * ALERT when upgrading the media to 40.01 the total block + * item is initialized to the value stored in the super block. + */ + + zlssPool->ZP_Version = (superHeader->hdr.SBH_VersionMediaMajor * 0x100L) + + superHeader->hdr.SBH_VersionMediaMinor; +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(CYAN,"%s Pool media version is 0x%lx\n",WHERE,zlssPool->ZP_Version); +#endif + if ( zlssPool->ZP_Version >= (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_PDB_DONE) ) + { + zASSERT( superHeader->hdr.SBH_LoggedPoolDataBlk != 0 ); + zlssPool->ZP_p.PZP_loggedPoolDataBlk = superHeader->hdr.SBH_LoggedPoolDataBlk; + zASSERT( superHeader->hdr.SBH_PoolDataBlk != 0 ); + zlssPool->ZP_p.PZP_poolDataBlk = superHeader->hdr.SBH_PoolDataBlk; + status = ZLSSPOOL_ReadPersistentPoolData( genMsg, zlssPool ); + if ( status != zOK ) + { + free( super ); + ZFSPOOL_UnloadPersistentPool( pool, mode ); + STACK_FREE(); + RTN_STATUS( zFAILURE ); + } + } + else + { + zlssPool->ZLSSPOOLtotalBlocks = superHeader->hdr.totalblocks; + } + +/*--------------------------------------------------------------------------- + | Now get the ZLSS volume's persistent data + *-------------------------------------------------------------------------*/ + status = ZFSVOL_ReadPersistentVolumeData( genMsg, &pool->zfsVol ); + if ( status != zOK ) + { + free( super ); + ZFSPOOL_UnloadPersistentPool( pool, mode ); + STACK_FREE(); + RTN_STATUS( zFAILURE ); + } + pool->zfsVol.ZFSVOLactivationCount += 1; + +#if NSS_DEBUG IS_ENABLED + aprintf(CYAN,MSGNot(" ** (DEBUG) Internal volume layout v%lu.%02lu\n"), + pool->zfsVol.ZLSSVOLmediaFormatMajor, + pool->zfsVol.ZLSSVOLmediaFormatMinor ); + aprintf(CYAN,MSGNot(" ** (DEBUG) Internal volume creation layout v%lu.%02lu\n"), + pool->zfsVol.ZLSSVOLmediaFormatMajorCreate, + pool->zfsVol.ZLSSVOLmediaFormatMinorCreate ); + DBG_DebugPrintf(CYAN, MSGNot("%s activation count %ld of volume 0x%lx\n"), + WHERE, pool->zfsVol.ZFSVOLactivationCount, &pool->zfsVol); +#endif + zASSERT( LB_GUIDValidate( &pool->zfsVol.ZLSSVOLvolumeID ) ); +// zASSERT( LB_GUIDCompare( &pool->zfsVol.p.PZV_volumeID, &pool->zfsVol.ZLSSVOLvolumeID ) == 0 ); + /* Ensure that the duplicate copy of the GUID is in the VDB. + * Specifically this helps the LV delete code because the delete + * code does not update the GUID in the VDB. Note that the GUID + * in the VDB is really a backup that re-link could use. Really + * not an issue with Internal Volumes as they do not get deleted + * in the same since. Have here to be consistent. + */ + pool->zfsVol.p.PZV_volumeID = pool->zfsVol.ZLSSVOLvolumeID; + + +#if SUPERBLOCK_SB_VM_MEDIA_MAJOR != 43 +#error "Check to see if we still need to do this conversion" +#endif + if (superHeader->hdr.SBH_VersionMediaMajor < AIPU_1ST_6PACK_MEDIA_VERSION) + { +#if ZLSS_DEFAULT_ENABLED_ATTRIBUTES != (zATTR_SALVAGE | \ + zATTR_EXTENDED_ATTRIBUTES | zATTR_DATA_STREAMS | zATTR_DOS_METADATA | \ + zATTR_NETWARE_METADATA | zATTR_MAC_METADATA | zATTR_UNIX_METADATA | \ + zATTR_TRANSACTION | zATTR_SPARSE_FILES | \ + zATTR_VERIFY | zATTR_REBUILD | zATTR_PHYSICAL_EOF | zATTR_DIRECT_IO | \ + zATTR_PERSISTENT_ATTRIBUTES | zATTR_EXTENDED_MAC_NAMESPACE) +#error Check this code out to see if your new define needs to be ORed out like zATTR_SALVAGE +#endif + if (pool->ZFSPOOLenabledAttributes == 0) + { + /* Supply defaults for all but SALVAGE, ... */ + pool->ZFSPOOLenabledAttributes = (ZLSS_DEFAULT_ENABLED_ATTRIBUTES & + ~zATTR_SALVAGE ); + + /* Get SALVAGE from old attributes. No other attribute bits were + * valid in the old volAttributes, and in fact there was a bug where + * the other bits could have been total GARBAGE. That is why we + * mask out the salvage bit and ignore the rest of the old bits. */ + /* in SP4 we fixed the volume to not have garbage bits and set + * the state attributes bit VOLSTATEATTR_FIXEDATTR after fixing + * the volume. Copy all bits from oldVolAttributes if the volume + * has been fixed, otherwise copy only the salvage bit + */ + if (pool->ZFSPOOLvol.p.stateAttributes & VOLSTATEATTR_FIXEDATTR) + { + pool->ZFSPOOLenabledAttributes |= + pool->ZFSPOOLoldVolAttributes; + } + else + { + pool->ZFSPOOLenabledAttributes |= + (pool->ZFSPOOLoldVolAttributes & zATTR_SALVAGE); + } + + /* Zero the old LONG so in the future we can reuse it for something + * else. */ + pool->ZFSPOOLoldVolAttributes = 0; + } + } + /** + * This is cheap upgrade code. We fill in the VDB copy + * of the volume ID from the superblock. The superblock + * volume ID was made correct in ZFSPOOL_LoadPool. + */ + if ( !LB_GUIDValidate( &pool->ZFSPOOLzfsvolp.PZV_volumeID ) ) + { /* As long as the VDB GUID is invalid place the + * super block one into it. We do this as we don't + * go out of are way to write the PZV so if we + * crash the PZV may not have been made persistent. + */ + pool->ZFSPOOLzfsvolp.PZV_volumeID = superHeader->hdr.SBH_PoolID; + } + pool->ZFSPOOLsupportedAttributes = ZLSS_SUPPORTED_ATTRIBUTES; + pool->ZFSPOOLmaximumFileSize = UI64_CONST(0xFFFFFFFFFFFFFFFF); + zASSERT( LB_GUIDValidate( &pool->ZFSPOOLvolumeID ) ); + zASSERT( LB_GUIDValidate( &pool->ZFSPOOLzfsvolp.PZV_volumeID ) ); + + if ( ioStateTransition ) + { + status = ZFSPOOL_CheckSharedPool( genMsg, pool, mode, requestedState ); + if ( status != zOK ) + { + free( super ); + ZFSPOOL_UnloadPersistentPool( pool, mode ); + STACK_FREE(); + RTN_STATUS(zFAILURE); + } + status = MSAP_MSAPActivate( zlssPool, genMsg ); + if ( status != zOK ) + { + free( super ); + ZFSPOOL_UnloadPersistentPool( pool, mode ); + STACK_FREE(); + RTN_STATUS( zFAILURE ); + } + } + /** + * We must write the volume data information before playing + * the purge log because if we crash we do not want to + * mistakenly play a old purge item. Also if our activation + * count does not get written then we will re-use the same one + * if we crash. This would cause us to not play PL entries + * that should be played. See the purge log play code for + * information about how it uses the activation count. + */ + /* In rebuild, we are loading the pool manually and dont play the purge log, + * we can't write the data blocks out yet because we have not yet mapped the + * blocks in the system, but we will write it later after we map the blocks + * and know which blocks we can use for the pool's data blocks + */ + if( !(pool->ZP_RepairFlags & ZP_REPAIRFLAGS_MANUAL_POOL_LOAD) ) + { + ZFSVOL_WritePersistentVolumeData(&pool->zfsVol); + } + + /* Point POOL to super block's memory space */ + pool->ZP_super = super; + STACK_FREE(); + RTN_STATUS( zOK ); + +} + +/* + * + * Not allowed to return an error. FixFixFix5( SPD 189303 )(Greg,Today) - needs to tell caller + * of ERROR but needs to clean up anyway (free all resources). + */ + +void ZFSPOOL_UnloadPersistentPool( + ZfsPool_s *pool, + NINT mode ) + +{ + QUAD features; + ZlssPool_s *zlssPool; + + ENTER(TZPOOL, ZFSPOOL_UnloadPersistentPool); + ASSERT_MPKNSS_LOCK(); + + zlssPool = (ZlssPool_s *)pool->ZFSPOOLvol.v_pool; + zASSERT( zlssPool != NULL ); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSPOOL_UnloadPersistentPool\n"))); + ASSERT_MPKNSS_LOCK(); + X_LATCH(&pool->ZFSPOOLbeastLatch); + pool->ZFSPOOLroot.bstState |= BST_STATE_FULL_FLUSH; + cacheFlushMyCacheBufs(&pool->ZFSPOOLmycache ); + UNX_LATCH(&pool->ZFSPOOLbeastLatch); + defaultFlushWait(&pool->ZFSPOOLmycache.agent ); + cacheTossAll(&pool->ZFSPOOLmycache ); + + /* The ZLSS Pool is used to do I/O on the logged and non-logged + * pool data blocks. + */ + X_LATCH(&zlssPool->ZLSSPOOLbeastLatch); + zlssPool->ZLSSPOOLroot.bstState |= BST_STATE_FULL_FLUSH; + cacheFlushMyCacheBufs(&zlssPool->ZLSSPOOLmycache ); + UNX_LATCH(&zlssPool->ZLSSPOOLbeastLatch); + defaultFlushWait(&zlssPool->ZLSSPOOLmycache.agent ); + cacheTossAll(&zlssPool->ZLSSPOOLmycache ); + + /* Free works if passed a NULL pointer */ + free( pool->ZP_super ); + pool->ZP_super = NULL; // Required because of ZfsPool destructor + + /* + * pool->zfsVol.p.PZV_loggedVolumeDataBlk = 0; + * pool->zfsVol.p.PZV_volumeDataBlk = 0; + * zlssPool->ZP_p.PZP_loggedPoolDataBlk = 0; + * zlssPool->ZP_p.PZP_poolDataBlk = 0; + * + * Zero all the persistent data - required for four commented + * out lines above, but a very good idea for all other fields. + * Note that we will not write either volume data block if + * their block number is 0. + * + * As of September 22, 2000 we always have the POOL's feature + * available no matter the STATE of the pool. Therefore, save + * off and restore around the bzeros below. + */ + + + features = zlssPool->ZLSSPOOLenabledFeatures; + + /* ZfsVolume_s persistent information */ + bzero( &pool->ZFSPOOLzfsvolp, sizeof( pool->ZFSPOOLzfsvolp) ); + bzero( &pool->ZFSPOOLzfsvollogged, sizeof( pool->ZFSPOOLzfsvollogged) ); + /* Volume_s persistent information */ + bzero( &pool->ZFSPOOLvol.p, sizeof( pool->ZFSPOOLvol.p) ); + bzero( &pool->ZFSPOOLvol.logged, sizeof( pool->ZFSPOOLvol.logged) ); + + /* ZlssPool_s persistent information */ + bzero( &zlssPool->ZP_p, sizeof( zlssPool->ZP_p ) ); + bzero( &zlssPool->ZP_logged, sizeof( zlssPool->ZP_logged ) ); + /* ZlssPool_s persistent information */ + bzero( &zlssPool->ZLSSPOOLpool.p, sizeof( zlssPool->ZLSSPOOLpool.p ) ); + bzero( &zlssPool->ZLSSPOOLpool.logged, sizeof( zlssPool->ZLSSPOOLpool.logged ) ); + + zlssPool->ZLSSPOOLenabledFeatures = features; + + RTN_VOID( ); + +} + + +/* ZFSPOOL_VOL_CommandFunction() - + * This is the Internal Volume's version of VOL_commandFunction. + * We simply say we support no functions at this time. This is + * very important as we do not want the internal volume to be renamed + * OR deleted. + * + */ + +STATUS ZFSPOOL_VOL_CommandFunction( + GeneralMsg_s *genMsg, + void *beast_LX, /* We inherit useCount */ + NINT functionNumber, + VCO_VolumeCommonOps_s *pCD, /*parsedCommandData */ + NINT parmLen, + utf8_t *parm, + NINT dataLen, + BYTE *commandData, + NINT offset, + NINT retBufLen, + BYTE *retBuf, + NINT *retLen) + +{ + ZfsPool_s *beast = (ZfsPool_s *)beast_LX; + statusfunc_t parentFunc; + STATUS status; + + zASSERT( COMN_IsDerivedFrom(beast, zFTYPE_ZLSS_ZFSPOOL) ); +#ifdef USER_GPACHNER + aprintf(LRED,"%s\n",WHERE); +#endif + switch( functionNumber ) + { + case VCO_VOLUME_CREATE_NUMBER: + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + case VCO_VOLUME_RENAME_NUMBER: + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + case VCO_VOLUME_DELETE_NUMBER: /*** Never let the Internal Volume support this ***/ + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + case VCO_VOLUME_DELETE_ACTION_NUMBER: + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + case VCO_VOLUME_GET_INFO_NUMBER: + /* Even though we have no data to return our derived from objects may */ + ASSERT_LATCH( &beast->ZFSPOOLvol.stateLatch ); + COMN_USE_BEAST( &beast->ZFSPOOLroot ); /* Get a use count for parent */ + parentFunc = COMN_GetNextParentVolumeComnOp( beast->ZFSPOOLbeastClass, + COMNVOLOPS_INDEX(VOL_commandFunction), + ZFSPOOL_VOL_CommandFunction ); + status = parentFunc( genMsg, beast, functionNumber, + pCD, parmLen, parm, dataLen, commandData, + offset, retBufLen, retBuf, retLen ); + break; + default: + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + zASSERT("Not implemented"==NULL); + break; + } + COMN_Release( &beast ); + return( status ); + +} /* End of ZFSPOOL_VOL_CommandFunction() */ + + +/************************************************************************** + * ZFSPOOL COMMON BEAST OPERATIONS definition + ***************************************************************************/ + +/*------------------------------------------------------------------------- + * Defines all of the ZFS pool beast operations + *-------------------------------------------------------------------------*/ +CommonBeastOps_s ZFSPOOL_ComnBeastOps = +{ + ZFSPOOL_BST_Construct, /* construct */ + ZFSPOOL_BST_Destruct, /* destruct */ + +// cnt NULL, /* BST_getNameUniquifier */ + NULL, /* BST_setupNameTypeSpecificInfo */ + NULL, /* BST_lookupByNameInDirectory*/ + NULL, /* BST_isDirectoryEmpty*/ + NULL, /* BST_addNameToDirectory*/ + NULL, /* BST_removeNameFromDirectory*/ + NULL, /* BST_modifyNameSpaceMaskInDirectory*/ + NULL, /* BST_setMatchAttributesInDirectory*/ + NULL, /* BST_wildcardLookup*/ + + NULL, /* BST_truncateFile*/ + NULL, /* BST_getStorageInfo*/ + NULL, /* BST_getExtentList*/ + NULL, /* BST_getPhysicalExtent*/ + NULL, /* BST_isBlockInBeast*/ + + NULL, /* BST_asyncReadFileBlk*/ + NULL, /* BST_getFileBlk*/ + NULL, /* BST_dioReadUnits*/ + NULL, /* BST_dioWriteUnits*/ + + NULL, /* BST_getZID*/ + NULL, /* BST_beastNotify*/ + NULL, /* BST_getInfo*/ + ZFSPOOL_BST_ModifyInfo, /* BST_modifyInfo*/ + NULL, /* BST_getInfoXML */ + NULL, /* BST_modifyInfoXML */ +}; + + +/*------------------------------------------------------------------------- + * Volume Operations + *-------------------------------------------------------------------------*/ +CommonVolumeOps_s ZFSPOOL_ComnVolOps = +{ +// cnt NULL, /* VOL_getNameUniquifier */ + NULL, /* VOL_setupNameTypeSpecificInfo */ + NULL, /* VOL_lookupByNameInDirectory */ + NULL, /* VOL_isDirectoryEmpty */ + NULL, /* VOL_addNameToDirectory */ + NULL, /* VOL_removeNameFromDirectory */ + NULL, /* VOL_modifyNameSpaceMaskInDirectory */ + NULL, /* VOL_setMatchAttributesInDirectory */ + NULL, /* VOL_wildcardLookup */ + + NULL, /* VOL_truncateFile */ + NULL, /* VOL_getStorageInfo */ + NULL, /* VOL_getExtentList */ + NULL, /* VOL_getPhysicalExtent */ + NULL, /* VOL_isBlockInBeast */ + + NULL, /* VOL_asyncReadFileBlk */ + NULL, /* VOL_getFileBlk */ + NULL, /* VOL_dioReadUnits */ + NULL, /* VOL_dioWriteUnits */ + + NULL, /* VOL_beginXLocal */ + NULL, /* VOL_endXLocal */ + NULL, /* VOL_addPurgeLogEntry */ + NULL, /* VOL_removePurgeLogEntry */ + NULL, /* VOL_writeVolumeLoggedData */ + + ZFSPOOL_VOL_ChangeVolumeState, /* VOL_changeVolumeState */ + ZFSREPAIR_VOL_CheckRepair, /* VOL_volumeMaintenance */ + NULL, /* VOL_allocStorageInfo */ + NULL, /* VOL_freeStorageInfo */ + NULL, /* VOL_getBeastFromVolume */ + NULL, /* VOL_updateBeastToVolume */ + NULL, /* VOL_insertBeastIntoVolume */ + NULL, /* VOL_removeBeastFromVolume */ + NULL, /* VOL_allocateZIDs */ + ZFSPOOL_VOL_FlushSystemBeasts, /* VOL_flushSystemBeasts */ + NULL, /* VOL_makeVolumeFreeSpace */ + NULL, /* VOL_browseBeastsInVolume */ + + NULL, /* VOL_getUserSpaceInfo */ + NULL, /* VOL_setUserSpaceRestriction */ + NULL, /* VOL_adjustUsedUserSpace */ + NULL, /* VOL_removeUser */ + NULL, /* VOL_ResetAllUsers */ + NULL, /* VOL_browseUsersInVolume */ + + NULL, /* VOL_insertMFLEntry */ + NULL, /* VOL_deleteMFLEntry */ + NULL, /* VOL_lookupMFLEntry */ + NULL, /* VOL_enumerateMFL */ + NULL, /* VOL_administerMFL */ + ZFSPOOL_VOL_CommandFunction, /* VOL_commandFunction */ + + NULL, /* VOL_getDirQuotaInfo */ + NULL, /* VOL_setDirQuota */ + NULL, /* VOL_adjustUsedDirSpace */ + NULL, /* VOL_removeDirectory */ + NULL, /* VOL_resetAllDirEntries */ + NULL, /* VOL_browseDirsInVolume */ + + NULL, /* VOL_getObjectName */ + NULL, /* VOL_insertObjectName */ + NULL, /* VOL_resetAllObjects */ + NULL, /* VOL_modifyObjectName */ + + NULL, /* VOL_insertEFLEntry */ + NULL, /* VOL_deleteEFLEntry */ + NULL, /* VOL_lookupEFLEntry */ + NULL, /* VOL_enumerateEFL */ + NULL, /* VOL_administerEFL */ + NULL, /* VOL_resetEFL */ + + NULL, /* VOL_FCNTL */ +}; + + +/* + * On success the beast is latched. + */ + +File_s *ZLSSPOOL_RebuildFileCreate( + GeneralMsg_s *genMsg, + ZfsPool_s *pool ) + +{ + + File_s *rebuildBeast; + STATUS status; + NINT packedSize; + ZfsXaction_s *xaction; + + ASSERT_MPKNSS_LOCK(); + rebuildBeast = BST_new(genMsg,zFTYPE_FILE,pool); + if (rebuildBeast == NULL) + { +#if NSS_DEBUG IS_ENABLED + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("Rebuild file BST_new() failed\n"))); + zASSERT( "Rebuild file BST_new() failed" == NULL ); +#endif + return( NULL ); + } + + rebuildBeast->FILEzid = ZFSPOOL_REBUILDFILE_ZID; + rebuildBeast->FILEfirstParentZid = zINVALID_ZID; + rebuildBeast->FILEaccessedTime = + rebuildBeast->FILEcreatedTime = + rebuildBeast->FILEmodifiedTime = + rebuildBeast->FILEmetaDataModifiedTime = GetUTCTime(); + + rebuildBeast->FILEarchivedTime = INVALID_UTC_TIME; + + rebuildBeast->FILEownerID = + rebuildBeast->FILEmodifierID = + rebuildBeast->FILEmetaDataModifierID = zSUPERVISOR_ID; + + rebuildBeast->FILEmetaDataSeqNum = 1; +// rebuildBeast->FILEbstState |= BST_STATE_NEW; +// COMN_MARK_BEAST_DIRTY( &rebuildBeast->FILEroot); + + + BEASTHASH_Insert(&rebuildBeast->FILEroot); + DQ_RMV(&rebuildBeast->FILEroot, volLink); + /* Now write into the beastTree so that the space for them + * will be allocated. + */ + X_LATCH(&rebuildBeast->FILEbeastLatch); + packedSize = BST_getPackedSize(&rebuildBeast->FILEroot); + xaction = BeginXLocal(rebuildBeast->FILEvolume,BXL_DEFAULT); + status = ZFSVOL_VOL_InsertBeastIntoVolume(genMsg,&rebuildBeast->FILEroot, + packedSize, &xaction->xaction); + EndXlocal(xaction); + + if (status != zOK) + { + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("Rebuild file insert into BT failed\n"))); + zASSERT( "Rebuild file insert into BT failed" == NULL ); +/** Insert beast now does the cleanup on error conditions. + ** We have no way to tell if the error was before or after the pack was + ** called. + ** + ** BST_noPackCleanup(&rebuildBeast->FILEroot); + **/ + COMN_UnlatchAndRelease(&rebuildBeast, XLATCHED); + return( NULL ); + } + return( rebuildBeast ); + +} + + +/* + * ZLSSPOOL_RebuildFileExtend + * This function extends the Rebuild File (Well known ZID 7) so + * to meet the the rebuild requirement of having about .3% of a + * pool usable to rebuild the Beast Tree. If the file does not + * already exist then the file is also created. + * + * totalBlocks + * Total number of blocks that should be in the rebuild file. If + * this number is greater than the current size then we increase + * the current size of the file otherwise we leave the file alone. + */ + +STATUS ZLSSPOOL_RebuildFileExtend( + GeneralMsg_s *genMsg, + ZfsPool_s *pool, + Blknum_t totalBlocks ) +{ + + Blknum_t blocksToAdd; + Blknum_t blkNum; + File_s *rebuildBeast; + File_s *tempBeast; + STATUS status = zOK; + + ASSERT_MPKNSS_LOCK(); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("Rebuild file being extended\n"))); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("Looking for rebuild file\n"))); + rebuildBeast = COMN_LookupByZid(genMsg, &pool->ZFSPOOLvol, + ZFSPOOL_REBUILDFILE_ZID, XLATCHED, TRUE); + /* On success the rebuildBeast is XLATCHED */ + if ( rebuildBeast == NULL ) + { + ClearErrno( genMsg ); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("Rebuild file not found so being created\n"))); + rebuildBeast = ZLSSPOOL_RebuildFileCreate( genMsg, pool ); + /* On success the rebuildBeast is XLATCHED */ + if ( rebuildBeast == NULL ) + { + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("Rebuild file create failed\n"))); + zASSERT( "Unit Test Assert - Create rebuild file failed" == NULL ); + return( zFAILURE ); + } + } + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("Rebuild file is open\n"))); + + /*** The rebuildBeast is XLATCHED ***/ + blkNum = rebuildBeast->FILEroot.storage.zfsInfo->nextBlk; + + if ( blkNum < totalBlocks ) + { + IoMsg_s io; + Buffer_s *buffer; + + blocksToAdd = totalBlocks - blkNum; + FILEBLK_IO_MSG_FLAG(io, rebuildBeast, blkNum, blocksToAdd, + CACHE_WRITE, ALLOC_NO_ZERO_FILL); + buffer = COMN_GetFileBlk(genMsg, &io); + COMN_MARK_BEAST_DIRTY( &rebuildBeast->FILEroot); + if (buffer == NULL) + { /* Note - the file has been partially extended + * in some error cases. This is the reason we + * require the caller to pass in how big should + * we be VERSES how big a segement was added. + */ + zASSERT( "Unit Test Assert - Extend rebuild file failed (ok to do go)" == NULL ); + status = zFAILURE; + goto error_closeFile; + } + CACHE_RELEASE(buffer); + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("Rebuild file has been extended\n"))); + } + +error_closeFile: + /* Full flush will cause beast to be flushed */ + rebuildBeast->FILEbstState |= BST_STATE_FULL_FLUSH; + cacheFlushMyCacheBufs(&rebuildBeast->FILEmycache); + UNX_LATCH(&rebuildBeast->FILEbeastLatch); + defaultFlushWait(&rebuildBeast->FILEmycache.agent ); + cacheTossAll(&rebuildBeast->FILEmycache); + tempBeast = rebuildBeast; + COMN_Release(&tempBeast); /* so we still have a pointer*/ + BST_free(rebuildBeast); /* Toss from memory since we will not use */ + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(CYAN, + MSGNot("Rebuild file has been closed\n"))); + return( status ); + +} + + +#if ZLSS_IO_GATHER IS_ENABLED + +/************************************************************************** + * Copy the ZLSS I/O log to SYS: + ***************************************************************************/ + +void ZFSPOOL_CopyLogFileToVolume( + char *volumeName, + int flags ) /* 0 - not used currently */ +{ + char logFilePath[32]; /* Big enough for max legacy volume and 8.3 name */ + NINT volNameLen; + char retPathString[32]; + LONG volume, pathBase, pathCount; + LONG handle; + LONG dummy; + LONG startOffset; + NINT saveLogState; + LONG result; + + ENTER(TZPOOLIO, ZFSPOOL_CopyLogFileToVolume); + if ( gZLSSPoolIOLog == NULL) return; + +/*--------------------------------------------------------------------------- + * Build a length preceded path string containing the volume name and + * the name of the log file. Then convert the string to Component format + * as required by the Legacy File System APIs. + *---------------------------------------------------------------------------*/ + aprintf(CYAN,MSGNot(" ** Copying ZLSS IO Statistics file to \"%s\"\n"), volumeName); + volNameLen = strlen( volumeName ); + strcpy(&logFilePath[1], volumeName); + if (logFilePath[volNameLen] != ':') + { + strcat(&logFilePath[1],MSGNot(":")); + volNameLen++; + } + + strcat(&logFilePath[1],MSGNot("ZLSS.ZSF")); + logFilePath[0]=strlen(&logFilePath[1]); + + ZOS_ConvertPathString(result,0,0,(BYTE *)logFilePath, + &volume,&pathBase,(BYTE *)retPathString,&pathCount); + if (result != zOK) + { + WARN("Bad volume name in CopyLogFileToVolume"==NULL); + RTN_VOID(); + } + + ZOS_CreateFile(result,0,NSS_TASK,volume,pathBase, + (BYTE *)retPathString,pathCount,DOSNameSpace, 0, + DELETE_FILE_ON_CREATE_BIT, PrimaryDataStream, &handle, + &dummy, (void **)&dummy) + if ( result != zOK ) + { + WARN("Creating Destination file in CopyLogFileToVolume"==NULL); + RTN_VOID(); + } + + + startOffset = 0; + /* Turn off IO logging. We can not have the circular pointers + * or data change on us when we are writing. + */ + + saveLogState = gZLSSPoolIOLogDo; + gZLSSPoolIOLogDo = FALSE; + + gZLSSPoolIOHeader->ZPIOH_SignatureMajor = 0x58595451; + gZLSSPoolIOHeader->ZPIOH_SignatureMinor = 0x58303030; + gZLSSPoolIOHeader->ZPIOH_VersionMajor = 1; + gZLSSPoolIOHeader->ZPIOH_VersionMinor = 0; + gZLSSPoolIOHeader->ZPIOH_Prs = sizeof( *gZLSSPoolIOHeader ); + gZLSSPoolIOHeader->ZPIOH_Pws = gZLSSPoolIOHeader->ZPIOH_Prs + sizeof( gZLSSPRS ); + gZLSSPoolIOHeader->ZPIOH_DataOffset = gZLSSPoolIOHeader->ZPIOH_Pws + sizeof( gZLSSPWS ); +// gZLSSPoolIOHeader->ZPIOH_Servername; +// gZLSSPoolIOHeader->ZPIOH_UTCTime= ; + gZLSSPoolIOHeader->ZPIOH_Start= gZLSSPoolIOLogStart; + gZLSSPoolIOHeader->ZPIOH_Next = gZLSSPoolIOLogNext; + gZLSSPoolIOHeader->ZPIOH_Size = gZLSSPoolIOLogSize; + gZLSSPoolIOHeader->ZPIOH_ZSTOREMajorVersion = ZSTORE_VersionInfo.majorVersion; + gZLSSPoolIOHeader->ZPIOH_ZSTOREMinorVersion = ZSTORE_VersionInfo.minorVersion; + gZLSSPoolIOHeader->ZPIOH_ZSTORESubVersion = ZSTORE_VersionInfo.subVersion; + gZLSSPoolIOHeader->ZPIOH_ZSTOREBuildNumber = ZSTORE_VersionInfo.buildNumber; + // The HEADER + ZOS_WriteFile(result,0,handle,startOffset, sizeof(*gZLSSPoolIOHeader), gZLSSPoolIOHeader ); + if ( result != zOK ) + { + WARN("Writing Destination file in CopyLogFileToVolume"==NULL); + } + startOffset += sizeof(*gZLSSPoolIOHeader); + // The gZLSSPWS statistics (WRITE) + ZOS_WriteFile(result,0,handle, startOffset, sizeof(gZLSSPRS), &gZLSSPRS ); + if ( result != zOK ) + { + WARN("Writing Destination file in CopyLogFileToVolume"==NULL); + } + startOffset += sizeof(gZLSSPRS); + // The gZLSSPRS statistics (READ) + ZOS_WriteFile(result,0,handle, startOffset, sizeof(gZLSSPWS), &gZLSSPWS ); + if ( result != zOK ) + { + WARN("Writing Destination file in CopyLogFileToVolume"==NULL); + } + startOffset += sizeof(gZLSSPWS); + // Recent IO History + ZOS_WriteFile( result, 0, handle, startOffset, gZLSSPoolIOLogSize, gZLSSPoolIOLog); + if ( result != zOK) + { + WARN("Writing Destination file in CopyLogFileToVolume"==NULL); + } + startOffset += gZLSSPoolIOLogSize; + /* Close the log file on the legacy volume */ + ZOS_CommitFile( 0, handle ); + gZLSSPoolIOLogDo = saveLogState; + ZOS_CloseFile( 0, NSS_TASK, handle); + RTN_VOID(); + +} /* End of ZFSPOOL_CopyLogFileToVolume() */ +#endif /* #if ZLSS_IO_GATHER IS_ENABLED */ + +#if ZLSS_IO_GATHER IS_ENABLED +STATUS doZLSSIOStatsSave( + PCLSwitchDef_s *switchDef, + NINT parseOptions, + void *userParm) +{ + MPKNSS_LOCK(); + UNUSED_PARAM(switchDef); + UNUSED_PARAM(parseOptions); + UNUSED_PARAM(userParm); + + ZFSPOOL_CopyLogFileToVolume( "SYS", 0 ); + + MPKNSS_UNLOCK(); + return zOK; +} +#endif + +LangEnabledStruct_s ZFSPOOL_ZidToNameLong[ZLSS_PRS_SB_COUNT] = { + StructMSG("Volume Data Blocks",454), + StructMSG("Beast Tree",455), + StructMSG("Free Tree",490), + StructMSG("Journal File",525), + StructMSG("Purge Log(Pool)",526), /* Zid 4 */ + StructMSG("Salvage Tree",527), + StructMSG("Name Tree",570), + StructMSG("Rebuild File",576), + StructMSG("User Space Tree(Pool)",578), /* Zid 8 */ + StructMSG("Purge Log(Volume)",592), + StructMSG("User Space Tree(Volume)",660), + StructMSG("MFL Tree",770), + StructMSG("Directory Space Tree",838), /* Zid 12 */ + StructMSG("EFL Tree",579), + StructMSG("Zid 14",840), + StructMSG("Zid 15 to 127",841) +}; + + +/* + * ZLSSPOOL_DisplayIOStatistics() - + * This function displays ZLSS Physical I/O statistics to the screen. + * + * Notes - + * A QUAD is 19 digits at most in decimal. + */ + +void ZLSSPOOL_DisplayIOStatistics( + unicode_t *poolName, + PoolReadStatistics_s *read, + PoolWriteStatistics_s *write, + Time_t time ) + +{ + + NINT i; + char timeStr[64]; + QUAD totalReads = 0; + QUAD totalWrites = 0; + QUAD dioRead; + QUAD dioWrite; + + aprintf(LMAGENTA, MSG("ZLSS Physical I/O Statistics(%U)\n",881),poolName); + aprintf(CYAN, MSGNot("%25.25s %19s %19s\n"), + MSG("Object",928), + MSG("Reads",971), + MSG("Writes",972)); + for ( i=0; i < ZLSS_PRS_SB_COUNT; ++i ) + { /* Only display system beast information for beasts with I/Os */ + if ( (read->PRS_SystemBeast[i] != 0) || + ( write->PWS_SystemBeast[i] != 0 ) ) + { + aprintf(LGREEN, MSGNot("%25.25s %19Lu %19Lu\n"), + StructGetMSGStr(ZFSPOOL_ZidToNameLong[i]), + read->PRS_SystemBeast[i], + write->PWS_SystemBeast[i]); + totalReads += read->PRS_SystemBeast[i]; + totalWrites += write->PWS_SystemBeast[i]; + } + } + if ( (read->PRS_SuperBlock != 0) || ( write->PWS_SuperBlock != 0 ) ) + { + aprintf(LGREEN, MSGNot("%25.25s %19Lu %19Lu\n"), + MSG("Superblocks",973), + read->PRS_SuperBlock, + write->PWS_SuperBlock); + totalReads += read->PRS_SuperBlock; + totalWrites += write->PWS_SuperBlock; + } + aprintf(YELLOW, MSGNot("%25.25s %19Lu %19Lu\n"), + MSG("System Blocks",974), + totalReads, + totalWrites ); + /* Convert DIO units into 4K units with rounding up. */ +#if ((1<PRS_DirectIOBlock + ((1<PWS_DirectIOBlock + ((1<PRS_UserBlock, + write->PWS_UserBlock ); + aprintf(YELLOW, MSGNot("%25.25s %19Lu %19Lu\n"), + MSG("User Blocks",975), + read->PRS_UserBlock + dioRead, + write->PWS_UserBlock + dioWrite ); + + aprintf(YELLOW, MSGNot("%25.25s %19Lu %19Lu\n"), + MSG("All Blocks",976), + totalReads + read->PRS_UserBlock + dioRead, + totalWrites + write->PWS_UserBlock + dioWrite ); + + if ( write->PWS_BarrierIO != 0 ) + { + aprintf(LGREEN, MSGNot("%25.25s %19Lu %19Lu\n"), + MSG("Barrier Flushes",0), + (QUAD)0, + write->PWS_BarrierIO ); + } + + secondsDiffToStr( time, GetUTCTime(), timeStr ); + aprintf(LMAGENTA, MSG(" Statistics time period: %s\n",977), timeStr ); + +} /* End of ZLSSPOOL_DisplayIOStatistics() */ + + +void ZLSSPOOL_DisplayIOStatisticsGlobal( ) + +{ + + ZLSSPOOL_DisplayIOStatistics( L"Summary since boot", &gZLSSPRS, &gZLSSPWS, UpTimeToUTCTime(gZLSSIOStartUpTime) ); + +} /* End of ZLSSPOOL_DisplayIOStatisticsGlobal() */ + + +void ZLSSPOOL_DisplayIOStatisticsPool( unicode_t *poolName, ZlssPool_s *zlssPool ) + +{ + + ZLSSPOOL_DisplayIOStatistics( poolName, &zlssPool->ZP_PRS, + &zlssPool->ZP_PWS, zlssPool->ZP_StatisticsResetUTCTime ); + +} /* End of ZLSSPOOL_DisplayIOStatisticsPool() */ + + +/* + * ZLSSPOOL_ResetIOStatistics() - + * This function resets the ZLSS Physical I/O statistics. + * + */ + +void ZLSSPOOL_ResetIOStatistics( + unicode_t *poolName, + PoolReadStatistics_s *read, + PoolWriteStatistics_s *write, + Time_t *time, + BOOL utcTime ) + +{ + + aprintf(YELLOW, MSGNot("ZLSS Physical I/O Statistics(%U) reset\n"),poolName); + bzero( read, sizeof(*read) ); + bzero( write, sizeof(*write) ); + if ( utcTime ) + { + *time = GetUTCTime(); + } + else + { + *time = GetUpTime(); + } + + +} /* End of ZLSSPOOL_ResetIOStatistics() */ + + +void ZLSSPOOL_ResetIOStatisticsGlobal( ) + +{ + + ZLSSPOOL_ResetIOStatistics( L"Summary since boot", &gZLSSPRS, &gZLSSPWS, &gZLSSIOStartUpTime, FALSE ); + +} /* End of ZLSSPOOL_ResetIOStatisticsGlobal() */ + + +void ZLSSPOOL_ResetIOStatisticsPool( unicode_t *poolName, ZlssPool_s *zlssPool ) + +{ + + ZLSSPOOL_ResetIOStatistics( poolName, &zlssPool->ZP_PRS, + &zlssPool->ZP_PWS, &zlssPool->ZP_StatisticsResetUTCTime, TRUE ); + +} /* End of ZLSSPOOL_ResetIOStatisticsPool() */ + + +/* + * This function supports the hidden command to display ZLSS Physical + * I/O statistics. This command is used internally to gather I/O + * information to help in making performance changes to ZLSS. + * + * These statistics are for ALL pools that the ZLSS owns. + */ + +STATUS doZLSSPoolIOStatistics( + PCLSwitchDef_s *switchDef, + NINT parseOptions, + void *userParm) + +{ + Pool_s *pool; +// STATUS status; + GeneralMsg_s dummyGenMsg; + + MPKNSS_LOCK(); + UNUSED_PARAM(parseOptions); + UNUSED_PARAM(userParm); + + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + if ( uniicmp( (unicode_t *)switchDef->ret_value, MSGNot(L"_Summary") ) == 0 ) + { + ZLSSPOOL_DisplayIOStatisticsGlobal( ); + MPKNSS_UNLOCK(); + return( zOK ); + } + pool = COMN_PoolNameLookup( &dummyGenMsg, (unicode_t *)switchDef->ret_value, + FALSE /*NOT Active Only*/, NULL ); + if (pool != NULL) + { + ZlssPool_s *zlssPool; + + if ( !COMN_IsDerivedFrom(pool, zFTYPE_ZLSS_LOGICAL_POOL) ) + { /* Not a ZLSS pool so exit */ + printf(MSGNot("Pool not a ZLSS pool.\n")); + COMN_Release( &pool ); + MPKNSS_UNLOCK(); + return zOK; + } + zlssPool = (ZlssPool_s *)pool; +// status = COMN_PoolActiveLock( &dummyGenMsg, pool ); +// if ( status != zOK ) +// { /* Not DEACTIVE - really O.K. to give stats but +// * we will not as not totally correct in a cluster +// * environment. +// */ +// printf(MSGNot("Pool not in the ACTIVE state.\n")); +// COMN_Release( &pool ); +// MPKNSS_UNLOCK(); +// return zOK; +// } + +// X_LATCH( &zlssPool->ZLSSPOOLpool.stateLatch ); We do not do as UseCount will prevent pool from being freed on us. + ZLSSPOOL_DisplayIOStatisticsPool( (unicode_t *)switchDef->ret_value, zlssPool ); +// UN_XLATCH( &zlssPool->ZLSSPOOLpool.stateLatch ); + +// COMN_PoolActiveUnlock( pool ); + COMN_Release( &pool ); + } + else + { + printf(MSGNot("Pool \"%U\" not found.\n"), + (unicode_t *)switchDef->ret_value ); + } + MPKNSS_UNLOCK(); + return zOK; + +} /* End of doZLSSPoolIOStatistics() */ + + +/* + * This function supports the hidden command to display ZLSS Physical + * I/O statistics. This command is used internally to gather I/O + * information to help in making performance changes to ZLSS. + * + * These statistics are for ALL pools that the ZLSS owns. + */ + +STATUS doZLSSPoolIOReset( + PCLSwitchDef_s *switchDef, + NINT parseOptions, + void *userParm) + +{ + Pool_s *pool; + GeneralMsg_s dummyGenMsg; + + MPKNSS_LOCK(); + UNUSED_PARAM(parseOptions); + UNUSED_PARAM(userParm); + + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + if ( uniicmp( (unicode_t *)switchDef->ret_value, MSGNot(L"_Summary") ) == 0 ) + { + ZLSSPOOL_ResetIOStatisticsGlobal( ); + MPKNSS_UNLOCK(); + return( zOK ); + } + pool = COMN_PoolNameLookup( &dummyGenMsg, (unicode_t *)switchDef->ret_value, + FALSE /*NOT Active Only*/, NULL ); + if (pool != NULL) + { + ZlssPool_s *zlssPool; + + if ( !COMN_IsDerivedFrom(pool, zFTYPE_ZLSS_LOGICAL_POOL) ) + { /* Not a ZLSS pool so exit */ + printf(MSGNot("Pool not a ZLSS pool.\n")); + COMN_Release( &pool ); + MPKNSS_UNLOCK(); + return zOK; + } + zlssPool = (ZlssPool_s *)pool; +// X_LATCH( &zlssPool->ZLSSPOOLpool.stateLatch ); + ZLSSPOOL_ResetIOStatisticsPool( (unicode_t *)switchDef->ret_value, zlssPool ); + /* Indicate when the reset command was done. This + * is used to see if should UNPACK the statistics information + * at pool activate time(in zlog_CommonCreateAndOpen). + */ + + zlssPool->ZP_StatisticsResetCommandUTCTime = zlssPool->ZP_StatisticsResetUTCTime; +// UN_XLATCH( &zlssPool->ZLSSPOOLpool.stateLatch ); + COMN_Release( &pool ); + } + else + { + printf(MSGNot("Pool \"%U\" not found.\n"), + (unicode_t *)switchDef->ret_value ); + } + MPKNSS_UNLOCK(); + return zOK; + +} /* End of doZLSSPoolIOReset() */ + + +/* + * ZLSSPOOL_VolumeUpgrade() - + * This function upgrades a specific ZLSS Volume from NSS 2.x to + * NSS 3.00. + * + * Note: This function releases the pool object. + */ + +#ifdef NW5X_UPGRADE +void ZLSSPOOL_VolumeUpgrade( + GeneralMsg_s *genMsg, + Pool_s *pool) + +{ + ZfsPool_s *zfsPool; + NINT state; + ZlssPool_s *zlssPool; + BOOL upgrade; + BOOL cluster; + typedef struct { + unicode_t name[zMAX_COMPONENT_NAME]; + BYTE threadName[MAX_NAME_LENGTH + 1]; + } AllocStuff_s; + AllocStuff_s *as; + + as = zalloc( sizeof( *as ) ); + if ( as == NULL ) + { + printf( MSG("Out of memory. Volume upgrade did not complete.\n",4) ); + COMN_Release( &pool ); + return; + } + + COMN_GetPoolName(genMsg, pool, as->name, NELEMS(as->name)); + + if ( !COMN_IsDerivedFrom(pool, zFTYPE_ZLSS_LOGICAL_POOL) ) + { /* Not a ZLSS pool so exit */ + printf( MSG("Volume %U not a ZLSS pool.\n",981), as->name ); + COMN_Release( &pool ); + free( as ); + return; + } + + zlssPool = (ZlssPool_s *)pool; + if ( zlssPool->ZP_Version >= (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_4_DONE) ) + { + printf( MSG("Volume %U has already been upgraded.\n",982), as->name ); + COMN_Release( &pool ); + free( as ); + return; + } + X_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); + state = zlssPool->ZLSSPOOLstate; + UNX_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); + if ( state != zVOLSTATE_DEACTIVE ) + { + printf( MSG("Volume %U must be in the DEACTIVE state to upgrade. Use the\n" + "command \"NSS /poolDeactivate=pool name\" to deactivate the volume.\n",518), + as->name); + COMN_Release( &pool ); + free( as ); + return; + } + + /* This is a kludge so that the Cluster Team does not have to + * change their scripts (to call a non prompting upgrade). + * We look for their thread name and if them we do not prompt. + */ + cluster = FALSE; + if ( kGetThreadName(kCurrentThread(), as->threadName, sizeof(as->threadName) ) == kSUCCESS ) + { + if ( strcmp( MSGNot("CRM_RESOURCE_THREAD"), as->threadName ) == 0 ) + { + cluster = TRUE; + } + } + + if ( cluster ) + { /* If CLUSTER software is asking us to do the upgrade then + * we do as we are not allowed to ask questions as they + * are running in a script. + */ + upgrade = TRUE; + } + else + { +// ZOS_UnformattedOutputWithAttribute(COMN_Resource.consoleScreenID, YELLOW, + aprintf( YELLOW, + MSG("Before proceeding, make sure that all processes relating to the\n" + "NetWare 65 upgrade have completed. For more information see the\n" + "Novell Storage Services Administration Guide at\n" + "http://www.novell.com/documentation/lg/nw65. Upgrading before the\n" + "server is ready can cause loss of trustees.\n", 550) ); + // Why not stdin which is wStdio which is COMN_Resource.stdio + upgrade = LB_wPromptForYesOrNo(stdin, FALSE, FALSE, + MSG("\nDo you wish to continue the upgrade now? ", 551)); +#error The above is not legal in LINUX + } + + zfsPool = ZLSS_POOL_TO_ZFS_POOL( zlssPool ); + COMN_Release( &pool ); /* Upgrade uses the 'load' useCount */ + /* Attempt to upgrade the POOL */ + + if (upgrade) + { + (void)ZLSSCON_Upgrade( genMsg, zfsPool ); + } + zfsPool = NULL; /* Pool has been unloaded and reloaded so + * can not use our old pointer. + */ + free( as ); + return; +} /* End of ZLSSPOOL_VolumeUpgrade() */ +#endif + +/* + * doZLSSVolumeUpgrade() - + * This function supports the command to upgrade a specific ZLSS Volume + * (or all volumes) from NSS 2.x to NSS 3.00. This is needed if LV AIPU fails. + * + * Notes - + * Since the command is upgrading a NSS 2.x ZLSS media we refer + * to the object as a volume. Although it really is still a + * pool/volume. + */ + +#ifdef NW5X_UPGRADE +STATUS doZLSSVolumeUpgrade( + PCLSwitchDef_s *switchDef, + NINT parseOptions, + void *userParm) + +{ + Pool_s *pool; + GeneralMsg_s genMsg; + + MPKNSS_LOCK(); + + COMN_SETUP_GENERAL_MSG_NOSA(&genMsg); + +/*------------------------------------------------------------------------- + * See if doing ALL volumes + *-------------------------------------------------------------------------*/ + if (uniicmp((unicode_t *)switchDef->ret_value, MSGNot(L"all")) == 0) + { + SET_FOREACHBLOCKING(&NSSMasterPoolList, pool, Pool_s, masterPoolLink) + { /*** You MUST NOT use continue in this loop because the macro + *** SET_FOREACHBLOCKINGEND (at end of for loop) must be called + *** every time through the loop. Technically, you + *** can use a continue BEFORE any blocking calls. + ***/ + COMN_USE_BEAST(&pool->POOLroot); + ZLSSPOOL_VolumeUpgrade(&genMsg, pool); + ClearErrno(&genMsg); + SET_FOREACHBLOCKINGEND(&NSSMasterPoolList, pool, Pool_s, + masterPoolLink); + } + } + else + { + pool = COMN_PoolNameLookup(&genMsg, (unicode_t *)switchDef->ret_value, + FALSE /* NOT only active */, NULL); + if (pool != NULL) + { + ZLSSPOOL_VolumeUpgrade(&genMsg, pool); + } + else + { + printf(MSG("Volume \"%U\" not found. If the name is correct use \"NSS\n" + "/ZLSSPoolScan\" to load the volume.\n",984), + (unicode_t *)switchDef->ret_value ); + } + + } + MPKNSS_UNLOCK(); + return zOK; +} +#endif + + +/* + * ZFSPOOL_Snapshot() - + * Take a snapshot of a pool. We use when we do DEBUG pool restores. + * The caller must not use zfsPool after we return. + * + * Note - + * As called by the command line this code only executes the Snapshot + * API. Therefore no real 'snapshot' is taken. I.E. no one is making + * a 'COPY' of a pool. We are just renaming all the volumes and changing + * the volume's IDs. This is very useful when you restore a pool from a + * customer that has volume names that match the server. Specificially, the + * volume name SYS appears to be used a lot. + */ + +STATUS ZFSPOOL_Snapshot( + GeneralMsg_s *genMsg, + ZfsPool_s *zfsPool ) + +{ + StorPool_s *storagePool; + + storagePool = zfsPool->storagepool; + if ( storagePool == NULL ) + { + SetErrno( genMsg, zERR_ZLSSPOOL_NO_PHYSICAL_POOL ); + return( zFAILURE ); + } + + ZFSPOOL_ShutdownPool( zfsPool ); + storagePool->zfspool = NULL; + + storagePool->zfspool = ZFSPOOL_LoadPool(genMsg, storagePool, + TRUE, ZLSS_PLF_NORMAL, TRUE, L"ZSBPOOL", FALSE ); + if ( storagePool->zfspool == NULL ) + { + storagePool->poolstatus = POL_STAT_ORPHAN; + return(zFAILURE); + } + + return zOK; + +} /* End of ZFSPOOL_Snapshot() */ + + +/* + * doZLSSPoolSnapshot() - + * Take a snapshot of a pool. We use when we do DEBUG pool restores. + * The caller must not use zfsPool after we return. + * + */ + +STATUS doZLSSPoolSnapshot( + PCLSwitchDef_s *switchDef, + NINT parseOptions, + void *userParm) + +{ + ZfsPool_s *zfsPool; + Pool_s *pool; + NINT state; + GeneralMsg_s dummyGenMsg; + + + MPKNSS_LOCK(); + UNUSED_PARAM(parseOptions); + UNUSED_PARAM(userParm); + + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + pool = COMN_PoolNameLookup( &dummyGenMsg, (unicode_t *)switchDef->ret_value, + FALSE /*NOT Active Only*/, NULL ); + if (pool != NULL) + { + ZlssPool_s *zlssPool; + + if ( !COMN_IsDerivedFrom(pool, zFTYPE_ZLSS_LOGICAL_POOL) ) + { /* Not a ZLSS pool so exit */ + printf( MSGNot("Pool not a ZLSS pool.\n") ); + COMN_Release( &pool ); + MPKNSS_UNLOCK(); + return zOK; + } + zlssPool = (ZlssPool_s *)pool; + X_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); + state = zlssPool->ZLSSPOOLstate; + UNX_LATCH( &zlssPool->ZLSSPOOLpool.cvsLatch ); + if ( state != zVOLSTATE_DEACTIVE ) + { + printf( MSGNot("Pool must be in the DEACTIVE state to snap. Use the\n" + "command \"NSS /poolDeactivate=%U\" to deactivate the pool.\n"), + (unicode_t *)switchDef->ret_value ); + COMN_Release( &pool ); + MPKNSS_UNLOCK(); + return zOK; + } + zfsPool = ZLSS_POOL_TO_ZFS_POOL( zlssPool ); + COMN_Release( &pool ); /* Snap uses the 'load' useCount */ + /* Attempt to upgrade the POOL */ + (void)ZFSPOOL_Snapshot( &dummyGenMsg, zfsPool ); + zfsPool = NULL; /* Pool has been unloaded and reloaded so + * can not use our old pointer. + */ + } + else + { + printf( MSGNot("Pool \"%U\" not found. If the name is correct use \"NSS\n" + "/ZLSSPoolScan\" to load the pool.\n"), + (unicode_t *)switchDef->ret_value ); + } + MPKNSS_UNLOCK(); + return zOK; + +} /* End of doZLSSPoolSnapshot() */ + + +/* + * doZLSSPoolIOState() - + * This function supports the hidden command to set the pool STATES + * that I/O statistics are tracked in. The globals have already + * been updated by the command line processor. We just need to + * go through ALL the ZLSS pools to update their tracking state. + * + * These statistics are for ALL pools that the ZLSS owns. + */ + +STATUS doZLSSPoolIOState( + PCLSwitchDef_s *switchDef, + NINT parseOptions, + void *userParm) + +{ + ZlssPool_s *zlssPool; + Pool_s *pool; + + MPKNSS_LOCK(); + + /* Uses a hidden key to find the 'next' pool */ + SET_FOREACHBLOCKING(&NSSMasterPoolList, pool, Pool_s, masterPoolLink) + { /*** You MUST NOT USE continue in this loop because the macro + *** SET_FOREACHBLOCKINGEND (at end of for loop) must be called + *** every time through the loop. Technically, you + *** can use a continue BEFORE any blocking calls. + ***/ + if ( !COMN_IsDerivedFrom(pool, zFTYPE_ZLSS_LOGICAL_POOL) ) + { /* Not a ZLSS pool so exit */ + zASSERT("Wow we have non ZLSS pools(ok to do a 'go')"==NULL); + goto cantUseContinue; + } + COMN_USE_BEAST( &pool->POOLroot ); + zlssPool = (ZlssPool_s *)pool; + X_LATCH( &zlssPool->ZLSSPOOLpool.stateLatch ); + switch (pool->state) + { + case zVOLSTATE_UNKNOWN: + case zVOLSTATE_DEACTIVE: + zlssPool->ZP_PoolTrackIO = gCLPoolTrackDeactive; + break; + case zVOLSTATE_MAINTENANCE: + zlssPool->ZP_PoolTrackIO = gCLPoolTrackMaintenance; + break; + case zVOLSTATE_ACTIVE: + zlssPool->ZP_PoolTrackIO = gCLPoolTrackActive; + break; + default: + break; + } + UNX_LATCH( &zlssPool->ZLSSPOOLpool.stateLatch ); + COMN_Release( &pool ); +cantUseContinue: + /* Resets us to the beginning of the list */ + SET_FOREACHBLOCKINGEND(&NSSMasterPoolList, pool, Pool_s, masterPoolLink) + } + MPKNSS_UNLOCK(); + return zOK; + +} /* End of doZLSSPoolIOState() */ + + +/**************************************************************************** + * CONSTRUCTOR for ZLSS Pools + *****************************************************************************/ +STATUS ZLSSPOOL_BST_Construct( + GeneralMsg_s *genMsg, + void *zlssPool_LX) +{ + ZlssPool_s *zlssPool = (ZlssPool_s *)zlssPool_LX; + + ENTER(TZPOOL, ZLSSPOOL_BST_Construct); + FSMLITE_INIT(&zlssPool->ZP_MakeFreeSpaceFsm, + MSGNot("ZLSS POOL Make Free Space"), 0); + zlssPool->ZP_UnusableFreeBlkCnt = 0; +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(RED,MSGNot("A ZlssPool_s is at %lx with inuse set to %d\n"),zlssPool,zlssPool->ZLSSPOOLroot.useCount); + DBG_DebugPrintf(RED,MSGNot("The inUse is at %lx\n"),&zlssPool->ZLSSPOOLroot.useCount); + +#endif + zlssPool->ZLSSPOOLsupportedFeatures = ZLSSPOOL_SUPPORTED_FEATURES; + zlssPool->ZP_StatisticsResetUTCTime = GetUTCTime(); + zlssPool->ZP_StatisticsResetCommandUTCTime = (Time_t)0; + zlssPool->ZLSSPOOLfreeBlockAdjustment = ZFS_RESERVED_BLOCKS_FOR_INTERNAL_ALLOCATION; + bzero( &zlssPool->ZP_PWS, sizeof( zlssPool->ZP_PWS ) ); + bzero( &zlssPool->ZP_PRS, sizeof( zlssPool->ZP_PRS ) ); + zlssPool->ZP_PoolTrackIO = gCLPoolTrackDeactive; + zlssPool->ZP_Loading = FALSE; + zlssPool->ZP_Snapshot = FALSE; + zlssPool->ZLSSPOOLroot.beastVersion = BEAST_VERSION_2; + + RTN_STATUS(zOK); +} /* End of ZLSSPOOL_BST_Construct */ + +/**************************************************************************** + * DESTURCTOR for ZLSS Pools + *****************************************************************************/ +void ZLSSPOOL_BST_Destruct( + void *pool) +{ + ENTER(TZPOOL, ZLSSPOOL_BST_Destruct); + RTN_VOID(); +} /* End of ZLSSPOOL_BST_Destruct */ + + +/************************************************************************** + * This will do a flush of the POOL's system beasts. We simply have the + * internal volume do the work. + * + * Notes - + * As of 12/18/2000 no one ever calls this volume op for a pool. + ***************************************************************************/ +STATUS ZLSSPOOL_VOL_FlushSystemBeasts( + GeneralMsg_s *genMsg, + void *zlssPool_LX) +{ + ZlssPool_s *zlssPool = (ZlssPool_s *)zlssPool_LX; + STATUS status; + + ENTER(TZPOOL, ZLSSPOOL_VOL_FlushSystemBeasts); + status = ZFSPOOL_VOL_FlushSystemBeasts( genMsg, zlssPool->ZLSSPOOLzfsPool ); + RTN_STATUS(status); + +} /* End of ZLSSPOOL_VOL_FlushSystemBeasts() */ + + +/************************************************************************** + * + * This changes the state of a ZLSS Pool. + * + * It is called 10 times for each common layer change state request. See + * COMN_ChangeVolumeState for details on the states. The Pool states + * are the same as the Volume states. + * + * Since ZLSS Pool creates a ZfsPool_s to do much of the work for the pool + * we pass our work to our ZfsPool_s in pass 1. This leaves ZlssPool_s + * to only have to handle the persistent pool items. + * + * Notes - LV loading and unloading is done by the ZLSS Pool Event handler. + * This was done as it is called after we know the pool STATE change + * worked and after the stateLatch is released. See + * ZLSSPOOL_ChangePoolStateExitCallBack() for details. + * + ***************************************************************************/ +STATIC STATUS ZLSSPOOL_VOL_ChangeVolumeState( + GeneralMsg_s *genMsg, + void *zlssPool_LX, + NINT sourceState, + NINT requestedState, + NINT mode, + NINT pass ) +{ + ZlssPool_s *zlssPool = (ZlssPool_s *)zlssPool_LX; + STATUS status = zOK; + statusfunc_t parentFunc; + + ENTER(TZPOOL, ZLSSPOOL_VOL_ChangeVolumeState); +/*------------------------------------------------------------------------- + * Since we know we need to call the Pool_s code, get the address of + * that routine. + *-------------------------------------------------------------------------*/ + parentFunc = COMN_GetNextParentVolumeComnOp(zlssPool->ZLSSPOOLbeastClass, + COMNVOLOPS_INDEX(VOL_changeVolumeState),ZLSSPOOL_VOL_ChangeVolumeState); + zASSERT(parentFunc != NULL); + status = parentFunc( genMsg, &zlssPool->ZLSSPOOLpool, sourceState, + requestedState, mode, pass ); + if ( status != zOK ) + { + RTN_STATUS( status ); + } + + /** At this time we do all the real work in a ZlssPool_s + ** state change by telling our internal volume that it + ** needs to do a STATE change. We do this in our first + ** pass. This in turns forces our LVs to the correct + ** STATE. I.E. if we are going to a lower number STATE + ** then the LVs will be placed in that STATE(if they + ** where in a higher STATE). + **/ + if ( pass == 1 ) + { + if (zlssPool->ZLSSPOOLzfsPool->storagepool != NULL) + { /* ???Code for if clustering changeing things on us??? */ + if ( zlssPool->ZLSSPOOLzfsPool->storagepool->sharedStatus ) + { /* Allen says we are shared */ + zlssPool->ZLSSPOOLenabledFeatures |= + zPOOL_FEATURE_SHARED_CLUSTER; + } + else + { + zlssPool->ZLSSPOOLenabledFeatures &= + ~zPOOL_FEATURE_SHARED_CLUSTER; + } + } + + status = COMN_ChangeVolumeState( genMsg, zlssPool->ZLSSPOOLzfsPool, requestedState, mode & VOLMODE_LEGAL_BITS); + if ( status != zOK ) + { + RTN_STATUS( status ); + } + /** This check handles the case where the internal volume + ** destination state got changed from ACTIVE to MAINTENANCE + ** because of the zERR_VOLUME_STATE_CHANGE_A_TO_M error. + ** When this error occurs the change volume code changes + ** the destination state to MAINTENANCE and if that + ** state is reached NO ERROR is returned. + **/ + if ( (requestedState == zVOLSTATE_ACTIVE) && + (zlssPool->ZLSSPOOLzfsPool->ZFSPOOLstate == zVOLSTATE_MAINTENANCE ) ) + { /* Since internal volume destination state got changed + * we need to change the pools. We simply return the + * special error code and let change pool state handle. + */ + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(GREEN, + MSGNot("ZLSS POOL - Change state converted to a request from %d to %d\n"), + requestedState, zVOLSTATE_MAINTENANCE )); + SetErrno( genMsg, zERR_VOLUME_STATE_CHANGE_A_TO_M ); + RTN_STATUS( zFAILURE ); + } + } + RTN_STATUS(status); + +} /* End of ZLSSPOOL_VOL_ChangeVolumeState() */ + + +/* ZLSSPOOL_VOL_CommandFunction - + * Implements volume create. Implemented here because only the + * pool beast is known when a volume is created. + * + * Notes - + * We must release the useCount on the 'beast'. + */ + +STATUS ZLSSPOOL_VOL_CommandFunction( + GeneralMsg_s *genMsg, + void *beast_LX, /* We inherit useCount */ + NINT functionNumber, + VCO_VolumeCommonOps_s *pCD, /*parsedCommandData */ + NINT parmLen, + utf8_t *parm, + NINT dataLen, + BYTE *commandData, + NINT offset, + NINT retBufLen, + BYTE *retBuf, + NINT *retLen) + +{ + ZlssPool_s *beast = (ZlssPool_s *)beast_LX; + STATUS status; + + zASSERT( COMN_IsDerivedFrom(beast, zFTYPE_ZLSS_LOGICAL_POOL) ); +#ifdef USER_GPACHNER + aprintf(LRED,"%s\n",WHERE); +#endif + switch( functionNumber ) + { + case VCO_VOLUME_CREATE_NUMBER: + status = ZLSSVOL_LV_CreateAPI( genMsg, + pCD->u.create.poolName, + pCD->u.create.volumeName, + pCD->u.create.authModelID, + pCD->u.create.quota, + &pCD->u.create.guid, + pCD->u.create.virtualFileFlag ); + break; + + /*** Note that the rest of the VCO_VOLUME_... commands + *** are handle by ZFSVOL_VOL_CommandFunction() because + *** the commands have volumes that exist. + ***/ + case VCO_VOLUME_RENAME_NUMBER: + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + case VCO_VOLUME_DELETE_NUMBER: + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + case VCO_VOLUME_DELETE_ACTION_NUMBER: + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + default: + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + } + COMN_Release( &beast ); + return( status ); + +} /* End of ZLSSPOOL_VOL_CommandFunction() */ + + +/* + * ZLSSPOOL_BST_ModifyInfo() - + * Modifies metadata information for a ZLSS POOL beast object + */ + +STATUS ZLSSPOOL_BST_ModifyInfo( + GeneralMsg_s *genMsg, + RootBeast_s *zlssPool_LX, + ModifyInfoMsg_s *modifyMsg, + Xaction_s *xaction) /* Optional xaction, may be NULL */ + +{ + ZlssPool_s *zlssPool = (ZlssPool_s *)zlssPool_LX; + zPoolInfo_s *poolInfo; + zVolumeInfo_s *volInfo; + statusfunc_t derivedFromModifyInfo; + BOOL modified = FALSE; + BOOL someAttrsChanged = FALSE; + BOOL someAttrsNotChanged = FALSE; + + ASSERT_MPKNSS_LOCK(); + ASSERT_XLATCH( &zlssPool->ZLSSPOOLfile.FILEbeastLatch ); + +/*--------------------------------------------------------------------------- + * First, call the generic file information routine and modify any + * requested generic information. If it fails, do not continue... + *---------------------------------------------------------------------------*/ + derivedFromModifyInfo = COMN_GetNextParentBeastComnOp( + zlssPool->ZLSSPOOLbeastClass, COMNOPS_INDEX( BST_modifyInfo ), + ZLSSPOOL_BST_ModifyInfo); + + if (derivedFromModifyInfo(genMsg,zlssPool,modifyMsg,xaction) != zOK) + { + return(zFAILURE); + } + + if ((modifyMsg->modifyInfoMask == 0) || (modifyMsg->modifyInfo == NULL)) + { + return zOK; + } + + poolInfo = &modifyMsg->modifyInfo->pool; + volInfo = &modifyMsg->modifyInfo->vol; + + if (modifyMsg->modifyInfoMask & + (zMOD_POOL_ATTRIBUTES | zMOD_POOL_NDS_OBJECT_ID)) + { + zASSERT(!(zlssPool->ZLSSPOOLenabledFeatures & zPOOL_FEATURE_READ_ONLY)); + + if (zlssPool->ZLSSPOOLauth.mayIDoThis(genMsg, zlssPool, +#ifdef NSS_NW60 + modifyMsg->parentZid, MAY_I_DO_EVERYTHING) != zOK) +#else + modifyMsg->parentZid, MAY_I_SUPERVISE) != zOK) +#endif + { + SetErrno(genMsg,zERR_NO_SET_PRIVILEGE); + return(zFAILURE); + } + } + + + if (modifyMsg->modifyInfoMask & zMOD_POOL_ATTRIBUTES) + { + if ((poolInfo->features.enableModMask & + ZLSSPOOL_NON_CHANGEABLE_FEATURES) != + (((poolInfo->features.enabled & poolInfo->features.enableModMask) & + ZLSSPOOL_NON_CHANGEABLE_FEATURES))) + { + /* One of the bits requested to be changed is a + * non-changeable feature */ + someAttrsNotChanged = TRUE; + } + + /* + * LOCAL + */ + if ((poolInfo->features.enableModMask & zPOOL_FEATURE_MSAP) && + ((poolInfo->features.enabled & zPOOL_FEATURE_MSAP) != + (zlssPool->ZLSSPOOLenabledFeatures & zPOOL_FEATURE_MSAP))) + { + /* Local is being changed */ + zlssPool->ZLSSPOOLenabledFeatures &= ~zPOOL_FEATURE_MSAP; + zlssPool->ZLSSPOOLenabledFeatures |= + (poolInfo->features.enabled & zPOOL_FEATURE_MSAP); + someAttrsChanged = TRUE; + } + + if (someAttrsChanged) + { + /* Some attributes did change, so notify the common layer */ +/* COMN_PoolAttributesChanged(genMsg, (Volume_s *)zlssVolume); We do not support at this time */ + if (someAttrsNotChanged) + { + /* Some attrs did not change, so set the return error */ + SetErrno(genMsg,zERR_SOME_ATTRS_NOT_CHANGED); + } + modified = TRUE; + } + else if (someAttrsNotChanged) + { + /* No attrs were changed */ + SetErrno(genMsg,zERR_ALL_ATTRS_NOT_CHANGED); + } +#if 0 +// Shared is a non-changeable (by the user) feature of the pool. +// /* Shared - even though this bit is in the persistent +// * field it is not persistent!!! The partition segment +// * is the keeper of this bit!!! +// */ +// +// if ((poolInfo->features.enableModMask & +// zPOOL_FEATURE_SHARED_CLUSTER) && +// ((poolInfo->features.enabled & +// zPOOL_FEATURE_SHARED_CLUSTER) != +// (zlssPool->ZLSSPOOLenabledFeatures & +// zPOOL_FEATURE_SHARED_CLUSTER))) +// { +// /* shared cluster feature is being changed +// * this bit is persistently stored by the partition segment +// * so we will call NSSIDK/MM to save this information +// */ +// +// if (ChangeSharedClusterFeatureOfPool(genMsg, zlssPool, +// poolInfo) != zOK) +// { +// ClearErrno( genMsg ); +// someAttrsNotChanged = TRUE; +// } +// } +#endif + } + if (modifyMsg->modifyInfoMask & zMOD_POOL_NDS_OBJECT_ID) + { + zlssPool->ZLSSPOOLndsObjectID = poolInfo->ndsObjectID; + OID_SaveObjectID((Volume_s *)zlssPool->ZP_ZfsPool, + &zlssPool->ZLSSPOOLndsObjectID); + modified = TRUE; + } + + + if (modifyMsg->modifyInfoMask & zMOD_VOL_LOW_WATER_MARK) + { + if (volInfo->salvage.lowWaterMark > zMAX_LOWWATERMARK) + volInfo->salvage.lowWaterMark = zMAX_LOWWATERMARK; + if (volInfo->salvage.lowWaterMark < zMIN_LOWWATERMARK) + volInfo->salvage.lowWaterMark = zMIN_LOWWATERMARK; + zlssPool->ZLSSPOOLlowWaterMark = volInfo->salvage.lowWaterMark; + modified = TRUE; + } + + if (modifyMsg->modifyInfoMask & zMOD_VOL_HIGH_WATER_MARK) + { + if (volInfo->salvage.highWaterMark > zMAX_HIGHWATERMARK) + volInfo->salvage.highWaterMark = zMAX_HIGHWATERMARK; + if (volInfo->salvage.highWaterMark < zMIN_HIGHWATERMARK) + volInfo->salvage.highWaterMark = zMIN_HIGHWATERMARK; + zlssPool->ZLSSPOOLhighWaterMark = volInfo->salvage.highWaterMark; + modified = TRUE; + } +/* Even if not changing high water mark, force it to be at least +* 2% greater than low water mark */ + if (zlssPool->ZLSSPOOLhighWaterMark < zlssPool->ZLSSPOOLlowWaterMark+2) + { + zlssPool->ZLSSPOOLhighWaterMark = zlssPool->ZLSSPOOLlowWaterMark+2; + modified = TRUE; + } + + if (modified) + { + ZLSSPOOL_WritePersistentPoolData(zlssPool); + } + + if (someAttrsNotChanged) + { + SetErrno(genMsg,zERR_ALL_ATTRS_NOT_CHANGED); + return zFAILURE; + } + return zOK; +} /* End of ZLSSPOOL_BST_ModifyInfo() */ + + +/************************************************************************** + * ZLSS POOL COMMON BEAST OPERATIONS definition + ***************************************************************************/ + +/*------------------------------------------------------------------------- + * Defines all of the ZLSS pool beast operations + *-------------------------------------------------------------------------*/ +CommonBeastOps_s ZLSSPOOL_ComnBeastOps = +{ + ZLSSPOOL_BST_Construct, /*construct*/ + ZLSSPOOL_BST_Destruct, /*destruct*/ + +// cnt NULL, /* BST_getNameUniquifier */ + NULL, /* BST_setupNameTypeSpecificInfo */ + NULL, /* BST_lookupByNameInDirectory*/ + NULL, /* BST_isDirectoryEmpty*/ + NULL, /* BST_addNameToDirectory*/ + NULL, /* BST_removeNameFromDirectory*/ + NULL, /* BST_modifyNameSpaceMaskInDirectory*/ + NULL, /* BST_setMatchAttributesInDirectory*/ + NULL, /* BST_wildcardLookup*/ + + NULL, /* BST_truncateFile*/ + NULL, /* BST_getStorageInfo*/ + NULL, /* BST_getExtentList*/ + NULL, /* BST_getPhysicalExtent*/ + NULL, /* BST_isBlockInBeast*/ + + NULL, /* BST_asyncReadFileBlk*/ + NULL, /* BST_getFileBlk*/ + NULL, /* BST_dfsReadUnits*/ + NULL, /* BST_dfsWriteUnits*/ + + NULL, /* BST_getZID*/ + NULL, /* BST_beastNotify*/ + NULL, /* BST_getInfo*/ + ZLSSPOOL_BST_ModifyInfo, /* BST_modifyInfo*/ + NULL, /* BST_getInfoXML*/ + NULL, /* BST_modifyInfoXML*/ +}; + + +/*------------------------------------------------------------------------- + * Volume Operations + *-------------------------------------------------------------------------*/ +CommonVolumeOps_s ZLSSPOOL_ComnVolOps = +{ +// cnt NULL, /* VOL_getNameUniquifier */ + NULL, /* VOL_setupNameTypeSpecificInfo */ + NULL, /* VOL_lookupByNameInDirectory */ + NULL, /* VOL_isDirectoryEmpty */ + NULL, /* VOL_addNameToDirectory */ + NULL, /* VOL_removeNameFromDirectory */ + NULL, /* VOL_modifyNameSpaceMaskInDirectory */ + NULL, /* VOL_setMatchAttributesInDirectory */ + NULL, /* VOL_wildcardLookup */ + + NULL, /* VOL_truncateFile */ + NULL, /* VOL_getStorageInfo */ + NULL, /* VOL_getExtentList */ + NULL, /* VOL_getPhysicalExtent */ + NULL, /* VOL_isBlockInBeast */ + + NULL, /* VOL_asyncReadFileBlk */ + NULL, /* VOL_getFileBlk */ + NULL, /* VOL_dfsReadUnits */ + NULL, /* VOL_dfsWriteUnits */ + + NULL, /* VOL_beginXLocal */ + NULL, /* VOL_endXLocal */ + NULL, /* VOL_addPurgeLogEntry */ + NULL, /* VOL_removePurgeLogEntry */ + NULL, /* VOL_writeVolumeLoggedData */ + + ZLSSPOOL_VOL_ChangeVolumeState, /* VOL_changeVolumeState */ + ZLSSPOOL_VOL_VolumeMaintenance, /* VOL_volumeMaintenance */ + NULL, /* VOL_allocStorageInfo */ + NULL, /* VOL_freeStorageInfo */ + NULL, /* VOL_getBeastFromVolume */ + NULL, /* VOL_updateBeastToVolume */ + NULL, /* VOL_insertBeastIntoVolume */ + NULL, /* VOL_removeBeastFromVolume */ + NULL, /* VOL_allocateZIDs */ + ZLSSPOOL_VOL_FlushSystemBeasts, /* VOL_flushSystemBeasts */ + ZLSSPOOL_VOL_MakeVolumeFreeSpace, /* VOL_makeVolumeFreeSpace */ + NULL, /* VOL_browseBeastsInVolume */ + + NULL, /* VOL_getUserSpaceInfo */ + NULL, /* VOL_setUserSpaceRestriction */ + NULL, /* VOL_adjustUsedUserSpace */ + NULL, /* VOL_removeUser */ + NULL, /* VOL_ResetAllUsers */ + NULL, /* VOL_browseUsersInVolume */ + + NULL, /* VOL_insertMFLEntry */ + NULL, /* VOL_deleteMFLEntry */ + NULL, /* VOL_lookupMFLEntry */ + NULL, /* VOL_enumerateMFL */ + NULL, /* VOL_administerMFL */ + ZLSSPOOL_VOL_CommandFunction, /* VOL_commandFunction */ + + NULL, /* VOL_getDirQuotaInfo */ + NULL, /* VOL_setDirQuota */ + NULL, /* VOL_adjustUsedDirSpace */ + NULL, /* VOL_removeDirectory */ + NULL, /* VOL_resetAllDirEntries */ + NULL, /* VOL_browseDirsInVolume */ + + NULL, /* VOL_getObjectName */ + NULL, /* VOL_insertObjectName */ + NULL, /* VOL_resetAllObjects */ + NULL, /* VOL_modifyObjectName */ + + NULL, /* VOL_insertEFLEntry */ + NULL, /* VOL_deleteEFLEntry */ + NULL, /* VOL_lookupEFLEntry */ + NULL, /* VOL_enumerateEFL */ + NULL, /* VOL_administerEFL */ + NULL, /* VOL_resetEFL */ + + NULL, /* VOL_FCNTL */ +}; + + +/* ZLSSPOOL_MediaIsCorrupt() - + * Handles the actions that must be done when ZLSS detects that + * a system block is corrupt. Currently, the actions are as follows. + * 1) Disables the volume. + * 2) Sends NetWare alerts. + * 3) Displays detailed error messages + * + * Returns - + * Standard STATUS - currently only zOK. + * + * Notes - + * Routine will detect that verify or rebuild is running. In which + * case no actions are taken as verify and rebuid must handle. + */ + +STATUS ZLSSPOOL_MediaIsCorrupt( + GeneralMsg_s *genMsg, + Buffer_s *corruptedBuffer, + IoMsg_s *iomsg ) + +{ + RootBeast_s *beast; /* The beast the corruption was found in */ + Volume_s *volume; /* The volume that the buffer is on. Note + * that this is not the 'owning' volume if + * the beast is a volume because the ADMIN + * volume owns all volumes. + */ + + + beast = STRUCT(corruptedBuffer->pBuf.mycache, RootBeast_s, ROOTmycache); + ZLSS_VOLUME_GET( beast, volume ); + zASSERT( volume != NULL ); + /* If not /VERIFY or /REBUILD then DISABLE the + * volume. The repair code handles errors so we + * do not wish to DISABLE the volume. Note that + * we do not simply check the volume's state because + * we would be required to obtain the stateLatch and + * if we did that we could easily deadlock. + */ + if ( !ZLSS_IS_MAINTENANCE_IO( volume ) ) + { + GeneralMsg_s dummyGenMsg; + NINT flags; + + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + + zASSERT("We have found corruption on a NSS volume (if possible contact NSS)."==NULL); + /* Now tell the common layer that we wish to disable this + * pool. This is an ASYNC call. We require because we + * can not block. CVA_NON_IO is set because we did not + * get an I/O read we just have a bad MAGIC. + */ + flags = CVA_SYSTEM_DATA | CVA_POOL_DISABLE | CVA_POOL_ALERT | CVA_NON_IO; + if ( !COMN_IsDerivedFrom(volume, zFTYPE_ZLSS_ZFSPOOL) ) + { /* Set Volume alert for Logical Volumes only. I.E. do + * NOT say that the internal volume is a volume. + * This prevents the internal volume name from being + * seen. + */ + flags |= CVA_VOLUME_DISABLE | CVA_VOLUME_ALERT; + } + if ( iomsg->mode == CACHE_READ ) + { /* No sense doing DISABLE on CACHE_READ as error + * is easily handled by caller. + */ + flags &= ~(CVA_POOL_DISABLE | CVA_VOLUME_DISABLE); + } + (void)COMN_VolumeAlert( &dummyGenMsg, beast, volume, + corruptedBuffer, corruptedBuffer->pBuf.fileBlk, + corruptedBuffer->volBlk, GetErrno(genMsg), GetErrnoSetter(genMsg), flags ); + ClearErrno( &dummyGenMsg ); + } + else + { +#if NSS_DEBUG IS_ENABLED + errPrintf( GetErrnoSetter(genMsg), Module, 1468, + MSGNot("Corruption detected on Volume \"???\". Run verify\n" + "File block %Ld(volume block %Ld) of object %Lx is corrupt\n"), + (QUAD)corruptedBuffer->pBuf.fileBlk, + (QUAD)corruptedBuffer->volBlk, + beast->zid ); +#endif + zASSERT("We have found corruption on a NSS volume"==NULL); + } + return( zOK ); + +} /* End of ZLSSPOOL_MediaIsCorrupt() */ + + +/* + * ZFSPOOL_SuperBlockUpdate() - + * Writes the 4 super block headers. + * + * Returns - + * zOK on success else could not write all four super block headers. + * When an error is returned some super blocks headers may have been + * written. + */ + +STATUS ZFSPOOL_SuperBlockUpdate( GeneralMsg_s *genMsg, ZfsPool_s *pool ) + +{ + STATUS status; + int superblock; + + /* Note that the checksum will be updated by the SuperBlock write + * header code below. Having the checksum correct in the + * ZP_Super (memory version of Super Block) is not REQUIRED because + * we always calculate it when writing the header. + */ + for ( superblock=0; superblock < SUPERBLOCK_NUMBER; superblock++ ) + { + zASSERT( pool->storagepool != NULL ); + status = ZFSPOOL_SuperBlockHeaderWrite(pool->storagepool, + &pool->ZP_super->SB_Header, + pool->ZP_super->SB_Header.hdr.superlocation[superblock] + + SUPERBLOCKHEADER_SECTION, SUPERBLOCKHEADER_BLKCOUNT ); + if (status != zOK) + { + SetErrno( genMsg, status ); + errPrintf(WHERE, Module, 1414, + MSG("Could not write superblock header, status=%d.\n" + "Run Verify.\n", 460), status); + return(zFAILURE); + } + } + return( zOK ); + +} /* End of ZFSPOOL_SuperBlockUpdate() */ + + +/* + * ZFSPOOL_SuperBlockUpdate2() - + * Writes the 4 super block headers. + * + * Returns - + * zOK on success else could not write all four super block headers. + * When an error is returned some super blocks headers may have been + * written. + */ + +STATUS ZFSPOOL_SuperBlockUpdate2( + GeneralMsg_s *genMsg, + StorPool_s *storagepool, + MediaSuperBlockHeader_s *super ) + +{ + STATUS status; + int superblock; + + /* Note that the checksum will be updated by the SuperBlock write + * header code below. Having the checksum correct in the + * ZP_Super (memory version of Super Block) is not REQUIRED because + * we always calculate it when writing the header. + */ + for ( superblock=0; superblock < SUPERBLOCK_NUMBER; superblock++ ) + { + status = ZFSPOOL_SuperBlockHeaderWrite(storagepool, + super, + super->hdr.superlocation[superblock] + + SUPERBLOCKHEADER_SECTION, SUPERBLOCKHEADER_BLKCOUNT ); + if (status != zOK) + { + SetErrno( genMsg, status ); + errPrintf(WHERE, Module, 1488, + MSG("Could not write superblock header, status=%d.\n" + "Run Verify.\n", 930), status); + return(zFAILURE); + } + } + return( zOK ); + +} /* End of ZFSPOOL_SuperBlockUpdate2() */ + + +/* + * ZFSPOOL_SuperBlockUpdateXaction() - + * Writes the 4 super block headers as part of a transaction. The caller + * must have the ZP_SuperblockHeaderLatch XLATCHed and must keep the latch + * until the transaction ends. + * + * Returns - + * zOK on success else could not write all four super block headers. + * When an error is returned some super blocks headers may have been + * written. + * + * Notes - + * The ZP_SuperblockHeaderLatch must be owned until after the + * xaction is ended. This is required because on REDO/UNDO of the + * superblock header is physically REDOne and UNDOne. I.E. is some + * one else changed and managed to end their xaction before us then + * their changes will be lost via a UNDO of the non-ended XACTION. This + * is TRUE only since Logical Undo has been added. Before then we would + * UNDO committed xactions so not to have someones xaction partially undone. + */ + +STATUS ZFSPOOL_SuperBlockUpdateXaction( + GeneralMsg_s *genMsg, + ZfsPool_s *zfsPool, + ZfsXaction_s *xaction ) + +{ + SNINT superblock; + SuperBlockHeader_s *superblockHeader; + IoMsg_s iomsg; + Buffer_s *sbhBuf[SUPERBLOCK_NUMBER]; + ZfsXasRecovery_s *logBuffer; + BlockInfo_s *poolBlks; + SbhLogRecord_s *logRecord; + + zASSERT( xaction != NULL ); + zASSERT( zfsPool->ZP_super != NULL ); + ASSERT_XLATCH( &zfsPool->ZP_SuperblockHeaderLatch ); + superblockHeader = &zfsPool->ZP_super->SB_Header.hdr; + + /* Grab the buffers before ZLOG_ObtainRecord because + * ZFS_ReadPoolBlk can block. + */ + for ( superblock=0; superblock < SUPERBLOCK_NUMBER; superblock++ ) + { + zASSERT( zfsPool->storagepool != NULL ); + /* CACHE_UPDATE because we need to get 'old' data */ + READBLK_IO_MSG( iomsg, zfsPool, + zfsPool->ZP_super->SB_Header.hdr.superlocation[superblock] + + SUPERBLOCKHEADER_SECTION, CACHE_UPDATE ); + sbhBuf[superblock] = ZFS_ReadPoolBlk(genMsg, &iomsg); + if ( sbhBuf[superblock] == NULL ) + { + for ( --superblock; superblock >= 0; superblock-- ) + { + CACHE_RELEASE(sbhBuf[superblock]); + } +// UNX_LATCH( &zfsPool->ZP_SuperblockHeaderLatch ); +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED,"%s Logging superblock header ERROR\n",WHERE); +#endif + return zFAILURE; + } + } + +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(CYAN,"%s Logging superblock header\n",WHERE); +#endif + ZLOG_ObtainRecord( xaction, + ZLOG_BLOCK_INFO_SIZE(SUPERBLOCK_NUMBER) + sizeof(*logRecord) ); + + /* create the log record */ + ZLOG_INIT_LOG_RECORD(XFUNC_SB_HEADER, xaction, logBuffer, SUPERBLOCK_NUMBER, + poolBlks, logRecord); + + zASSERT( sizeof(logRecord->SLR_ValuesOld) == sizeof(logRecord->SLR_ValuesNew) ); + memcpy( logRecord->SLR_ValuesOld, sbhBuf[0]->pBuf.data, sizeof(logRecord->SLR_ValuesOld)); + + /* We must update checksum as we are placing superblock header + * into a cache buffer (I.E. ZFSPOOL_SuperBlockHeaderWrite will + * not be used to write it so ZFSPOOL_SuperBlockHeaderWrite can not + * update checksum for us). + */ + superblockHeader->SBH_Lsn = logBuffer->ZXR_Lsn; + superblockHeader->SBH_VersionMajor = SUPERBLOCK_SB_VM_MAJOR; + superblockHeader->SBH_VersionMinor = SUPERBLOCK_SB_VM_MINOR; + superblockHeader->SBH_PackedSize = sizeof(SuperBlockHeader_s); + superblockHeader->SBH_Checksum = 0; + superblockHeader->SBH_Checksum = 0 - + ZFSPOOL_SuperBlockHeaderCalculateChecksum( + (LONG *)superblockHeader, + sizeof( SuperBlockHeader_s ) / 4 ); + for ( superblock=0; superblock < SUPERBLOCK_NUMBER; superblock++ ) + { +#ifdef USER_GPACHNER +aprintf(LRED,"SBH %d LSN %Lx Log rec old val size %d superblockhr size %d\n", superblock,((SuperBlockHeader_s *)(sbhBuf[superblock]->pBuf.data))->SBH_Lsn,sizeof(logRecord->SLR_ValuesOld),sizeof(SuperBlockHeader_s)); +#endif + ZLOG_ASSIGN_BLOCK_INFO( poolBlks[superblock], + sbhBuf[superblock]->volBlk, + ((MediaSuperBlockHeader_s *)(sbhBuf[superblock]->pBuf.data))->hdr.SBH_Lsn, + sbhBuf[superblock], xaction, superblock/*, bbtCompare*/); + memcpy( sbhBuf[superblock]->pBuf.data, superblockHeader, sizeof(*superblockHeader)); + zASSERT( SUPERBLOCK_SECTION_SIZE == 4096 ); /* Must not be different than cache size */ + bzero( sbhBuf[superblock]->pBuf.data+sizeof(*superblockHeader), SUPERBLOCK_SECTION_SIZE-sizeof(*superblockHeader) ); + ZLOG_BIND(xaction, sbhBuf[superblock]); + } + zASSERT( sizeof(superblockHeader) <= sizeof(logRecord->SLR_ValuesOld) ); + memcpy( &logRecord->SLR_ValuesNew, sbhBuf[0]->pBuf.data, sizeof(logRecord->SLR_ValuesNew)); + + ZLOG_ReleaseRecord(xaction); + for ( superblock=0; superblock < SUPERBLOCK_NUMBER; superblock++ ) + { + CACHE_DIRTY_RELEASE( sbhBuf[superblock] ); + } + return( zOK ); + +} /* End of ZFSPOOL_SuperBlockUpdateXaction() */ + + +/* + * ZLSSPOOL_RecoverySuperblockHeader() - + * REDO/UNDO routine for XFUNC_SB_HEADER function. This routine + * is aware that we ALREADY have a IN MEMORY copy of the superblock header. + * I.E. it will update the physical block and IN MEMORY copy of it. + * + * Notes - + * XFUNC_SB_HEADER is only used by ZFSPOOL_SuperBlockUpdateXaction + * which is only used duing LV AIPU. + */ + +STATUS ZLSSPOOL_RecoverySuperblockHeader( + struct GeneralMsg_s *genMsg, + struct ZfsPool_s *pool, + struct ZfsXasRecovery_s *logBuffer, + NINT action) + +{ + int superblock; + SbhLogRecord_s *logRecord; + Buffer_s *buffer[SUPERBLOCK_NUMBER]; + MediaSuperBlockHeader_s *block; + IoMsg_s iomsg; + BlockInfo_s *poolBlks; + ZlssPool_s *zlssPool; + + ENTER(TZVOL,ZLSSPOOL_RecoverySuperblockHeader); + + zlssPool = ZLSS_VOLUME_TO_ZLSS_POOL( &pool->ZFSPOOLzfsvol ); + zASSERT( zlssPool != NULL ); + poolBlks = ZLOG_START_OF_POOL_BLOCKS(logBuffer); + + for ( superblock=0; superblock < SUPERBLOCK_NUMBER; superblock++ ) + { + if (ZLOG_VALID_BLOCK(poolBlks[superblock])) + { /* Must use be a CACHE_UPDATE because ZLOG_ALREADY_DONE + * references LSN in block. + */ + READBLK_IO_MSG(iomsg, pool, poolBlks[superblock].blkNum, CACHE_UPDATE) + buffer[superblock] = ZFS_ReadPoolBlk( genMsg, &iomsg ); + if (buffer[superblock] == NULL) + { + return( zFAILURE ); + } + block = (MediaSuperBlockHeader_s *)buffer[superblock]->pBuf.data; + zASSERT(block->hdr.SBH_Signature == SUPERBLOCK_SB_S_SIGNATURE); + if (ZLOG_ALREADY_DONE(pool, logBuffer, block->hdr.SBH_Lsn, action)) + { + CACHE_RELEASE(buffer[superblock]); + } + else + { + logRecord = ZLOG_START_OF_LOG_RECORD(logBuffer); + if (action == X_REDO) + { +#ifdef USER_GPACHNER + zASSERT("REDO of superblock header"==NULL); +#endif + zASSERT( block->hdr.SBH_Lsn == poolBlks[superblock].previousLsn); + memcpy( block, logRecord->SLR_ValuesNew, sizeof(logRecord->SLR_ValuesNew)); + zASSERT( SUPERBLOCK_SECTION_SIZE == 4096 ); /* Must not be different than cache size */ + bzero( ((BYTE *)block)+sizeof(logRecord->SLR_ValuesNew), SUPERBLOCK_SECTION_SIZE-sizeof(logRecord->SLR_ValuesNew) ); + zASSERT( block->hdr.SBH_Lsn == logBuffer->ZXR_Lsn ); + /* Reload NEW info into memory copy of super block header */ + zASSERT( pool->ZP_super != NULL ); +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED,"1) Zlss pool version is now %ld (long value)\n", zlssPool->ZP_Version ); +#endif + memcpy( &pool->ZP_super->SB_Header.hdr, logRecord->SLR_ValuesNew, sizeof(logRecord->SLR_ValuesNew)); + zlssPool->ZP_Version = (pool->ZP_super->SB_Header.hdr.SBH_VersionMediaMajor * 0x100L) + + pool->ZP_super->SB_Header.hdr.SBH_VersionMediaMinor; +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED,"2) Zlss pool version is now %ld (long value)\n", zlssPool->ZP_Version ); +#endif + } + else + { +#ifdef USER_GPACHNER + zASSERT("UNDO of superblock header"==NULL); +#endif + zASSERT( block->hdr.SBH_Lsn == logBuffer->ZXR_Lsn ); + memcpy( block, logRecord->SLR_ValuesOld, sizeof(logRecord->SLR_ValuesOld)); + zASSERT( SUPERBLOCK_SECTION_SIZE == 4096 ); /* Must not be different than cache size */ + bzero( ((BYTE *)block)+sizeof(logRecord->SLR_ValuesOld), SUPERBLOCK_SECTION_SIZE-sizeof(logRecord->SLR_ValuesOld) ); + zASSERT( block->hdr.SBH_Lsn == poolBlks[superblock].previousLsn); + /* Reload OLD info into memory */ +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED,"3) Zlss pool version is now %ld (long value)\n", zlssPool->ZP_Version ); +#endif + zASSERT( pool->ZP_super != NULL ); + memcpy( &pool->ZP_super->SB_Header.hdr, logRecord->SLR_ValuesOld, sizeof(logRecord->SLR_ValuesOld)); + zlssPool->ZP_Version = (pool->ZP_super->SB_Header.hdr.SBH_VersionMediaMajor * 0x100L) + + pool->ZP_super->SB_Header.hdr.SBH_VersionMediaMinor; +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED,"4) Zlss pool version is now %ld (long value)\n", zlssPool->ZP_Version ); +#endif + } + /* Not needed because the memcpy above sets the LSN + * to the correct value + */ +/* ZLOG_SET_LSN(logBuffer, block->ZLPB_zlssPool.LPZP_lsn, poolBlks[superblock], action); */ + CACHE_DIRTY_RELEASE(buffer[superblock]); + } + } + } + RTN_STATUS(zOK); + +} /* End of ZLSSPOOL_RecoverySuperblockHeader() */ diff --git a/src/nwnss/zlss/zfsVol.c b/src/nwnss/zlss/zfsVol.c new file mode 100644 index 0000000..5dc4add --- /dev/null +++ b/src/nwnss/zlss/zfsVol.c @@ -0,0 +1,1979 @@ +/**************************************************************************** + | + | (C) Copyright 1995-2003 Novell, Inc. + | All Rights Reserved. + | + | This program is free software; you can redistribute it and/or + | modify it under the terms of version 2 of the GNU General Public + | License as published by the Free Software Foundation. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, contact Novell, Inc. + | + | To contact Novell about this file by physical or electronic mail, + | you may find current contact information at www.novell.com + | + |*************************************************************************** + | + | NetWare Advance File Services (NSS) module + | + |--------------------------------------------------------------------------- + | + | $Author: cteerlink $ + | $Date: 2008-04-18 03:39:54 +0530 (Fri, 18 Apr 2008) $ + | + | $RCSfile$ + | $Revision: 2319 $ + | + |--------------------------------------------------------------------------- + | This module is used to: + | ZFS_Classes: Register all Beast classes used by ZFS. + +-------------------------------------------------------------------------*/ +#include /* NSS Library */ +#include +#include +#include +#include +#include + +#include "zfs.h" +#include "zParams.h" +#include "comnPublics.h" +#include "beastTree.h" +#include "nameSpace.h" +#include "comnBeastClass.h" +#include "comnAuthorize.h" +#include "msgIO.h" +#include "dfsIO.h" +#include "zfsdefs.h" +#include "xaction.h" +#include "purgeLog.h" +#include "xAdminVolume.h" +#include "zlog.h" +#include "zlssStartup.h" +#include "purgeTree_if.h" +#include "userTree.h" +#include "zlssLogicalVolume.h" +#include "dirTree.h" + +#include "uxaction.h" +#include "fsm.h" + +#include "mfl_if.h" +#include "eflTree.h" +#include "zlssUpgrade.h" + +void ZFSVOL_UnLoadSystemBeasts(ZfsVolume_s *vol); + +/* After we make volume's read ahead number changable, we should + * put it to somewhere else + */ + +/**************************************************************************** + * ZFS volume beast constructor + *****************************************************************************/ +STATIC STATUS ZFSVOL_BST_Construct( + GeneralMsg_s *genMsg, + void *vol_LX) +{ + ZfsVolume_s *vol = (ZfsVolume_s *)vol_LX; + + ASSERT_MPKNSS_LOCK(); + vol->ZFSVOLvol.VOLsupportedAttributes = ZLSS_SUPPORTED_ATTRIBUTES; + vol->ZFSVOLvol.VOLmaximumFileSize = UI64_CONST(0xFFFFFFFFFFFFFFFF); + + zASSERT( sizeof( PersistentZfsVolume_s ) == 128 ); + zASSERT( sizeof( LoggedPersistentZfsVolume_s ) == 128 ); + + /** + * We set up many of ZLSS volume's non-persistent items here. + */ + vol->ZFSVOLmaxBeastSize = MAX_BEAST_SIZE; + vol->ZFSVOLfreeBlockAdjustment = ZFS_RESERVED_BLOCKS_FOR_INTERNAL_ALLOCATION; + + vol->ZFSVOLvol.VOLsupportedAttributes = ZLSS_SUPPORTED_ATTRIBUTES; + vol->ZFSVOLvol.VOLmaximumFileSize = UI64_CONST(0xFFFFFFFFFFFFFFFF); + + vol->ZFSVOLvol.readAheadBlocks = ZLSS_DEFAULT_READ_AHEAD; + vol->ZFSVOLroot.beastVersion = BEAST_VERSION_2; + + /* Mark as a Logical Volume - If this is the pool's internal + * volume then the ZfsPool_s constructor will turn this bit off + */ + vol->ZFSVOLvol.v_statusFlag |= VOL_SF_LOGICAL_VOLUME; + DQ_INIT( &vol->ZV_deleteBlkQ ); + INIT_LATCH( &vol->ZV_vdbLatch ); + vol->unusableFreeBlkCnt = 0; + vol->ZV_stopPurge = FALSE; + +/* + * This line should be here, but we do not know which pool we are in. + * Therefore this is actually done in ZLSSVOL_LV_Load or ZLSSVOL_LV_Create + * right after the ZLSS volume has been NEWed. The ZfsPool_s constructor + * sets for ZLSS POOLs. + * + * vol->ZFSVOLpool = pool; + */ + +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf(LRED,"%s zfsvol address is %lx inUse is at offset %x\n",WHERE,vol, offsetof( RootBeast_s, useCount) ); +#endif + return zOK; +} + +/**************************************************************************** + * ZFS vol beast destructor + *****************************************************************************/ +STATIC void ZFSVOL_BST_Destruct( + void *vol_LX) +{ + ZfsVolume_s *vol = (ZfsVolume_s *)vol_LX; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_Destruct); + + BST_releaseAndFree(vol->beastTree); + BST_releaseAndFree(vol->nameTree); + COMN_Release( &vol->ZLSSVOLv_pool ); + + RTN_VOID(); +} + +/************************************************************************** + * + ***************************************************************************/ +STATUS ZFSVOL_Activate( + GeneralMsg_s *genMsg, + ZfsVolume_s *zfsVol, + NINT mode) +{ + + ASSERT_MPKNSS_LOCK(); + /* The next statement is located in the load of the Volume's + * persistent data blocks (VDB and LVDB). This was done because + * rebuild does not activate a volume. + */ +// zfsVol->zv_bookedInUseBlocks = zfsVol->ZFSVOLinUseBlocks; + + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSVOL_Activate\n"))); + +// This is now done the first time we read persistent volume data, because +// purge log entries can be generated before this. eg AIPU and logical undo +// zfsVol->ZFSVOLactivationCount += 1; +// /** +// * We must write the volume data information before playing +// * the purge log because if we crash we do not want to +// * mistakenly play a old purge item. Also if our activation +// * count does not get written then we will re-use the same one +// * if we crash. This would cause us to not play PL entries +// * that should be played. See the purge log play code for +// * information about how it uses the activation count. +// */ +// ZFSVOL_WritePersistentVolumeData(zfsVol); + + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(CYAN, + MSGNot("%s 0x%lx Activation count is %d restart count id %d\n"), + WHERE, zfsVol, zfsVol->ZFSVOLactivationCount, + zfsVol->ZFSVOLvol.v_restartCount )); + /** + * Since Internal volume's restart count is changed each time + * the internal volume is activated we need to update the + * LV restart count when the LV is activated (verses loaded). + */ + zfsVol->ZLSSVOLvol.v_restartCount = zfsVol->ZFSVOLpool->ZFSPOOLvol.v_restartCount; + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(CYAN, + MSGNot("%s 0x%lx Activation count is %d restart count id %d\n"), + WHERE, zfsVol, zfsVol->ZFSVOLactivationCount, + zfsVol->ZFSVOLvol.v_restartCount )); + zASSERT( zfsVol->ZFSVOLvol.v_restartCount != 0 ); + return ( ZFSVOL_LoadSystemBeasts( genMsg, zfsVol ) ); +} + + +/************************************************************************** + * + * In ZLSS Volume there is not difference between deactivate and going + * from ACTIVATE to MAINTENANCE. Therefore this routine is called + * in both cases. + ***************************************************************************/ +STATUS ZFSVOL_Deactivate( + GeneralMsg_s *genMsg, + ZfsVolume_s *zfsVol, + NINT mode) +{ + GeneralMsg_s dummyGenMsg; + NINT i; + + ASSERT_MPKNSS_LOCK(); + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + + zASSERT( zfsVol->zv_bookedInUseBlocks == zfsVol->ZFSVOLinUseBlocks); + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT, + (CYAN,MSGNot("CVS@ZFSVOL_Deactivate\n"))); + /* The *10 (i.e. 40) is to ENSURE we take enough checkpoints to clear + * the defered delete queue. + * LV - When added LVs I increased loop to write up to + * 120 checkpoints. This with the delay I added in the + * loop causes us to take checkpints for up to a minute before + * giving up. With LVs home my not be moving because of other LVs. + * We only flush our system beasts and not every ones. So + * we better wait a little while. Note if we have nothing on + * our delete queue then we do not do a delay. + */ + for ( i=0; i < 120; i++ ) + { +/*------------------------------------------------------------------------- + * Write all of the SYSTEM beasts + *-------------------------------------------------------------------------*/ + if ( ZFSVOL_DoFlushSystemBeasts( &dummyGenMsg, zfsVol, FALSE ) != zOK ) + { + errPrintf(WHERE, Module, 1423, + MSG("Unable to flush system beasts, status=%d. Run Verify.", 425), + GetErrno(&dummyGenMsg)); + if ( GetErrno( genMsg ) == zOK ) + { + SetErrno( genMsg, GetErrno(&dummyGenMsg) ); + } + ClearErrno( &dummyGenMsg ); + /* continue with the deactivation */ + } + /* + * This code prevents us from taking extra checkpoints when + * trying to clear the defered delete queue. Here we check + * that there are no more items on the defered delete queue. + * + * We must clear the defered delete queue so that we can take + * one last checkpoint that will be clean. If we do not clear + * defered delete queue 1st then after we take the last + * checkpoint more log records may be generated by blocks + * that are now deletable. + * + * The QUEUE is protected by NetWare being non-pre-emptive. + * + */ + if (DQ_EMPTY(&zfsVol->ZV_deleteBlkQ)) + { /* Nothing on queue */ + break; + } + LB_delay( 500 ); /* 1/2 second */ + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(CYAN, + MSGNot("Doing %d checkpoint in ZFSVOL_Deactivate because ZV_deleteBlkQ is not empty\n"), + i+1)); + if ( ZFSPOOL_CheckpointTake( &dummyGenMsg, ZLSS_VOLUME_TO_ZFS_POOL( zfsVol ), + CHECKPOINT_CT_S_DEFERED_DELETE|CHECKPOINT_CT_S_NORMAL) != zOK ) + { + errPrintf(WHERE, Module, 1424, + MSG("Error flushing checkpoint, status=%d. Run Verify.", 436), + GetErrno(&dummyGenMsg)); + /* continue with the deactivation */ + if ( GetErrno( genMsg ) == zOK ) + { + SetErrno( genMsg, GetErrno(&dummyGenMsg) ); + } + ClearErrno( &dummyGenMsg ); + } + } + /* Volumes do not have the pool problem of worst case Free B-Tree + * casading because the Free B-Tree is owned by the Pool. See + * comments in ZFSPOOL_Deactivate. If this is WRONG then we will have + * a purge log entry and be forced to do REDO/UNDO and/or replay + * purge log WHICH in the case of LVs cause a system block to be + * freed to soon!!! + */ +#if NSS_DEBUG IS_ENABLED + { + ZfsPool_s *zfsPool; + + zfsPool = ZLSS_VOLUME_TO_ZFS_POOL( zfsVol ); + zASSERT( zfsPool != NULL ); + /* We skip ASSERTS if in DISABLE mode because they will occur */ + if ( !(zfsPool->ZFSPOOLvol.v_ioFlag & VOL_IOF_DISABLE) || + !(zfsVol->ZFSVOLvol.v_ioFlag & VOL_IOF_DISABLE) ) + { + zASSERT( i < CHECKPOINT_NUMBER*10 ); + zASSERT( DQ_EMPTY(&zfsVol->ZV_deleteBlkQ) ); + } + } +#endif + /** + * We need to clean up deleted system blocks that have + * not yet been physically freed. The XDEL_ call will + * toss the list of blocks, but the purge log is aware of them + * so they will be freed on our next activation when the LV's + * purge log is played. Tossing the list here leaves us open + * to freeing a block during a quick activation that should + * not be physically freed(yet). This is highly unlikily with + * checkpoint code above, but if we do manage to free and + * then allocate and then crash the result could be a coruption + * of the media. + */ + XDEL_DeactivateVolume(zfsVol); + + /* Don't do tosses until after we are done with all the activity on + * the system beasts, because cacheToss has a BUG where it + * removes the blocks from the cache HASH even if they still needs + * to be flushed. + */ + (void)ZFSVOL_DoFlushSystemBeasts( &dummyGenMsg, zfsVol, TRUE ); + + ZFSVOL_UnLoadSystemBeasts( zfsVol ); + + return( zOK ); +} + +/************************************************************************** + * This is called to load ZLSS volume specific beasts. + * + * Currently this means to load the root dir. When we have logical + * volumes we will have to load the logical volumes NAME and BEAST + * B-Trees. + * + ***************************************************************************/ +STATIC STATUS ZFSVOL_LoadSystemBeasts( + GeneralMsg_s *genMsg, + ZfsVolume_s *vol) +{ + File_s *rootdir; + typedef struct Stack_s { + unicode_t volname[zMAX_COMPONENT_NAME]; + } Stack_s; + STACK_ALLOC(); + + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_LoadSystemBeasts); + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSVOL_LoadSystemBeasts\n"))); + if ( !COMN_IsDerivedFrom(vol, zFTYPE_ZLSS_ZFSPOOL) ) + { + STATUS status; + status = ZLSSVOL_LoadSystemBeasts( genMsg, vol ); + if ( status != zOK ) + { + STACK_FREE(); + RTN_STATUS(zFAILURE); + } + } + + zASSERT(vol->beastTree != NULL); + zASSERT(vol->nameTree != NULL); + zASSERT(vol->pool->zfsLogBeast != NULL); + + /* Set up quick pointer to ZLOG Beast */ + vol->zv_zfsLogBeast = vol->pool->zfsLogBeast; // FixFixFix6 - Why not in constructor?? + +#if !VOL_DOES_ROOTDIR +/*------------------------------------------------------------------------- + * Load the ROOT directory + *-------------------------------------------------------------------------*/ + zASSERT( vol->ZFSVOLrootdir == NULL ); + /* The zVOL_PSTATE_CRTEATED check was added when LV delete + * was implemented. During LV delete (and create) the + * rootdir may not exist. This is O.K. so we just + * never load in those cases. + */ + /* The if is just defensive code. The ROOTDIR should never + * be loaded when this routine is called. But less bad things + * will happen if we check and not re-load. + */ + if ( (vol->ZFSVOLvol.VOLpState == zVOL_PSTATE_CREATED) && + (vol->ZFSVOLrootdir == NULL) ) + { + rootdir = BST_read(genMsg,zROOTDIR_ZID,&vol->ZFSVOLvol); + if (rootdir == NULL) + { + X_LATCH(&vol->ZFSVOLbeastLatch); + COMN_GetNameFromBeast(genMsg, vol, +/* cnt vol->ZFSVOLvol.VOLfirstParentNameUniquifier, */ + zNSPACE_LONG, NELEMS(aStack->volname),aStack->volname,NULL); + UNX_LATCH(&vol->ZFSVOLbeastLatch); + errPrintf(WHERE, Module, 1448, + MSG("Unable to read ROOTDIR from the " + "volume \"%U\", status=%d.\n" + "You may be out of memory. Run Verify.", 439), + aStack->volname,GetErrno(genMsg)); + STACK_FREE(); + RTN_STATUS(zFAILURE); + } + else + { + vol->ZFSVOLrootdir = rootdir; + BEASTHASH_Insert(&rootdir->FILEroot); + /* We do not store any system beasts on volLink */ + DQ_RMV(&rootdir->FILEroot,volLink); + } + } +#endif + +#if NSS_DEBUG IS_ENABLED + { + char buffer[40]; + DBG_ScreenAPrintf( "ZLSS.Greg.Volume.FYI", WHERE, CYAN,MSGNot(" ** (DEBUG) Create UTC time %s\n"), + UTCTime2Str(vol->ZFSVOLvol.p.PV_createTimeUTC,&buffer[0]) ); + DBG_ScreenAPrintf( "ZLSS.Greg.Volume.FYI", WHERE, CYAN,MSGNot(" ** (DEBUG) Last Activate UTC time %s\n"), + UTCTime2Str(vol->ZFSVOLvol.p.PV_activationTimeUTC,&buffer[0]) ); + DBG_ScreenAPrintf( "ZLSS.Greg.Volume.FYI", WHERE, CYAN,MSGNot(" ** (DEBUG) Last verify UTC time %s\n"), + UTCTime2Str(vol->ZFSVOLvol.p.PV_verifyTimeUTC,&buffer[0]) ); + DBG_ScreenAPrintf( "ZLSS.Greg.Volume.FYI", WHERE, CYAN,MSGNot(" ** (DEBUG) Last rebuild UTC time %s\n"), + UTCTime2Str(vol->ZFSVOLvol.p.PV_rebuildTimeUTC,&buffer[0]) ); + + DBG_DebugPrintf(LGREEN,MSGNot("Create UTC time %s\n"), + UTCTime2Str(vol->ZFSVOLvol.p.PV_createTimeUTC,&buffer[0]) ); + DBG_DebugPrintf(LGREEN,MSGNot("Last Activate UTC time %s\n"), + UTCTime2Str(vol->ZFSVOLvol.p.PV_activationTimeUTC,&buffer[0]) ); + DBG_DebugPrintf(LGREEN,MSGNot("Last verify UTC time %s\n"), + UTCTime2Str(vol->ZFSVOLvol.p.PV_verifyTimeUTC,&buffer[0]) ); + DBG_DebugPrintf(LGREEN,MSGNot("Last rebuild UTC time %s\n"), + UTCTime2Str(vol->ZFSVOLvol.p.PV_rebuildTimeUTC,&buffer[0]) ); + } +#endif + + STACK_FREE(); + RTN_STATUS(zOK); +} + +/************************************************************************** + * This is called to unload ZFS volume specific beasts. + * + * Currently this means to unload the root dir. When we have logical + * volumes we MAY have to unload the logical volumes NAME and BEAST + * B-Trees. + * + * Note this routine can not assume that it is being called when + * the volume is ACTIVE. We are sometimes called to UNDO a failed + * attempt to go to the ACTIVE state. This means that we can not + * assume that our beasts are loaded. + * + * This routine must succeed. + ***************************************************************************/ + +void ZFSVOL_UnLoadSystemBeasts( + ZfsVolume_s *vol) +{ + + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_UnLoadSystemBeasts); + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSVOL_UnLoadSystemBeasts\n") )); + +#if !VOL_DOES_ROOTDIR + /* + * Even though our load inserted into the beast hash we do + * not need to remove from hash, because the root destructor does. + */ +/*------------------------------------------------------------------------- + * UnLoad the ROOT directory if it is loaded. + *-------------------------------------------------------------------------*/ + if (vol->ZFSVOLrootdir != NULL) + { + BST_releaseAndFree( vol->ZFSVOLrootdir ); + vol->ZFSVOLrootdir = NULL; + } +#endif + + if ( !COMN_IsDerivedFrom(vol, zFTYPE_ZLSS_ZFSPOOL) ) + { + ZLSSVOL_UnloadSystemBeasts( vol ); + } + RTN_VOID(); + +} + + +/************************************************************************** + * this is called to bring a VOLUME online, the MODE parameter defines + * different level of ONLINE + ***************************************************************************/ +STATIC STATUS ZFSVOL_VOL_ChangeVolumeState( + GeneralMsg_s *genMsg, + void *vol_LX, + NINT sourceState, /* This is normally vol->state, but when we + * are doing VOLMODE_UNDO then this is the + * state that we had previously tried to + * go to. + */ + NINT requestedState, + NINT mode, + NINT pass ) +{ + ZfsVolume_s *vol = (ZfsVolume_s *)vol_LX; + STATUS status; + statusfunc_t parentFunc; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_ChangeVolumeState); + +/*------------------------------------------------------------------------- + * Since we know we need to call the Volume_s code, get the address of + * that routine. + *-------------------------------------------------------------------------*/ + parentFunc = COMN_GetNextParentVolumeComnOp( vol->ZFSVOLbeastClass, + COMNVOLOPS_INDEX(VOL_changeVolumeState), + ZFSVOL_VOL_ChangeVolumeState ); + zASSERT(parentFunc != NULL); + status = parentFunc( genMsg, &vol->vol, sourceState, + requestedState, mode, pass ); + if ( status != zOK ) + { + return( status ); + } + + switch (requestedState) + { + +/*=========================================================================*/ + case zVOLSTATE_ACTIVE: /* Destination State */ + switch( sourceState ) + { + case zVOLSTATE_ACTIVE: /* Source State */ + break; + /* + * The DEACTIVE amd MAINTENANCE code should + * normally be run as VOLMODE_PRE, but we still have + * the ZFSPOOL load the NAME and BEAST tree. If + * we change this then we can go back to being PRE + * fuctions. + */ + case zVOLSTATE_DEACTIVE: /* Source State */ + case zVOLSTATE_MAINTENANCE: /* Source State */ + // This is done by Volume_s now so that ALL volumes have +// if ( pass == ZLSSVOL_ACTIVATE_TIME ) +// { +// DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(LRED, +// MSGNot("(DEBUG)Setting the activation time field\n"))); +// vol->ZFSVOLvol.p.PV_activationTimeUTC = GetUTCTime(); +// } + if ( pass == ZLSSVOL_ACTIVATE ) + { + if ( !COMN_IsDerivedFrom(vol, zFTYPE_ZLSS_ZFSPOOL) ) + { + status = ZLSSVOL_LoadPersistentPool( genMsg, vol, mode, 0 ); + if ( status != zOK ) + { + RTN_STATUS(zFAILURE); + } + /** Same code is in POOL CVS make this a function **/ + /* If we are not in override mode then we need + * to fail going to ACTIVE state if the volume + * is corrupt or was being repaired/checked. We + * have to wait until now to check because the + * corrupt and repairing bits are stored in the + * persistent volume. Maybe Volume_s should do + * the check. + */ + if ( !(mode & VOLMODE_OVERRIDE) ) + { + if ( (vol->ZLSSVOLvol.p.stateAttributes & VOLSTATEATTR_CORRUPT) || + (vol->ZLSSVOLvol.p.stateAttributes & VOLSTATEATTR_REPAIRING) ) + { + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(CYAN, + MSGNot("Change state converted to a request from %d to %d based on stateAtrribute of 0x%lx\n"), + sourceState, zVOLSTATE_MAINTENANCE, vol->ZLSSVOLvol.p.stateAttributes )); + if ( sourceState == zVOLSTATE_DEACTIVE ) + { + if ( mode & VOLMODE_VERBOSE ) + { + if ( vol->ZLSSVOLvol.p.stateAttributes & VOLSTATEATTR_REPAIRING ) + { + aprintf(LRED,MSG(" ** Volume was being rebuilt so being placed into MAINTENANCE state\n", 868) ); + } + else + { + aprintf(LRED,MSG(" ** Volume has corruption so being placed into MAINTENANCE state\n", 869) ); + } + } + SetErrno(genMsg,zERR_VOLUME_STATE_CHANGE_A_TO_M); + } + else + { /* Since we are corrupt or were repairing and + * we have not been told to OVERRIDE then we + * will not do this activation request. + */ + SetErrno(genMsg,zERR_VOLUME_SHOULD_NOT_ACTIVATE); + } + return( zFAILURE ); + } + } + } + status = ZFSVOL_Activate( genMsg, vol, mode ); + } + if ( pass == ZLSSVOL_LAST ) + { + ZFSVOL_CheckZeroZid(&vol->ZFSVOLvol); /* make sure the beast tree has zid 0 */ +//#ifndef __linux__ // LINUX_Upgrade + if ( ZLSS_AIPUNakomaVolume_Ready( vol ) ) + { /* Volume is ready to be upgraded so ensure that + * the upgrade thread is running. + */ + ZLSS_UpgradeStartupNakoma( ); + } +//#endif + + /* After the system is up, now process the Volume Purge Log. */ + if ( mode & VOLMODE_VERBOSE ) + { + aprintf(NSS_POOL_COLOR,MSG(" ** Processing volume purge log\n", 870)); + } + if (DATA_SHREDDING_ENABLED(vol)) + { + if (DataShredInit(vol->ZFSVOLshredCount) != zOK) + { + SetErrno(genMsg, zERR_NO_MEMORY); + RTN_STATUS(zFAILURE); + } + } + status = ZFSVOL_PlayPurgeLog(genMsg, vol, + vol->ZFSVOLvolumePurgeLog, mode); + if ( status != zOK ) + { + RTN_STATUS(zFAILURE); + } + } + + break; + case zVOLSTATE_UNKNOWN: /* Source State */ + break; + default: + break; + } + break; + +/*=========================================================================*/ + case zVOLSTATE_DEACTIVE: /*** Destination State ***/ + switch( sourceState ) + { + case zVOLSTATE_ACTIVE: /*** Source State ***/ + if ( pass == ZLSSVOL_DEACTIVATE ) + { + status = ZFSVOL_Deactivate( genMsg, vol, mode ); + } + break; + + case zVOLSTATE_DEACTIVE: /*** Source State ***/ + case zVOLSTATE_MAINTENANCE: /*** Source State ***/ + case zVOLSTATE_UNKNOWN: /*** Source State ***/ + default: + break; + } + break; +/*=========================================================================*/ + case zVOLSTATE_MAINTENANCE: /*** Destination State ***/ + switch( sourceState ) + { + case zVOLSTATE_ACTIVE: /*** Source State ***/ + if ( pass == ZLSSVOL_ACTIVE_TO_MAINTENANCE ) + { + status = ZFSVOL_Deactivate( genMsg, vol, mode ); + } + break; + + case zVOLSTATE_DEACTIVE: /*** Source State ***/ + case zVOLSTATE_MAINTENANCE: /*** Source State ***/ + case zVOLSTATE_UNKNOWN: /*** Source State ***/ + default: + break; + } + break; +/*=========================================================================*/ + case zVOLSTATE_UNKNOWN: + switch( sourceState ) + { + case zVOLSTATE_ACTIVE: + case zVOLSTATE_DEACTIVE: + case zVOLSTATE_MAINTENANCE: + case zVOLSTATE_UNKNOWN: + default: + break; + } + break; + } + RTN_STATUS( status ); + +} + + +/************************************************************************ + * Allocate and init the storage info structure. Also set the blk size + * shift for the file. + *************************************************************************/ +STATIC STATUS ZFSVOL_VOL_AllocStorageInfo( + GeneralMsg_s *genMsg, + RootBeast_s *beast) +{ + ZfsVolume_s *zvol = beast->vol.zfsVol; + ZFSStorageInfo_s *stInfo; + statusfunc_t parentFunc; + STATUS status; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_AllocStorageInfo); + + stInfo = (ZFSStorageInfo_s *)zalloc(sizeof(ZFSStorageInfo_s)); + if (stInfo == NULL) + { + SetErrno(genMsg,zERR_NO_MEMORY); + RTN_STATUS(zFAILURE); + } + beast->storage.zfsInfo = stInfo; + stInfo->rebuildMagic = REBUILD_FILE_IS_GOOD; + + parentFunc = COMN_GetNextParentVolumeComnOp(zvol->ZFSVOLbeastClass, + COMNVOLOPS_INDEX(VOL_allocStorageInfo),ZFSVOL_VOL_AllocStorageInfo); + + status = parentFunc(genMsg,beast); + + zASSERT(beast->blkSizeShift == ZFS_BLOCK_SHIFT); + RTN_STATUS(status); +} + +/************************************************************************ + * Get the storage info structure in a generic way. + *************************************************************************/ +STATIC STATUS ZFSVOL_VOL_GetStorageInfo( + GeneralMsg_s *genMsg, + RootBeast_s *beast, + GetStorageInfo_s *getStorageInfo) +{ + ZFSStorageInfo_s *stInfo = beast->storage.zfsInfo; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_GetStorageInfo); + + getStorageInfo->physicalEOF = + (((QUAD)stInfo->nextBlk) << beast->ROOTblkSizeShift); + getStorageInfo->filePhysSize = + (((QUAD)stInfo->fmapDataBlks) << beast->ROOTblkSizeShift); + getStorageInfo->metaDataPhysSize = + (((QUAD)stInfo->fmapTreeBlks) << beast->ROOTblkSizeShift); + + RTN_STATUS(zOK); +} + +/**************************************************************************** + * This will allocate and return the next range of ZIDs for this volume. + * The return parameter is count of how many we got + *****************************************************************************/ +STATIC NINT ZFSVOL_VOL_AllocateZIDs( + GeneralMsg_s *genMsg, + Volume_s *vol_LX, + NINT howMany, + Zid_t *retZID, + Xaction_s *xaction) +{ + ZfsVolume_s *vol = (ZfsVolume_s *)vol_LX; + VolInfoLog_s volLog; + GeneralMsg_s dummyGenMsg; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_AllocateZID); + + ASSERT_XLATCH(&vol->ZFSVOLbeastLatch); + zASSERT(!(vol->ZFSVOLenabledAttributes & zATTR_READONLY)); + zASSERT((howMany > 0) && (howMany <= 4096)); + if (howMany > 0) + { + *retZID = vol->ZFSVOLnextZid; + if (*retZID >= UI64_CONST(0xFFFFFFFF)) + { + SetErrno(genMsg, zERR_ZID_GREATER_THAN_32BITS); + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + COMN_VolumeAlert(&dummyGenMsg, NULL, &vol->vol, NULL, 0, 0, + GetErrno(genMsg), GetErrnoSetter(genMsg), + CVA_VOLUME_ALERT | CVA_REZID_NEEDED); + RTN_NINT(0); + } + if ((*retZID+howMany) >= UI64_CONST(0xFFFFFFFF)) + { + howMany = UI64_CONST(0xFFFFFFFF) - *retZID; + } + vol->ZFSVOLnextZid += howMany; + volLog.delta = howMany; + + DEBUG_PRINTF(TZVOL, DBG_INDENT,(YELLOW, MSGNot("ZFSVOL_AllocateZID: retZID=%d nextZid=%d\n"), + (LONG)*retZID, vol->ZFSVOLnextZid)); + + volLog.action = VOLINFO_CHANGE_NEXT_ZID; + /* Store off actual value for debugging */ + volLog.VIL_value = vol->ZFSVOLnextZid; + ZFSVOL_VOL_WriteVolumeLoggedData(&vol->vol, xaction, &volLog); + } + RTN_NINT(howMany); +} + + +/* ZLSS_VolumeIDLookupRecovery() - + * This function is used to locate a ZLSS volume by its internal GUID and + * lock the volume into memory(increases inUse count) at recovery time. + * If this function succeeds, the volume must be released from memory + * by calling COMN_Release. + * + * Internal volumes are included on the list of volumes to match. + * Volumes not active will also be found. + * Internal Volumes include the pool's volume and deleted + * volumes (they are volumes that have been marked as internal so that the + * end user does not see them). + * + * The caller owns the cvsLatch on the poolVolume. + * This routine must only be called during recovery. + */ + +ZfsVolume_s *ZLSS_VolumeIDLookupRecovery( + GeneralMsg_s *genMsg, + GUID_t *internalID, + ZfsPool_s *poolVolume ) /* Pool's internal volume */ +{ + ZfsVolume_s *zVolume; + Volume_s *volume; + Pool_s *pool; + + zASSERT( LB_GUIDValidate( internalID ) ); + zASSERT( poolVolume != NULL ); + pool = poolVolume->ZFSPOOLvol.v_pool; + zASSERT( pool != NULL ); + /* + * Go through the link list of LVs that this pool + * owns and find the correct volume. + */ + POOL_FOR_EACH_LOADED_VOLUME( pool, volume ) + { + zVolume = (ZfsVolume_s *)volume; + zASSERT( LB_GUIDValidate( &zVolume->ZLSSVOLinternalID ) ); + if ( LB_GUIDCompare( &zVolume->ZLSSVOLinternalID, internalID ) == 0 ) + { /* This is the correct volume */ + COMN_USE_BEAST( &volume->VOLroot ); + return( zVolume ); + } + } + /* Did not match a Logical Volume, so check internal volume */ + zASSERT( LB_GUIDValidate( &poolVolume->ZFSPOOLinternalID ) ); + if ( LB_GUIDCompare( &poolVolume->ZFSPOOLinternalID, internalID ) == 0 ) + { /* This is the correct volume */ + volume = &poolVolume->ZFSPOOLvol; + zVolume = (ZfsVolume_s *)volume; + COMN_USE_BEAST( &volume->VOLroot ); + return( zVolume ); + } + SetErrno( genMsg, zERR_VOLUME_NOT_FOUND ); + return( NULL ); + +} /* End of ZLSS_VolumeIDLookupRecovery() */ + + +/* ZLSS_VolumeIDLookup() - + * This function is used to locate a ZLSS volume by its internal GUID and + * lock the volume into memory(increases inUse count). If this function + * succeeds, the volume must be released from memory by calling COMN_Release. + * + * Notes - + * ZLSS volumes on different pools can have the same internal ID. This + * comes about because of 'breaking a mirror'. This is why the ZfsPool_s + * is a required parameter. + * Internal volumes are inlcuded on the list of volumes to match + * against only if genMsg has had the COMN_INCLUDE_INTERNAL_VOLUMES macro + * applied to it. Internal Volumes include the pool's volume and deleted + * volumes (they are volumes that have been marked as internal so that the + * end user does not see them). + * + * Warnings - + * The caller must not own the cvsLatch on the poolVolume. cvsLatch + * requirement removed June 2001 because the only caller at the time sometimes + * had the pool locked active. This is the same as owning the cvsLatch as + * we deadlocked because Change Pool State owned cvsLatch and was waiting for + * use to release active count. We had active count, but where waiting for + * cvsLatch. I simply removed getting the cvsLatch because we own the + * NSS Lock and because we do not block. This means that the list can not + * change on us. In the long run we may wish to change the list to a SET + * so that no latches are needed anywhere. This will be helpful because + * debug code may cause us to BLOCK. For example the ASSERT. + */ + +ZfsVolume_s *ZLSS_VolumeIDLookup( + GeneralMsg_s *genMsg, + GUID_t *internalID, + BOOL activeOnly, /* If true, only ACTIVE volumes will be returned*/ + ZfsPool_s *poolVolume ) /* Pool's internal volume that volume is in */ + +{ + ZfsVolume_s *zVolume; + Volume_s *volume; + Pool_s *pool; + + zASSERT( LB_GUIDValidate( internalID ) ); + zASSERT( poolVolume != NULL ); + pool = poolVolume->ZFSPOOLvol.v_pool; + zASSERT( pool != NULL ); + /* + * Go through the link list of LVs that this pool + * owns and find the correct volume. + */ + + POOL_FOR_EACH_LOADED_VOLUME_NO_BLOCKING( pool, volume ) + { +// volume->VOLfile.FILEuseCount += 1; We don't do because no yields + zVolume = (ZfsVolume_s *)volume; + zASSERT( LB_GUIDValidate( &zVolume->ZLSSVOLinternalID ) ); + if ( LB_GUIDCompare( &zVolume->ZLSSVOLinternalID, internalID ) == 0 ) + { /* This is the correct volume */ + if ( ( (!activeOnly) || ((volume->state == zVOLSTATE_ACTIVE) && + (!(volume->v_statusFlag & VOL_SF_LEAVING_ACTIVE_STATE_CLEANUP)) + ) + ) && VOL_ACCESSIBLE2(volume, genMsg) /* Needed because deleted LVs are hidden */ + ) + { /* It matches all conditions */ + COMN_USE_BEAST( &volume->VOLroot ); + return( zVolume ); + } + /* Volume is not in the correct 'state' for user */ + SetErrno( genMsg, zERR_VOLUME_NOT_FOUND ); + return( NULL ); + } + } + /* Did not match a Logical Volume, so check internal volume */ + zASSERT( LB_GUIDValidate( &poolVolume->ZFSPOOLinternalID ) ); + if ( LB_GUIDCompare( &poolVolume->ZFSPOOLinternalID, internalID ) == 0 ) + { /* This is the correct volume */ + volume = &poolVolume->ZFSPOOLvol; + zVolume = (ZfsVolume_s *)volume; + if ( ( (!activeOnly) || ((volume->state == zVOLSTATE_ACTIVE) && + (!(volume->v_statusFlag & VOL_SF_LEAVING_ACTIVE_STATE_CLEANUP)) + ) + ) && VOL_ACCESSIBLE2(volume, genMsg) /* Needed because the _IV_ is hidden */ + ) + { /* It matches all conditions */ + COMN_USE_BEAST( &volume->VOLroot ); + return( zVolume ); + } + } + SetErrno( genMsg, zERR_VOLUME_NOT_FOUND ); + return( NULL ); + +} /* End of ZLSS_VolumeIDLookup() */ + + +/* ZLSS_VolumeIDLookupID - + * Given a pool and a volume's internal ID this routine will find + * the volume's Volume ID. + * + * Notes - + * ZLSS volumes on different pools can have the same internal ID. This + * comes about because of 'breaking a mirror'. This is why the ZfsPool_s + * is a required parameter. + */ + +STATUS ZLSS_VolumeIDLookupID( + GeneralMsg_s *genMsg, + GUID_t *internalID, + ZfsPool_s *poolVolume, + VolumeID_t *retVolumeID) + +{ + ZfsVolume_s *zVolume=NULL; + Volume_s *volume; + Pool_s *pool; + + zASSERT( LB_GUIDValidate( internalID ) ); + zASSERT( poolVolume != NULL ); + pool = poolVolume->ZFSPOOLvol.v_pool; + zASSERT( pool != NULL ); + /* + * Go through the link list of LVs that this pool + * owns and find the correct volume. If not found + * then check to see if caller is looking for the + * pool's internal volume. If not then return an error. + */ + POOL_FOR_EACH_LOADED_VOLUME_NO_BLOCKING( pool, volume ) + { + zVolume = (ZfsVolume_s *)volume; +// volume->VOLfile.FILEuseCount += 1; We don't do because no yields + zASSERT( LB_GUIDValidate( &zVolume->ZLSSVOLinternalID ) ); + zVolume = (ZfsVolume_s *)volume; + if ( LB_GUIDCompare( &zVolume->ZLSSVOLinternalID, internalID ) == 0 ) + { /* This is the correct volume */ + zASSERT( LB_GUIDValidate( &zVolume->ZLSSVOLvolumeID ) ); + *retVolumeID = zVolume->ZLSSVOLvolumeID; + return( zOK ); + } + } + /* Not a Logical Volume, so check internal volume */ + zASSERT( LB_GUIDValidate( &poolVolume->ZFSPOOLinternalID ) ); + if ( LB_GUIDCompare( &poolVolume->ZFSPOOLinternalID, internalID ) == 0 ) + { /* This is the correct volume */ + zASSERT( LB_GUIDValidate( &zVolume->ZLSSVOLvolumeID ) ); + *retVolumeID = poolVolume->ZFSPOOLvolumeID; + return( zOK ); + } + SetErrno( genMsg, zERR_VOLUME_NOT_FOUND ); + return( zFAILURE ); + +} /* End of ZLSS_VolumeIDLookupID() */ + + +#if !VOL_DOES_ROOTDIR +/************************************************************************** + * This will flush all of the ZFSVOL's SYSTEM beasts to the disk. This + * is called when we are both deactivating as well as simply flushing + * the volume. We pass a flag to say what mode we are in. + * + * The ZFSVOL's only system beast is the rootdir. + ***************************************************************************/ +STATUS ZFSVOL_DoFlushSystemBeasts( + GeneralMsg_s *genMsg, + ZfsVolume_s *vol, + BOOL deactivating) /* if TRUE we are deactivating the pool, else flushing*/ +{ + RootBeast_s *beast; + ASSERT_MPKNSS_LOCK(); +#if VOL_HAS_SYSTEM_BEASTS + statusfunc_t parentFunc; +#endif + + ENTER(TCOMMON, ZFSVOL_DoFlushSystemBeasts); + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSVOL_DoFlushSystemBeasts\n"))); + +#if VOL_HAS_SYSTEM_BEASTS + parentFunc = BST_GetNextParentMethod(vol->ZFSVOLbeastClass, + COMNOPS_INDEX(flushSystemBeasts),ZFSVOL_FlushSystemBeasts); + + zASSERT( parentFunc != NULL ); + parentFunc( genMsg, (Volume_s *)vol ); + /* FixFixFix6 - This code is not executed in MOAB release */ +#endif + +/*------------------------------------------------------------------------- + * First go through and flush all of the system beasts. We will wait for + * them to write. + *-------------------------------------------------------------------------*/ + STOP_SIGNAL_PROPAGATION(&vol->ZFSVOLagent); + beast = (RootBeast_s *)vol->ZFSVOLrootdir; + if ( (beast != NULL) && !(beast->bstState & BST_STATE_DO_NOT_WRITE)) + { + bind(&vol->ZFSVOLagent,&beast->ROOTmycache.agent); + beast->bstState |= BST_STATE_FULL_FLUSH; + cacheFlushMyCache(&beast->ROOTmycache); + } + START_SIGNAL_PROPAGATION(&vol->ZFSVOLagent); + defaultFlushWait(&vol->ZFSVOLagent); + + if ( !COMN_IsDerivedFrom(vol, zFTYPE_ZLSS_ZFSPOOL) ) + { + GeneralMsg_s dummyGenMsg; + + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + (void)ZLSSVOL_DoFlushSystemBeasts( &dummyGenMsg, vol, deactivating ); + } + + RTN_STATUS( zOK ); +} + +/************************************************************************** + * This will do a flush of the system beasts of the volume (i.e. rootdir) + ***************************************************************************/ +STATUS ZFSVOL_VOL_FlushSystemBeasts( + GeneralMsg_s *genMsg, + void *vol_LX) +{ + ZfsVolume_s *vol = (ZfsVolume_s *)vol_LX; + + DEBUG_PRINTF(TLVOLUMES,DBG_NOINDENT,(CYAN,MSGNot("CVS@ZFSVOL_FlushSystemBeasts\n"))); + ASSERT_MPKNSS_LOCK(); + return ZFSVOL_DoFlushSystemBeasts(genMsg,vol,FALSE); +} +#endif /* #if !VOL_DOES_ROOTDIR */ + + +/**************************************************************************** + * This function returns metadata information for a file beast object + *****************************************************************************/ +STATUS ZFSVOL_BST_GetInfo( + GeneralMsg_s *genMsg, + RootBeast_s *volume_LX, + GetInfoMsg_s *infoMsg) +{ + ZfsVolume_s *volume = (ZfsVolume_s *)volume_LX; + zVolumeInfo_s *volInfo; + statusfunc_t derivedFromGetInfo; + ASSERT_MPKNSS_LOCK(); +/*--------------------------------------------------------------------------- + * First, call the generic file information routine and get any + * requested generic information. If it fails, do not continue... + *---------------------------------------------------------------------------*/ + derivedFromGetInfo = COMN_GetNextParentBeastComnOp(volume->ZFSVOLbeastClass, + COMNOPS_INDEX(BST_getInfo),ZFSVOL_BST_GetInfo); + if (derivedFromGetInfo(genMsg,volume,infoMsg) != zOK) + return(zFAILURE); +/*--------------------------------------------------------------------------- + * Now, if the caller requested, fill in any volume specific information + *---------------------------------------------------------------------------*/ + if ((volInfo = infoMsg->ret_getTypeInfo) == NULL) + { + return(zOK); /* No type specific info to be modified */ + } + + if (infoMsg->sizeRetGetTypeInfo < sizeof(zVolumeInfo_s)) + { + SetErrno(genMsg,zERR_BUFFER_TOO_SMALL); + return(zFAILURE); + } + + if (infoMsg->getTypeInfoMask) + { + if (infoMsg->getTypeInfoMask & zGET_VOL_SALVAGE_INFO) + { + /* The default volume fills in the all salvage info except the + * oldestDeletedTime.*/ + + volInfo->salvage.oldestDeletedTime = getOldestDeletedTime(genMsg, volume); + } + } + + return(zOK); +} + + +/**************************************************************************** + * This function modifies metadata information for a ZLSSVOL beast object + *****************************************************************************/ +STATUS ZFSVOL_BST_ModifyInfo( + GeneralMsg_s *genMsg, + RootBeast_s *zlssVolume_LX, + ModifyInfoMsg_s *modifyMsg, + Xaction_s *xaction) /* Optional xaction, may be NULL */ +{ + ZfsVolume_s *zlssVolume = (ZfsVolume_s *)zlssVolume_LX; + zVolumeInfo_s *volInfo; + statusfunc_t derivedFromModifyInfo; + BOOL modified = FALSE; + BOOL someAttrsChanged = FALSE; + BOOL someAttrsNotChanged = FALSE; + + ASSERT_MPKNSS_LOCK(); + zASSERT(!(zlssVolume->ZFSVOLenabledAttributes & zATTR_READONLY) || + (modifyMsg->modifyTypeInfoMask & zMOD_VOL_ATTRIBUTES)); + +/*--------------------------------------------------------------------------- + * First, call the generic file information routine and modify any + * requested generic information. If it fails, do not continue... + *---------------------------------------------------------------------------*/ + derivedFromModifyInfo = COMN_GetNextParentBeastComnOp( + zlssVolume->ZFSVOLbeastClass, + COMNOPS_INDEX(BST_modifyInfo), + ZFSVOL_BST_ModifyInfo); + + if (derivedFromModifyInfo(genMsg,zlssVolume,modifyMsg,xaction) != zOK) + return(zFAILURE); +/*--------------------------------------------------------------------------- + * Now, if the caller requested, fill in any volume specific information + *---------------------------------------------------------------------------*/ + if ((volInfo = modifyMsg->modifyTypeInfo) == NULL) + { + return(zOK); /* No type specific info to be modified */ + } + + if (modifyMsg->modifyTypeInfoMask) + { + zASSERT(!(zlssVolume->ZFSVOLenabledAttributes & zATTR_READONLY) || + ((modifyMsg->modifyTypeInfoMask & zMOD_VOL_ATTRIBUTES) && + (!(volInfo->features.enabled & zATTR_READONLY)))); + + if (zlssVolume->ZFSVOLmayIDoThis(genMsg, zlssVolume, modifyMsg->parentZid, + MAY_I_DO_EVERYTHING) != zOK) + { + SetErrno(genMsg,zERR_NO_SET_PRIVILEGE); + return(zFAILURE); + } + if (modifyMsg->modifyTypeInfoMask & zMOD_VOL_DATA_SHREDDING_COUNT) + { + zASSERT((volInfo->dataShreddingCount >= 1) && + (volInfo->dataShreddingCount <= MAX_DATA_SHRED_PATTERNS)); + + if (volInfo->dataShreddingCount < 1) + { + volInfo->dataShreddingCount = 1; + } + else if (volInfo->dataShreddingCount > MAX_DATA_SHRED_PATTERNS) + { + volInfo->dataShreddingCount = MAX_DATA_SHRED_PATTERNS; + } + if (DataShredInit(volInfo->dataShreddingCount) != zOK) + { + someAttrsNotChanged = TRUE; + } + else + { + zlssVolume->ZFSVOLshredCount = volInfo->dataShreddingCount; + someAttrsChanged = TRUE; + } + } + /* FixFixFix6(Later, Logical Volumes) - when logical volumes + * are added then this code must only set attributes + * that belong to the logical volume. Items setable + * by the pool only need to be moved to the Pool modify + * code (i.e. ZFSPOOL_BST_ModifyInfo). + */ + if (modifyMsg->modifyTypeInfoMask & zMOD_VOL_ATTRIBUTES) + { + if ((volInfo->features.enableModMask & + ZLSS_NON_CHANGABLE_ATTRIBUTES) != + ((volInfo->features.enabled & volInfo->features.enableModMask) & + ZLSS_NON_CHANGABLE_ATTRIBUTES)) + { + /* One or more of these bits are specified to be changed. + * Today, we don't allow any of these bits to be modified. */ + someAttrsNotChanged = TRUE; + } + + /* + * Salvage + */ + if ((volInfo->features.enableModMask & zATTR_SALVAGE) && + ((volInfo->features.enabled & zATTR_SALVAGE) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_SALVAGE))) + { + /* Salvage is being changed */ + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_SALVAGE; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_SALVAGE); + someAttrsChanged = TRUE; + } + + /* + * Atime + */ + +// The compiler has trouble with 64 bit masking and bit compares. +// This code variation seems to work. + +// if ((volInfo->features.enableModMask & zATTR_NO_ATIME) && +// ((volInfo->features.enabled & zATTR_NO_ATIME) != +// (zlssVolume->ZFSVOLenabledAttributes & zATTR_NO_ATIME))) +// { +// zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_NO_ATIME; +// zlssVolume->ZFSVOLenabledAttributes |= +// (volInfo->features.enabled & zATTR_NO_ATIME); +// someAttrsChanged = TRUE; +// } + + { + QUAD c1,c2,c3; + c1 = volInfo->features.enableModMask; + c1 &= zATTR_NO_ATIME; + c2 = volInfo->features.enabled; + c2 &= zATTR_NO_ATIME; + c3 = zlssVolume->ZFSVOLenabledAttributes; + c3 &= zATTR_NO_ATIME; + + if(c1) + { + if(c2 != c3) + { + /* Atime is being changed */ + c1 = ~zATTR_NO_ATIME; + zlssVolume->ZFSVOLenabledAttributes &= c1; + c1 = (volInfo->features.enabled & zATTR_NO_ATIME); + zlssVolume->ZFSVOLenabledAttributes |= c1; + someAttrsChanged = TRUE; + } + } + } + /* + * Hardlinks + */ + if ((volInfo->features.enableModMask & zATTR_HARD_LINKS) && + ((volInfo->features.enabled & zATTR_HARD_LINKS) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_HARD_LINKS))) + { + /* check for need to upgrade first */ + if((zlssVolume->ZLSSVOLmediaFormatMajor < 38) || + ((zlssVolume->ZLSSVOLmediaFormatMajor == 38) && (zlssVolume->ZLSSVOLmediaFormatMinor < 5))) + { + SetErrno(genMsg, zERR_MUST_UPGRADE_TO_LINK); + } + else + { + /* Hardlinks is being changed */ + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_HARD_LINKS; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_HARD_LINKS); + someAttrsChanged = TRUE; + } + } + + /* + * Extended Mac Namespace i.e. up to 255 unicode chars + */ + if ((volInfo->features.enableModMask & zATTR_EXTENDED_MAC_NAMESPACE) && + ((volInfo->features.enabled & zATTR_EXTENDED_MAC_NAMESPACE) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_EXTENDED_MAC_NAMESPACE))) + { + /* Extended Mac Namespace attribute is being changed */ + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_EXTENDED_MAC_NAMESPACE; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_EXTENDED_MAC_NAMESPACE); + someAttrsChanged = TRUE; + } + + + /* + * Migration + */ + if ((volInfo->features.enableModMask & zATTR_MIGRATION) && + ((volInfo->features.enabled & zATTR_MIGRATION) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_MIGRATION))) + { + /* Migration is being changed */ + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_MIGRATION; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_MIGRATION); + someAttrsChanged = TRUE; + + } + + if ((volInfo->features.enableModMask & zATTR_USER_TRANSACTION) && + ((volInfo->features.enabled & zATTR_USER_TRANSACTION) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_USER_TRANSACTION))) + { + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_USER_TRANSACTION; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_USER_TRANSACTION); + + someAttrsChanged = TRUE; + + /* If we enabled TTS make it active */ + if (volInfo->features.enabled & zATTR_USER_TRANSACTION) + { +#if NSS_DEBUG IS_ENABLED + static NINT fsmInstance; +#endif + + FsmLite_s *fsm = malloc(sizeof(FsmLite_s)); + if (fsm == NULL) + { + SetErrno(genMsg, zERR_NO_MEMORY); + errPrintf(WHERE, Module, 673, + MSG("Unable to start process to activate user transactions\n", 429)); + someAttrsNotChanged = TRUE; + } + else + { + FSMLITE_INIT(fsm, "FSM for user transaction activation", + ++fsmInstance); + + if (COMN_LockVolumeActive(genMsg, (Volume_s *)zlssVolume, FALSE) == zOK) + { + WORK_Schedule(fsm, UXactionActivateThread, (ADDR)zlssVolume); + } + } + } + } + + /* + * User space restrictions + */ + if ((volInfo->features.enableModMask & + zATTR_USER_SPACE_RESTRICTIONS) && + ((volInfo->features.enabled & zATTR_USER_SPACE_RESTRICTIONS) != + (zlssVolume->ZFSVOLenabledAttributes & + zATTR_USER_SPACE_RESTRICTIONS))) + { + BOOL errorFlag = FALSE; + + /* User Space Restriction status is being changed */ + if (volInfo->features.enabled & zATTR_USER_SPACE_RESTRICTIONS) + { /* + * If we are enabling restriction make sure the tree + * has been created + */ + if (zlssVolume->ZLSSVOLuserTree == NULL) + { + if (UBT_CreateUserTreeBeast(genMsg, + zlssVolume) != zOK) + { + errorFlag = TRUE; + someAttrsNotChanged = TRUE; + } + } + /* Initialize the User Space database */ + if (VOL_InitUserSpaceRestrictionData(genMsg, + &zlssVolume->ZFSVOLvol) != zOK) + { + errorFlag = TRUE; + someAttrsNotChanged = TRUE; + } + } + if (!errorFlag) + { + zlssVolume->ZFSVOLenabledAttributes &= + ~zATTR_USER_SPACE_RESTRICTIONS; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & + zATTR_USER_SPACE_RESTRICTIONS); + someAttrsChanged = TRUE; + } + } + + /* + * Directory quotas + */ + if ((volInfo->features.enableModMask & zATTR_DIR_QUOTAS) && + ((volInfo->features.enabled & zATTR_DIR_QUOTAS) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_DIR_QUOTAS))) + { + BOOL errorFlag = FALSE; + + /* Directory quota status is being changed */ + if (volInfo->features.enabled & zATTR_DIR_QUOTAS) + { /* + * If we are enabling restriction make sure the tree + * beast has been created + */ + if (zlssVolume->ZLSSVOLdirTree == NULL) + { + if (DBT_CreateDirTreeBeast(genMsg, + zlssVolume) != zOK) + { + errorFlag = TRUE; + someAttrsNotChanged = TRUE; + } + } + } + if (!errorFlag) + { + zlssVolume->ZFSVOLenabledAttributes &= + ~zATTR_DIR_QUOTAS; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & + zATTR_DIR_QUOTAS); + someAttrsChanged = TRUE; + } + } + + /* + * Read-only + */ + if ((volInfo->features.enableModMask & zATTR_READONLY) && + ((volInfo->features.enabled & zATTR_READONLY) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_READONLY))) + { + /* Read-only status is being changed */ + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_READONLY; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_READONLY); + someAttrsChanged = TRUE; + } + + /* + * Encryption - you can't modify it + */ + if ((volInfo->features.enableModMask & zATTR_ENCRYPTED) && + ((volInfo->features.enabled & zATTR_ENCRYPTED) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_ENCRYPTED))) + { + someAttrsNotChanged = TRUE; + } + + if ((volInfo->features.enableModMask & zATTR_COMPRESSION) && + ((volInfo->features.enabled & zATTR_COMPRESSION) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_COMPRESSION))) + { + /* compression is being changed */ + /* don't allow compression to be turned ON to OFF */ + if (! (zlssVolume->ZFSVOLenabledAttributes & zATTR_COMPRESSION)) + { + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_COMPRESSION); + someAttrsChanged = TRUE; + } + else + { + someAttrsNotChanged = TRUE; + } + } + + /* + * Modified File List + */ + if ((volInfo->features.enableModMask & zATTR_MFL) && + ((volInfo->features.enabled & zATTR_MFL) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_MFL))) + { + BOOL errorFlag = FALSE; + + /* MFL feature is being changed */ + if (volInfo->features.enabled & zATTR_MFL) + { /* + * If we are enabling MFL make sure the tree + * has been created + */ + if (zlssVolume->ZLSSVOLMFL == NULL) + { + if (ZFSVOL_VOL_createMFL(genMsg, zlssVolume) != zOK) + { + errorFlag = TRUE; + someAttrsNotChanged = TRUE; + } + } + } + else /* Disabling MFL maintenance; destroy existing MFL */ + { + if ((zlssVolume->ZLSSVOLMFL != NULL) && + (ZFSVOL_VOL_destroyMFL(genMsg, zlssVolume) != zOK)) + { + errorFlag = TRUE; + someAttrsNotChanged = TRUE; + } + } + if (!errorFlag) + { + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_MFL; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_MFL); + someAttrsChanged = TRUE; + } + } + + /* + * Data Shredding + */ + + if ((volInfo->features.enableModMask & zATTR_SHRED_DATA) && + ((volInfo->features.enabled & zATTR_SHRED_DATA) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_SHRED_DATA))) + { + BOOL errorFlag = FALSE; + + if (volInfo->features.enabled & zATTR_SHRED_DATA) + { + zASSERT((zlssVolume->ZFSVOLshredCount >= 1) && + (zlssVolume->ZFSVOLshredCount <= MAX_DATA_SHRED_PATTERNS)); + + if (DataShredInit(zlssVolume->ZFSVOLshredCount) != zOK) + { + errorFlag = TRUE; + someAttrsNotChanged = TRUE; + } + } + if (!errorFlag) + { + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_SHRED_DATA; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_SHRED_DATA); + someAttrsChanged = TRUE; + } + } + /* + * High Integrity + */ + if ((volInfo->features.enableModMask & zATTR_HIGH_INTEGRITY) && + ((volInfo->features.enabled & zATTR_HIGH_INTEGRITY) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_HIGH_INTEGRITY))) + { + /* high integrity is being changed */ + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_HIGH_INTEGRITY; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_HIGH_INTEGRITY); + someAttrsChanged = TRUE; + + } + /* + * CFS_MASTER + */ + if ((volInfo->features.enableModMask & zATTR_CFS_MASTER) && + ((volInfo->features.enabled & zATTR_CFS_MASTER) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_CFS_MASTER))) + { + /* CFS_MASTER is being changed */ + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_CFS_MASTER; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_CFS_MASTER); + someAttrsChanged = TRUE; + + } + /* + * COW + */ + if ((volInfo->features.enableModMask & zATTR_COW) && + ((volInfo->features.enabled & zATTR_COW) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_COW))) + { + /* Cow is being changed */ + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_COW; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_COW); + someAttrsChanged = TRUE; + } + /* + * DONT BACKUP + */ + if ((volInfo->features.enableModMask & zATTR_DONT_BACKUP) && + ((volInfo->features.enabled & zATTR_DONT_BACKUP) != + (zlssVolume->ZFSVOLenabledAttributes & zATTR_DONT_BACKUP))) + { + /* Don't backup is being changed */ + zlssVolume->ZFSVOLenabledAttributes &= ~zATTR_DONT_BACKUP; + zlssVolume->ZFSVOLenabledAttributes |= + (volInfo->features.enabled & zATTR_DONT_BACKUP); + someAttrsChanged = TRUE; + } + + if (someAttrsChanged) + { + /* Some attributes did change, so notify the common layer */ + COMN_VolumeAttributesChanged(genMsg, (Volume_s *)zlssVolume); + if (someAttrsNotChanged) + { + /* Some attrs did not change, so set the return error */ + SetErrno(genMsg,zERR_SOME_ATTRS_NOT_CHANGED); + } + modified = TRUE; + } + else if (someAttrsNotChanged) + { + /* No attrs were changed */ + SetErrno(genMsg,zERR_ALL_ATTRS_NOT_CHANGED); + } + } + /* We set these items because each logical volume can + * have their own salvage min and max settings. + */ + if (modifyMsg->modifyTypeInfoMask & zMOD_VOL_MIN_KEEP_SECONDS) + { + zlssVolume->ZFSVOLminKeepSeconds = volInfo->salvage.minKeepSeconds; + modified = TRUE; + } + if (modifyMsg->modifyTypeInfoMask & zMOD_VOL_MAX_KEEP_SECONDS) + { + zlssVolume->ZFSVOLmaxKeepSeconds = volInfo->salvage.maxKeepSeconds; + modified = TRUE; + } + + if (modifyMsg->modifyTypeInfoMask & zMOD_VOL_NDS_OBJECT_ID) + { + /*- FixFixFix6 this needs to be moved to the admin vol (mike) -*/ + zlssVolume->ZLSSVOLndsObjectID = volInfo->ndsObjectID; + /*- end of fix for mike to move later -*/ + modified = TRUE; + } + if (modifyMsg->modifyTypeInfoMask & zMOD_VOL_QUOTA) + { + NINT shift = zlssVolume->ZLSSVOLblockShift; + zlssVolume->ZLSSVOLtotalBlocks = + (volInfo->totalSpaceQuota + (1 << shift) - 1) >> shift; + modified = TRUE; + } + + if (modifyMsg->modifyTypeInfoMask & zMOD_READ_AHEAD_BLOCKS) + { + zlssVolume->p.PZV_readAheadBlocks = zlssVolume->vol.readAheadBlocks; + modified = TRUE; + } + + if ( modified ) + { /* This is an async call that does not do transactions + * so no warrenty that persistent information is updated. + */ + ZFSVOL_WritePersistentVolumeData(zlssVolume); + } + + } + + if (someAttrsNotChanged) + { + /* The error code zERR_SOME_ATTRS_NOT_CHANGED indicates a partial + * success, but we still return an error indicating not all + * attributes were changed as requested. */ + return(zFAILURE); + + } + return(zOK); +} + +/* ZFSVOL_VOL_CommandFunction - + * Implements volume delete, rename and delete action. Does not implement + * volume create because the ZlssPool_s object does (i.e. there is no + * ZfsVolume_s beast to call through). + * + * Notes - + * We must release the useCount on the 'beast'. + */ + +STATUS ZFSVOL_VOL_CommandFunction( + GeneralMsg_s *genMsg, + void *beast_LX, /* We inherit useCount */ + NINT functionNumber, + VCO_VolumeCommonOps_s *pCD, /*parsedCommandData */ + NINT parmLen, + utf8_t *parm, + NINT dataLen, + BYTE *commandData, + NINT offset, + NINT retBufLen, + BYTE *retBuf, + NINT *retLen) + +{ + ZfsVolume_s *beast = (ZfsVolume_s *)beast_LX; + statusfunc_t parentFunc; + STATUS status; + + zASSERT( COMN_IsDerivedFrom(beast, zFTYPE_ZLSS_VOL) ); +#if NSS_DEBUG IS_ENABLED + DBG_ScreenAPrintf("ZLSS.Greg.Volume", WHERE, + LRED, "ZFSVOL_VOL_CommandFunction(enter)\n"); +#endif + switch( functionNumber ) + { + /*** Note that the VCO_VOLUME_CREATE_NUMBER command + *** is handle by ZLSSPOL_VOL_CommandFunction() because + *** we only have a pool beast when a volume is created. + ***/ + case VCO_VOLUME_CREATE_NUMBER: + COMN_Release( &beast ); + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + case VCO_VOLUME_RENAME_NUMBER: + COMN_Release( &beast ); + status = ZLSSVOL_LVR_RenameAPI( genMsg, + pCD->u.rename.volumeName, + pCD->u.rename.volumeNameNew ); + break; + case VCO_VOLUME_DELETE_NUMBER: + COMN_Release( &beast ); + status = ZLSSVOL_LVD_DeleteAPI( genMsg, + pCD->u.delete.volumeName, + 0 /* We do not own POOL CVS Latch */ ); + break; + case VCO_VOLUME_DELETE_ACTION_NUMBER: + COMN_Release( &beast ); /* Delete API assumes calling thread + * does not own. + */ + status = ZLSSVOL_LVD_DeleteActionAPI( genMsg, + pCD->u.deleteAction.volumeName, + pCD->u.deleteAction.action ); + break; + case VCO_VOLUME_GET_INFO_NUMBER: +// S_LATCH( &beast->ZLSSVOLvol.stateLatch ); + ASSERT_LATCH( &beast->ZLSSVOLvol.stateLatch ); + COMN_USE_BEAST( &beast->ZFSVOLroot ); /* Get a use count for parent */ + parentFunc = COMN_GetNextParentVolumeComnOp( beast->ZFSVOLbeastClass, + COMNVOLOPS_INDEX(VOL_commandFunction), + ZFSVOL_VOL_CommandFunction ); + zASSERT(parentFunc != NULL); + status = parentFunc( genMsg, beast, functionNumber, + pCD, parmLen, parm, dataLen, commandData, + offset, retBufLen, retBuf, retLen ); + if ( status != zOK ) + { +// UNS_LATCH( &beast->ZLSSVOLvol.stateLatch ); + COMN_Release( &beast ); /* Release OUR inherited use count */ + return( status ); + } + zASSERT( pCD->u.VCO_getInfo.VGI_action == 0 ); + status = ZLSSVOL_LV_GetInfoAPI( genMsg, + pCD->u.VCO_getInfo.VGI_volumeName, + pCD->u.VCO_getInfo.VGI_action, + offset, + retBufLen, + retBuf, + retLen ); +// UNS_LATCH( &beast->ZLSSVOLvol.stateLatch ); + COMN_Release( &beast ); /* Release OUR inherited use count */ + break; + default: + COMN_Release( &beast ); + status = zFAILURE; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + break; + } +#if NSS_DEBUG IS_ENABLED + DBG_ScreenAPrintf("ZLSS.Greg.Volume", WHERE, + LRED, "ZFSVOL_VOL_CommandFunction(exit) status %d(%s)\n", + GetErrno(genMsg), GetErrnoSetter(genMsg) ); +#endif + return( status ); + +} /* End of ZFSVOL_VOL_CommandFunction() */ + + +/**************************************************************************** + * ZFSVOL COMMON BEAST OPERATIONS definition + *****************************************************************************/ + +/*--------------------------------------------------------------------------- + * Defines all of the ZFS volume beast operations + *---------------------------------------------------------------------------*/ +CommonBeastOps_s ZFSVOL_ComnBeastOps = +{ + ZFSVOL_BST_Construct, /* construct */ + ZFSVOL_BST_Destruct, /* destruct */ + +// cnt NULL, /* BST_getNameUniquifier */ + NULL, /* BST_setupNameTypeSpecificInfo */ + NULL, /* BST_lookupByNameInDirectory*/ + NULL, /* BST_isDirectoryEmpty*/ + NULL, /* BST_addNameToDirectory*/ + NULL, /* BST_removeNameFromDirectory*/ + NULL, /* BST_modifyNameSpaceMaskInDirectory*/ + NULL, /* BST_setMatchAttributesInDirectory */ + NULL, /* BST_wildcardLookup*/ + + NULL, /* BST_truncateFile*/ + NULL, /* BST_getStorageInfo*/ + NULL, /* BST_getExtentList*/ + NULL, /* BST_getPhysicalExtent*/ + NULL, /* BST_isBlockInBeast*/ + + NULL, /* BST_asyncReadFileBlk*/ + NULL, /* BST_getFileBlk*/ + NULL, /* BST_dioReadUnits*/ + NULL, /* BST_dioWriteUnits*/ + + NULL, /* BST_getZID*/ + NULL, /* BST_beastNotify*/ + ZFSVOL_BST_GetInfo, /* BST_getInfo*/ + ZFSVOL_BST_ModifyInfo, /* BST_modifyInfo*/ + NULL, /* BST_getInfoXML*/ + NULL, /* BST_modifyInfoXML*/ +}; + +extern STATUS ZFSREPAIR_VOL_CheckRepair(GeneralMsg_s *genMsg, void *volPool,NINT maintenanceType,NINT flag, void *userInfo); + +/*------------------------------------------------------------------------- + * Volume Operations + *-------------------------------------------------------------------------*/ +CommonVolumeOps_s ZFSVOL_ComnVolOps = +{ +// cnt NULL, /* VOL_getNameUniquifier */ + NULL, /* VOL_setupNameTypeSpecificInfo */ + ZFSVOL_VOL_LookupByNameInDirectory, /* VOL_lookupByNameInDirectory */ + ZFSVOL_VOL_IsDirectoryEmpty, /* VOL_isDirectoryEmpty */ + ZFSVOL_VOL_AddNameToDirectory, /* VOL_addNameToDirectory */ + ZFSVOL_VOL_RemoveNameFromDirectory, /* VOL_removeNameFromDirectory */ + ZFSVOL_VOL_ModifyNameSpaceMaskInDirectory, /* VOL_modifyNameSpaceMaskInDirectory */ + ZFSVOL_VOL_SetMatchAttributesInDirectory, /* VOL_setMatchAttributesInDirectory */ + ZFSVOL_VOL_WildcardLookup, /* VOL_wildcardLookup */ + + ZFSVOL_VOL_truncateFile, /* VOL_truncateFile */ + ZFSVOL_VOL_GetStorageInfo, /* VOL_getStorageInfo */ + ZFSVOL_VOL_getExtentList, /* VOL_getExtentList */ + ZFSVOL_VOL_getPhysicalExtent, /* VOL_getPhysicalExtent */ + ZFSVOL_VOL_isBlockInBeast, /* VOL_isBlockInBeast */ + + ZFSVOL_VOL_asyncReadFileBlk, /* VOL_asyncReadFileBlk */ + ZFSVOL_VOL_getFileBlk, /* VOL_getFileBlk */ + ZFSVOL_VOL_dioReadUnits, /* VOL_dioReadUnits */ + ZFSVOL_VOL_dioWriteUnits, /* VOL_dioWriteUnits */ + + ZFSVOL_VOL_beginXLocal, /* VOL_beginXLocal */ + ZFSVOL_VOL_endXLocal, /* VOL_endXLocal */ + ZFSVOL_VOL_AddPurgeLogEntry, /* VOL_addPurgeLogEntry */ + ZFSVOL_VOL_RemovePurgeLogEntry, /* VOL_removePurgeLogEntry */ + ZFSVOL_VOL_WriteVolumeLoggedData, /* VOL_writeVolumeLoggedData */ + + ZFSVOL_VOL_ChangeVolumeState, /* VOL_changeVolumeState */ + ZFSREPAIR_VOL_CheckRepair, /* VOL_volumeMaintenance */ + ZFSVOL_VOL_AllocStorageInfo, /* VOL_allocStorageInfo */ + NULL, /* VOL_freeStorageInfo */ + ZFSVOL_VOL_GetBeastFromVolume, /* VOL_getBeastFromVolume */ + ZFSVOL_VOL_UpdateBeastToVolume, /* VOL_updateBeastToVolume */ + ZFSVOL_VOL_InsertBeastIntoVolume, /* VOL_insertBeastIntoVolume */ + ZFSVOL_VOL_RemoveBeastFromVolume, /* VOL_removeBeastFromVolume */ + ZFSVOL_VOL_AllocateZIDs, /* VOL_allocateZIDs */ +#if !VOL_DOES_ROOTDIR + ZFSVOL_VOL_FlushSystemBeasts, /* VOL_flushSystemBeasts*/ +#else + NULL, /* VOL_flushSystemBeasts*/ +#endif + ZFSVOL_VOL_MakeVolumeFreeSpace, /* VOL_makeVolumeFreeSpace */ + ZFSVOL_VOL_BrowseBeastsInVolume, /* VOL_browseBeastsInVolume */ + + ZFSVOL_VOL_GetUserSpaceInfo, /* VOL_getUserSpaceInfo */ + ZFSVOL_VOL_SetUserSpaceRestriction, /* VOL_setUserSpaceRestriction */ + ZFSVOL_VOL_AdjustUsedUserSpace, /* VOL_adjustUsedUserSpace */ + ZFSVOL_VOL_RemoveUser, /* VOL_removeUser */ + ZFSVOL_VOL_ResetAllUsers, /* VOL_resetAllUsers */ + ZFSVOL_VOL_BrowseUsersInVolume, /* VOL_browseUsersInVolume */ + + ZFSVOL_VOL_insertMFLEntry, /* VOL_insertMFLEntry */ + ZFSVOL_VOL_deleteMFLEntry, /* VOL_deleteMFLEntry */ + ZFSVOL_VOL_lookupMFLEntry, /* VOL_lookupMFLEntry */ + ZFSVOL_VOL_enumerateMFL, /* VOL_enumerateMFL */ + ZFSVOL_VOL_administerMFL, /* VOL_administerMFL */ + ZFSVOL_VOL_CommandFunction, /* VOL_commandFunction */ + + ZFSVOL_VOL_GetDirQuotaInfo, /* VOL_getDirQuotaInfo */ + ZFSVOL_VOL_SetDirQuota, /* VOL_setDirQuota */ + ZFSVOL_VOL_AdjustUsedDirSpace, /* VOL_adjustUsedDirSpace */ + ZFSVOL_VOL_RemoveDirectory, /* VOL_removeDirectory */ + ZFSVOL_VOL_ResetAllDirEntries, /* VOL_resetAllDirEntries */ + ZFSVOL_VOL_BrowseDirsInVolume, /* VOL_browseDirsInVolume */ + + ZFSVOL_VOL_GetObjectName, /* VOL_getObjectName */ + ZFSVOL_VOL_InsertObjectName, /* VOL_insertObjectName */ + ZFSVOL_VOL_ResetAllObjects, /* VOL_resetAllObjects */ + ZFSVOL_VOL_ModifyObjectName, /* VOL_modifyObjectName */ + + ZFSVOL_VOL_insertEFLEntry, /* VOL_insertEFLEntry */ + ZFSVOL_VOL_deleteEFLEntry, /* VOL_deleteEFLEntry */ + ZFSVOL_VOL_lookupEFLEntry, /* VOL_lookupEFLEntry */ + ZFSVOL_VOL_enumerateEFL, /* VOL_enumerateEFL */ + ZFSVOL_VOL_administerEFL, /* VOL_administerEFL */ + ZFSVOL_VOL_resetEFL, /* VOL_resetEFL */ + + ZFSVOL_VOL_FCNTL, /* VOL_FCNTL */ +}; + + +/**************************************************************************** + * EXTENT BASED storage pack routine + *****************************************************************************/ +STATIC NINT ZFSVOL_PackedSize( + void *beast_LX) +{ + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_PackedSize); + + RTN_NINT(0); /*sizeof(PersistentZfsVolume_s);*/ +} + +/**************************************************************************** + * EXTENT BASED storage pack routine + *****************************************************************************/ +STATIC BYTE *ZFSVOL_Pack( + void *beast_LX, + BYTE *storeBuffer) +{ + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_Pack); + + //memcpy(storeBuffer,&beast->p,sizeof(PersistentZfsVolume_s)); + //return (storeBuffer /*+ sizeof(PersistentZfsVolume_s)*/); + RTN_PTR(storeBuffer); +} + +/**************************************************************************** + * EXTENT BASED storage unpack routines + *****************************************************************************/ +STATIC BYTE *ZFSVOL_Unpack( + GeneralMsg_s *genMsg, + void *beast_LX, + BYTE *storeBuffer) +{ + ASSERT_MPKNSS_LOCK(); + ENTER(TZVOL, ZFSVOL_Unpack); + + //memcpy(&beast->p,storeBuffer,sizeof(PersistentZfsVolume_s)); + //return (storeBuffer /*+ sizeof(PersistentZfsVolume_s)*/); + RTN_PTR(storeBuffer); +} + +/*--------------------------------------------------------------------------- + * file beast STORAGE ops definition + *---------------------------------------------------------------------------*/ +LSSSpecificPackUnpackOps_s ZFSVOL_lssOps[] = +{ + {zLSS_ID_ZLSS,ZFSVOL_PackedSize,ZFSVOL_Pack,NULL,ZFSVOL_Unpack}, + {zLSS_ID_INVALID} +}; diff --git a/src/nwnss/zlss/zio.c b/src/nwnss/zlss/zio.c new file mode 100644 index 0000000..81bf82e --- /dev/null +++ b/src/nwnss/zlss/zio.c @@ -0,0 +1,2762 @@ +/**************************************************************************** + | + | (C) Copyright 1995-2001 Novell, Inc. + | All Rights Reserved. + | + | This program is free software; you can redistribute it and/or + | modify it under the terms of version 2 of the GNU General Public + | License as published by the Free Software Foundation. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, contact Novell, Inc. + | + | To contact Novell about this file by physical or electronic mail, + | you may find current contact information at www.novell.com + | + |*************************************************************************** + | + | NetWare Advance File Services (NSS) module + | + |--------------------------------------------------------------------------- + | + | $Author: mvijai $ + | $Date: 2008-09-11 11:46:56 +0530 (Thu, 11 Sep 2008) $ + | + | $RCSfile$ + | $Revision: 2470 $ + | + |--------------------------------------------------------------------------- + | This module is used to: + | Defines routines to manage pool block io. + +-------------------------------------------------------------------------*/ +#if !defined(NSS_USERSPACE) +#include +#include +#include +#include +#else +struct block_device; +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "comnBeasts.h" +#include "comnBeastClass.h" +#include "msgIO.h" +#include "hmc.h" +#include "zfs.h" +#include "zfsXTree.h" +#include "comnPublics.h" + +#include + +#include +#include "zfsAsyncio.h" +#include "comnPublics.h" + +#include "zfsSuperBlk.h" +#include "zlog.h" +#include "zlssStartup.h" +#include "zlssLogicalVolume.h" + +#include "z_aes.h" + +#if defined(NSS_USERSPACE) +#ifndef READ +#define READ 0 +#endif +#ifndef WRITE +#define WRITE 1 +#endif +#endif + +#if defined(__linux__) && !defined(NSS_USERSPACE) +#if zLINUX_2_6 +#include +#endif +#endif + +#if LOG_TEST IS_ENABLED +#include "zlog.h" /* For debug code only */ +#include "zParams.h" +#include "inst.h" +#include "comnIO.h" +#include "msgIO.h" +#include "beastTree.h" +#include "comnBeastClass.h" +#include "comnBeasts.h" +#include "zlssStartup.h" +#include "zfsAsyncio.h" +#include "zfsSuperBlk.h" +#endif + + +#ifdef USER_GPACHNER +#if NSS_DEBUG IS_ENABLED + +#ifndef __linux__ +void ReBoot(void); /* Reboots a PC - taken from crashNMIShell.386 */ + +#pragma aux ReBoot = \ + "cli" \ + "mov al, 254", \ + "out 100, al" \ + "jmp *-2" \ + modify exact [EAX]; +#endif + +#endif +#endif + +/*------------------------------------------------------------------------- + * Global Variables + *-------------------------------------------------------------------------*/ + + +/* + * ZIO_ErrorLog - + * Tracks physical I/Os Error information. Used to quickly tell + * if I/O errors are occuring within a ZLSS pool. The command + * 'NSS /ZlssPoolIOEror' will display most of the information in + * this array. + * + * ZIO_ErrorLogCount - + * Number of valid entries in ZIO_ErrorLog. + * + * IOsInst - + * + * gZLSSGatherDetailedIOInformation - + * When TRUE a summary of physical I/Os per pool is tracked. This + * information is stored persistently in the ZLOG beast. The command + * 'NSS /ZlssPoolIOStatistics=PoolName' will display the I/Os for a + * given pool. If PoolName is _summary then a summary of I/Os since + * ZLSS was loaded will be displayed. + */ + +IOErrorLog_s ZIO_ErrorLog[100]; +int ZIO_ErrorLogCount = 0; +ZLSSIOsInst_s IOsInst; +NINT gZLSSGatherDetailedIOInformation = TRUE; + + +#if NSS_DEBUG IS_ENABLED + /* Read debug globals */ +NINT RWTestCnt = 0; +NINT RWErrTestSeed = 0; +NINT RWErrTestInterval = 1; + /* WRite debug globals */ +NINT WRTestCnt = 0; /* Number of writes that we have done while + * WRErrTestSeed is non-zero. + */ +NINT WRErrTestSeed = 0; /* The write number to start failing on. + * The value zero idicates we will not + * simulate write failures. + */ +NINT WRErrTestInterval = 1; /* Frequency of which write calls will + * fail. Note that ZLSS attempts to + * never do another REAL write attempt + * after detecting a failure. + */ + +QUAD gSBD_WriteCrashFailureCount = 0; /* This is filled in at pool load time */ + +#endif + +/* ListHead for threads waiting on an encryption buffer */ +CIRhead_t EncryptedBufWaitHead = NULL; +#if NSS_DEBUG IS_ENABLED +NINT EncryptedBufWaitWrite = 0; +NINT EncryptedBufWaitRead = 0; +NINT EncryptedBufWaitReadAsync = 0; +#endif + +/*- current write count of outstanding write IOs -*/ +NINT CurrentWriteCount = 0; + +/************************************************************************************************************************ + ************************************************************************************************************************ + * + * The next section of APIs deal with encrypted volumes. Rough theory of operation is that a volume may be initially + * created as an encrypted volume, which becomes a volume attribute. Once created, a volume may not switch from encrypted + * to non-encrypted or vice-versa. Encryption is a create time attribute only. If a volume is created encrypted, a password + * of up to 16 characters (unicode) must be supplied. The password is not stored, but is used to wrap a generated key which + * is stored along with other crypto data in the volume's persistent logical data block and the volume locator beast. The + * key is also stored in memory to be used during later volume activations, while the password is discarded. When a volume + * is enabled, if a key is already present, it is used for further volume operations. If no key is present, a password is + * required. If no volume password is present, the activate fails and upper layer software will prompt for a password and + * try again. Once the password is present, it is used to unwrap the key stored in the VDB. When non-system data blocks + * (as defined by the ZLSS_IS_SYSTEM_BLOCK macro)are read or written to an encrypted volume, the data are converted from + * plain text to crypto just before the physical write, and decrypted on read before they go into the cache. A list of + * dedicated buffers are used to hold the crypto data for the time it is being used. Because the list of volume/key pairs + * are kept by system for the duration of up-time, the keys are available to allow clusters to pass keys around. Whenever + * a new volume key saved, an attempt is made to notify the cluster NLM of the new key. + * + * The actual crypt work is done by NICI via a library linked with COMN. This library does all the necessary checking for + * proper (read legal) requirements to abide by export laws and avoid crypto-in-a-hole problems. Because NICI loads after + * NSS, the encryption code cannot do it's work until after NICI loads. This necessitates that encrypted volumes cannot + * load until late in the boot process, and the SYS volume can never be encrypted. + * + ************************************************************************************************************************** + **************************************************************************************************************************/ + +/**************************************************************************** + * encrypt buf->pBuf.data into buf->eData in COMN's crypto context + *****************************************************************************/ +static STATUS ZFS_E_Buf( + Volume_s *vol, + Buffer_s *buf) +{ + BYTE ivBuf[16]; + STATUS status; + AES_KEY aesEncKey; + LONG bufLen = 1<<(buf->bufSizeShift); + + zASSERT(buf->eData != NULL); + if(buf->eData == NULL) + { + return zERR_NICI_SUPPORT; + } + + status = AES_set_encrypt_key(vol->v_key.key, 128, &aesEncKey); + zASSERT(status == zOK); + if (status != zOK) + { + return status; + } + + /* Copy the already-encrypted volume initialization vector */ + memcpy(ivBuf, &vol->v_key.iv[0], 16); + + /* Encrypt the user data from the cache buffer to the write buffer */ + mapBufferPage(buf); + zASSERT((buf->pBuf.data != NULL) && (buf->eData != NULL)); + AES_cbc_encrypt(buf->pBuf.data, (unsigned char *)buf->eData, bufLen, + &aesEncKey, ivBuf, AES_ENCRYPT); + unmapBufferPage(buf); + + return(zOK); +} + +/**************************************************************************** + * Decrypt data in buf->eData into buf->pBuf.data + *****************************************************************************/ +static STATUS ZFS_D_Buf( + Volume_s *vol, + Buffer_s *buf) +{ + BYTE ivBuf[16]; + STATUS status; + AES_KEY aesDecKey; + LONG bufLen = 1<<(buf->bufSizeShift); + + zASSERT(buf->eData != NULL); + if(buf->eData == NULL) + { + return zERR_NICI_SUPPORT; + } + + status = AES_set_decrypt_key(vol->v_key.key, 128, &aesDecKey); + zASSERT(status == zOK); + if (status != zOK) + { + return status; + } + + /* Copy the already-encrypted volume initialization vector */ + memcpy(ivBuf, &vol->v_key.iv[0], 16); + + /* Decrypt the user data from the read buffer to the cache buffer */ + mapBufferPage(buf); + zASSERT((buf->pBuf.data != NULL) && (buf->eData != NULL)); + AES_cbc_encrypt((unsigned char *)buf->eData, buf->pBuf.data, bufLen, + &aesDecKey, ivBuf, AES_DECRYPT); + unmapBufferPage(buf); + + return(zOK); +} + +/*********************************************************************** + * ZLSS Routines to copy data to himem and copy data from himem + ***********************************************************************/ + +typedef struct ZLSS_Himem_s { + QUAD ZH_ZLSSCacheCopyFromHimem; /* Times we have called ZLSSCacheCopyFromHimem() */ + QUAD ZH_CacheCopiedFromHimem; /* Number of times HMC_CopyDataFromHimem copied a block */ + QUAD ZH_CacheCopyFromHimem; /* Number of times we called HMC_CopyDataFromHimem */ + QUAD ZH_CacheMissFromHimem; /* Wanted to copy from himem, but not present in himem */ + + QUAD ZH_ZLSSCacheCopyToHimem; /* Times we have called ZLSSCacheCopyToHimem() */ + QUAD ZH_CacheCopyToHimem; /* Number of times we called HMC_CopyDataToHimem */ + + QUAD ZH_IsJournal; /* Number of times ZLSS_StoreBeastInHimem saw a journal block */ + QUAD ZH_IsStoreBlock; /* Number of times ZLSS_StoreBeastInHimem saw a storable block */ + QUAD ZH_IsUserBlock; /* Number of times ZLSS_StoreBeastInHimem saw a user block */ + QUAD ZH_CacheRejects; /* Cache code rejects the use of himem */ + + QUAD ZH_NoFromInode; /* Number of times ZLSS_CopyDataFromHimem had an invalid inode */ + QUAD ZH_NoFromPool; /* Number of times ZLSS_CopyDataFromHimem had an invalid v_pool */ + QUAD ZH_NoToInode; /* Number of times ZLSS_CopyDataToHimem had an invalid inode */ + QUAD ZH_NoToPool; /* Number of times ZLSS_CopyDataToHimem had an invalid v_pool */ +} ZLSS_Himem_s; + +ZLSS_Himem_s ZLSS_Himem; + +/* + ZLSS_IsUserBlock() - + Determines if a Buffer_s contains a user data block. + + Notes - + System objects always have ZIDs < zFIRST_ALLOCATABLE_ZID. Not all + blocks in a file's mycache are data blocks. File map blocks are stored in + the file's mycache. File map blocks will have a negative fileBlk number + while user data blocks are positive. A 'fileBlk' of zero is special as it + may be pool block zero (negative of 0 is 0) or the 1st logical block in a + file. This code assumes that volBlk zero is never given to users, which + is true for ZLSS (which is why code is not in COMN layer). Pool block + 0 is always the 1st block in the 1st ZLSS superblock. + + The 'volBlk == 0' is here because of a previous fix to ZFS_ReadPoolBlk + that detects if a block is a user block. The fix was needed because + verify/rebuild would read pool block zero on a file beast thinking it + was a file map block. This would cause an abend because a meta-data + block was in a Linux user page. I do not recall the exact location of the + abend. + */ +static inline BOOL ZLSS_IsUserBlock(const RootBeast_s *root, const Buffer_s *buffer) +{ + if ((buffer->pBuf.fileBlk < 0) || + (buffer->volBlk == 0) || + (root->ROOTzid < zFIRST_ALLOCATABLE_ZID)) + { + return FALSE; + } + return TRUE; +} + + +/* + ZLSS_StoreBeastInHimem() - + Returns TRUE if this object should be stored in Linux Himem. + + Notes - + User data uses 'Linux Pages' which are not limited to low memory. I.E. we + have no reason to store in himem. PLUS if we did we would have a stale data + bug related to the fact that cache.c sometimes uses NSS meta-data cache to + store user blocks:-( See cache.c for more information. +*/ +static BOOL ZLSS_StoreBeastInHimem( const RootBeast_s *root, const Buffer_s *buffer ) +{ + if ( !HMC_UseHimem(buffer) ) { + ++ZLSS_Himem.ZH_CacheRejects; + return FALSE; + } + if (root->zid == ZFSPOOL_LOGFILE_ZID) + { /* Skip Journal as we only write to it (except in redo/undo phase). */ + ++ZLSS_Himem.ZH_IsJournal; + return FALSE; + } + if (ZLSS_IsUserBlock(root, buffer)) + { /* Cache code requires that we do not attempt to store user blocks + into himem. We do the check in ZLSS because only the LSS knows + which blocks are user blocks. + */ + ++ZLSS_Himem.ZH_IsUserBlock; + return FALSE; + } + ++ZLSS_Himem.ZH_IsStoreBlock; + return TRUE; +} + +/* + ZLSS_CopyDataToHimem() - + Copies the data of a Buffer_s into himem. The copy is only done if the + data meets specific requirements. See ZLSS_StoreBeastInHimem(). + */ +static void ZLSS_CopyDataToHimem( + Buffer_s *buffer, + RootBeast_s *root) +{ + struct inode *inode; + + ++ZLSS_Himem.ZH_ZLSSCacheCopyToHimem; + if (ZLSS_StoreBeastInHimem(root, buffer)) + { + if (!root->vol.volume->v_pool) + { + ++ZLSS_Himem.ZH_NoToPool; + return; + } + inode = root->vol.volume->v_pool->P_Inode; + if (!inode) + { + ++ZLSS_Himem.ZH_NoToInode; + return; + } + ++ZLSS_Himem.ZH_CacheCopyToHimem; + HMC_CopyDataToHimem(&root->vol.volume->v_pool->P_MDHimemList, buffer, inode); + } + return; +} + +/* + ZLSS_CopyDataFromHimem() - + Copies from himem into a Buffer_s's data block. The copy is only done + if the data meets specific requirements. See ZLSS_StoreBeastInHimem(). + In addition, the copy can only occur if the data is in himem. + + Returns - + TRUE if we copied the data from himem. + */ +static BOOL ZLSS_CopyDataFromHimem( + Buffer_s *buffer, + RootBeast_s *root) +{ + BOOL found = FALSE; + struct inode *inode; + + ++ZLSS_Himem.ZH_ZLSSCacheCopyFromHimem; + if (ZLSS_StoreBeastInHimem(root, buffer)) + { + if (!root->vol.volume->v_pool) + { + ++ZLSS_Himem.ZH_NoFromPool; + return found; + } + inode = root->vol.volume->v_pool->P_Inode; + if (!inode) + { + ++ZLSS_Himem.ZH_NoFromInode; + return found; + } + ++ZLSS_Himem.ZH_CacheCopyFromHimem; + found = HMC_CopyDataFromHimem(buffer, inode); + if (found) { + ++ZLSS_Himem.ZH_CacheCopiedFromHimem; + } else { + ++ZLSS_Himem.ZH_CacheMissFromHimem; + } + } + return found; +} + + +/*- (FUNCTION) ----- ZFS_AllocPoolBlk() ------------------------------------- + | + +-------------------------------------------------------------------------*/ +Buffer_s *ZFS_AllocPoolBlkSpecialWithFlags( + GeneralMsg_s *genMsg, + IoMsg_s *iomsg, + Buffer_s **specialBuffer, + NINT allocFlags) +{ + RootBeast_s *beast = iomsg->beast; + Extent_s extent; + Buffer_s *buf; + STATUS status; + + ASSERT_MPKNSS_LOCK(); + + /*---- SPECIAL NOTE: + | Please note that this routine will allocate blocks from the system + | reserved area so don't use it to allocate blocks for user data. + +-----------------------------------------------------------------------*/ + + extent.poolBlkNum = iomsg->volBlk; + extent.lengthOfExtent = iomsg->allocNumBlks; + if (specialBuffer != NULL) + { + status = zfsAllocExtentFromSpecialBlk(genMsg, beast->vol.zfsVol, + &extent, iomsg->xaction, specialBuffer); + } + else + { + status = zfsAllocExtent(genMsg, beast->vol.zfsVol, + &extent, allocFlags|XTREE_SYSTEM_REQUEST, iomsg->xaction); + } + + if (status != zOK) + { + return NULL; + } + if (iomsg->fileBlk == INVALID_BLK) + { /* + * This must be an indirect block we are allocating + * Use negative of volBlk number. + */ + iomsg->fileBlk = -extent.poolBlkNum; + } + zASSERT(extent.poolBlkNum != 0); + cacheTossIfThere(&beast->ROOTmycache, iomsg->fileBlk); + + buf = cacheAllocBuffer( &beast->ROOTmycache, + iomsg->fileBlk, extent.poolBlkNum, + ZFS_BlockSignalHandler, STAT_CACHE_ALLOCATE); + if (iomsg->mode != CACHE_WRITE) + { + bzero(buf->pBuf.data, (1 << buf->bufSizeShift)); + } + zASSERT(buf != NULL); + return buf; + +} /* End of ZFS_AllocPoolBlk() */ + +/* + ZFS_WritePoolBlk() - + Wrapper around unitwritewait to ensure that ZLSS_CopyDataToHimem is called + properly. + */ + + +STATUS ZFS_WritePoolBlk( + GeneralMsg_s *genMsg, + zConPool_s *phypool, + Buffer_s *buffer ) +{ + STATUS status; + RootBeast_s *root; + + root = STRUCT(buffer->pBuf.mycache, RootBeast_s, mycache); + ZLSS_CopyDataToHimem(buffer, root); + + status = zlssBioIOBuffer(WRITE, phypool->ZCP_dev, buffer); + + SetErrno( genMsg, status ); + return status; +} + + +#if LOG_TEST IS_ENABLED +/* + * ZIO_D_LogTest() - + * This is code we used to test REDO/UNDO routines without having + * to crash the server. This may or may not work in 6Pack with logical + * undo code added by Vandana. + */ +Buffer_s *ZIO_DBG_LogTest( + IoMsg_s *iomsg ) + +{ + ZfsPool_s *zfsPool; + ZlogBeast_s *zlogBeast; + NINT index; + Buffer_s *buffer; + + zASSERT( LogTest ); + zASSERT( iomsg->beast != NULL ); + if ( COMN_IsDerivedFrom( iomsg->beast, zFTYPE_ZLSS_ZFSPOOL) && + (((ZfsPool_s *)iomsg->beast)->zfsLogBeast != NULL) ) + { /* Is the pool - Pool only used for undo and redo!!!! */ + zfsPool = (ZfsPool_s *)iomsg->beast; + zlogBeast = zfsPool->zfsLogBeast; + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + + if ( zlogBeast->ZLB_DebugState == ZLOG_ZB_DS_LOG_TEST ) + { + for ( index = 0; index < LOG_TEST_MAX; ++index ) + { + buffer = zlogBeast->ZLB_LT[index].LT_Buffer; + if ( buffer != NULL ) + { + if ( buffer->volBlk == iomsg->volBlk ) + { + return( buffer ); + } + } + } + zASSERT("ZFS_ReadPoolBlk() could not find block"==NULL); + } + } + return( NULL ); + +} /* End of ZIO_DBG_LogTest() */ +#endif + + +#if NSS_DEBUG IS_ENABLED +void ZIO_DBG_ErrorSimulateRead( + STATUS *status, /* Input/output */ + GeneralMsg_s *genMsg, + IoMsg_s *iomsg ) + +{ + + if ( (*status == zOK) && (iomsg->debugID == ReadPoolBlkErrDebug) ) + { + *status = zFAILURE; + SetErrno( genMsg, zERR_READ_FAILURE); + errPrintf(WHERE, Module, 1454, + MSGNot("Simulated Error Return from ZFS_ReadPoolBlk: DebugID = %d"), + ReadPoolBlkErrDebug); + DBG_DebugPrintf(MAGENTA, + MSGNot("Simulated Error Return from ZFS_ReadPoolBlk: DebugID = %d\n"), + ReadPoolBlkErrDebug); + if (IncReadPoolBlkErrDebug) + { + ReadPoolBlkErrDebug++; + } + } + +} /* End of ZIO_DBG_ErrorSimulateRead() */ +#endif + +/* + * ZFS_ReadPoolBlk_ErrorHandler() - + * Handles the error code path for ZFS_ReadPoolBlk when + * ZFSMAL_ReadBlk() errors out. + * + */ + +void ZFS_ReadPoolBlk_ErrorHandler (GeneralMsg_s *genMsg, IoMsg_s *iomsg, Buffer_s *buf) +{ + RootBeast_s *beast; /* The beast read was done for */ + + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + buf->agent.status = GetErrno(genMsg); + + if (iomsg->mode == CACHE_READ) + { + DOWN_LATCH(&buf->agent.latch); + } + /* If this is a pre-read of a SYSTEM buffer that will + * be written then the error must DISABLE the volume. + * + * Feb 20, 2001 + * We came close to removing this disable code + * because some people do updates just in case they + * do a write. We left in, because numerous places + * do not do COMN abort Xaction when errors are + * returned during a XACTION. + */ + if ( (iomsg->mode == CACHE_UPDATE) && + ZLSS_IS_SYSTEM_BLOCK( beast, buf ) ) + { /* This is a SYSTEM Block being pre-read for a write */ + Volume_s *volume; /* The volume that the read was done + * on. Note that this is not the + * 'owning' volume if the beast is a + * volume becausethe ADMIN volume + * owns all volumes. + */ + + volume = ((beast->bstState & BST_STATE_IS_VOLUME_OR_POOL) ? + (Volume_s *)beast : + beast->vol.volume ); + zASSERT( volume != NULL ); + /* If not /VERIFY or /REBUILD then DISABLE the + * volume. The repair code handles errors so we + * do not wish to DISABLE the volume. Note that + * we do not simply check the volume's state because + * we would be required to obtain the stateLatch and + * if we did that we could easily deadlock. + */ + if ( !ZLSS_IS_MAINTENANCE_IO( volume ) ) + { + GeneralMsg_s dummyGenMsg; + NINT flags; + + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + + flags = CVA_SYSTEM_DATA | CVA_POOL_DISABLE | CVA_POOL_ALERT; + if ( !COMN_IsDerivedFrom(volume, zFTYPE_ZLSS_ZFSPOOL) ) + { + flags |= CVA_VOLUME_DISABLE | CVA_VOLUME_ALERT; + } + /* Now tell the common layer that we wish to disable this + * volume. This is an ASYNC call. We require because we + * can not block. + */ + (void)COMN_VolumeAlert( &dummyGenMsg, beast, volume, + buf, buf->pBuf.fileBlk, buf->volBlk, + buf->agent.status, WHERE, flags ); + } + + } + + CACHE_RELEASE(buf); + return; +} + + + +/* + * ZFS_ReadPoolBlk() - + * Allocates a buffer and places it in the myCache of the beast. If + * !CACHE_WRITE then we read the disk block into buffer->pBuf.data. This + * is a SYNC routine. + * + */ + +Buffer_s *ZFS_ReadPoolBlk (GeneralMsg_s *genMsg, IoMsg_s *iomsg) +{ + MyCache_s *mycache = &iomsg->beast->ROOTmycache; + Buffer_s *buf; + + ASSERT_MPKNSS_LOCK(); + zASSERT(iomsg->volBlk != 0); + + if (iomsg->fileBlk == INVALID_BLK) + { + iomsg->fileBlk = -iomsg->volBlk; + } +#if LOG_TEST IS_ENABLED + if ( LogTest ) + { + buf = ZIO_DBG_LogTest( iomsg ); + if ( buf != NULL ) + { + return( buf ); + } + } +#endif + buf = cacheLookup(mycache, iomsg->fileBlk, iomsg->mode); + if (buf != NULL) + { + Inst.cache.ioHitSystem++; + { + RootBeast_s *root; + struct inode *inode; + + root = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + + if (ZLSS_StoreBeastInHimem(root, buf)) + { + if (root->vol.volume->v_pool && + (inode = root->vol.volume->v_pool->P_Inode)) + { + (void)HMC_MarkNewestIfCached( inode, buf->volBlk ); + } + } + } + return buf; + } + if (iomsg->volBlk == INVALID_BLK) + { + SetErrno(genMsg, zERR_INVALID_BLOCK); + return NULL; + } + + if ((iomsg->fileBlk < 0) || (iomsg->volBlk == 0) || + (iomsg->beast->ROOTzid < zFIRST_ALLOCATABLE_ZID)) + { + buf = cacheAllocBuffer(mycache, + iomsg->fileBlk, iomsg->volBlk, + ZFS_BlockSignalHandler, iomsg->mode); + } + else + { + buf = cacheAllocBufferForUserData(mycache, + iomsg->fileBlk, iomsg->volBlk, + ZFS_BlockSignalHandler, iomsg->mode); + } + + /* If after allocating a cache buffer, the page found in linux cache + * still has valid data, then no need to read the data from disk again. + */ + if (buf->state & CACHE_DATA_VALID) + { + buf->state &= ~CACHE_DATA_VALID; + if (iomsg->mode == CACHE_READ) + { + DOWN_LATCH(&buf->agent.latch); + } + return buf; + } + + if (iomsg->mode != CACHE_WRITE) + { + STATUS status; + + status = ZFSMAL_ReadBlk(genMsg, buf); +//#if NSS_DEBUG IS_ENABLED +// ZIO_DBG_ErrorSimulateRead( &status, genMsg, iomsg ); +//#endif + if (status != zOK) + { + /* + * 407920 - Moved the code to this routine to save stack space. + * Helps quiet a bit on 64-bit architecture. + */ + ZFS_ReadPoolBlk_ErrorHandler(genMsg, iomsg, buf); + return NULL; + } + + if (iomsg->mode == CACHE_READ) + { + DOWN_LATCH(&buf->agent.latch); + } + } + return buf; + +} /* End of ZFS_ReadPoolBlk() */ + + +/*- (FUNCTION) ----- ZFSMAL_PhysicalExtent() -------------------------------- + | + | Allen needs to fill this code in. + | + +-------------------------------------------------------------------------*/ +STATUS ZFSMAL_PhysicalExtent( + ZfsVolume_s *zfsVol, + QUAD poolOffset, + QUAD poolLength, + ADDR *deviceID, + QUAD *deviceOffset, + QUAD *deviceLength ) +{ + zConPool_s *phyPool = zfsVol->pool->storagepool->phypool; + +#if 1 + if ( phyPool == NULL ) + { + zASSERT("Pool is no longer valid" != NULL); + return( zERR_POOL_NOT_ACCESSIBLE ); + } + + *deviceID = (ADDR)phyPool->ZCP_dev; + *deviceOffset = poolOffset; + *deviceLength = poolLength; +#else + /* Paul says - I'm making the code look like many devices so I can + * test out logical extents being broken into multiple physical extents. + */ + *deviceID = (ADDR)phyPool->ZCP_dev + (volumeOffset >> 3); + *offset = volumeOffset & MASK(0, 3); + *length = (1 << 3) - *offset; +#endif + + return zOK; +} /* End of ZFSMAL_PhysicalExtent() */ + + +/* + * ZIO_ErrorNewEntry() - + * Returns the 'last' free entry in a ZIO_ErrorLog. If none are + * free then throws out the first entry and returns the last entry. + * + */ + +IOErrorLog_s *ZIO_ErrorNewEntry( Volume_s *volume ) + +{ + if ( ZIO_ErrorLogCount < NELEMS( ZIO_ErrorLog ) ) + { + ++ZIO_ErrorLogCount; + } + else + { /* Move errors down to make space for new one */ + memmove( &ZIO_ErrorLog[0], &ZIO_ErrorLog[1], + NELEMS( ZIO_ErrorLog ) * sizeof( ZIO_ErrorLog[0] ) ); + } + return( &ZIO_ErrorLog[ZIO_ErrorLogCount - 1] ); + +} /* End of ZIO_ErrorNewEntry() */ + + +/* + * doZLSSPoolIOErrors() - + * This function supports the hidden command to display ZLSS Physical + * I/O Errors. This command is used internally to gather I/O + * information to help in making performance changes to ZLSS. + * + * These statistics are for ALL pools that the ZLSS owns. + */ + +STATUS doZLSSPoolIOErrors( + PCLSwitchDef_s *switchDef, + NINT parseOptions, + void *userParm) + +{ + int i; + IOErrorLog_s *entry; + char *sysString; + char *ioString; + char buffer[64]; + + MPKNSS_LOCK(); + UNUSED_PARAM(parseOptions); + UNUSED_PARAM(userParm); + + wPause(stdout, -1); + if ( ZIO_ErrorLogCount != 0 ) + { + aprintf( CYAN, + "%-15.15s %-4.4s %-5.5s %-5.5s %-8.8s %-9.9s %-27.27s\n", + "Pool Name", "Type", "Stat1", "Stat2", + "Block", "Object", "Time" ); + aprintf( CYAN, + "%-15.15s %-4.4s %-5.5s %-5.5s %-8.8s %-9.9s %-27.27s\n", + "-----------------------------------", + "-----------------------------------", + "-----------------------------------", + "-----------------------------------", + "-----------------------------------", + "-----------------------------------", + "-----------------------------------", + "-----------------------------------" ); + } + for ( i = 0; i < ZIO_ErrorLogCount; ++i ) + { + /* Because of limited room we do not print + * + * IOEL_FileBlock AND IOEL_Data + * + * and only 5 digits of + * + * IOEL_Status AND IOEL_ErrorMediaManager + * + * and only 9 hex digits of + * + * IOEL_Zid + */ + entry = &ZIO_ErrorLog[i]; + ioString = entry->IOEL_Read ? "R" : "W"; + sysString = entry->IOEL_System ? "S" : "U"; + UTCTime2Str( entry->IOEL_UTCTime, &buffer[0] ); + aprintf( LGREEN, + "%-15.15U %2.2s%-2.2s %5d %5d %8lx %9Lx %-27.27s\n", + entry->IOEL_PoolName, ioString, sysString, entry->IOEL_Status, + entry->IOEL_ErrorMediaManager, entry->IOEL_VolumeBlock, + entry->IOEL_Zid, buffer ); + } + wPause(stdout, 0); + MPKNSS_UNLOCK(); + return zOK; + +} /* End of doZLSSPoolIOErrors() */ + + +/* + * ZIO_ErrorRecord() - + * Routine that record I/O errors so that they can be viewed. I + * added because if the error scrolls off of the console and we disable + * the pool/volume then we need to be able to say that it was caused + * by an I/O error. + */ + +void ZIO_ErrorRecord( + Buffer_s *buf, + LONG consumerRetCode, + STATUS status, + int state ) + +{ + RootBeast_s *beast; + Volume_s *volume; + ZfsVolume_s *zVolume; + ZfsPool_s *zfsPool; + IOErrorLog_s *ioErrorEntry; + unicode_t *poolName; + BOOL read; /* If true READ else WRITE */ + + zASSERT( status != zOK ); + + /*** + *** + *** This can be called on a FAST WORK TO DO (I.E. no blocking!) + *** + ***/ + switch ( state ) + { + case ZPIOLH_STATE_WRITE_COMPLETE: + read = FALSE; + break; + case ZPIOLH_STATE_READ_COMPLETE: + read = TRUE; + break; + + case ZPIOLH_STATE_WRITE_START: + case ZPIOLH_STATE_READ_START: + case ZPIOLH_STATE_READ_ASYNC_START: + default: + zASSERT("Illegal state"==NULL); + return; + } + + beast = STRUCT( buf->pBuf.mycache, RootBeast_s, ROOTmycache ); + zASSERT( beast != NULL ); + ZLSS_VOLUME_GET( beast, volume ); + zASSERT( volume != NULL ); + zASSERT( COMN_IsDerivedFrom(volume, zFTYPE_ZLSS_VOL) ); + zVolume = VOLUME_TO_ZLSS_VOLUME( volume ); + zfsPool = ZLSS_VOLUME_TO_ZFS_POOL( zVolume ); + zASSERT( zfsPool != NULL ); + + ioErrorEntry = ZIO_ErrorNewEntry( volume ); + zASSERT( ioErrorEntry != NULL ); + ioErrorEntry->IOEL_UTCTime = GetUTCTime(); + ioErrorEntry->IOEL_Read = read; + ioErrorEntry->IOEL_VolumeBlock = buf->volBlk; + ioErrorEntry->IOEL_FileBlock = buf->pBuf.fileBlk; + ioErrorEntry->IOEL_ErrorMediaManager = consumerRetCode; + ioErrorEntry->IOEL_Status = status; + ioErrorEntry->IOEL_Zid = beast->zid; + if ( zfsPool->storagepool != NULL ) + { + poolName = zfsPool->storagepool->poolname; + } + else + { + poolName = L""; + } + unicpy( ioErrorEntry->IOEL_PoolName, poolName ); + + if ( ZLSS_IS_SYSTEM_BLOCK( beast, buf ) ) + { + ioErrorEntry->IOEL_System = TRUE; +#if NSS_DEBUG IS_ENABLED + if ( read ) + { + volume->v_stats.IO_system_read_failure++; + } + else + { + volume->v_stats.IO_system_write_failure++; + } +#endif + } + else + { + ioErrorEntry->IOEL_System = FALSE; +#if NSS_DEBUG IS_ENABLED + if ( read ) + { + volume->v_stats.IO_user_read_failure++; + } + else + { + volume->v_stats.IO_user_write_failure++; + } +#endif + } + +} /* End of ZIO_ErrorRecord() */ + + +#if 0 + Volume_s *volume; + ZfsPool_s *zfsPool; + ZlssPool_s *zlssPool; + IOErrorLog_s *ioErrorEntry; + + zASSERT( status != zOK ); + ZLSS_VOLUME_GET( beast, volume ); + zASSERT( volume != NULL ); + zASSERT( COMN_IsDerivedFrom(volume, zFTYPE_ZLSS_VOL) ); + zlssPool = ZLSS_VOLUME_TO_ZLSS_POOL( (ZfsVolume_s *)volume ); + zASSERT( zlssPool != NULL ); + zASSERT( COMN_IsDerivedFrom(zlssPool, zFTYPE_ZLSS_LOGICAL_POOL) ); + zfsPool = ZLSS_POOL_TO_ZFS_POOL( zlssPool ); + zASSERT( zfsPool != NULL ); + switch ( status ) + { + case zERR_READ_FAILURE_POSTPONE: + DBG_DebugPrintf( LRED, + "%s postpone has exceeded the retry count\n", + read ? "Read" : "Write" ); + break; + case zERR_READ_FAILURE_MEDIA: + DBG_DebugPrintf( LRED, + "%s has exceeded the retry count\n", + read ? "Read" : "Write" ); + break; + default: + DBG_DebugPrintf(LRED,MSGNot( + "Unknown %s error has exceeded the retry count\n", + read ? "read" : "write" ); + break; + } + if ( zfsPool->storagepool != NULL ) + { /* Mike changed this from errPrintf in file r1.37 because + * errPrintf would cause an ABEND when we switched from + * P1 to P0. Was in a FAST WORK TO DO path and blocked!!! + * We will let the ALERT system report errors. + */ + DBG_DebugPrintf(LRED, + " Consumer status=%d at block: 0x%x file block: 0x%x on pool: %U.\n", + (consumerRetCode & 0x0000ffff), buf->volBlk, + buf->pBuf.fileBlk, zfsPool->storagepool->poolname ); + } +#endif + +#if NSS_DEBUG IS_ENABLED + +/* + * ZIO_DBG_HistoryDisplay() - + * Displays detailed information of ALL I/O into the =d buffer. This + * occurs only if TZPOOLIO is on in DBG_DebugFlag or ?DBG_TraceFlag? + */ + +void ZIO_DBG_HistoryDisplay( + Buffer_s *buf, + int state ) + +{ + + char *modeString; + char v; + char stuff[5]; + RootBeast_s *beast; + Volume_s *volume; + + /*** + *** + *** This can be called on a FAST WORK TO DO (I.E. no blocking!) + *** + ***/ + switch ( state ) + { + case ZPIOLH_STATE_WRITE_COMPLETE: + modeString = "WC "; + break; + case ZPIOLH_STATE_READ_COMPLETE: + modeString = "RC "; + break; + case ZPIOLH_STATE_WRITE_START: + modeString = "WS "; + break; + case ZPIOLH_STATE_READ_START: + modeString = "RS "; + break; + case ZPIOLH_STATE_READ_ASYNC_START: + modeString = "ARS"; + break; + default: + zASSERT("Illegal state"==NULL); + modeString = "---"; + break; + } + + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + ZLSS_VOLUME_GET( beast, volume ); + zASSERT( volume != NULL ); + v = ((beast->bstState & BST_STATE_IS_VOLUME_OR_POOL) ? 'V' : ' '); + if ( ZLSS_IS_SYSTEM_BLOCK( beast, buf ) ) + { + memcpy( stuff, buf->pBuf.data, 4 ); + stuff[4] = NULL; + DEBUG_PRINTF(TZPOOLIO,DBG_INDENT, + ( GREEN, + MSGNot(" s%s: VZ=%ld Z=%ld%c FBlk=%ld VBlk=%ld D=%08lx *D=%s\n"), + modeString, + (unsigned long)volume->VOLzid, + (unsigned long)beast->zid,v, + buf->pBuf.fileBlk, + buf->volBlk, + buf->pBuf.data, + stuff + )); + } + else + { + DEBUG_PRINTF(TZPOOLIO,DBG_INDENT, + ( GREEN, + MSGNot(" u%s: VZ=%ld Z=%ld%c FBlk=%ld VBlk=%ld D=%08lx *D=%lx\n"), + modeString, + (unsigned long)volume->VOLzid, + (unsigned long)beast->zid,v, + buf->pBuf.fileBlk, + buf->volBlk, + buf->pBuf.data, + *((LONG *)buf->pBuf.data) + )); + } +} /* End of ZIO_DBG_HistoryDisplay() */ +#endif + + +#if ZLSS_IO_GATHER IS_ENABLED + +/* + * ZIO_DBG_GatherEventComplete() - + * Logs detailed COMPLETE information of ALL I/O into memory. See comments + * of ZLSS_IO_GATHER in zfs.h for details how utility to procees this + * log. + */ + +void ZIO_DBG_GatherEventComplete( + Buffer_s *buf, + WORD state ) + +{ + + RootBeast_s *beast; + Volume_s *volume; + ZLSSPoolIOLog_Complete_s *log; + + /*** + *** + *** This can be called on a FAST WORK TO DO (I.E. no blocking!) + *** + ***/ + zASSERT( (gZLSSPoolIOLog != NULL) || !gZLSSPoolIOLogDo ); + switch ( state ) + { + case ZPIOLH_STATE_WRITE_COMPLETE: + case ZPIOLH_STATE_READ_COMPLETE: + break; + case ZPIOLH_STATE_WRITE_START: + case ZPIOLH_STATE_READ_START: + case ZPIOLH_STATE_READ_ASYNC_START: + default: + zASSERT("Illegal state"==NULL); + } + + log = (ZLSSPoolIOLog_Complete_s *)(&gZLSSPoolIOLog[gZLSSPoolIOLogNext]); + + gZLSSPoolIOLogNext += sizeof( *log ); + if ( gZLSSPoolIOLogNext > (gZLSSPoolIOLogSize - ZLSS_POOL_IO_LOG_SPACE) ) // FixFixFix + { + gZLSSPoolIOFull = TRUE; + gZLSSPoolIOLogStart = 0; + gZLSSPoolIOLogNext = 0; + } + if ( gZLSSPoolIOFull ) + { + while ( gZLSSPoolIOLogNext > gZLSSPoolIOLogStart ) + { + gZLSSPoolIOLogStart += ((ZLSSPoolIOLogHeader_s *)(&gZLSSPoolIOLog[gZLSSPoolIOLogStart]))->ZPIOLH_Size; + if ( gZLSSPoolIOLogStart > (gZLSSPoolIOLogSize - ZLSS_POOL_IO_LOG_SPACE) ) // FixFixFix + { + gZLSSPoolIOLogStart = 0; + break; + } + } + } + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + ZLSS_VOLUME_GET( beast, volume ); +// volume = ((beast->bstState & BST_STATE_IS_VOLUME_OR_POOL) ? +// (Volume_s *)beast : +// beast->vol.volume ); + gZLSSPoolIOHeader->ZPIOH_UTCTime = GetUTCTime(); + log->ZPIOL_C_Header.ZPIOLH_Size = sizeof( *log ); + log->ZPIOL_C_Header.ZPIOLH_Format = ZPIOLH_FORMAT_STANDARD; + log->ZPIOL_C_Header.ZPIOLH_State = state; + log->ZPIOL_C_Header.ZPIOLH_ZidVolume = (LONG)volume->VOLzid; + log->ZPIOL_C_Header.ZPIOLH_Time = microSecondTimer(); + log->ZPIOL_C_BlockPool = buf->volBlk; + log->ZPIOL_C_Data = *((LONG *)buf->pBuf.data); + +} /* End of ZIO_DBG_GatherEventComplete() */ + + +/* + * ZIO_DBG_GatherEventStart() - + * Logs detailed START information of ALL I/O into memory. See comments + * of ZLSS_IO_GATHER in zfs.h for details how utility to procees this + * log. + */ + +void ZIO_DBG_GatherEventStart( + Buffer_s *buf, + WORD state ) + +{ + + RootBeast_s *beast; + Volume_s *volume; + ZLSSPoolIOLog_Start_s *log; + + zASSERT( (gZLSSPoolIOLog != NULL) || !gZLSSPoolIOLogDo ); + switch ( state ) + { + case ZPIOLH_STATE_WRITE_START: + case ZPIOLH_STATE_READ_START: + case ZPIOLH_STATE_READ_ASYNC_START: + break; + case ZPIOLH_STATE_WRITE_COMPLETE: + case ZPIOLH_STATE_READ_COMPLETE: + default: + zASSERT("Illegal state"==NULL); + } + + log = (ZLSSPoolIOLog_Start_s *)(&gZLSSPoolIOLog[gZLSSPoolIOLogNext]); + gZLSSPoolIOLogNext += sizeof( *log ); + if ( gZLSSPoolIOLogNext > (gZLSSPoolIOLogSize - ZLSS_POOL_IO_LOG_SPACE) ) // FixFixFix + { + gZLSSPoolIOFull = TRUE; + gZLSSPoolIOLogStart = 0; + gZLSSPoolIOLogNext = 0; + } + if ( gZLSSPoolIOFull ) + { + while ( gZLSSPoolIOLogNext > gZLSSPoolIOLogStart ) + { + gZLSSPoolIOLogStart += ((ZLSSPoolIOLogHeader_s *)(&gZLSSPoolIOLog[gZLSSPoolIOLogStart]))->ZPIOLH_Size; + if ( gZLSSPoolIOLogStart > (gZLSSPoolIOLogSize - ZLSS_POOL_IO_LOG_SPACE) ) // FixFixFix + { + gZLSSPoolIOLogStart = 0; + break; + } + } + } + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + ZLSS_VOLUME_GET( beast, volume ); +// volume = ((beast->bstState & BST_STATE_IS_VOLUME_OR_POOL) ? +// (Volume_s *)beast : +// beast->vol.volume ); + zASSERT( volume != NULL ); + gZLSSPoolIOHeader->ZPIOH_UTCTime = GetUTCTime(); + log->ZPIOL_S_Header.ZPIOLH_Size = sizeof( *log ); + log->ZPIOL_S_Header.ZPIOLH_Format = ZPIOLH_FORMAT_STANDARD; + log->ZPIOL_S_Header.ZPIOLH_State = state; + log->ZPIOL_S_Header.ZPIOLH_ZidVolume = (LONG)volume->VOLzid; + log->ZPIOL_S_Header.ZPIOLH_Time = microSecondTimer(); + log->ZPIOL_S_ZidBeast = (LONG)beast->zid; + log->ZPIOL_S_BlockFile = buf->pBuf.fileBlk; + log->ZPIOL_S_BlockPool = buf->volBlk; + +} /* End of ZIO_DBG_GatherEventStart() */ + + +/* + * ZIO_DBG_GatherEvent() - + * Logs detailed information of ALL I/O into memory. See comments + * of ZLSS_IO_GATHER in zfs.h for details how utility to procees this + * log. + */ + +void ZIO_DBG_GatherEvent( Buffer_s *buf, WORD state ) + +{ + + zASSERT( (gZLSSPoolIOLog != NULL) || !gZLSSPoolIOLogDo ); + switch ( state ) + { + case ZPIOLH_STATE_WRITE_START: + case ZPIOLH_STATE_READ_START: + case ZPIOLH_STATE_READ_ASYNC_START: + ZIO_DBG_GatherEventStart( buf, state ); + break; + case ZPIOLH_STATE_WRITE_COMPLETE: + case ZPIOLH_STATE_READ_COMPLETE: + ZIO_DBG_GatherEventComplete( buf, state ); + break; + deafult: + zASSERT("Illegal state"==NULL); + } + +} /* End of ZIO_DBG_GatherEvent() */ +#endif + + +/* + * ZIO_GatherDetailedSummaryInformation() - + * This tracks the number of IOs done to ZLSS pools. It + * tracks LVDB and all system beast writes as separate counts. In addition + * it counts all user writes. + * + * Notes - + * The command "NSS /ZLSSPoolIOStatistics=PoolName" can be used to + * view this information. This information is persistently tracked. + * The command "NSS /ZLSSPoolIOReset=PoolName" can be used to reset + * the counts. If PoolName is _Summary then the command displays + * the total of IOs to all pools since the ZLSS has been up or + * a reset on _Summary had been done. + * + */ + +void ZIO_GatherDetailedSummaryInformation( + Buffer_s *buf, + int state ) + +{ + RootBeast_s *beast; + Volume_s *volume; + ZlssPool_s *zlssPool; + BOOL read; + + /*** + *** + *** This can be called on a FAST WORK TO DO (I.E. no blocking!) + *** + ***/ + switch ( state ) + { + case ZPIOLH_STATE_WRITE_COMPLETE: + read = FALSE; + break; + case ZPIOLH_STATE_READ_COMPLETE: + read = TRUE; + break; + case ZPIOLH_STATE_READ_START: + case ZPIOLH_STATE_READ_ASYNC_START: + case ZPIOLH_STATE_WRITE_START: + default: + zASSERT("Illegal state"==NULL); + return; + } + + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + ZLSS_VOLUME_GET( beast, volume ); +// volume = ((beast->bstState & BST_STATE_IS_VOLUME_OR_POOL) ? +// (Volume_s *)beast : +// beast->vol.volume ); +// zASSERT( volume != NULL ); +// zASSERT( COMN_IsDerivedFrom(volume, zFTYPE_ZLSS_VOL) ); + zlssPool = ZLSS_VOLUME_TO_ZLSS_POOL( (ZfsVolume_s *)volume ); + zASSERT( zlssPool != NULL ); + zASSERT( COMN_IsDerivedFrom(zlssPool, zFTYPE_ZLSS_LOGICAL_POOL) ); + + if ( ZLSS_POOL_IO_TRACK( zlssPool ) ) + { + if ( read ) + { /* A Read I/O */ + if ( ZLSS_IS_SYSTEM_BLOCK( beast, buf ) ) + { + if (beast->bstState & BST_STATE_IS_VOLUME_OR_POOL) + { /* Volume's do not have proper ZIDs */ + ++gZLSSPRS.PRS_SystemBeast[0]; + ++zlssPool->ZP_PRS.PRS_SystemBeast[0]; + } + else + { + if ( beast->zid < (ZLSS_PRS_SB_COUNT - 1) ) + { + ++gZLSSPRS.PRS_SystemBeast[(unsigned int)(beast->zid)]; + ++zlssPool->ZP_PRS.PRS_SystemBeast[(unsigned int)(beast->zid)]; + } + else + { + ++gZLSSPRS.PRS_SystemBeast[ZLSS_PRS_SB_COUNT-1]; + ++zlssPool->ZP_PRS.PRS_SystemBeast[ZLSS_PRS_SB_COUNT-1]; + } + } + } + else + { + ++gZLSSPRS.PRS_UserBlock; + ++zlssPool->ZP_PRS.PRS_UserBlock; + } + } + else + { /* A Write I/O */ + if ( ZLSS_IS_SYSTEM_BLOCK( beast, buf ) ) + { + if (beast->bstState & BST_STATE_IS_VOLUME_OR_POOL) + { /* Volume's do not have proper ZIDs */ + ++gZLSSPWS.PWS_SystemBeast[0]; + ++zlssPool->ZP_PWS.PWS_SystemBeast[0]; + } + else + { + if ( beast->zid < (ZLSS_PRS_SB_COUNT-1) ) + { + ++gZLSSPWS.PWS_SystemBeast[(unsigned int)(beast->zid)]; + ++zlssPool->ZP_PWS.PWS_SystemBeast[(unsigned int)(beast->zid)]; + } + else + { + ++gZLSSPWS.PWS_SystemBeast[ZLSS_PRS_SB_COUNT-1]; + ++zlssPool->ZP_PWS.PWS_SystemBeast[ZLSS_PRS_SB_COUNT-1]; + } + } + } + else + { + ++gZLSSPWS.PWS_UserBlock; + ++zlssPool->ZP_PWS.PWS_UserBlock; + } + } + } + +} /* End of ZIO_GatherDetailedSummaryInformation() */ + + +/* + * ZIO_GatherDetailedSummaryInformationDIO - + * Added March 1, 2002 to track Direct I/O. Paul wants the + * I/Os to be tracked as DIrect I/O and not in the User I/O + * element. This works out well as DIO does not use 4K + * I/O unit size. + * + */ +void ZIO_GatherDetailedSummaryInformationDIO( + RootBeast_s *beast, + int state, + LONG dioUnits ) /* Each unit is DIO_UNIT_SIZE bytes */ +{ + Volume_s *volume; + ZlssPool_s *zlssPool; + BOOL read; + + switch ( state ) + { + case ZPIOLH_STATE_DIO_WRITE_START: + read = FALSE; + break; + case ZPIOLH_STATE_DIO_READ_START: + read = TRUE; + break; + default: + zASSERT("Illegal state"==NULL); + return; + } + ZLSS_VOLUME_GET( beast, volume ); + zlssPool = ZLSS_VOLUME_TO_ZLSS_POOL( (ZfsVolume_s *)volume ); + zASSERT( zlssPool != NULL ); + zASSERT( COMN_IsDerivedFrom(zlssPool, zFTYPE_ZLSS_LOGICAL_POOL) ); + + if ( ZLSS_POOL_IO_TRACK( zlssPool ) ) + { + if ( read ) + { + ++gZLSSPRS.PRS_DirectIOBlock; + ++zlssPool->ZP_PRS.PRS_DirectIOBlock; + } + else + { + ++gZLSSPWS.PWS_DirectIOBlock; + ++zlssPool->ZP_PWS.PWS_DirectIOBlock; + } + } +} + +/* For Linux compiler , this needs to be prototyped before calling it */ +extern void retEDataBuf(Buffer_s *buf); + +/*- (FUNCTION) ----- ZFSMAL_ReadBlkDone() ----------------------------------- + | + | MAL Callback when the Async Read has been completed. + | + +-------------------------------------------------------------------------*/ +void ZFSMAL_ReadBlkDone(BioReq_s *bioReq) +{ + Asyncio_s *aio = STRUCT(bioReq, Asyncio_s, bioReq); + Buffer_s *buf = aio->buffer; + RootBeast_s *beast; +// ZfsPool_s *ioObj; + Volume_s *volume; + STATUS rc; + + /*** + *** + *** This can be called on a FAST WORK TO DO (I.E. no blocking!) + *** + ***/ + ENTER(TZPOOLIO, ZFSMAL_ReadBlkDone); + ASSERT_MPKNSS_LOCK(); + DEBUG_PRINTF(TZPOOLIO,DBG_INDENT,(LRED,MSGNot("Read IO Completed on %08x\n"),buf)); + + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + ZLSS_VOLUME_GET( beast, volume); + ++IOsInst.readOut; /*- total read io count of requests at call back -*/ + if ( gZLSSGatherDetailedIOInformation ) + { + ZIO_GatherDetailedSummaryInformation( buf, ZPIOLH_STATE_READ_COMPLETE ); + } +#if NSS_DEBUG IS_ENABLED + ZIO_DBG_HistoryDisplay( buf, ZPIOLH_STATE_READ_COMPLETE ); +#endif +#if ZLSS_IO_GATHER IS_ENABLED + if ( gZLSSPoolIOLog != NULL || !gZLSSPoolIOLogDo ) + { + ZIO_DBG_GatherEvent( buf, ZPIOLH_STATE_READ_COMPLETE ); + } +#endif + + if (bioReq->br_status == zOK) + { + /* Do the decryption if necessary */ + if((volume->VOLenabledAttributes & zATTR_ENCRYPTED) && + (volume->v_statusFlag & VOL_SF_KEYPRESENT) && + (buf->eData != NULL)) + { + rc = ZFS_D_Buf(volume, buf); + zASSERT(rc == zOK); + retEDataBuf(buf); + if(rc != zOK) + { + aio->status = rc; + goto returnError; + } + } + + /* get IO stats */ +// ioObj = ((beast->bstState & BST_STATE_IS_VOLUME_OR_POOL) ? +// (ZfsPool_s *)beast : beast->vol.zfsVol->pool); + if (beast->beastClass->classID <= zFTYPE_FILE) + { + volume->v_stats.IO_user_read_success++; + volume->v_stats.IO_user_readSize += 4096; + } +#if NSS_DEBUG IS_ENABLED + else + { + volume->v_stats.IO_system_read_success++; + volume->v_stats.IO_system_readSize += 4096; + } +#endif + + ZLSS_CopyDataToHimem(buf, beast); + + FSM_READY(&aio->fsm); + RTN_VOID(); + } + else + { + if((volume->VOLenabledAttributes & zATTR_ENCRYPTED) && + (volume->v_statusFlag & VOL_SF_KEYPRESENT) && + (buf->eData != NULL)) + { + retEDataBuf(buf); + } + if (--buf->ioRetryCount <= 0) + { + aio->status = bioReq->br_status; + goto returnError; + } + ZFSMAL_asyncReadBlk(aio); + RTN_VOID(); + } + + aio->status = bioReq->br_status; + +returnError: + /*** ***/ + /*** Real error exit here. Retries exit elsewhere ***/ + /*** ***/ + + zASSERT( aio->status != zOK ); + ZIO_ErrorRecord( buf, bioReq->br_status, + aio->status, ZPIOLH_STATE_READ_COMPLETE ); + FSM_READY(&aio->fsm); + + RTN_VOID(); + +} /* End of ZFSMAL_ReadBlkDone() */ + +/*- (FUNCTION) ----- ZFSMAL_asyncReadBlk() ---------------------------------- + | + | MAL Read - use call back when read completes (none-blocking) + | + +-------------------------------------------------------------------------*/ +void ZFSMAL_asyncReadBlk(Asyncio_s *aio) +{ + RootBeast_s *root; + Buffer_s *buf = aio->buffer; + ZfsPool_s *ioObj; + zConPool_s *phypool; + Volume_s *vol; + BOOL cryptRead; + + ENTER(TZPOOLIO, ZFSMAL_asyncReadBlk); + + ASSERT_MPKNSS_LOCK(); + zASSERT(buf->volBlk != 0); + + root = STRUCT(aio->mycache, RootBeast_s, ROOTmycache); + { /* This is the special code to handle disabling a + * volume. The LSS is required to stop all physical reads + * if the volume's ioFlag indicates the volume is + * disabled. In addition, the Volume's pool is checked. + */ + + ZLSS_VOLUME_GET( root, vol ); + if ( (ZLSS_VOLUME_IO_DISABLED( (ZfsVolume_s *)vol ) ) ) + { + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("Faking a ASYNC READ error (BLK %Ld.) (VOL 0x%x) because in DISABLE volume/pool mode\n"), + (SQUAD)buf->volBlk, (unsigned long)vol )); + aio->status = zERR_POOL_DISABLING; + FSM_READY(&aio->fsm); + RTN_VOID(); + } + } + if (ZLSS_CopyDataFromHimem(buf, root)) + { + FSM_READY(&aio->fsm); + RTN_VOID(); + } + ZLSS_INTERNAL_VOLUME_GET( root, ioObj ); + + /*- get the storage deposit to access for this volume -*/ + if((phypool = ioObj->storagepool->phypool) == NULL) + { + zASSERT("Pool is no longer valid" != NULL); + aio->status = zERR_POOL_NOT_ACCESSIBLE; + FSM_READY(&aio->fsm); + RTN_VOID(); + } + +#if NSS_DEBUG IS_ENABLED + if (RWErrTestSeed) + { + RWTestCnt++; + if (RWTestCnt >= RWErrTestSeed) + { + if (RWTestCnt % RWErrTestInterval == 0) + { + aio->status = zERR_READ_FAILURE; + FSM_READY(&aio->fsm); + RTN_VOID(); + } + } + } +#endif + + /*- only set the retry count if zero or > than max. -*/ + if((buf->ioRetryCount == 0) || + (buf->ioRetryCount > MAX_IO_RETRY_COUNT)) + { + buf->ioRetryCount = MAX_IO_RETRY_COUNT; + } + + /* Determine if this read is for a user data block on an encrypted volume, + * or just a regular read + */ + cryptRead = ((vol->VOLenabledAttributes & zATTR_ENCRYPTED) && + (vol->v_statusFlag & VOL_SF_KEYPRESENT) && + (!ZLSS_IS_SYSTEM_BLOCK( root, buf ))); + if (cryptRead) + { + zASSERT(buf->eData == NULL); + STK_POP(COMN_Encrypted_Page_Head, buf->eData, EncryptedBufPage_s, link); + if (buf->eData == NULL) + { + aio->fsm.lite.action = ZFSMAL_asyncReadBlk; + CIR_ENQ(EncryptedBufWaitHead, aio, fsm.lite.link); +#if NSS_DEBUG IS_ENABLED + EncryptedBufWaitReadAsync++; +#endif + RTN_VOID(); + } + } + else + { + buf->eData = NULL; + } + + /*- total read io count of requests sent to storage objects -*/ + ++IOsInst.readIn; + /* + Here the lock is released in the lower level calls where the thread + leaves NSS domain + */ + +#if NSS_DEBUG IS_ENABLED + ZIO_DBG_HistoryDisplay( buf, ZPIOLH_STATE_READ_ASYNC_START ); +#endif +#if ZLSS_IO_GATHER IS_ENABLED + if ( gZLSSPoolIOLog != NULL || !gZLSSPoolIOLogDo ) + { + ZIO_DBG_GatherEvent( buf, ZPIOLH_STATE_READ_ASYNC_START ); + } +#endif + + INIT_BIO_REQ(&aio->bioReq, ZFSMAL_ReadBlkDone); + zlssBioIOBufferAsync(READ, phypool->ZCP_dev, buf, &aio->bioReq); + + RTN_VOID(); +} /* End of ZFSMAL_asyncReadBlk() */ + +/**************************************************************************** + * + *****************************************************************************/ +typedef struct EBufWait_s +{ + FsmLite_s fsm; + ADDR nextThread; +} eBufWait_s; + +/**************************************************************************** + * + *****************************************************************************/ +void encryptedAllocBufferContinue (FsmLite_s *fsm) +{ + eBufWait_s *wait = (eBufWait_s *)fsm; + ASSERT_MPKNSS_LOCK(); + Continue(wait->nextThread); +} + +/**************************************************************************** + * NO free buffers so we have to wait. + *****************************************************************************/ +void waitForEncryptedBuffer (void) +{ + eBufWait_s wait; + + ENTER(TCACHE, waitForEncryptedBuffer); + ASSERT_MPKNSS_LOCK(); + + FSMLITE_INIT( &wait.fsm, MSGNot("waitForEncryptedBuffer"), 0); + + wait.fsm.action = encryptedAllocBufferContinue; + wait.nextThread = ThreadId(); + CIR_ENQ(EncryptedBufWaitHead, &wait, fsm.link); + Wait(); +#if NSS_DEBUG IS_ENABLED + EncryptedBufWaitRead++; +#endif + RTN_VOID(); +} + +/**************************************************************************** + * check for an encryption buffer to be returned, if found, return it and wake + * any threads that may have been waiting for the buffer. + *****************************************************************************/ +void retEDataBuf(Buffer_s *buf) +{ + if(buf->eData != NULL) + { + buf->eData->link = NULL; + STK_PUSH(COMN_Encrypted_Page_Head, buf->eData, link); + buf->eData = NULL; + + } + FSM_READYALL(EncryptedBufWaitHead); /* Wake anyone that is waiting on a buffer */ +} + +/*- (FUNCTION) ----- ZFSMAL_ReadBlk() --------------------------------------- + | + | MAL Read - return when read completes (blocking) + | + +-------------------------------------------------------------------------*/ +STATUS ZFSMAL_ReadBlk( + GeneralMsg_s *genMsg, + Buffer_s *buf) +{ + STATUS status; + RootBeast_s *root; + ZfsPool_s *ioObj; + zConPool_s *phypool; + + Volume_s *vol; + BOOL cryptRead; + + ENTER(TZPOOLIO, ZFSMAL_ReadBlk); + ASSERT_MPKNSS_LOCK(); + zASSERT(buf->volBlk != 0); + + root = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + + { + /* This is the special code to handle disabling a + * volume. The LSS is required to stop all physical reads + * if the volume's ioFlag indicates the volume is + * disabled. In addition, the Volume's pool is checked. + */ + + ZLSS_VOLUME_GET( root, vol ); + if ( (ZLSS_VOLUME_IO_DISABLED( (ZfsVolume_s *)vol ) ) ) + { + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("Faking a READ error (BLK %Ld.) (VOL 0x%x) because in DISABLE volume/pool mode\n"), + (SQUAD)buf->volBlk, (unsigned long)vol )); + SetErrno(genMsg, zERR_POOL_DISABLING); + RTN_STATUS(zFAILURE); + } + } + if (ZLSS_CopyDataFromHimem(buf, root)) + { + RTN_STATUS(zOK); + } + + ZLSS_INTERNAL_VOLUME_GET( root, ioObj ); + /*- get the storage deposit to access for this volume -*/ + if((phypool = ioObj->storagepool->phypool) == NULL) + { + zASSERT("Pool is no longer valid" != NULL); + SetErrno(genMsg, zERR_POOL_NOT_ACCESSIBLE); + RTN_STATUS(zFAILURE); + } + +#if NSS_DEBUG IS_ENABLED + if (RWErrTestSeed) + { + RWTestCnt++; + if (RWTestCnt >= RWErrTestSeed) + { + if (RWTestCnt % RWErrTestInterval == 0) + { + SetErrno(genMsg, zERR_READ_FAILURE); + RTN_STATUS(zFAILURE); + } + } + } +#endif + + cryptRead = ((vol->VOLenabledAttributes & zATTR_ENCRYPTED) && + (vol->v_statusFlag & VOL_SF_KEYPRESENT) && + (!ZLSS_IS_SYSTEM_BLOCK( root, buf ))); + if (cryptRead) + { + zASSERT(buf->eData == NULL); + while(STK_EMPTY(COMN_Encrypted_Page_Head)) + { + waitForEncryptedBuffer(); + } + STK_POP(COMN_Encrypted_Page_Head, buf->eData, EncryptedBufPage_s, link); + } + else + { + buf->eData = NULL; + } + + for ( buf->ioRetryCount = 0; + buf->ioRetryCount <= MAX_IO_RETRY_COUNT; buf->ioRetryCount++ ) + { +#if NSS_DEBUG IS_ENABLED + ZIO_DBG_HistoryDisplay( buf, ZPIOLH_STATE_READ_START ); +#endif +#if ZLSS_IO_GATHER IS_ENABLED + if ( gZLSSPoolIOLog != NULL || !gZLSSPoolIOLogDo ) + { + ZIO_DBG_GatherEvent( buf, ZPIOLH_STATE_READ_START ); + } +#endif + ++IOsInst.readWIn; /* IO count of requests sent to storage objects */ + + + status = zlssBioIOBuffer(READ, phypool->ZCP_dev, buf); + + /* If this was an encrypted read, decrypt it into the regular buffer, + * and free the encrypt buffer + */ + if (cryptRead) + { + if (status == 0) + { + zASSERT(buf->eData != NULL); + + status = ZFS_D_Buf(vol, buf); + zASSERT(status == zOK); + if(status != zOK) + { + /* on error, we have to return the page to the list, + * on success this happens in ZFS_D_Buf */ + SetErrno( genMsg, zERR_NICI_SUPPORT ); + /* arrange for an early out */ + buf->ioRetryCount = MAX_IO_RETRY_COUNT; + } + } + retEDataBuf(buf); + } + + ++IOsInst.readWOut; /* IO count of requests finished */ + if ( gZLSSGatherDetailedIOInformation ) + { + ZIO_GatherDetailedSummaryInformation( buf, ZPIOLH_STATE_READ_COMPLETE ); + } +#if NSS_DEBUG IS_ENABLED + ZIO_DBG_HistoryDisplay( buf, ZPIOLH_STATE_READ_COMPLETE ); +#endif +#if ZLSS_IO_GATHER IS_ENABLED + if ( (gZLSSPoolIOLog != NULL) || !gZLSSPoolIOLogDo ) + { + ZIO_DBG_GatherEvent( buf, ZPIOLH_STATE_READ_COMPLETE ); + } +#endif + if ( status == zOK ) + { /* Read was successful */ + if (root->beastClass->classID <= zFTYPE_FILE) + { + ((Volume_s *)ioObj)->v_stats.IO_user_read_success++; + ((Volume_s *)ioObj)->v_stats.IO_user_readSize += phypool->pol.poolblocksize; + } +#if NSS_DEBUG IS_ENABLED + else + { + ((Volume_s *)ioObj)->v_stats.IO_system_read_success++; + ((Volume_s *)ioObj)->v_stats.IO_system_readSize += phypool->pol.poolblocksize; + } +#endif + + ZLSS_CopyDataToHimem(buf, root); + + RTN_STATUS(zOK); + } + } /* End of retry FOR statement */ + + SetErrno( genMsg, status ); + ZIO_ErrorRecord( buf, status, GetErrno(genMsg), ZPIOLH_STATE_READ_COMPLETE ); +#if NSS_DEBUG IS_ENABLED + (root->beastClass->classID <= zFTYPE_FILE) ? + ((Volume_s *)ioObj)->v_stats.IO_user_read_failure++ : + ((Volume_s *)ioObj)->v_stats.IO_system_read_failure++; +#endif + RTN_STATUS(zFAILURE); + +} /* End of ZFSMAL_ReadBlk() */ + + +#if NSS_DEBUG IS_ENABLED +/* + * ZIO_DBG_WriteGuidVerify() - + * Verifies that the GUID is correct in the block being written. All + * system beasts have the volume's internal GUID at offset 16 in their block. + * This code was added to help find a volume corruption bug. + * + * Notes - + * We do not do this check when rebuild/verify runs because rebuild uses + * the ZLOG file for extra storage. When rebuild does this it does have + * the GUID in the data. + * + * We do not do while REDO/UNDO because all blocks are written under the + * pool because the volumes are not up. + */ + +void ZIO_DBG_WriteGuidVerify( + Buffer_s *buf, + int state ) + +{ + + Volume_s *volume; + ZfsVolume_s *zVolume; + ZfsPool_s *zfsPool; + RootBeast_s *beast; + char vBuffer[GUID_FORMAT_SIZE]; + + switch ( state ) + { + case ZPIOLH_STATE_WRITE_START: + case ZPIOLH_STATE_WRITE_COMPLETE: + case ZPIOLH_STATE_READ_COMPLETE: + case ZPIOLH_STATE_READ_ASYNC_START: + break; + case ZPIOLH_STATE_READ_START: + default: + zASSERT("Illegal state"==NULL); + return; + } + + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + ZLSS_VOLUME_GET( beast, volume ); + zVolume = VOLUME_TO_ZLSS_VOLUME( volume ); + /** + * Looking at the VOLstate is illegal without owning + * the state latch. We do here as + * 1) Debug code. + * 2) Only care if not in maintenance or REDO/UNDO. Which + * is TRUE when zVOLSTATE_ACTIVE is set. + */ + if ( (volume->VOLstate != zVOLSTATE_ACTIVE) || ( buf->pBuf.fileBlk >= 0 ) ) + { + return; + } + if ( LB_GUIDCompareWithBlock( (GUIDWithBlock_t *)(buf->pBuf.data+16), &zVolume->ZV_internalID) == 0 ) + { + return; + } + if ( COMN_IsDerivedFrom(zVolume, zFTYPE_ZLSS_ZFSPOOL) ) + { /** This pool may have been upgraded with + * a LV which means some of its OLD metadata + * blocks will have an old GUID. + */ + zfsPool = (ZfsPool_s *)zVolume; + if ( LB_GUIDCompareWithBlock( (GUIDWithBlock_t *)(buf->pBuf.data+16), + &zfsPool->ZFSPOOLoldInternalID) == 0 ) + { + return; + } + /* Display Pool's old GUID */ + LB_GUIDToString( &zfsPool->ZFSPOOLoldInternalID, + sizeof( vBuffer ), vBuffer ); + DBG_DebugPrintf(LRED, + MSGNot("zfsPool->ZFSPOOLoldInternalID GUID %s\n"), vBuffer ); + } + /* Display volume's current GUID */ + LB_GUIDToString( &zVolume->ZV_internalID, sizeof( vBuffer ), vBuffer ); + DBG_DebugPrintf(LRED,MSGNot("zvioObj->ZV_internalID GUID %s\n"), vBuffer ); + /* Display block information */ + LB_GUIDToString( (GUID_t *)(buf->pBuf.data+16), + sizeof( vBuffer ), vBuffer ); + DBG_DebugPrintf(LRED,"Buffer 0x%lx Block 0x%lx *Data 0x%lx Guid %s\n", + buf, buf->volBlk, *(LONG *)(buf->pBuf.data), vBuffer ); + zASSERT("GUIDs do not match at write"==NULL); + +} /* End of ZIO_DBG_WriteGuidVerify() */ +#endif + +#if NSS_DEBUG IS_ENABLED +void ZIO_DBG_ErrorSimulateWrite( Buffer_s *buf, LONG *consumerRetCode ) + +{ + + if (WRErrTestSeed) + { + RootBeast_s *rootD; + + rootD = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); +// if ( !ZLSS_IS_SYSTEM_BLOCK( rootD, buf ) ) + { + WRTestCnt++; + DBG_DebugPrintf(CYAN,"WRTestCnt - %d\n", WRTestCnt ); + if (WRTestCnt >= WRErrTestSeed) + { + if ( (WRTestCnt-WRErrTestSeed) % WRErrTestInterval == 0) + { + *consumerRetCode = MAL_IO_UNDEFINED_ERROR; + /* Set retry count so we FAIL this IO instead of + * causing a retry. + */ + buf->ioRetryCount = 1; + DBG_DebugPrintf(LRED,"Generated Fake I/O Error at write count %d\n", WRTestCnt ); + } + } + } + } +} /* End of ZIO_DBG_ErrorSimulateWrite() */ +#endif + +#ifdef USER_GPACHNER +FsmLite_s gDelaySignalWorkToDoFsm; +BOOL gEatSignal = FALSE; /* Set to TRUE to delay ONE FT write. + * Used to lock HOME into one place so + * that ZLOG file will get FULL. + */ +BOOL gSignalDelay = TRUE; /* Set to FALSE to UN delay the FT + * write. Used so that the POOL + * DISABLE that a full ZLOG generates + * can complete. + */ +NINT gEaten; +Agent_s *gAgent; +Buffer_s *gBuf; + +void ZIO_DelaySignalWorkToDoRoutine( FsmLite_s *workToDoFsm ) +{ + + WORK_PROCESS_INIT(); + ASSERT_MPKNSS_LOCK(); + do + { + LB_delay( 1000 ); + } while (gSignalDelay); + + defaultSignal(gAgent); + CACHE_SIGNAL_RELEASE(gBuf); + return; + +} /* End of ZFS_UserBlocksWorkToDoRoutine() */ +#endif + + +void ZFS_FsmDoBlockWriteFromSignal (Agent_s *agent); +/*- (FUNCTION) ----- ZFS_WriteBlkDone() ------------------------------------- + | + | MAL Callback when the Write has been completed. + | NON-BLOCKING - Can be called on FastWorkToDo + | + +-------------------------------------------------------------------------*/ +void ZFS_WriteBlkDone(BioReq_s *bioReq) +{ + Buffer_s *buf = STRUCT(bioReq, Buffer_s, bioReq); + Agent_s *agent = &buf->agent; + STATUS status; + RootBeast_s *beast; + + ENTER(TZPOOLIO, ZFS_WriteBlkDone); + ASSERT_MPKNSS_LOCK(); + + zASSERT(buf->state & CACHE_DIRTY); + + /* now return the encrypted page to the avail pool if its present, even if + * the write completion fails, a new encrypt buf and processing will take + * place with the write is sent back to DoBlockWriteFromSignal. + */ + retEDataBuf(buf); + + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + if ( gZLSSGatherDetailedIOInformation ) + { + ZIO_GatherDetailedSummaryInformation( buf, ZPIOLH_STATE_WRITE_COMPLETE ); + } +#if NSS_DEBUG IS_ENABLED + ZIO_DBG_ErrorSimulateWrite( buf, &bioReq->br_status ); + ZIO_DBG_HistoryDisplay( buf, ZPIOLH_STATE_WRITE_COMPLETE ); +#endif +#if ZLSS_IO_GATHER IS_ENABLED + if ( gZLSSPoolIOLog != NULL || !gZLSSPoolIOLogDo ) + { + ZIO_DBG_GatherEvent( buf, ZPIOLH_STATE_WRITE_COMPLETE ); + } +#endif + + /*- total write io count of requests received from storage objects -*/ + --CurrentWriteCount; + + /*- writeOut has no waiters, writeOutXdata has dependent waiters -*/ + if (NO_SIGNALS( &(buf->agent))) + { + ++IOsInst.writeOut; + } + else + { + ++IOsInst.writeOutXdata; + } + + if (bioReq->br_status == zOK) + { +// RootBeast_s *root; +// ZfsPool_s *ioObj; + Volume_s *volume; + + CACHE_CLEAN(buf); + /* if a writeDone routine is defined, call it instead of doing + * the default done processing + */ + beast = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + ZLSS_VOLUME_GET( beast, volume ); +// ioObj = ((beast->bstState & BST_STATE_IS_VOLUME_OR_POOL) ? +// (ZfsPool_s *)beast : beast->vol.zfsVol->pool); + /* get IO stats */ + if (beast->beastClass->classID <= zFTYPE_FILE) + { + volume->v_stats.IO_user_write_success++; + volume->v_stats.IO_user_writeSize += 4096; + } +#if NSS_DEBUG IS_ENABLED + else + { + volume->v_stats.IO_system_write_success++; + volume->v_stats.IO_system_writeSize += 4096; + } +#endif + if (buf->writeDone != NULL) + { + buf->writeDone(agent); + } + else + { +#ifdef USER_GPACHNER + if ( (gEatSignal) && (gEaten == 0) && + (ZLSS_IS_SYSTEM_BLOCK(beast, buf )) && + ( *((LONG *)buf->pBuf.data) == FXLEAF_MAGIC) ) + { + ++gEaten; + zASSERT("Ate a signal"==NULL); + gAgent = agent; + gBuf = buf; + FSMLITE_INIT( &gDelaySignalWorkToDoFsm /* Lite FSM */, + MSGNot("Eat signal delay"), 0 /* Instance */ ); + WORK_Schedule( &gDelaySignalWorkToDoFsm, + ZIO_DelaySignalWorkToDoRoutine, 0); + } + else + { + defaultSignal(agent); + CACHE_SIGNAL_RELEASE(buf); + } +#else + defaultSignal(agent); + CACHE_SIGNAL_RELEASE(buf); +#endif + } + + RTN_VOID(); + } + else + { + if (--buf->ioRetryCount <= 0) + { + agent->status = bioReq->br_status; + goto errorReturn; + } + ZFS_FsmDoBlockWriteFromSignal(agent); + RTN_VOID(); + } + + agent->status = bioReq->br_status; + +errorReturn: + /* Only writes that failed are passing through the following + * error handling code. + */ + + ZIO_ErrorRecord( buf, bioReq->br_status, + agent->status, ZPIOLH_STATE_WRITE_COMPLETE ); + status = agent->status; + zASSERT( status != zOK ); + /* We will disable the volume on all serious IO errors to + * SYSTEM blocks. + * + * This paragraph is no longer TRUE. See next paragraph. The + * exception is that we do not do this in MAINTENANCE state. + * The logic is that the /verify or /rebuild code needs to + * handle. For the MAINTENANCE code to catch errors + * they have to have their own signal handlers. The ZLOG code + * does this. It would be verify difficult for /Rebuild + * because the /Rebuild code uses normal ZLSS code to do + * much of the rebuild!!! + * + * Since /Rebuild does not use its own signal handlers we + * will not treat MAINTENANCE state special yet. A change + * volume state is sent to it that indicates we are disabling + * the volume. The ZLSS /rebuild code could use to detect + * that a serious error occurred. FixFixFix6. + * + * Serious IO error on USER blocks will cause an ALERT to + * be sent. + */ + + if ( ZLSS_IS_SYSTEM_BLOCK(beast, buf ) ) + { /* This is a SYSTEM Block */ + GeneralMsg_s dummyGenMsg; + Volume_s *volume; + NINT flags; + + /* Now tell the common layer that we wish to disable this pool. + * This is an ASYNC call. We require because we can not block. + */ + COMN_SETUP_GENERAL_MSG_NOSA(&dummyGenMsg); + ZLSS_VOLUME_GET( beast, volume ); + flags = CVA_SYSTEM_DATA | CVA_POOL_DISABLE | CVA_POOL_ALERT; + if ( !COMN_IsDerivedFrom(volume, zFTYPE_ZLSS_ZFSPOOL) ) + { + flags |= CVA_VOLUME_DISABLE | CVA_VOLUME_ALERT; + } + (void)COMN_VolumeAlert( &dummyGenMsg, beast, volume, buf, + buf->pBuf.fileBlk, buf->volBlk, + status, WHERE, flags ); + } + else + { /* This is a USER Block */ + GeneralMsg_s dummyGenMsg; + Volume_s *volume; + NINT flags; + + /* Now tell the common layer about this error. + * This is an ASYNC call. We require because we can not block. + */ + COMN_SETUP_GENERAL_MSG_NOSA(&dummyGenMsg); + ZLSS_VOLUME_GET( beast, volume ); + if ( COMN_IsDerivedFrom(volume, zFTYPE_ZLSS_ZFSPOOL) ) + { + flags = CVA_POOL_ALERT; + } + else + { + flags = CVA_VOLUME_ALERT; + } + (void)COMN_VolumeAlert( &dummyGenMsg, beast, volume, buf, + buf->pBuf.fileBlk, buf->volBlk, status, WHERE, + flags ); + } + + CACHE_CLEAN(buf); + if (buf->writeDone != NULL) + { + buf->writeDone(agent); + } + else + { + defaultSignal(agent); + CACHE_SIGNAL_RELEASE_TOSS(buf); /* invalidate the cache block */ + } + RTN_VOID(); + +} /* End of ZFS_WriteBlkDone() */ + +/*- (FUNCTION) ----- ZFS_WriteBlkDoneNoWrite() --------------------------- + +-------------------------------------------------------------------------*/ + +void ZFS_WriteBlkDoneNoWrite(FsmLite_s *fsm) +{ + Agent_s *agent; + Buffer_s *buf; + + agent = STRUCT(fsm,Agent_s,fsm); + buf = STRUCT(agent,Buffer_s,agent); + + CACHE_CLEAN(buf); + if (buf->writeDone != NULL) + { + buf->writeDone(agent); + } + else + { + defaultSignal(agent); + CACHE_SIGNAL_RELEASE(buf); + } + +} /* End of ZFS_WriteBlkDoneNoWrite() */ + + +/*- (FUNCTION) ----- ZFS_WriteBlkDoneForError() --------------------------- + +-------------------------------------------------------------------------*/ + +void ZFS_WriteBlkDoneForError(FsmLite_s *fsm) +{ + Agent_s *agent; + Buffer_s *buffer; + + agent = STRUCT(fsm,Agent_s,fsm); + buffer = STRUCT(agent, Buffer_s, agent); + + buffer->bioReq.br_status = agent->status; + + ZFS_WriteBlkDone(&buffer->bioReq); +} + + + +/*- (FUNCTION) ----- ZFS_DoBlockWriteFromSignal() --------------------------- + | + | This is the actual routine that issues a write on a data block. The + | user can pass in the address of a routine that is called by the + | "ZFS_WriteBlkDone" routine. + | + | Called as an FSM? (ZLOG does) + | + | This routine can be called with the agent == NULL. This will test if any + | available requests have been queued. + | + | All return values go to the DBWFSQueLoop to check que or exit. + +-------------------------------------------------------------------------*/ +void ZFS_FsmDoBlockWriteFromSignal (Agent_s *agent) +{ + RootBeast_s *root; + Buffer_s *buf; + ZfsPool_s *ioObj; + zConPool_s *phypool; + AgentSignalFunc_t writeDoneHandler; + Volume_s *vol; + STATUS rc; + BOOL cryptWrite; + + + ENTER(TBOND, ZFS_DoBlockWriteFromSignal); + ASSERT_MPKNSS_LOCK(); + + buf = STRUCT(agent, Buffer_s, agent); + writeDoneHandler = buf->writeDone; + + if (!(buf->state & CACHE_DIRTY)) + { /* + * Since this buffer is not dirty, we don't have + * to write the buffer but we still need to process + * the signals. + * + * We schedule an FSM to process the signals, because we + * can run into cases where the copy cache buffer code can + * end up creating a long chain of copied/not dirtied buffers and + * signaling back through the list could overflow the stack. + */ + FSM_ACTIVATE(&agent->fsm, ZFS_WriteBlkDoneNoWrite); + RTN_VOID(); + } + if (buf->state & CACHE_TOSS) + { /* + * Since this buffer is being tossed, we don't have + * to write the buffer but we still need to process + * the signals. + */ + CACHE_CLEAN(buf); + if (writeDoneHandler != NULL) + { + writeDoneHandler(agent); + } + else + { + defaultSignal(agent); + CACHE_SIGNAL_RELEASE(buf); + } + RTN_VOID(); + } + + +#if NSS_DEBUG IS_ENABLED + /* + * The 'CrashPools' gobal is used when we wish to fake a crash by + * not writing most blocks. This has been modified to skip + * all blocks except zlog buffers (during logging). Note that + * checkpoints are in the superblock which is also not effected + * by the CrashPools global. We also now use CrashPools with + * the /nodata switch to stop writes, but not exit right away. + * + * The gZCL_SkipWrites will prevent even log buffers from + * being written. This is usually used so we can run through + * the same recovery setup mulitple times. + */ + if ( (CrashPools && (writeDoneHandler == NULL)) || (gZCL_SkipWrites) ) + { /* + * This is the code that ZFSPOOL_WritePoolBlkDone would + * do after a call back for the lower levels. + */ + CACHE_CLEAN(buf); + if (writeDoneHandler != NULL) + { + writeDoneHandler(agent); + } + else + { + defaultSignal(agent); + CACHE_SIGNAL_RELEASE(buf); + } + RTN_VOID(); + } +#endif + + zASSERT(buf->state & CACHE_DIRTY); + + root = STRUCT(buf->pBuf.mycache, RootBeast_s, ROOTmycache); + ZLSS_INTERNAL_VOLUME_GET( root, ioObj ); + zASSERT(buf->volBlk != 0); + + { + /* This is the special code to handle disabling a + * volume. The LSS is required to stop all writes + * if the volume's ioFlag indicates the volume is + * disabled. In addition, the Volume's pool is checked. + */ + + ZLSS_VOLUME_GET( root, vol ); + if ( (ZLSS_VOLUME_IO_DISABLED( (ZfsVolume_s *)vol ) ) ) + { + DEBUG_PRINTF(TPOOL,DBG_NOINDENT,(LRED, + MSGNot("Faking a write (BLK %Ld.) to (VOL 0x%x) because in DISABLE volume mode\n"), + (SQUAD)buf->volBlk, (unsigned long)vol )); +// agent->status = ioObj->ZFSPOOLvol.v_disableVolumeStatus; + agent->status = zERR_POOL_DISABLING; + FSM_ACTIVATE(&agent->fsm, ZFS_WriteBlkDoneNoWrite); + RTN_VOID(); + } + } + +#if NSS_DEBUG IS_ENABLED + ZIO_DBG_WriteGuidVerify( buf, ZPIOLH_STATE_WRITE_START ); +#endif + +#if NSS_DEBUG IS_ENABLED + ZIO_DBG_HistoryDisplay( buf, ZPIOLH_STATE_WRITE_START ); +#endif +#if ZLSS_IO_GATHER IS_ENABLED + if ( gZLSSPoolIOLog != NULL || !gZLSSPoolIOLogDo ) + { + ZIO_DBG_GatherEvent( buf, ZPIOLH_STATE_WRITE_START ); + } +#endif + + /*- get the storage deposit to access for this volume -*/ + phypool = ioObj->storagepool->phypool; + if( phypool != NULL ) + { + /*- only set the retry count if zero or > than max. -*/ + if((buf->ioRetryCount == 0) || + (buf->ioRetryCount > MAX_IO_RETRY_COUNT)) + { + buf->ioRetryCount = MAX_IO_RETRY_COUNT; + } + /* We should never be writing the sparse buffer because + * it does not contain correct block information. + */ + zASSERT( buf != &CACHE_SparseBuffer); + + /* See if this is an encrypted buffer write, if so, + * we need to allocate a buffer and encrypt the user data */ + cryptWrite = ((vol->VOLenabledAttributes & zATTR_ENCRYPTED) && + (vol->v_statusFlag & VOL_SF_KEYPRESENT) && + (!ZLSS_IS_SYSTEM_BLOCK( root, buf ))); + + if (cryptWrite) + { + zASSERT(buf->eData == NULL); + STK_POP(COMN_Encrypted_Page_Head, buf->eData, EncryptedBufPage_s, link); + if (buf->eData == NULL) + { + agent->fsm.action = ZFS_FsmDoBlockWriteFromSignal; + CIR_ENQ(EncryptedBufWaitHead, agent, fsm.link); +#if NSS_DEBUG IS_ENABLED + EncryptedBufWaitWrite++; +#endif + RTN_VOID(); + } + /* encrypt buf->pBuf.data into buf->eData in COMN's context */ + rc = ZFS_E_Buf(vol, buf); + if(rc != zOK) + { + agent->status = zERR_NICI_SUPPORT; + retEDataBuf(buf); + goto cryptFail; + } + } + else + { + buf->eData = NULL; + } + + /*- writeIn no waiters, writeInXdata has dependent waiters -*/ + if (NO_SIGNALS( &(buf->agent))) + { + ++IOsInst.writeIn; + } + else + { + ++IOsInst.writeInXdata; + } + /*- total io count of requests sent to storage objects -*/ + ++CurrentWriteCount; + + ZLSS_CopyDataToHimem(buf, root); + + INIT_BIO_REQ(&buf->bioReq, ZFS_WriteBlkDone); + zlssBioIOBufferAsync(WRITE, phypool->ZCP_dev, buf, &buf->bioReq); + RTN_VOID(); + +cryptFail: + FSM_ACTIVATE(&agent->fsm, ZFS_WriteBlkDoneForError); + + +#if NSS_DEBUG IS_ENABLED + (root->beastClass->classID <= zFTYPE_FILE) ? + ((Volume_s *)ioObj)->v_stats.IO_user_write_failure++ : + ((Volume_s *)ioObj)->v_stats.IO_system_write_failure++; +#endif + RTN_VOID(); + } + + zASSERT("Pool is no longer valid" == NULL); + errPrintf(WHERE, Module, 1443, + MSG("Physical pool \"%U\" is unavailable - Write Request failed at " + "pool block: 0x%x file block: 0x%x", 430), + ioObj->storagepool->poolname, + buf->volBlk, + buf->pBuf.fileBlk); + + agent->status = zERR_POOL_NOT_ACCESSIBLE; + + /* See comments that are in ZFS_WriteBlkDone */ + if ( ZLSS_IS_SYSTEM_BLOCK(root, buf ) ) + { /* This is a SYSTEM Block */ + GeneralMsg_s dummyGenMsg; + Volume_s *volume; + NINT flags; + + /* Now tell the common layer that we wish to disable this volume. + * This is an ASYNC call. We require because we can not block. + */ + COMN_SETUP_GENERAL_MSG_NOSA(&dummyGenMsg); + ZLSS_VOLUME_GET( root, volume ); + flags = CVA_SYSTEM_DATA | CVA_POOL_DISABLE | CVA_POOL_ALERT; + if ( !COMN_IsDerivedFrom(volume, zFTYPE_ZLSS_ZFSPOOL) ) + { + flags |= CVA_VOLUME_DISABLE | CVA_VOLUME_ALERT; + } + (void)COMN_VolumeAlert( &dummyGenMsg, root, volume, buf, + buf->pBuf.fileBlk, buf->volBlk, + agent->status, WHERE, flags ); + } + else + { /* This is a USER Block */ + GeneralMsg_s dummyGenMsg; + Volume_s *volume; + + /* Now tell the common layer about this error. + * This is an ASYNC call. We require because we can not block. + */ + COMN_SETUP_GENERAL_MSG_NOSA(&dummyGenMsg); + ZLSS_VOLUME_GET( root, volume ); + (void)COMN_VolumeAlert( &dummyGenMsg, root, volume, buf, + buf->pBuf.fileBlk, buf->volBlk, + agent->status, WHERE, + CVA_VOLUME_ALERT | CVA_POOL_ALERT ); + } + + CACHE_CLEAN(buf); + if (writeDoneHandler != NULL) + { + writeDoneHandler(agent); + } + else + { + defaultSignal(agent); + CACHE_SIGNAL_RELEASE_TOSS(buf); + } + RTN_VOID(); + +} /* End of ZFS_FsmDoBlockWriteFromSignal() */ + + + +void ZFS_DoBlockWriteFromSignal(Agent_s *agent, AgentSignalFunc_t writeDoneHandler) +{ + Buffer_s *buf; + + ENTER(TBOND, ZFS_DoBlockWriteFromSignal); + ASSERT_MPKNSS_LOCK(); + + buf = STRUCT(agent, Buffer_s, agent); + /* This removes the buffer from the LRU */ + cachePrepareToFlush(buf); + buf->writeDone = writeDoneHandler; + + ZFS_FsmDoBlockWriteFromSignal(&buf->agent); + RTN_VOID(); +} + + +/*********************************************************************** + * ZLSS_AsyncMetadataReadAhead + * ZLSS_AsyncMetaDataReadDone + * + * Used to readahead metadata blocks. + ***********************************************************************/ +void ZLSS_AsyncMetaDataReadDone( + Asyncio_s *asyncio) +{ + if (asyncio->status != zOK) + { + asyncio->buffer->agent.status = asyncio->status; + } + CACHE_RELEASE(asyncio->buffer); + asyncio->buffer = NULL; + + freeAsyncioRA(asyncio); +} + +void ZLSS_AsyncMetadataReadAhead( + MyCache_s *mycache, + Blknum_t *blockList, + NINT blockCount) +{ + int i; + Buffer_s *buffer; + Asyncio_s *asyncio = NULL; + Blknum_t raBlock; +// int numRA = 0; + + for (i = 0; i < blockCount; i++) + { + raBlock = *blockList++; + if (cacheFind(mycache, -raBlock)) + { + continue; + } + asyncio = getAsyncioNoWaitRA(); + if (asyncio == NULL) + { +// printk("<1> Error Actual RA Requested numBlocks = %d\n", numRA); + return; + } + buffer = cacheAllocBufferForCopy(mycache, -raBlock, raBlock, + ZFS_BlockSignalHandler, FALSE, CACHE_READ); + if (buffer == NULL) + { + freeAsyncio(asyncio); +// printk("<1> Error Actual RA Requested numBlocks = %d\n", numRA); + return; + } + INIT_AIO(asyncio, mycache, -raBlock, CACHE_READ); + asyncio->volBlk = raBlock; + asyncio->buffer = buffer; + + FSM_PUSH(&asyncio->fsm, ZLSS_AsyncMetaDataReadDone); + ZFSMAL_asyncReadBlk(asyncio); +// printk("<1> ReadAhead Block = %d\n", raBlock); +// numRA++; + } +// if (numRA > 0) +// printk("<1> Actual RA Requested numBlocks = %d\n", numRA); +} diff --git a/src/nwnss/zlss/zlog.c b/src/nwnss/zlss/zlog.c new file mode 100644 index 0000000..55f5e03 --- /dev/null +++ b/src/nwnss/zlss/zlog.c @@ -0,0 +1,5866 @@ +/**************************************************************************** + | + | (C) Copyright 1995-2000, 2007 Novell, Inc. + | All Rights Reserved. + | + | This program is free software; you can redistribute it and/or + | modify it under the terms of version 2 of the GNU General Public + | License as published by the Free Software Foundation. + | + | This program is distributed in the hope that it will be useful, + | but WITHOUT ANY WARRANTY; without even the implied warranty of + | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + | GNU General Public License for more details. + | + | You should have received a copy of the GNU General Public License + | along with this program; if not, contact Novell, Inc. + | + | To contact Novell about this file by physical or electronic mail, + | you may find current contact information at www.novell.com + | + |*************************************************************************** + | + | The ZFS Log (ZLOG) component of ZFS + | + |--------------------------------------------------------------------------- + | + | $Author: gpachner $ + | $Date: 2007-06-07 02:25:28 +0530 (Thu, 07 Jun 2007) $ + | + | $RCSfile$ + | $Revision: 2044 $ + | + |--------------------------------------------------------------------------- + | This module is used to: + | All ZLOG code. + | For detailed information about ZLOG see the ZLOG Specification. + | + +-------------------------------------------------------------------------*/ +#if zLINUX && !defined(NSS_USERSPACE) +#include +#endif +#if defined(NSS_USERSPACE) +#include +#ifndef EOPNOTSUPP +#define EOPNOTSUPP 95 +#endif +extern void abort(void); +static void nssZlssUserspaceAbend(const char *message) +{ + (void)message; + abort(); +} +#define Abend(_message) nssZlssUserspaceAbend(_message) +#endif + +#include +#include /* NSS Library */ +#include +#include +#include +#include +#include /* For ZLOG debug */ +#include +#include + +#include "zParams.h" +#include "zstoreConfig.h" +#include "zfsAsyncio.h" +#include "zfsSuperBlk.h" +#include "zfs.h" +#include "fsm.h" +#include "zlog.h" +#include "comnIO.h" +#include "comnPublics.h" +#include "zfsXTree.h" +#include "pssDebug.h" +#include "zlssStartup.h" +#include "nssOSAPIs.h" +#include "zlssLogicalVolume.h" +#include "virtualIO.h" +#include "pssConfig.h" + + /* + * This is the value of the minimun number of extents that the Log File + * will use. This is to allow the IO's to be spread across the pool + * space. If the pool space constructed with multiple drives then this + * will allow spindle striping. Do not change this value without + * changing the seed[] code Zlog_PoolInitialize(). + */ +#define ZLOG_EXTENT_MIN 8 + + +#if NSS_DEBUG IS_ENABLED +#if LOG_TEST IS_ENABLED + + /* + * Set these in the debugger to specify a range of undo/redo functions + * that will be tested. By default we test all undo/redo functions + * when /logtest is supplied on the command line. Xaction.h has the + * defines for all of the undo/redo function numbers. + */ +NINT xfirst = 0; +NINT xlast = XFUNC_MAX; +#endif + +#if NSS_ASSERT IS_ENABLED +int ZLOGRecoveryGetCount; +#endif + +#if ZLOG_TEST IS_ENABLED +BOOL ZlogUnitTest = FALSE; /* Default is not to run unit test */ +#endif + +#endif + +/* + * FixFixFix6(Cleanup) Remove #if 0's and 1's. Update comments with spec + * comments. General clean-up before release. + */ + +/* FixFixFix6(Neal) seems like we are missing a toss/flush on deactivate or +shutdown because if I do /poolcompare then I get cache mycache mad at me. I +believe this is on the POOL mycache since the test uses it!!! Also if I +do a load unss /copy=test /poolr=test then I get an assert because +?ZLOG? cache is not empty. */ /* Neal says I need to toss ZFSPOOL after use, +I say seems like deactivate or something else should handle for a long term +solution. */ + +/* + * + * The following comments explain some important implementation facts + * that will be helpful in understanding and modifying the ZLOG code. + * + * *** ZLOG Specification + * + * There is a design/implementation specification for ZLOG. It is + * stored on the inner WEB at + * + * http://137.65.73.185/docs/pssdev/secure/transactions/ + * + * Start by viewing 'zlog1st.html' and then either 'zlogweb.html' or + * 'zlogweb.doc'. + * + * *** Read/Write error handling + * + * ZLOG adheres to the NSS read/write design. This design is as + * follows. + * 1) Write errors will cause low level routines to start a + * de-activation of the pool. Higher level routines + * can ignore write errors as long as ignoring the + * error does not corrupt the pool. Otherwise the + * error must be handled and passed up. In addition, + * no further writes will be allowed to the + * pool. This future writes will return errors or zOK + * (but the write will not be done)? I vote for errors. + * 2) If all checkpoints can not be read then activation must fail, + * otherwise activation must continue. + * 3) A read error during recovery needs to be passed up. The + * activation code will then place the pool in read only mode. + * 4) Read errors while activated must be handled and passed up. + * + * + * *** ZLOG's latch use for serialization and access. + * + * ZLOG uses the beastLatch to serialize ZLOG Beast Access. The ZLOG + * beast is different from the most other beasts in that it uses the + * beastLatch to serialize operations. + * + * *** Function and variable naming conventions. + * + * All routine and variables with ZLOG start with some form of ZLOG. + * + * ZLOG_ - ZSTORE external (ZLOG specified API or routine) + * Zlog_ - NSS internal (NSS required API (mostly beast functions)) + * zlog_ - ZLOG internal + * + * FSM routine names. All related helper FSM routine names start with + * the same 6 or so letters. The form is zlog_XYZ... where XYZ are the + * first letters of the routine name that is running. I have asked the + * debugger team to allow setting breakpoints with wild cards. When + * this is done, all FSM routines in a specific "family" can have + * breakpoints set on them with one command. See ZLOG_TransactionHomed() + * and this will be clear. + * + * FSM routines always follow the routine that calls them. Normally, + * they follow immediately after. This makes following the logic + * pretty easy because the routine that is next in the FSM is just + * a few lines after the current routine. This helps in setting + * breakpoints because the routine start will be in view. To do + * this all ZLOG functions are prototyped in zlog.h. + * + * + */ + + +/* + * + * + * The following are the execution states that are supported in NSS. The + * caller of a routine have requirements as to how a routine executes. + * These requirements are noted in all routines. + * + * Execution Options + * Asynchronous - May complete work after returning. Usaully the + * caller is notified when work complete via a callback. + * Synchronous - Must be done with work before returning + * + * Blocking - Allowed to block to accomplish work. + * Non-Blocking - Not allowed to block to accomplish work. + * + * FSM - Asynchronous and Non-Blocking routine that calls routine on + * top of FSM stack when work has completed. A single thread + * called a FSM Engine can process many FSMs. + * Thread - Async or Sync and Blocking or Non-blocking code. + * + */ + + +/**************************************************************************** + * ZFSLOG COMMON BEAST OPERATIONS definition + *****************************************************************************/ + +/*--------------------------------------------------------------------------- + * Defines all of the ZFS log beast operations + * + * All operations are inherited from parent if set to NULL. The + * construct and destruct operations are never inherited. + * + *---------------------------------------------------------------------------*/ +CommonBeastOps_s ZlogComnBeastOps = +{ + Zlog_Construct, /* construct */ + Zlog_Destruct, /* destruct */ + +// cnt NULL, /* BST_getNameUniquifier */ + NULL, /* BST_setupNameTypeSpecificInfo */ + NULL, /* BST_lookupByNameInDirectory*/ + NULL, /* BST_isDirectoryEmpty*/ + NULL, /* BST_addNameToDirectory*/ + NULL, /* BST_removeNameFromDirectory*/ + NULL, /* BST_modifyNameSpaceMaskInDirectory*/ + NULL, /* BST_setMatchAttributesInDirectory*/ + NULL, /* BST_wildcardLookup*/ + + NULL, /* BST_truncateFile*/ + NULL, /* BST_getStorageInfo*/ + NULL, /* BST_getExtentList*/ + NULL, /* BST_getPhysicalExtent*/ + NULL, /* BST_isBlockInBeast*/ + + NULL, /* BST_asyncReadFileBlk*/ + NULL, /* BST_getFileBlk*/ + NULL, /* BST_dfsReadUnits*/ + NULL, /* BST_dfsWriteUnits*/ + + NULL, /* BST_getZID*/ + NULL, /* BST_beastNotify*/ + NULL, /* BST_getInfo*/ + NULL /* BST_modifyInfo*/ +}; + + +#if ZLOG_DEBUG IS_ENABLED +ZlogBeast_s *gZlogBeast; +LONG gZlogAssertBlock = 0xffffffffuL; +#endif + + /* We assume that ZLOG_HISTOGRAM_CONSTRUCT is called before + * ZLOG_HISTOGRAM_INIT. + */ +void zlog_HistogramInitialize( ZlogHistogram_s *histogram ) + +{ + + zASSERT( histogram->ZH_Flags & (ZH_F_LONG|ZH_F_QUAD) ); + zASSERT( (histogram->ZH_Flags & (ZH_F_LONG|ZH_F_QUAD)) != (ZH_F_LONG|ZH_F_QUAD)); + if ( histogram->ZH_Flags & ZH_F_LONG ) + { + bzero( histogram->ZH_Bucket, + histogram->ZH_BucketCount * sizeof(LONG) ); + } + else + { + bzero( histogram->ZH_Bucket, + histogram->ZH_BucketCount * sizeof(QUAD) ); + } + histogram->ZH_P->ZHP_CurrentEvent = 0; + histogram->ZH_P->ZHP_EventCount = 0; + +} /* End of zlog_HistogramInitialize() */ + +/* + * + */ +void zlog_HistogramConstruct( + ZlogHistogram_s *histogram, + LONG count, + LONG size, + LONG flags, + ZlogHistogramPersistent_s *persistent, + void *bucket ) +{ + + histogram->ZH_BucketCount = count; + histogram->ZH_BucketSize = size; + histogram->ZH_Flags = flags; + histogram->ZH_P = persistent; + histogram->ZH_Bucket = bucket; + +} /* End of zlog_HistogramConstruct() */ + + + /* We assume that zlog_HistoryConstruct is called before + * ZLOG_HISTORY_INIT. + */ +void zlog_HistoryInitialize( + ZlogHistory_s *history ) +{ + + zASSERT( history->ZH_WatermarkCount > 0 ); + zASSERT( history->ZH_WatermarkCount < 50 ); + bzero( history->ZH_Slots, + sizeof( (*(history->ZH_Slots)) ) * history->ZH_WatermarkCount ); +} /* End of zlog_HistoryInitialize() */ + + +void zlog_HistoryConstruct( + ZlogHistory_s *history, + LONG historyCount, + ZlogHighWatermarkSlot_s *historySlots ) +{ + + history->ZH_WatermarkCount = historyCount; + history->ZH_Slots = historySlots; + +} /* End of zlog_HistoryConstruct() */ + + +void zlog_initAllPersistentStatistics( + ZlogBeast_s *zlogBeast ) + +{ + NINT i; + + zlog_HistogramInitialize( &zlogBeast->ZLB_ReferenceBlockCountHistogram ); + zlog_HistogramInitialize( &zlogBeast->ZLB_FunctionHistogram ); + zlog_HistogramInitialize( &zlogBeast->ZLB_SizeHistogram ); + zlog_HistogramInitialize( &zlogBeast->ZLB_BlockInuseCountHistogram ); + zlog_HistogramInitialize( &zlogBeast->ZLB_RecoveryTimeHistogram ); + zlog_HistogramInitialize( &zlogBeast->ZLB_DeferredWritesHistogram ); + + zlog_HistoryInitialize( &zlogBeast->ZLB_ReferenceBlockCountHistory ); + zlog_HistoryInitialize( &zlogBeast->ZLB_BlockInuseCountHistory ); + zlog_HistoryInitialize( &zlogBeast->ZLB_RecoveryTimeHistory ); + zlog_HistoryInitialize( &zlogBeast->ZLB_DeferredWritesHistory ); + + for ( i = 0; i < NELEMS( zlogBeast->ZLB_History ); i++ ) + { + zlog_HistoryInitialize( &zlogBeast->ZLB_History[i] ); + } +// for ( i = 0; i < NELEMS( zlogBeast->ZLB_P.ZLBP_Count ); i++ ) +// { +// ZLOG_COUNT_INIT( &zlogBeast->ZLB_P.ZLBP_Count[i], 0 ); +// } + + bzero( &zlogBeast->ZLB_P.ZLBP_StatisticsRead, + sizeof( zlogBeast->ZLB_P.ZLBP_StatisticsRead ) ); + bzero( &zlogBeast->ZLB_P.ZLBP_StatisticsWrite, + sizeof( zlogBeast->ZLB_P.ZLBP_StatisticsWrite ) ); + zlogBeast->ZLB_P.ZLBP_StatisticsResetUTCTime = (LONG)GetUTCTime(); + +} /* End of zlog_initAllPersistentStatistics() */ + + + +/* + * + * zlog_CalculateChecksum() - Calculate a checksum on a buffer of LONGs. + * + */ + + +LONG zlog_CalculateChecksum( LONG *buffer, NINT numberOfLONGs ) +{ + LONG checksum; + +#if ZLOG_CHECKSUM_LOG_RECORDS IS_ENABLED +/* FixFixFix(Performance) - look at several BYTEs every 512 bytes (media change or maybe if mismatch then do old checksum method?) */ +#endif + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, zlog_CalculateChecksum); + checksum = 0; + for ( ; numberOfLONGs != 0 ; --numberOfLONGs ) + { + checksum += *buffer; + ++buffer; + } + /* We do not want a checksum of a zeroed block to be 0 */ + RTN_LONG( checksum + 1 ); + +} /* End of zlog_CalculateChecksum */ + + +/************************************************************************** + * This is called when someone wishes to take an asynchronous checkpiont. + * + * ZLOG beastLatch exclusivily latched on entry. + * + * Asynchronous and non-blocking + * No notification of completion + ***************************************************************************/ +void zlog_CheckpointTakeSchedule( + ZlogBeast_s *zlogBeast, + NINT state ) + +{ + + ASSERT_MPKNSS_LOCK(); +// ZfsPool_s *zfsPool; + + ENTER(TZLOG, zlog_CheckpointTakeSchedule); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.vol.zfsVol->pool != NULL ); +// ASSERT_XLATCH( &zlogBeast->ZFSLOGbeastLatch ); + + DEBUG_PRINTF(TZLOG,DBG_INDENT,(CYAN,MSGNot("Scheduling a checkpoint take\n"))); + + + /* + * Reset counter - we do here becase this checkpoint code + * is async and may take a while before it resets the count. In + * the mean time we do not wish to execute this code again. + */ + zlogBeast->ZLB_LogBlockFilledInSinceCheckpointCount = 0; + /* Tell Our checkpointing thread to do its thing */ + /* See if a checkpoint is already scheduled */ + if ( zlogBeast->ZLB_CheckpointTakeWorkToDoScheduled ) + { /* Already scheduled so just return. ScheduleWork() does + not handle multiple schedules of a work to do item. */ + + DEBUG_PRINTF(TZLOG,DBG_INDENT, (CYAN, + MSGNot("Asynchronous checkpoint already scheduled\n"))); + RTN_VOID(); + } + /* + * Mark that we are scheduling our work to do. We use this flag + * to prevent mulitple schedules and to prevent unloading of the + * beast with a work-to-do scheduled (see Zlog_Destruct()). + */ + zlogBeast->ZLB_CheckpointTakeWorkToDoScheduled = TRUE; + + zlogBeast->ZLB_WorkToDoParameter1 = state; +#if NSS_DEBUG IS_ENABLED + zlogBeast->ZLB_CheckpointTakeWorkToDoTiming = GetUTCTime(); +#endif + /* Schedule a thread to do the work later (so we will not block) */ +#if ZLOG_USE_SYSTEMS_WORK_TO_DO + ScheduleWork( &zlogBeast->ZLB_CheckpointTakeWorkToDoStructure ); +#else + WORK_Schedule( &zlogBeast->ZLB_CheckpointTakeWorkToDoFsm, + zlog_CheckpointTakeWorkToDoRoutine, 0); +#endif + RTN_VOID(); + +} /* End of zlog_CheckpointTakeSchedule() */ + + +/************************************************************************** + * This is scheduled as a work to do and actually writes the checkpoint + * + * Synchronous and blocking + * + * NOTE + * This code assumes that the zWorkProc_s is not needed by the system + * when this function returns. Which is what I was told Netware does. This + * assumption comes about because this routine re-sets the checkpoint + * work to do flag. In a pre-emptive system, after we release the ZLOG + * beast latch we could schedule another work to do item with the same + * zWorkProc_s item. We don't care because this routine is done using + * the zWorkProc_ item, but the system better also be done. + ***************************************************************************/ +void zlog_CheckpointTakeWorkToDoRoutine( +#if ZLOG_USE_SYSTEMS_WORK_TO_DO + zWorkProc_s *work) +#else + FsmLite_s *workToDoFsm ) +#endif +{ + + GeneralMsg_s genMsg; + ZlogBeast_s *zlogBeast; + + WORK_PROCESS_INIT(); + MPKNSS_LOCK(); + ENTER(TZLOG, zlog_CheckpointTakeWorkToDoRoutine); + + COMN_SETUP_GENERAL_MSG_NOSA( &genMsg ); + DEBUG_PRINTF(TZLOG,DBG_INDENT,(CYAN,MSGNot("CheckpointTaking...."))); +#if ZLOG_USE_SYSTEMS_WORK_TO_DO + zlogBeast = work->info; +#else + zlogBeast = STRUCT( workToDoFsm, ZlogBeast_s, + ZLB_CheckpointTakeWorkToDoFsm ); +#endif + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.vol.zfsVol->pool != NULL ); +#if NSS_DEBUG IS_ENABLED + { + Time_t cTime; + + cTime = GetUTCTime(); + + if ( (zlogBeast->ZLB_CheckpointTakeWorkToDoTiming + 3) <= cTime ) + { + DBG_DebugPrintf( TZLOG_COLOR, + MSGNot("\"%U\" Checkpoint schedule delayed for %lu seconds(This is BAD).\n"), + zlogBeast->ZFSLOGroot.vol.zfsVol->pool->ZFSPOOLroot.name, + (unsigned long)cTime - + (unsigned long)zlogBeast->ZLB_CheckpointTakeWorkToDoTiming ); + aprintf(CYAN, + MSGNot("\"%U\" Checkpoint schedule delayed for %lu seconds(This is BAD).\n"), + zlogBeast->ZFSLOGroot.vol.zfsVol->pool->ZFSPOOLroot.name, + (unsigned long)cTime - + (unsigned long)zlogBeast->ZLB_CheckpointTakeWorkToDoTiming ); + } + } +#endif + (void)ZFSPOOL_CheckpointTake( &genMsg, + zlogBeast->ZFSLOGroot.vol.zfsVol->pool, + zlogBeast->ZLB_WorkToDoParameter1 ); + /* + * We ignore errors above based on read/write design within + * NSS. A low level routine has started de-activation or read + * only mode. + */ + DEBUG_PRINTF(TZLOG,DBG_INDENT, (CYAN,MSGNot("... Done"))); + + /* Mark that we are 'almost' done running. */ + zlogBeast->ZLB_CheckpointTakeWorkToDoScheduled = FALSE; + + + /**********************************************************/ + /*** No more using 'work', because we said we are done! ***/ + /**********************************************************/ + RTN_MPKUNLOCK_VOID(); +} /* End of zlog_CheckpointTakeWorkToDoRoutine() */ + + +/* + * This is common code between 'open' and 'create'. It is called after + * the persistent part of the ZLOG beast has been correctly initialized. + * + * ZLOG Beast latch owned + */ + + +STATUS zlog_CommonCreateAndOpen( + GeneralMsg_s *genMsg, + ZlogBeast_s *zlogBeast ) + +{ + ZfsPool_s *zfsPool; + ZlssPool_s *zlssPool; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, zlog_CommonCreateAndOpen); + /* + * We assume at least 4096 byte blocks. I.E. we document that + * we can log ZLOG_MAXIMUM_RECORD_SIZE which is about 4096 bytes. + */ + zASSERT( zlogBeast->ZFSLOGblkSizeShift >= 12 ); + + zfsPool = zlogBeast->ZFSLOGroot.vol.zfsVol->pool; + zASSERT( zfsPool != NULL ); + zASSERT( COMN_IsDerivedFrom(zfsPool, zFTYPE_ZLSS_ZFSPOOL) ); + + zlssPool = ZFS_POOL_TO_ZLSS_POOL( zfsPool ); + + zASSERT( zlssPool != NULL ); + zASSERT( COMN_IsDerivedFrom(zlssPool, zFTYPE_ZLSS_LOGICAL_POOL) ); + + /* + * Install is suppose to create with at least 1024 original blocks. + * This is a requirement of the ZLOG specification. + */ + WARN( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + /* */ + if ( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ) + { /* + * Calculate number of log blocks (originals and duplicates) in + * the pool log file. + */ + zlogBeast->ZLB_NumberOfLogBlocks = zlogBeast->ZFSLOGroot.eof / + (1 << zlogBeast->ZFSLOGblkSizeShift); + } + else + { /* The file is just too small for logging to occur successfully */ + zASSERT("Please get Greg (or Vandana)"==NULL); + SetErrno( genMsg, zERR_ZLOG_FILE_TOO_SMALL ); + RTN_STATUS(zERR_ZLOG_FILE_TOO_SMALL); + } + /* + * Calculate number of log blocks that must be in use before we + * throttle back and throttle full. This formula is + * specified by the ZLOG specification. + */ + zlogBeast->ZLB_P.ZLBP_FileThrottleFull = zlogBeast->ZLB_NumberOfLogBlocks * + zlogBeast->ZLB_P.ZLBP_FileThrottleFullPercent / 100u; + zlogBeast->ZLB_P.ZLBP_FileThrottleBack = zlogBeast->ZLB_NumberOfLogBlocks * + zlogBeast->ZLB_P.ZLBP_FileThrottleBackPercent / 100u; + /* + * Calculate number of log blocks that will be filled in prior to + * taking a new checkpoint. This formula is specified by the ZLOG + * specification. + */ + if ( zlogBeast->ZLB_NumberOfLogBlocks > + (ZLOG_CHECKPOINT_BLOCK_FILLED_IN_MAXIMUM * + ZLOG_CHECKPOINTS_BEFORE_WARP_MINIMUM( + zlogBeast->ZLB_P.ZLBP_FileThrottleBackPercent)) ) + { /* The most blocks before taking a checkpoint is 500 */ + zlogBeast->ZLB_LogBlockFilledInMaximum = + ZLOG_CHECKPOINT_BLOCK_FILLED_IN_MAXIMUM; + } + else + { /* We must take at least ZLOG_CHECKPOINTS_BEFORE_WARP_MINIMUM + * checkpoints before wrapping. + */ + zlogBeast->ZLB_LogBlockFilledInMaximum = + zlogBeast->ZLB_NumberOfLogBlocks / + ZLOG_CHECKPOINTS_BEFORE_WARP_MINIMUM( + zlogBeast->ZLB_P.ZLBP_FileThrottleBackPercent); + } + + /* + * Blast in the saved persistent counts if the user has not + * done a statistics RESET since last load. This logic + * allows the user to reset the statistics when the pool in + * non-active. + */ + if ( zlssPool->ZP_StatisticsResetCommandUTCTime <= zlogBeast->ZLB_P.ZLBP_StatisticsResetUTCTime ) + { + memcpy( &zlssPool->ZP_PRS, &zlogBeast->ZLB_P.ZLBP_StatisticsRead, + sizeof( zlssPool->ZP_PRS ) ); + memcpy( &zlssPool->ZP_PWS, &zlogBeast->ZLB_P.ZLBP_StatisticsWrite, + sizeof( zlssPool->ZP_PWS ) ); + zlssPool->ZP_StatisticsResetUTCTime = zlogBeast->ZLB_P.ZLBP_StatisticsResetUTCTime; + } + + /* + * Update NSS version numbers here so that ZLOG contains + * the version of the running system verses the version + * of the system running when the pool was initialized. + */ + zlogBeast->ZLB_P.ZLBP_ZSTOREMajorVersion = ZSTORE_VersionInfo.majorVersion; + zlogBeast->ZLB_P.ZLBP_ZSTOREMinorVersion = ZSTORE_VersionInfo.minorVersion; + zlogBeast->ZLB_P.ZLBP_ZSTORESubVersion = ZSTORE_VersionInfo.subVersion; + zlogBeast->ZLB_P.ZLBP_ZSTOREBuildNumber = ZSTORE_VersionInfo.buildNumber; + RTN_STATUS(zOK); + +} /* End of zlog_CommonCreateAndOpen */ + +/**************************************************************************** + * ZFS log beast constructor + * + * When a beast is 'created' only its constructor gets called. When + * a beast is 'opened' the constructor is called then the unpack routine. + * This means that the constructor must initialize as though the + * unpack routine will not be called. Note it is common for the unpack + * routine to re-initialize non-persistent information. + * + * Execution Requirements/Assumptions + * Synchronous and blocking + * + ****************************************************************************/ +STATUS Zlog_Construct( + GeneralMsg_s *genMsg, + void *zlogBeast_LX) +{ + ZlogBeast_s *zlogBeast = (ZlogBeast_s *)zlogBeast_LX; + int i; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, Zlog_Construct); + +#if ZLOG_DEBUG IS_ENABLED + gZlogBeast = zlogBeast; +#endif + + /* + * Latch to help find bugs. None of our other code should be + * getting called before the beast is created, but better safe + * than sorry. + */ +// X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + + /*** Assert some basic stuff the code assumes ***/ + + /* Verify that PRE is 0 (otherwise next ASSERT test is invalid) */ + zASSERT( ZLOG_ZB_DS_CONSTRUCT_PRE == 0 ); + /* Verify no one calls twice */ + zASSERT( zlogBeast->ZLB_DebugState == ZLOG_ZB_DS_CONSTRUCT_PRE ); + /* Mark that we made it to the constructor */ + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_CONSTRUCT_START; + + /* These asserts are not in debug code because someone may change + one of these basic assumptions when ZLOG is not being debugged. */ + + /* Our LONGs mean exactly 4 bytes (bogus, but a Novell fact of life) */ + zASSERT( sizeof( LONG ) == 4 ); + /* Our QUADs mean exactly 8 bytes */ + zASSERT( sizeof( QUAD ) == 8 ); + /* Best to have ZlogBeastPersistent evenly divisible by 8 */ + zASSERT( sizeof( ZlogBeastPersistent_s ) == + ALIGN( sizeof( ZlogBeastPersistent_s ), 8 )); + /* Best to have ZlogBeast evenly divisible by 8 */ +// zASSERT( sizeof( ZlogBeast_s ) == ALIGN( sizeof( ZlogBeast_s ), 8 )); + /* Code assumes that ZLOGRecordHeader is evenly divisible by 8 */ + zASSERT( sizeof( ZLOGRecordHeader_s ) == + ALIGN( sizeof( ZLOGRecordHeader_s ), 8 )); + /* Code assumes that ZLOGBlockHeader is evenly divisible by 8 */ + zASSERT( sizeof( ZLOGBlockHeader_s ) == + ALIGN( sizeof( ZLOGBlockHeader_s ), 8 )); + /* + * Verify documented record size is still smaller than or equal + * to actual maximum record size. If this ASSERT appears then we + * need to change the documented record size and deal with the side + * effects of reducing the allowabe log record size on our other + * ZFS components. + */ + zASSERT( ZLOG_MAXIMUM_RECORD_SIZE <= zlog_MAXIMUM_RECORD_SIZE ); + +#if ZLOG_TEST IS_ENABLED + if ( ZlogUnitTest ) + { + DEBUG_PRINTF(TZLOG,DBG_INDENT,(LGREEN,MSGNot("ZLOG - Documented Maximum Record Size %d\n"), + ZLOG_MAXIMUM_RECORD_SIZE)); + DEBUG_PRINTF(TZLOG,DBG_INDENT,(LGREEN,MSGNot("ZLOG - Actual Maximum Record Size %d\n"), + zlog_MAXIMUM_RECORD_SIZE)); + DEBUG_PRINTF(TZLOG,DBG_INDENT,(LGREEN,MSGNot("ZLOG - Block Header Size %d\n"), + sizeof(ZLOGBlockHeader_s))); + DEBUG_PRINTF(TZLOG,DBG_INDENT,(LGREEN,MSGNot("ZLOG - Record Header Size %d\n"), + sizeof(ZLOGRecordHeader_s))); + DEBUG_PRINTF(TZLOG,DBG_INDENT,(LGREEN,MSGNot("ZLOG - ZLOG Beast Size %d\n"), + sizeof(ZlogBeast_s))); + DEBUG_PRINTF(TZLOG,DBG_INDENT,(LGREEN,MSGNot("ZLOG - ZLOG Beast Persistent Size %d\n"), + sizeof(ZlogBeastPersistent_s))); + } +#endif + + /*** Initialize all non-persistent ZLOG beast items ***/ + + /* + * Flag that this beast can not be written to the beast Tree + */ + zlogBeast->ZFSLOGroot.bstState |= BST_STATE_DO_NOT_WRITE; + + /* + * Note that the beast is initialized to zero by the beast + * sub-system and therefore we just comment out items that need + * to be zeroed. + */ + + zlogBeast->ZLB_Signature = ZLOG_ZLB_S_SIGNATURE; + /* Start with no log blocks filled in since last checkpoint. */ +/* zlogBeast->ZLB_LogBlockFilledInSinceCheckpointCount = 0; */ +/* zlogBeast->ZLB_LogBlockFilledInCount = 0; */ +/* zlogBeast->ZLB_LogRecordFilledInCount = 0; */ + /* + * To simplify logic only ZLOG_ObtainRecord() ever reads a block. + * So we just need to NULL buffer to indicate no current block. + */ +/* zlogBeast->ZLB_Buffer = NULL; */ + zlogBeast->ZLB_RedoUndoStatus = zOK; + zlogBeast->ZLB_RedoUndoStatusSetter = WHERE; + zlogBeast->ZLB_ActiveHasBeenRead = FALSE; + /* + * Have DQ system initialize our ZfsXaction_s seniority DQ head. + * DQ of all active transactions order by oldest to youngest. Used to + * determine when the home pointer and home pointer LSN can be moved + * forward. Transactions get placed on list by ZLOG_ObtainRecord() and + * removed by ZLOG_TransactionHomed(). + */ + DQ_INIT( &zlogBeast->ZLB_SeniorityListHead ); + /* + * Have DQ system initialize our Buffer_s seniority DQ head. + * DQ of all not flushed buffers order by oldest to youngest. Used to + * determine when we can forward the signal to bound DATA buffers. + * Buffers get placed on list by ZLOG_ObtainRecord() and + * removed by Zlog_FlushBlockEndSignalHandler(). + */ + DQ_INIT( &zlogBeast->ZLB_CachedBufferListHead ); + /* + * Currently - initialing a latch zeros/nulls out some fields. It + * does not seem worth the possible trouble to comment out the + * init just in case it changes (I.E. Latching is not documented so + * who knows if it must be able to take a ZEROed latch). + */ + INIT_LATCH( &zlogBeast->ZLB_RecoveryLatch ); + INIT_LATCH( &zlogBeast->ZLB_FileFullLatch ); + + zlog_HistogramConstruct( &zlogBeast->ZLB_FunctionHistogram, + NELEMS(zlogBeast->ZLB_P.ZLBP_FunctionBucket), + ZLOG_FB_BUCKET_SIZE /* Bucket size */, + ZLOG_FB_BUCKET_TYPE, + &zlogBeast->ZLB_P.ZLBP_FunctionHistogramPersistent, + &zlogBeast->ZLB_P.ZLBP_FunctionBucket ); + + zlog_HistogramConstruct( &zlogBeast->ZLB_SizeHistogram, + NELEMS(zlogBeast->ZLB_P.ZLBP_SizeBucket), + ZLOG_SB_BUCKET_SIZE /* Bucket size */, + ZLOG_SB_BUCKET_TYPE, + &zlogBeast->ZLB_P.ZLBP_SizeHistogramPersistent, + &zlogBeast->ZLB_P.ZLBP_SizeBucket ); + + zlog_HistogramConstruct( &zlogBeast->ZLB_ReferenceBlockCountHistogram, + NELEMS(zlogBeast->ZLB_P.ZLBP_ReferenceBlockCountBucket), + ZLOG_RBCB_BUCKET_SIZE /* Bucket size */, + ZLOG_RBCB_BUCKET_TYPE, + &zlogBeast->ZLB_P.ZLBP_ReferenceBlockCountHistogramPersistent, + &zlogBeast->ZLB_P.ZLBP_ReferenceBlockCountBucket ); + zlog_HistoryConstruct( &zlogBeast->ZLB_ReferenceBlockCountHistory, + NELEMS(zlogBeast->ZLB_P.ZLBP_ReferenceBlockCountHistorySlots), + zlogBeast->ZLB_P.ZLBP_ReferenceBlockCountHistorySlots ); + + zlog_HistogramConstruct( &zlogBeast->ZLB_BlockInuseCountHistogram, + NELEMS(zlogBeast->ZLB_P.ZLBP_BlockInuseCountBucket), + ZLOG_BICB_BUCKET_SIZE /* Bucket size */, + ZLOG_BICB_BUCKET_TYPE, + &zlogBeast->ZLB_P.ZLBP_BlockInuseCountHistogramPersistent, + &zlogBeast->ZLB_P.ZLBP_BlockInuseCountBucket ); + zlog_HistoryConstruct( &zlogBeast->ZLB_BlockInuseCountHistory, + NELEMS(zlogBeast->ZLB_P.ZLBP_BlockInuseCountHistorySlots), + zlogBeast->ZLB_P.ZLBP_BlockInuseCountHistorySlots ); + + zlog_HistogramConstruct( &zlogBeast->ZLB_RecoveryTimeHistogram, + NELEMS(zlogBeast->ZLB_P.ZLBP_RecoveryTimeBucket), + ZLOG_RTB_BUCKET_SIZE /* Bucket size */, + ZLOG_RTB_BUCKET_TYPE, + &zlogBeast->ZLB_P.ZLBP_RecoveryTimeHistogramPersistent, + &zlogBeast->ZLB_P.ZLBP_RecoveryTimeBucket ); + zlog_HistoryConstruct( &zlogBeast->ZLB_RecoveryTimeHistory, + NELEMS(zlogBeast->ZLB_P.ZLBP_RecoveryTimeHistorySlots), + zlogBeast->ZLB_P.ZLBP_RecoveryTimeHistorySlots ); + + zlog_HistogramConstruct( &zlogBeast->ZLB_DeferredWritesHistogram, + NELEMS(zlogBeast->ZLB_P.ZLBP_DeferredWritesBucket), + ZLOG_DWB_BUCKET_SIZE /* Bucket size */, + ZLOG_DWB_BUCKET_TYPE, + &zlogBeast->ZLB_P.ZLBP_DeferredWritesHistogramPersistent, + &zlogBeast->ZLB_P.ZLBP_DeferredWritesBucket ); + zlog_HistoryConstruct( &zlogBeast->ZLB_DeferredWritesHistory, + NELEMS(zlogBeast->ZLB_P.ZLBP_DeferredWritesHistorySlots), + zlogBeast->ZLB_P.ZLBP_DeferredWritesHistorySlots ); + + for ( i = 0; i < NELEMS( zlogBeast->ZLB_History ); ++i ) + { + zlog_HistoryConstruct( &zlogBeast->ZLB_History[i], + ZLOG_HISTORY_SLOTS, + &zlogBeast->ZLB_P.ZLBP_HistorySlots[i*ZLOG_HISTORY_SLOTS] ); + } + +#if ZLOG_MODE_DEFAULT != 0 + zlogBeast->ZLB_P.ZLBP_Mode == ZLOG_MODE_DEFAULT; +#endif +// zlogBeast->ZLB_FileFullWaiters = 0; +// zlogBeast->ZLB_CheckpointTakeWorkToDoScheduled = FALSE; +#if ZLOG_ZB_S_DEFAULT_STATE != 0 + zlogBeast->ZLB_State = ZLOG_ZB_S_DEFAULT_STATE; +#endif +#if ZLOG_USE_SYSTEMS_WORK_TO_DO + fillInWork( &zlogBeast->ZLB_CheckpointTakeWorkToDoStructure, + zlog_CheckpointTakeWorkToDoRoutine, zlogBeast); +#else + FSMLITE_INIT( &zlogBeast->ZLB_CheckpointTakeWorkToDoFsm /* Lite FSM */, + MSGNot("ZLOG Checkpoint Work-To-Do"), 0 /* Instance */ ); +#endif + SQ_INIT( &zlogBeast->ZLB_BarrierListHead ); + zlogBeast->ZLB_Barrier = NULL; + zlogBeast->ZLB_BarrierWorkToDoScheduled = FALSE; + FSMLITE_INIT( &zlogBeast->ZLB_BarrierWorkToDoFsm, + MSGNot("ZLSS Barrier Work-To-Do"), 0 /* Instance */ ); + + /* Start off writing to checkpoint #0 all the time */ +/* zlogBeast->ZLB_NextCheckpoint = 0; */ + INIT_ONESHOT( zlogBeast->ZLB_CheckpointTakeWorkToDoTimer ); + /* Mark that we made it successfully through the constructor */ + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_CONSTRUCT_END; + +// UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + +#if LOG_TEST IS_ENABLED + for (i = 0; i < LOG_TEST_MAX; ++i) + { + zlogBeast->ZLB_LT[i].LT_Copy = + zalloc(1 << zlogBeast->ZFSLOGblkSizeShift ); + zASSERT( zlogBeast->ZLB_LT[i].LT_Copy != NULL ); + } +#endif + RTN_STATUS(zOK); + +} /* End of ZlogConstruct */ + +/**************************************************************************** + * ZFS log beast destructor + * + * + * Specific Requirements/Assumptions + * Only called if ZlogConstruct returned zOK. + * + * Execution Requirements/Assumptions + * Synchronous and blocking + *****************************************************************************/ + +void Zlog_Destruct( + void *zlogBeast_LX ) +{ + ZlogBeast_s *zlogBeast = (ZlogBeast_s *)zlogBeast_LX; + NINT count = 0; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, Zlog_Destruct); + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_DESTRUCT_START; + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + + /* I placed this code in because DESTRUCT was being called + * with items on the list. I have since made changes to + * pool deactivate which also does the same code. See + * pool deactivate for details. The bottom line is that + * this code should not be needed anymore, but it does + * no harm so I left it in. + */ + while ( (!(DQ_EMPTY( &zlogBeast->ZLB_SeniorityListHead )) ) && + ( count < ((15 * 1000)/ZLOG_WORK_CHECKPOINT_WAIT_DELAY)) ) + { +#if NSS_ASSERT IS_ENABLED + aprintf(LRED,MSGNot("ZLOG delay for senior list\n")); +#endif + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + LB_delay( ZLOG_WORK_CHECKPOINT_WAIT_DELAY ); /* 20 millisecs */ + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + ++count; /* Not in ASSERT macro just in case of side-effects */ + } + /* If we have been doing this for 15 secs then ASSERT */ + zASSERT( count < ((15 * 1000)/ZLOG_WORK_CHECKPOINT_WAIT_DELAY) ); + + /* + * Can not have next ASSERT because if we get an error while + * creating a pool the signature is not filled in. The above + * ASSERT will have to be good enough. + */ +// zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + /* Cancel checkpoint timer (this works even if not scheduled) */ + CANCEL_ALARM( zlogBeast->ZLB_CheckpointTakeWorkToDoTimer ); + /* We can still have a work-to-do scheduled. If so wait + * for it to complete before we allow the ZLOG beast to + * disappear on the work-to-do. + */ + WARN( !zlogBeast->ZLB_CheckpointTakeWorkToDoScheduled ); + WARN( !zlogBeast->ZLB_BarrierWorkToDoScheduled ); + while ( zlogBeast->ZLB_CheckpointTakeWorkToDoScheduled || + zlogBeast->ZLB_BarrierWorkToDoScheduled ) + { /* + * Delay to give Work-to-do time to run. We do not do a yield + * because no documentation on work-to-do priority and no info + * on if yield lets 'lower' (less important) threads run. We + * could add cancel code, but this code will work and cancel + * could return a error anyway. We unlatch because the work-to-do + * will need to obtain to do a checkpoint. + */ + + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + LB_delay( ZLOG_WORK_CHECKPOINT_WAIT_DELAY ); /* 20 millisecs */ + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + +#if NSS_ASSERT IS_ENABLED + ++count; /* Not in ASSERT macro just in case of side-effects */ +#endif + /* If we have been doing this for 15 secs then ASSERT */ + zASSERT( count < ((15 * 1000)/ZLOG_WORK_CHECKPOINT_WAIT_DELAY) ); + } + + /* + * If we ever see this ASSERT then we can .. + * LB_delay() until the work to do runs and/or + * cancel the work to do. + */ + + /* Should only be called if our constructor returned zOK */ + zASSERT( zlogBeast->ZLB_DebugState >= ZLOG_ZB_DS_CONSTRUCT_END); + /* Ensure that our last buffer has been flushed. And that we have not + started logging more transactions since then. */ + WARN( zlogBeast->ZLB_Buffer == NULL ); +#if NSS_ASSERT IS_ENABLED + count = 0; +#endif + while ( zlogBeast->ZLB_Buffer != NULL ) + { /* + * Delay to give buffer time to flush. We unlatch because the + * flush handler will need to obtain to complete the flush. + */ + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + LB_delay( ZLOG_WORK_FLUSH_WAIT_DELAY ); /* 100 millisecs */ + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); +#if NSS_ASSERT IS_ENABLED + ++count; /* Not in ASSERT macro just in case of side-effects */ +#endif + /* If we have been doing this for 15 secs then ASSERT */ + zASSERT( count < ((20 * 1000)/ZLOG_WORK_FLUSH_WAIT_DELAY) ); + } + /* + * Verify that both DQ's are empty. + * If ZLB_SeniorityListHead asserts then some transactions + * never got homed? + * If ZLB_CachedBufferListHead asserts then some ZLOG buffers + * got tossed instead of flushed? + */ + zASSERT( DQ_EMPTY( &zlogBeast->ZLB_SeniorityListHead ) ); + zASSERT( DQ_EMPTY( &zlogBeast->ZLB_CachedBufferListHead ) ); +#if LOG_TEST IS_ENABLED + { + int i; + + for (i = 0; i < LOG_TEST_MAX; ++i) + { + zASSERT( zlogBeast->ZLB_LT[i].LT_Copy != NULL ); + free( zlogBeast->ZLB_LT[i].LT_Copy ); + } + } +#endif + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_DESTRUCT_END; + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + RTN_VOID(); +} /* End of ZlogDestruct */ + + +/************************************************************************** + * Called as fast work to do thread when a ZLOG buffer has been written + * successfully to the media. + * + * Any errors are located in agent->status. We ignore all because + * the NSS read/write design says that if the pool gets ONE write + * error no other writes will be allowed. + * + ***************************************************************************/ +void Zlog_FlushBlockDoneSignalHandler( + Agent_s *agent) + +{ + Buffer_s *oldestBuffer; + Buffer_s *flushedBuffer; + ZlogBeast_s *zlogBeast; + NINT count; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, Zlog_FlushBlockDoneSignalHandler); + flushedBuffer = STRUCT( agent,Buffer_s,agent ); + zlogBeast = STRUCT(flushedBuffer->pBuf.mycache, ZlogBeast_s, ZFSLOGmycache); + + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + ASSERT_SLATCH( &flushedBuffer->agent.latch ); + + /* Now remove the oldest buffer as long as it has been flushed. */ + count = 0; + for ( ;; ) + { + + DQ_PEEK( &zlogBeast->ZLB_CachedBufferListHead, oldestBuffer, + Buffer_s, signalLink ); + if ( (oldestBuffer == NULL) || + (oldestBuffer->state & CACHE_DIRTY ) ) + { /* No more buffers or oldest buffer not yet flushed */ + ZLOG_HISTOGRAM_EVENT( &zlogBeast->ZLB_DeferredWritesHistogram, + (QUAD)count ); + ZLOG_HISTORY_EVENT( &zlogBeast->ZLB_DeferredWritesHistory, + (QUAD)count ); + + RTN_VOID(); + } + ++count; + /* Remove from list */ + DQ_RMV( oldestBuffer, signalLink ); +#if ZLOG_DEBUG IS_ENABLED + { + ZLOGBlockHeader_s *logBlockHeader; + + logBlockHeader = (ZLOGBlockHeader_s *)oldestBuffer->pBuf.data; + zASSERT( logBlockHeader->ZLBH_Signature == ZLBH_S_SIGNATURE ); + /* Assert if this is the BLOCK someone wishes to be + * told when written. This can be used to crash + * the machine where a ZLOG record is written and the + * meta data block not written. + */ + zASSERT( (logBlockHeader->ZLBH_FileBlockNumber != gZlogAssertBlock) + || (gZlogAssertBlock == 0xfffffffful) ); + } +#endif + ASSERT_SLATCH( &oldestBuffer->agent.latch ); + /* Pass the signal through */ + defaultSignal( &oldestBuffer->agent ); + /* The above routine unlatched our shared latch on the buffer */ + /* + * We release with push because we will not need because we only + * do CACHE_WRITE which means we do not read the current data + * of the file. + */ + CACHE_SIGNAL_RELEASE_PUSH( oldestBuffer ); + } +} /* End of Zlog_FlushBlockDoneSignalHandler() */ + +extern NINT GCL_ZLSSJournal; + +/************************************************************************** + * Called as FSM when a ZLOG buffer needs to be flushed to the media. + ***************************************************************************/ +void Zlog_FlushBlockStartSignalHandler( + Agent_s *agent) +{ + ZlogBeast_s *zlogBeast; + Buffer_s *buffer; + ZLOGBlockHeader_s *logBlockHeader; +#if ZLOG_DEBUG IS_ENABLED + LONG *unusedArea; +#endif + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, Zlog_FlushBlockStartSignalHandler); + + buffer = STRUCT(agent,Buffer_s,agent); + zlogBeast = STRUCT(buffer->pBuf.mycache, ZlogBeast_s, ZFSLOGmycache); + + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + ASSERT_SLATCH( &buffer->agent.latch ); + + + /* Are we flushing the current active ZLOG buffer? */ + if ( buffer == zlogBeast->ZLB_Buffer ) + { /* Yes - then we need to zap ZLOG's buffer pointer */ + zlogBeast->ZLB_Buffer = NULL; + } + + + /*** Fill in last of Block Header and then write log block ***/ + + /* + * FixFixFix6(Future,If reading ZLOG file supported) + * Technically we need to have an exclusive latch on buffer + * when updating it. We do not do because we have no readers + * of the ZLOG file while ZLOG is running. If this changes then + * we will need to up the latch (with correct FSM coding). + * + * UP_LATCH( &agent->latch ); + */ + + logBlockHeader = (ZLOGBlockHeader_s *)buffer->pBuf.data; + zASSERT( logBlockHeader->ZLBH_Signature == ZLBH_S_SIGNATURE ); + /* Turn off active bit - checksum is now correct (when latch released)*/ + logBlockHeader->ZLBH_Status &= ~ZLBH_S_CHECKSUM; + +#if ZLOG_DEBUG IS_ENABLED + /* Debug. Zero all of the unused longs */ + if ( logBlockHeader->ZLBH_NumberOfUnusedLongs != 0 ) + { /* + * My doc does not say what memset() does if length is zero + * so best not to find out. + */ + unusedArea = ((LONG *)logBlockHeader) + + (((1 << zlogBeast->ZFSLOGblkSizeShift)/4) - + logBlockHeader->ZLBH_NumberOfUnusedLongs); + bzero( unusedArea, + ( (size_t)logBlockHeader->ZLBH_NumberOfUnusedLongs) * + ( sizeof(LONG)/sizeof( unsigned char) ) ); + } +#endif + logBlockHeader->ZLBH_RebuildCount = zlogBeast->ZFSLOGrebuildCount; + logBlockHeader->ZLBH_Reserved = 0; + logBlockHeader->ZLBH_TimeEnd = GetUTCTime(); + /* Calculate checksum on Log Block Header only */ + logBlockHeader->ZLBH_Checksum = 0; + /* The checksum we store is the value that we make a good + block's checksum be zero. */ + logBlockHeader->ZLBH_Checksum = 0 - zlog_CalculateChecksum( + (LONG *)logBlockHeader, + ((1 << zlogBeast->ZFSLOGblkSizeShift)/4) - + logBlockHeader->ZLBH_NumberOfUnusedLongs ); + /* + * FixFixFix6(Future,If reading ZLOG file supported) + * + * DOWN_LATCH( &agent->latch ); + */ + + if ( GCL_ZLSSJournal == 8615960 ) + { /* This is for performance testing. Added for Ben so + * that we can test how much ZLOG writes slow down + * our performance. If the above global is set to + * the magic number then we will SKIP all + * the ZLOG buffer writes. This is not for the customer + * to use as if we crash we will have a corrupt volume. + */ + cachePrepareToFlush(buffer); + CACHE_CLEAN(buffer); + Zlog_FlushBlockDoneSignalHandler(agent); + RTN_VOID(); + } + + ZFS_DoBlockWriteFromSignal( agent, &Zlog_FlushBlockDoneSignalHandler /*,0 */); + RTN_VOID(); + +} /* End of Zlog_FlushBlockStartSignalHandler */ + + +/* + * ZIO_GatherDetailedSummaryInformationBarrierIOs - + * Added March 1, 2007 to track Barrier I/Os. + * + */ +void ZIO_GatherDetailedSummaryInformationBarrierIOs( + struct zConPool_s *phyPool ) +{ + ZlssPool_s *zlssPool; + + if ( phyPool->pol.zfspool == NULL ) { + return; + } + zlssPool = ZFS_POOL_TO_ZLSS_POOL( phyPool->pol.zfspool ); + if ( zlssPool == NULL ) { + return; + } + zASSERT( COMN_IsDerivedFrom(zlssPool, zFTYPE_ZLSS_LOGICAL_POOL) ); + if ( ZLSS_POOL_IO_TRACK( zlssPool ) ) + { + ++gZLSSPWS.PWS_BarrierIO; + ++zlssPool->ZP_PWS.PWS_BarrierIO; + } + return; +} + + +ObjCache_s ZlssBarrierObjCache; + /* Object Cache used to allocate ZlssBarrier_s objects. These + are the barrier agent primary object. */ +BOOL ZlssBarrierObjCacheInited = FALSE; +/*int ZLSSBarrierWaitSecs = 1;*/ + // TODO(Perf): allow less than a second + /* Number of seconds before a group write should occur + on journal blocks. We get better performance by grouping + journal block writes, but increasing this global causes + more metadata changes to be 'lost' after a crash. */ + + +void ZlssBarrierSignalRelease(ZlssBarrier_s *barrier); +void Zlog_BarrierFlushBlockStartSignalHandler( Agent_s *agent ); + +/* + ZLSSDoBarrierWriteIfRequired() - + Flush the drives write back cache if the drive supports it AND the + user has not overridden this feature. + + Some I/O within ZLSS is order dependent. ZLSS assumes that when a + write callback is called that the write has occured (or will even if the + power goes out). Many low end devices with write back caches do not ensure + this. On these devices we must flush the cache. + + Notes - + Can block and release NSS spinlock. + */ +void ZLSSDoBarrierWriteIfRequired( struct zConPool_s *phyPool ) +{ + if ( phyPool == NULL ) + { + return; + } + if ( phyPool->ZCP_BarrierWritesRequired ) + { + int flushRet; + + if ( gZLSSGatherDetailedIOInformation ) + { + ZIO_GatherDetailedSummaryInformationBarrierIOs( phyPool ); + } +#if defined(NSS_BLOCK_IO) + MPKNSS_UNLOCK(); + flushRet = blkdev_issue_flush( phyPool->ZCP_dev, NULL); + if (flushRet == -EOPNOTSUPP) + { + phyPool->ZCP_BarrierWritesRequired = FALSE; + } + MPKNSS_LOCK(); +#else + flushRet = -EOPNOTSUPP; + phyPool->ZCP_BarrierWritesRequired = FALSE; + (void)flushRet; +#endif + } + return; +} + + +/* + ZlssBarrier_s_InitOnce() - + Iniitializes items in a ZlssBarrier_s that only need to be + initialized once. +*/ +void ZlssBarrier_s_InitOnce(ZlssBarrier_s *barrier) +{ + initAgent(&barrier->ZB_Agent, + Zlog_BarrierFlushBlockStartSignalHandler, "ZLOG Barrier"); + barrier->ZB_Signature = ZLSS_BARRIER_SIGNATURE; + NULLIFY(&barrier->ZB_BarrierLink); + return; +} + + +/* + ZLSS_BarrierStart() - + Initialize resources used by ZLSS barrier write (device flush) system. + Called via ZSTORE_GlobalStartup when ZLSS is loaded. + +*/ +STATUS ZLSS_BarrierStart( GeneralMsg_s *genMsg ) +{ + STATUS retCode; + + retCode = objCacheCreate(&ZlssBarrierObjCache, "ZLSS Barriers", + sizeof(ZlssBarrier_s), ZlssBarrier_s_InitOnce); + if ( retCode != zOK ) + { + SetErrno( genMsg, retCode ); + return(zFAILURE); + } + ZlssBarrierObjCacheInited = TRUE; + return zOK; +} + + +/* + ZLSS_BarrierStop() - + Cleans up resources used by Barrier agent code. Must only be called + at ZLSS shutdown time. +*/ +void ZLSS_BarrierStop( ) +{ + if ( ZlssBarrierObjCacheInited ) { + objCacheDestroy(&ZlssBarrierObjCache); + ZlssBarrierObjCacheInited = FALSE; + } + return; +} + + +/* + zlog_BarrierWorkToDoRoutine() - + Work-to-do that does a barrier write for all barrier agents that + have been signaled (or are signaled before the thread completes). + + Synchronous and blocking + */ +void zlog_BarrierWorkToDoRoutine( + FsmLite_s *workToDoFsm ) +{ + GeneralMsg_s genMsg; + ZlssBarrier_s *barrier; + ZlogBeast_s *zlogBeast; + ZfsPool_s *zfsPool; + zConPool_s *phyPool; + SQhead_t barrierHead; + int sets = 0; + int items; + +// WORK_PROCESS_INIT(); +// MPKNSS_LOCK(); + COMN_SETUP_GENERAL_MSG_NOSA( &genMsg ); +// printk("Barrier....\n"); + zlogBeast = STRUCT( workToDoFsm, ZlogBeast_s, ZLB_BarrierWorkToDoFsm ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + zfsPool = zlogBeast->ZFSLOGroot.vol.zfsVol->pool; + phyPool = zfsPool->storagepool->phypool; + + SQ_INIT( &barrierHead ); +MoreMayHaveBeenAddedDuringBarrierWrite: + /* Grab current signaled barriers because barrier write + can block and release NSS spinlock. Only writes + that are done prior to a barrier write are ensured + to be flush with a barrier write. */ + SQ_APPEND( &barrierHead, &zlogBeast->ZLB_BarrierListHead ); + SQ_DEQ( &barrierHead, barrier, ZlssBarrier_s, ZB_BarrierLink ); + if ( barrier != NULL ) { + items = 0; + /* We can do a single barrier write for all barriers + that have already been signaled:-) */ + ZLSSDoBarrierWriteIfRequired( phyPool ); + /* Above can block and release NSS spinlock */ + do { + defaultSignal(&barrier->ZB_Agent); + ZlssBarrierSignalRelease( barrier ); + SQ_DEQ(&barrierHead, barrier, ZlssBarrier_s, ZB_BarrierLink); + ++items; + } while ( barrier != NULL ); + ++sets; +// printk("<3> Barrier (%u,%u)\n",sets,items); + /* Any new barrier signals require a new barrier write:-( */ + goto MoreMayHaveBeenAddedDuringBarrierWrite; + } +// printk("<3>... Done\n"); + zlogBeast->ZLB_BarrierWorkToDoScheduled = FALSE; +// MPKNSS_UNLOCK(); + return; +} /* End of zlog_BarrierWorkToDoRoutine() */ + + +/* + This is called when someone wishes to take an asynchronous barrier. + + Asynchronous and non-blocking, no notification of completion. + */ +static void zlog_BarrierSchedule( + ZlssBarrier_s *barrier ) +{ + ZlogBeast_s *zlogBeast; + ASSERT_MPKNSS_LOCK(); + + zlogBeast = barrier->ZB_Zlog; + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + SQ_ENQ( &zlogBeast->ZLB_BarrierListHead, barrier, ZB_BarrierLink ); + /* See if a barrier is already scheduled */ + if ( zlogBeast->ZLB_BarrierWorkToDoScheduled ) + { /* Already scheduled, thread will take us off + ZLB_BarrierListHead before terminating. */ +// printk("<2>Queued barrier %p\n",barrier); + return; + } + zlogBeast->ZLB_BarrierWorkToDoScheduled = TRUE; + /* Schedule a thread to do the work later (we must not block) */ + WORK_Schedule_HIGH( &zlogBeast->ZLB_BarrierWorkToDoFsm, //TODO(Perf,now): Should it be WORK_Schedule of WORK_Schedule_HIGH? + zlog_BarrierWorkToDoRoutine, 0 ); + return; + +} /* End of zlog_BarrierSchedule() */ + + +/* + ZFSPOOL_BarrierTimerPop() - + This is called when the TIMER pop occurs. It ensures that a Group + Write on journal blocks occur at least every ZLSSBarrierWaitSecs. + */ +void ZFSPOOL_BarrierTimerPop( + OneShot_s *alarm) +{ + Agent_s *agent = STRUCT(alarm, Agent_s, timer); + + defaultFlush(agent); + return; +} + + +static inline void ZLSSBarrierBind( ZlssBarrier_s *barrier, Buffer_s *buffer ) +{ + bind(&barrier->ZB_Agent, &buffer->agent); + return; +} + + +/* + ZlssBarrier_s_Construct() - + Returns a new X_LATCHed instance of a ZlssBarrier_s object. +*/ +ZlssBarrier_s *ZlssBarrier_s_Construct( ZlogBeast_s *zlogBeast ) +{ + ZlssBarrier_s *barrier; + + barrier = objCacheAlloc( &ZlssBarrierObjCache ); + INIT_X_LATCH(&barrier->ZB_Agent.latch); /* Init latch to be X_LATCHed */ + barrier->ZB_Zlog = zlogBeast; + /* No 'use count' as zlogBeast waits for all journal stuf in its + destructor. This is the same as pointer to zlogBeast in + ZfsXaction_s. */ +// setOneShot( &barrier->ZB_Agent.timer, SEC2TICK(ZLSSBarrierWaitSecs), ZFSPOOL_BarrierTimerPop); + setOneShot( &barrier->ZB_Agent.timer, SEC2TICK(Config.sec.journalGroupWriteTime), ZFSPOOL_BarrierTimerPop); + return barrier; +} + + +/* + Can be called in timer pop so must not block. +*/ +void ZlssBarrier_s_Destruct( ZlssBarrier_s *barrier ) +{ + objCacheFree(barrier); + return; +} + + +/* + This routine MUST only be called from Signal Handlers. + */ +/* + Can be called in timer pop so must not block. +*/ +void ZlssBarrierSignalRelease(ZlssBarrier_s *barrier) +{ + if ( LATCH_FREE( &(barrier->ZB_Agent.latch)) ) + { + ZlssBarrier_s_Destruct( barrier ); + } + return; +} + + +/* + Zlog_BarrierFlushBlockStartSignalHandler() - + Flush handler for barrier write. The barrier agent sits between + the journal blocks and the metadata blocks. It gives us a nice place + to flush the journal blocks before homing any metadata blocks. It + also allows us to group write journal blocks connected with the barrier + by doing a defaultFlush on the barrier agent. + + This routine was added Feb 2007 to greatly reduce the number of + barriers NSS does on devices that caches writes, but lie about when + a write completes. + + Modeled after Zlog_FlushBlockStartSignalHandler + +*/ +void Zlog_BarrierFlushBlockStartSignalHandler( Agent_s *agent ) +{ + ZlogBeast_s *zlogBeast; + ZlssBarrier_s *barrier; + ZfsPool_s *zfsPool; + zConPool_s *phyPool; + + ASSERT_MPKNSS_LOCK(); + barrier = STRUCT(agent,ZlssBarrier_s,ZB_Agent); + zASSERT( barrier->ZB_Signature == ZLSS_BARRIER_SIGNATURE ); + zlogBeast = barrier->ZB_Zlog; + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + CANCEL_ALARM(barrier->ZB_Agent.timer); + /* Are we flushing the current active barrier? */ + if ( barrier == zlogBeast->ZLB_Barrier ) + { /* Yes - then we need to zap the pointer so a new barrier object + is created. We must not allow more binds to occur on our + barrier because they would not be ensured to be written by + our barrier write below. */ + zlogBeast->ZLB_Barrier = NULL; + } + + zfsPool = zlogBeast->ZFSLOGroot.vol.zfsVol->pool; + phyPool = zfsPool->storagepool->phypool; + if ( !phyPool->ZCP_BarrierWritesRequired ) + { /* No Barrier - so just let the metadata blocks go now. Barrier + code is not useless in this case, defaultFlush of the barrier + implements group write of journal blocks. */ + defaultSignal(&barrier->ZB_Agent); + ZlssBarrierSignalRelease( barrier ); + return; + } +// printk("<2>Timer: Scheduling barrier write with barrier %p\n",barrier); + zlog_BarrierSchedule( barrier ); + return; +} /* End of Zlog_BarrierFlushBlockStartSignalHandler */ + + +NINT zlog_GetNewestCheckpointIndex( ZlogBeast_s *zlogBeast ) + +{ + + NINT checkpoint; + NINT newestCheckpoint; + + ASSERT_MPKNSS_LOCK(); + newestCheckpoint = 0; + for ( checkpoint = 1; checkpoint < CHECKPOINT_NUMBER; checkpoint++ ) + { + if ( zlogBeast->ZLB_P.ZLBP_OldHomePointerLsn[newestCheckpoint] < + zlogBeast->ZLB_P.ZLBP_OldHomePointerLsn[checkpoint] ) + { + newestCheckpoint = checkpoint; + } + } + return newestCheckpoint; +} /* End of zlog_GetNewestCheckpointIndex() */ + + +NINT zlog_GetOldestCheckpointIndex( ZlogBeast_s *zlogBeast ) + +{ + + NINT checkpoint; + NINT oldestCheckpoint; + + ASSERT_MPKNSS_LOCK(); + oldestCheckpoint = 0; + for ( checkpoint = 1; checkpoint < CHECKPOINT_NUMBER; checkpoint++ ) + { + if ( zlogBeast->ZLB_P.ZLBP_OldHomePointerLsn[oldestCheckpoint] > + zlogBeast->ZLB_P.ZLBP_OldHomePointerLsn[checkpoint] ) + { + oldestCheckpoint = checkpoint; + } + } + return oldestCheckpoint; +} /* End of zlog_GetOldestCheckpointIndex() */ + +/* + * + */ +void zlog_HistogramEvent(ZlogHistogram_s *histogram, QUAD event) +{ + + LONG bucket; + ZlogHistogramPersistent_s *p; + + ASSERT_MPKNSS_LOCK(); + p = histogram->ZH_P; + p->ZHP_CurrentEvent = event; + ++p->ZHP_EventCount; + bucket = event/histogram->ZH_BucketSize; + zASSERT( histogram->ZH_BucketCount > 0 ); + if ( bucket >= histogram->ZH_BucketCount ) + { + bucket = histogram->ZH_BucketCount - 1; + } + if ( histogram->ZH_Flags & ZH_F_LONG ) + { + ((LONG *)histogram->ZH_Bucket)[bucket] += 1; + } + else + { + if ( histogram->ZH_Flags & ZH_F_QUAD ) + { + ((QUAD *)histogram->ZH_Bucket)[bucket] += 1; + } + } + +} /* End of zlog_HistogramEvent() */ + + +//void zlog_HistogramEvent (ZlogHistogram_s *histogram, QUAD event) +//{ +// +// LONG bucket; +// unsigned int index; +// +// ASSERT_MPKNSS_LOCK(); +// histogram->ZH_CurrentEvent = event; +// ++histogram->ZH_EventCount; +// bucket = event/histogram->ZH_BucketSize; +// zASSERT( histogram->ZH_BucketCount > 0 ); +// if ( bucket >= histogram->ZH_BucketCount ) +// { +// bucket = histogram->ZH_BucketCount - 1; +// } +// if ( histogram->ZH_Flags & ZH_F_LONG ) +// { +// ((LONG *)histogram->ZH_Bucket)[bucket] += 1; +// } +// else +// { +// if ( histogram->ZH_Flags & ZH_F_QUAD ) +// { +// ((QUAD *)histogram->ZH_Bucket)[bucket] += 1; +// } +// } +// +// if ( (histogram->ZH_Flags & ZH_F_HIGH_WATERMARK) && +// (event >= histogram->ZH_Watermarks.ZHW_Bucket[0].ZHWB_Event) ) +// { +// /* Lowest to higest ordered list sorted by event value */ +// for ( index = 0; index < ZLOG_HWB_DEFAULT-1; index++ ) +// { +// if ( event < +// histogram->ZH_Watermarks.ZHW_Bucket[index+1].ZHWB_Event ) +// { +// break; +// } +// histogram->ZH_Watermarks.ZHW_Bucket[index] = +// histogram->ZH_Watermarks.ZHW_Bucket[index+1]; +// } +// histogram->ZH_Watermarks.ZHW_Bucket[index].ZHWB_Event = event; +// histogram->ZH_Watermarks.ZHW_Bucket[index].ZHWB_Time = GetUTCTime(); +// } +//} + + +void zlog_HistoryEvent(ZlogHistory_s *history, QUAD event) +{ + + unsigned int index; + ZlogHighWatermarkSlot_s *slots; + + ASSERT_MPKNSS_LOCK(); + zASSERT( history->ZH_WatermarkCount > 0 ); + zASSERT( history->ZH_WatermarkCount < 50 ); + slots = history->ZH_Slots; + if ( event >= slots[0].ZHWB_Event ) + { + /* Lowest to higest ordered list sorted by event value */ + for ( index = 0; index < history->ZH_WatermarkCount-1; index++ ) + { + if ( event < slots[index+1].ZHWB_Event ) + { + break; + } + slots[index].ZHWB_Event = slots[index+1].ZHWB_Event; + slots[index].ZHWB_Time = slots[index+1].ZHWB_Time; + } + slots[index].ZHWB_Event = event; + slots[index].ZHWB_Time = GetUTCTime(); + } +} /* End of zlog_HistoryEvent() */ + + +#if NSS_DEBUG IS_ENABLED + +void zlog_LogRecordDump( ZLOGRecordHeader_s *logRecordHeader, + char *headerMsg, NINT headerColor ) + +{ + + NINT i; + BlockInfo_s *blkInfo; + ZfsXasRecovery_s *logRec; + + ASSERT_MPKNSS_LOCK(); + if ( DBG_DebugFlag & (TZLOG | TZLOG2) ) \ + { + DEBUG_PRINTF(TZLOG, DBG_BOTH_NOINDENT, (headerColor, headerMsg) ); + DEBUG_PRINTF(TZLOG, DBG_BOTH_NOINDENT, (TZLOG_COLOR, + MSGNot("LSN:%lx%08lx Fun:%-2d Xid:%lx%08lx State:%04x Len:%-4d Time:%d Blks:%d\n"), + (unsigned long)(logRecordHeader->ZLRH_Zxr.ZXR_Lsn >> 32), + (unsigned long)(logRecordHeader->ZLRH_Zxr.ZXR_Lsn), + (unsigned int)(logRecordHeader->ZLRH_Zxr.ZXR_FunctionIndex), + (unsigned long)(logRecordHeader->ZLRH_Zxr.ZXR_LocalXid.restartCount), + (unsigned long)(logRecordHeader->ZLRH_Zxr.ZXR_LocalXid.unique), + (unsigned int)(logRecordHeader->ZLRH_Zxr.ZXR_TransactionState), + (unsigned int)(logRecordHeader->ZLRH_LongLength * 4), + (unsigned int)(logRecordHeader->ZLRH_EndTime - + logRecordHeader->ZLRH_StartTime), + (unsigned int)(logRecordHeader->ZLRH_Zxr.ZXR_PoolBlockCount) ) ) ; + logRec = &logRecordHeader->ZLRH_Zxr; + blkInfo = ZLOG_START_OF_POOL_BLOCKS(logRec); + for (i = 0; i < logRec->ZXR_PoolBlockCount; ++i) + { + DEBUG_PRINTF(TZLOG, DBG_BOTH_NOINDENT, (TZLOG_COLOR, + MSGNot(" Index:%d Blk:%08lx LState:%04x RState:%04x PLSN:%lx%08lx\n"), + i, + (unsigned long)blkInfo[i].blkNum, + (unsigned int)blkInfo[i].logState, + (unsigned int)blkInfo[i].replayState, + (unsigned long)(blkInfo[i].previousLsn >> 32), + (unsigned long)blkInfo[i].previousLsn ) ); + } + DEBUG_HEX_DUMP( TZLOG2, TZLOG2_COLOR, DHD_DM_BYTE | DHD_DM_ASCII, + (void *)(blkInfo + logRec->ZXR_PoolBlockCount), + (int)((logRecordHeader->ZLRH_LongLength * 4) - + sizeof( ZLOGRecordHeader_s ) - + ZLOG_BLOCK_INFO_SIZE( logRec->ZXR_PoolBlockCount)) ); + } + return; +} /* End of zlog_LogRecordDump() */ +#endif + + +#if LOG_TEST IS_ENABLED +NINT ZLOGDoLogAsserts = 1; /* Maximum number of asserts per data buffer */ + +STATUS zlog_LogTestCompare( + BYTE *oSrc1, + BYTE *oSrc2, + NINT len ) + +{ + + NINT mismatches, checked; + BYTE *src1, *src2; + + ASSERT_MPKNSS_LOCK(); + /* Just in case someone wants to look at original pointers */ + src1 = oSrc1; + src2 = oSrc2; + + mismatches = 0; + /* We have lots of code here so people can stick + breakpoints on different 'facts'. */ + for( checked = 0; checked < len; checked++, src1++, src2++ ) + { + if ( *src1 != *src2 ) + { + ++mismatches; + if ( mismatches <= ZLOGDoLogAsserts ) + { + zASSERT("Someone is not doing undo/redo correctly?" == NULL); + } + } + } + if ( mismatches == 0 ) + { + return( zOK ); + } + else + { + return( zFAILURE ); + } + +} + +#endif + +extern void cacheFlushMetadataBuffers(void); +LONG CacheFlushAtJournalPercent = 25; + +/* Thread Routine + * + * STATUS ZLOG_ObtainRecord(GeneralMsg_s *genMsg, NINT recordSize, + * RootBeast_s *rootBeast, ZfsXaction_s *zlogXaction) + * + * genMsg Contains thread specific error information. ZLOG will set the + * errno field when zOK is not returned. + * recordSize The number of bytes needed in the data portion of the pool + * log record. This number must be less than or equal to + * ZLOG_MAXIMUM_RECORD_SIZE. ZLOG will always allocate on + * QUAD boundaries. An ASSERT is done if the recordSize is + * bigger than ZLOG_MAXIMUM_RECORD_SIZE. + * rootBeast Beast that is being logged. Used to find the ZLOG Beast + * connected with the current volume and the beast class + * identifier so that it can be stored into the log record. + * zlogXaction The transaction structure that this log record will be part of. + * + * This function obtains a log record for the transaction associated with the + * zlogXaction. The caller then fills in the log record. The caller then + * releases the log record with a call to ZLOG_ReleaseRecord(). + * + * The transaction's state must be correct when ZLOG_ObtainRecord() is called. + * + * The error return value zOK indicates the call succeeded and that the other + * return values can be used. All other returns indicate an error in obtaining + * a log record. + * + * ZLOG_ObtainRecord() returns the following information in the ZLOG beast. + * + * ZLB_CommitAgent Type of Agent_s. ZLOG's Commit Agent used by callers to + * bind with ZLOG. ZLOG will use to signal that the Log + * Record has been flushed. This will be a define that + * points to the Agent_s that resides in the Buffer_s that + * contains the log record. + * ZLB_ZfsXasRecovery Pointer to a ZfsXasRecovery_s. See page 21 for a + * description of the ZfsXasRecovery_s structure. The + * caller€s beast specific information can be stored at + * zlogBeast->ZLB_ZfsXasRecovery+1. + * + * The ZLOG beast is pointed to by element zfsLogBeast within the ZfsPool_s + * structure. In addition, it is pointed to by element zfsLogBeast within + * the ZfsVolume_s structure. All beasts contain a pointer to the volume + * the beast resides on. Also, ZfsXaction_s has a element called ZX_zlogBeast + * that points to the ZLOG beast. This field is only valid after a successful + * return from ZLOG_ObtainRecord(). + * + * The caller must call ZLOG_ReleaseRecord() after each call to + * ZLOG_ObtainRecord(). The caller must not block the FSM between these calls + * because ZLOG is serialized per pool during this period. + * + * Function Implementation Details + * + * ZLOG will store homing information into the transaction structure whenever + * the transaction's state is XAS_XR_TS_START. ZLOG will store the LSN and + * Log Block number being used by the start log record of the transaction. + * ZLOG will also link the transaction structure to other transaction + * structures in oldest to youngest order (lowest LSN to highest LSN). The + * Transaction System will signal ZLOG_TransactionHomed() when all metadata + * for this transaction has been homed. ZLOG_TransactionHomed() will use + * the stored information to update the Home Pointer and Home Pointer LSN. + * See Pool Log File section on page 4 for more information on Home Pointer. + * This design is required because we must keep the transaction system separate + * from ZLOG. + * + * ZLOG's beastLatch is exclusively owned on exit. + * + */ + +void ZLOG_ObtainRecord( + ZfsXaction_s *zlogXaction, + NINT recordSize ) +{ + ZlogBeast_s *zlogBeast; + Buffer_s *buffer; + ZLOGBlockHeader_s *logBlockHeader; + NINT neededLongs; +// IoMsg_s ioMsg; + GeneralMsg_s genMsg; /* This is a dummy genMsg */ + LONG journalCount; + int i; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, ZLOG_ObtainRecord); + COMN_SETUP_GENERAL_MSG_NOSA( &genMsg ); + /* We round users request to a QUAD boundary */ + recordSize = ALIGN( recordSize, sizeof(QUAD) ); + /* Verify that record size can fit in a single log block */ + zASSERT( recordSize <= zlog_MAXIMUM_RECORD_SIZE ); + if (recordSize > zlog_MAXIMUM_RECORD_SIZE ) + { + ZOS_Abend(MSGNot("More information is being logged than can be put in a log record.")); + ZOS_Abend(MSGNot("Quit from the Debugger or your pool will be corrupted.")); + RTN_VOID(); + } + /* Warn if record size greater than documented allowable size */ +// WARN( recordSize <= ZLOG_MAXIMUM_RECORD_SIZE ); + /* + * Number of LONGs needed. Note that both values being added + * below are evenly divisible by a QUAD(8). We do not keep in + * QUAD counts because not enough compilers support QUADs + */ + neededLongs = recordSize / 4 + ( sizeof( ZLOGRecordHeader_s )/4 ); + zlogBeast = zlogXaction->ZX_zlogBeast; + zASSERT(!(zlogBeast->ZLB_State & ZLOG_ZB_S_NEEDTODO_COMPENSATION)); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + ZLOG_HISTOGRAM_EVENT( &zlogBeast->ZLB_SizeHistogram, neededLongs*4u ); + + zASSERT(!(zlogBeast->ZFSLOGroot.vol.zfsVol->pool->ZFSPOOLrepairFlags & + ZP_REPAIRFLAGS_NO_LOGGING_ALLOWED)); + if (zlogBeast->ZFSLOGroot.vol.zfsVol->pool->ZFSPOOLrepairFlags & + ZP_REPAIRFLAGS_NO_LOGGING_ALLOWED) + { + ZOS_Abend(MSGNot("During rebuild logging is not allowed.")); + ZOS_Abend(MSGNot("Quit from the Debugger or your pool will be corrupted.")); + RTN_VOID(); + } +/***************/ +zlog_ORTryAgain:; +/***************/ + /* + * Get exclusive latch. This latch is not released until the user + * calls ZLOG_ReleaseRecord(). The only exception is we will release + * on a full file before we loop back here. + */ + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); +#if NSS_ASSERT IS_ENABLED + /* The unit test is evil so skip this test if unit test is running. */ + if ( !ZlogUnitTest ) + { /* No one can call until after we start Process Transaction */ + zASSERT( zlogBeast->ZLB_DebugState >= ZLOG_ZB_DS_PT_START ); + } +#endif + /* + * Check if someone forgot to call ZLOGReleaseRecord which sets + * to NULL. Only works if they also released beastLatch because + * otherwise we are hung in the X_LATCH above. + */ + zASSERT( zlogBeast->ZLB_ZfsXasRecovery == NULL ); + /* Make local copy */ + if ( zlogBeast->ZLB_ActiveHasBeenRead == FALSE ) + { /* + * This test is only TRUE the first time Obtain is called. At + * which time ZLB_Buffer is also NULL (which is required). + */ + zlogBeast->ZLB_ActiveHasBeenRead = TRUE; + goto GetCurrentBlock; + } + /* + * Now check if defaultFlush (caused by timer pop or a DATA buffer + * wanting to go out or a beast flush) has been called on the + * buffer we are currently using. If any of the above have occurred + * our flush block start handler NULLs out our buffer pointer. We + * useMPKNSS_LOCK() for protection because our handler can not + * get a latch on the ZLOG beast to protect ZLB_Buffer. Because + * our signal handler has a shared latch on the buffers agent latch + * and aMPKNSS_LOCK() we can use either latch method to protect + * ZLB_Buffer. + */ + +#if NSS_DEBUG IS_ENABLED + if ( zlogBeast->ZLB_P.ZLBP_Mode & ZLOG_THROTTLE_BACK ) + { /* Track number of obtains we have while in throttle back mode. + * Only done in debug mode because not worth the cycles in + * the production system. + */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleSkipCount; + } +#endif + if ( zlogBeast->ZLB_Buffer == NULL ) + { /* + * defaultFlush() has been called on our buffer. Just go get + * the next block. + */ + + goto GetNextBlock; + } + else + { + NINT waitFlag; + + buffer = zlogBeast->ZLB_Buffer; + /* + * We will use the XLATCH code that will not block is we can + * not get an X Latch. + */ + X_NOWAIT( &buffer->agent.latch, waitFlag ); + /* Is the latch owned by someone else? */ + if ( waitFlag == WAITED ) + { /* + * Yes - then we will just go to the next buffer. Note that + * this works because we have marked the buffer dirty and + * filled in one record in it when we first grabbed it. The + * timer in it will defaultFlush it at some point and our + * signal handler will checksum it. + */ + zlogBeast->ZLB_Buffer = NULL; + + goto GetNextBlock; + } + + zASSERT( zlogBeast->ZLB_Buffer != NULL ); + } + + /* + * It is O.K. to use the current ZLOG buffer!!! + */ + /* O.K. to give out log record */ + logBlockHeader = (ZLOGBlockHeader_s *)buffer->pBuf.data; + if ( (NINT)logBlockHeader->ZLBH_NumberOfUnusedLongs >= neededLongs ) + { /* Current log block has enough room for request */ + if ( zlogBeast->ZLB_Barrier == NULL ) + { /* defaultFlush() has been called on our barrier. Get a new one. */ + zlogBeast->ZLB_Barrier = ZlssBarrier_s_Construct(zlogBeast); + ZLSSBarrierBind( zlogBeast->ZLB_Barrier, zlogBeast->ZLB_Buffer ); + } else { + NINT waitFlag; + + X_NOWAIT( &zlogBeast->ZLB_Barrier->ZB_Agent.latch, waitFlag ); + /* Is the latch owned by someone else? Should indicate + that a defaultFlush is being done on barrier. */ + if ( waitFlag == WAITED ) + { /* Yes - then we will just go to another barrier. */ + zlogBeast->ZLB_Barrier = ZlssBarrier_s_Construct(zlogBeast); + ZLSSBarrierBind( zlogBeast->ZLB_Barrier, zlogBeast->ZLB_Buffer ); + } + zASSERT( zlogBeast->ZLB_Barrier != NULL ); + } + + zlog_ORCommon( neededLongs, zlogBeast, logBlockHeader, zlogXaction ); + RTN_VOID(); + } + /* There is not enough space in the current log block */ + /* Mark why we are writing and ensure debug bit set correctly */ + logBlockHeader->ZLBH_Status = ZLBH_S_TOOBIG | ZLBH_S_DEBUG + | ZLBH_S_CHECKSUM; + /* + * Indicate that we are done with the buffer, before we allow + * shared access which would allow our signal handler access to + * ZLB_Buffer. We do not needMPKNSS_LOCK() here because we have the + * ZLB_Buffer latched. + */ + zlogBeast->ZLB_Buffer = NULL; + /* + * We are now done with this buffer, unlatch and then flush it. + */ + CACHE_UNXLATCH( buffer ); + /* + * To write we simply note that we would like to do a flush of + * our buffer. + */ +// defaultFlush( &buffer->agent ); + /* + * Write errors show up in our done signal handler. + */ + + /*** Get the next log block in the file ***/ +/************/ +GetNextBlock:; +/************/ + if (zlogBeast->ZLB_State & ZLOG_ZB_S_DOING_COMPENSATION) + { + /* We are in compensation writing mode. At this time we do not + * want to take checkpoints. We are also making the assumption + * that there are only very few undo records and even fewer logical + * undo records, that the compensation records that are created + * are not enough to fill the log. Also when we start compensation + * records, we have moved home to the beginning of where the logical + * undo records were copied. Thus, at this time we have the whole + * log available for compensation records + */ + + /* Accounting */ + zlogBeast->ZLB_LogBlockFilledInCount++; + /* Track blocks used for checkpointing determination */ + zlogBeast->ZLB_LogBlockFilledInSinceCheckpointCount++; + + goto SkipChkPtAndZlogFullTest; + } + /* + * We are back the buffer may or may not be flushed, but we + * don't care. + */ + + /* + * The ZLOG code is very dependent that this is the ONLY place a log + * block is obtained for transaction logging. + * + * Verify that we are not about to give out a block that could be + * needed if we crash! + * + * Is the file 'almost' full (I.E. only one block free)? + */ + if ( zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber == + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ) + { /* Yes - The file is full */ +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf( TZLOG_COLOR, MSGNot("\"%U\" ZLOG file full\n"), + zlogBeast->ZFSLOGroot.vol.zfsVol->pool->ZFSPOOLroot.name ); + aprintf( CYAN, MSGNot("\"%U\" ZLOG file full\n"), + zlogBeast->ZFSLOGroot.vol.zfsVol->pool->ZFSPOOLroot.name ); + WARN(zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber != + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ); +#endif +/* FixFixFix6(Debug) - + * Leave next line in until we find out why checkpoints are not being taken. + */ + zASSERT( "ZLOG file is FULL!!! This may be temporary, but it still should not occur" == NULL ); + if ( zlogBeast->ZLB_FileFullWaiters == 0 ) + { + X_LATCH( &zlogBeast->ZLB_FileFullLatch ); + zlogBeast->ZLB_FileFullWaiters = 1; + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + /* + * By taking a checkpoint here we are hoping to move the + * pre oldest home pointer. This will work until all four + * checkpoints have the same home pointer. At which point, + * file full can only be resolved by home moving. We will + * take four checkpoints here so to max our chances of + * pre home moving and thus getting us out of file full + * condition. More than likily we are full because home + * has not moved in a long time so this code will not move + * the pre oldest home. This code will only help if we have + * not been taking enough checkpoints. + */ + /* Reset counter */ + zASSERT( zlogBeast->ZFSLOGroot.vol.zfsVol->pool != NULL ); + DEBUG_PRINTF(TZLOG,DBG_INDENT,(CYAN,MSGNot("Attempting four checkpoint miracle\n"))); + /* Take four synchronous checkpoints. */ + { + NINT checkpoint; + + for ( checkpoint = 0; checkpoint < CHECKPOINT_NUMBER; + ++checkpoint ) + { + ZFSPOOL_CheckpointTake( &genMsg, + zlogBeast->ZFSLOGroot.vol.zfsVol->pool, + CHECKPOINT_CT_S_NORMAL|CHECKPOINT_CT_S_MIRACLE ); + } + } +#if NSS_DEBUG IS_ENABLED + if ( zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber == + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ) + { /* Yes - The file is still full */ + zASSERT( "ZLOG file is FULL!!! This may be temporary, but it still should not occur" == NULL ); + } +#endif + /* + * Wait for room in the file - I.E. home or pre home moving. + * If our four checkpoints moved pre home then we will rip + * through the S_BARRIER. + */ + S_BARRIER( &zlogBeast->ZLB_FileFullLatch ); + goto zlog_ORTryAgain; + } + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + + S_BARRIER( &zlogBeast->ZLB_FileFullLatch ); + goto zlog_ORTryAgain; + } + + if ( !(zlogBeast->ZLB_P.ZLBP_Mode & ZLOG_THROTTLE_BACK) ) + { /* In throttle full mode */ + NINT blocksInuse; + + if ( zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber <= + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ) + { /* Do wrap math calculation */ + blocksInuse = (NINT)zlogBeast->ZLB_NumberOfLogBlocks - + (NINT)zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber + + (NINT)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + zASSERT( zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber != + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ); + } + else + { /* No wrap -- do normal math */ + blocksInuse = (NINT)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber - + (NINT)zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber; + } + + for (i = CacheFlushAtJournalPercent; i < 50; i+=5) + { + journalCount = (zlogBeast->ZLB_P.ZLBP_FileThrottleBack * 2 * i)/100; + if (blocksInuse == journalCount) + { +// printk("<1> blocksInuse at %d percent\n", i); + cacheFlushMetadataBuffers(); + } + } + if ( blocksInuse >= zlogBeast->ZLB_P.ZLBP_FileThrottleBack ) + { /* Yes - The file is 'too' full */ + /* Go into throttle back mode */ +// printk("<1> blocksInuse at 50 percent\n"); + cacheFlushMetadataBuffers(); + zlogBeast->ZLB_P.ZLBP_Mode |= ZLOG_THROTTLE_BACK; + ++zlogBeast->ZLB_P.ZLBP_FileThrottleBackCount; + zlogBeast->ZLB_P.ZLBP_FileThrottleTimeStart = GetUTCTime(); +#if NSS_DEBUG IS_ENABLED + { + char buffer[40]; + + + DBG_DebugPrintf( TZLOG_COLOR, + MSGNot("\"%U\" Entering transaction throttle back mode at %s\n"), + zlogBeast->ZFSLOGroot.vol.zfsVol->pool->ZFSPOOLroot.name, + UTCTime2Str( + zlogBeast->ZLB_P.ZLBP_FileThrottleTimeStart, + &buffer[0]) ); + DBG_ScreenAPrintf("ZLSS.Greg.Pool.Volume", WHERE, + CYAN, + MSGNot("\"%U\" Entering transaction throttle back mode at %s\n"), + zlogBeast->ZFSLOGroot.vol.zfsVol->pool->ZFSPOOLroot.name, + UTCTime2Str( + zlogBeast->ZLB_P.ZLBP_FileThrottleTimeStart, + &buffer[0]) ); + + } +#endif + /* + * We schedule a checkpoint when we enter throttle back + * mode. If checkpoints have not been taken in a while + * then this can free up ZLOG space and thus allow us to + * quickly exit throttle back mode. With current 1/12 + * checkpoint rate the lack of taking checkpoints should + * not be causing the throttle back condition. We take a + * checkpoint here just because it can't hurt. + */ + /* Async, non-blocking routine to schedule a checkpoint */ + zlog_CheckpointTakeSchedule( zlogBeast, + CHECKPOINT_CT_S_NORMAL|CHECKPOINT_CT_S_THROTTLE_BACK ); + } + } +#if 0 + else + { + NINT blocksInuse; + NINT percent; + SNINT delayAmount; /* In milliseconds */ + + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWaitCount; + + if ( zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber <= + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ) + { /* Do wrap math calculation */ + blocksInuse = (NINT)zlogBeast->ZLB_NumberOfLogBlocks - + (NINT)zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber + + (NINT)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + zASSERT(zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber != + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ); + } + else + { /* No wrap -- do normal math */ + blocksInuse = (NINT)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber - + (NINT)zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber; + } + if ( blocksInuse >= zlogBeast->ZLB_P.ZLBP_FileThrottleBack ) + { /* Yes - The file is 'too' full */ + percent = (100u * blocksInuse) / + zlogBeast->ZLB_NumberOfLogBlocks; + if ( percent < 55u ) + { + delayAmount = 1000u/10u; /* .1 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait1Count; + } + else + { + if ( percent < 60u ) + { + delayAmount = 1000u/2u; /* .5 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait2Count; + } + else + { + if ( percent < 70u ) + { + delayAmount = 1000u*1u; /* 1 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait3Count; + } + else + { + if ( percent < 80u ) + { + delayAmount = 1000u*10u; /* 10 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait4Count; + } + else + { + if ( percent < 90u ) + { + delayAmount = 1000u*100u; /* 100 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait5Count; + } + else + { + delayAmount = 1000u*1000u; /* 1000 sec */ + } + } + } + } + } + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + /* The specificaiton requires us not to sleep for + * a long time so that if the throttle mode changes + * our thread gets to run 'right' away. We therefore + * delay 0.1 seconds and then see if still in throttle + * back mode. We also let run when thread has waited + * required amount(remember this prevents deadlocks). + */ + do { + LB_delay( 1000u/10u ); + delayAmount -= 1000u/10u; + /* Note that we normally are in a MPKNSS_LOCK region + * when looking at the throttle bit. We ignore here + * because it will not cause any serious problems. + */ + } while ( (delayAmount > 0) && + (zlogBeast->ZLB_P.ZLBP_Mode & ZLOG_THROTTLE_BACK) ); + + goto zlog_ORTryAgain; + } + } +#endif + + /* Accounting */ + zlogBeast->ZLB_LogBlockFilledInCount++; + /* Track blocks used for checkpointing determination */ + zlogBeast->ZLB_LogBlockFilledInSinceCheckpointCount++; + /* Have we written enough blocks to warrent a checkpoint? */ + if (zlogBeast->ZLB_LogBlockFilledInSinceCheckpointCount >= + zlogBeast->ZLB_LogBlockFilledInMaximum ) + { /* Yes - then schedule a checkpoint */ + /* + * It is interesting that taking checkpoints when home has not + * moved is just a waste of time! We have one exception to this + * which is the 'miracle four checkpoints' which attempt to move + * pre oldest and not home. Moving home reduces recovery time + * while moving pre oldest home frees space in the ZLOG file. + */ + /* Async, non-blocking routine to schedule a checkpoint */ + zlog_CheckpointTakeSchedule( zlogBeast, + CHECKPOINT_CT_S_NORMAL|CHECKPOINT_CT_S_ACTIVE_MOVEMENT ); + } + +/***************/ +SkipChkPtAndZlogFullTest:; +/***************/ + + /* Update to next log block number */ + ZLOG_NEXT_POOL_LOG_BLOCK( zlogBeast, + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber ); +/***************/ +GetCurrentBlock:; +/***************/ + /* + * Because we always write the full block we do not need to do + * a pre-read. So we are only getting a buffer from the cache system. + * We will have this buffer X latched so our signal handler will not be + * trying to poke ZLB_Buffer so we do not need MPKNSS_LOCK here. + */ + /* + * Must look on mycache list because we do not toss ALL the time. + * 1). Recovery. + * 2). Other people reading file. + */ + buffer = cacheLookup(&zlogBeast->ZFSLOGmycache, + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber, + CACHE_WRITE); + if ( buffer == NULL ) + { + Blknum_t volBlk = 0; + Fmap_s *fmap = &zlogBeast->ZFSLOGzfsStInfo->fmap; + Blknum_t fileBlk = zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + NINT i; +// RootBeast_s + zASSERT(fmap->numRecs > 1); + + for (i = 1; i < fmap->numRecs; i++) + { + if (fmap->dirExt[i].count > fileBlk) + { + volBlk = fmap->dirExt[i].poolBlk + + (fileBlk - fmap->dirExt[i-1].count); + break; + } + } + zASSERT(volBlk != 0); + /* + * We must know when the cache system flushes our buffer so we + * insert our own handler into the flush path verses the standard + * signal handler. + */ + buffer = cacheAllocBuffer( &zlogBeast->ZFSLOGmycache, + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber, + volBlk, &Zlog_FlushBlockStartSignalHandler, + CACHE_WRITE); + zASSERT( buffer != NULL ); + } + else + { + /* + * The poke is needed just in case the buffer was originally + * obtained by our recovery code or some one reading the file. + * In these cases the signal handler is the default block handler. + */ + Inst.cache.ioHitSystem++; + POKE_SIGNAL_HANDLER( buffer, &Zlog_FlushBlockStartSignalHandler ); + } + +#if 1 + if ( zlogBeast->ZLB_Barrier == NULL ) + { /* defaultFlush() has been called on our barrier. Get a new one. */ + zlogBeast->ZLB_Barrier = ZlssBarrier_s_Construct(zlogBeast); + } else { + NINT waitFlag; + + X_NOWAIT( &zlogBeast->ZLB_Barrier->ZB_Agent.latch, waitFlag ); + /* Is the latch owned by someone else? Should indicate + that a defaultFlush is being done on barrier. */ + if ( waitFlag == WAITED ) + { /* Yes - then we will just go to another barrier. */ + zlogBeast->ZLB_Barrier = ZlssBarrier_s_Construct(zlogBeast); + } + zASSERT( zlogBeast->ZLB_Barrier != NULL ); + } + ZLSSBarrierBind( zlogBeast->ZLB_Barrier, buffer ); +#endif + + /* + * Note that we have an exclusive latch on the buffer since we + * will be writing to it. + */ + zASSERT( zlogBeast->ZLB_Buffer == NULL ); + zlogBeast->ZLB_Buffer = buffer; + /* + * Mark buffer as dirty. This also starts a flush timer so that + * this buffer is not held too long. Our flush handler (set above) + * will be called when the timer goes off (or a bound DATA block + * indicates it needs to go out (or someone flushed the ZLOG beast)) + * via calls to defaultFlush(). + */ + CACHE_MARK_DIRTY( buffer ); + /* + * Order cached buffers by oldest(lowest LSN) to the youngest + * (highest LSN). The current buffer is the youngest (highest LSN) + * so we put at the end of the cached buffer list. Zlog_WriteBlkEnd() + * will use and then remove from list. + */ + + DQ_ENQ( &zlogBeast->ZLB_CachedBufferListHead, buffer, signalLink ); + + + logBlockHeader = (ZLOGBlockHeader_s *)buffer->pBuf.data; +#if NSS_ASSERT IS_ENABLED || ZLOG_DEBUG IS_ENABLED + /* + * Fill complete block with explanation points. This puts + * them in the padding of log records so each log record can be found + * and allows for ASSERTs if someone writes past their log record end. + */ + memset( logBlockHeader, 0x44, 1 << zlogBeast->ZFSLOGblkSizeShift ); +#endif + logBlockHeader->ZLBH_TimeStart = GetUTCTime(); + logBlockHeader->ZLBH_Signature = ZLBH_S_SIGNATURE; + zASSERT( LB_GUIDValidate( &zlogBeast->ZFSLOGroot.ROOTinternalID ) ); + logBlockHeader->ZLBH_InternalID = zlogBeast->ZFSLOGroot.ROOTinternalID; +#if ZLOG_DEBUG IS_ENABLED + logBlockHeader->ZLBH_FileBlockNumber = + (LONG)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; +#endif + logBlockHeader->ZLBH_Status = ZLBH_S_TIME_OR_BOND | ZLBH_S_DEBUG + | ZLBH_S_CHECKSUM; + /* Set to zero because routine will increment */ + logBlockHeader->ZLBH_NumberOfRecords = 0; + /* Set to initial size */ + logBlockHeader->ZLBH_NumberOfUnusedLongs = + ((1 << zlogBeast->ZFSLOGblkSizeShift)/4) - + ( sizeof(ZLOGBlockHeader_s ) / sizeof(LONG) ); + /* Fill in this log block header, log record header, and ... */ + zlog_ORCommon( neededLongs, zlogBeast, logBlockHeader, zlogXaction ); + RTN_VOID(); + +} /* End of ZLOG_ObtainRecord */ + + +/* + * + * Common routine for ZLOG_ObtainRecord. + * + * Called when we get a new log block and when there is room in an old + * log block for users ZLOG_ObtainRecord request. + * + */ + +void zlog_ORCommon( + NINT neededLongs, + ZlogBeast_s *zlogBeast, + ZLOGBlockHeader_s *logBlockHeader, + ZfsXaction_s *zlogXaction ) + +{ + + ZLOGRecordHeader_s *logRecordHeader; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, zlog_ORCommon); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + logRecordHeader = (ZLOGRecordHeader_s *)((LONG *)logBlockHeader + + (((1 << zlogBeast->ZFSLOGblkSizeShift)/4) - + logBlockHeader->ZLBH_NumberOfUnusedLongs)); +#if ZLOG_DEBUG IS_ENABLED + logRecordHeader->ZLRH_StartTime = (LONG)microSecondTimer(); +#endif + logRecordHeader->ZLRH_LongLength = (LONG)neededLongs; + /* Get and update LSN */ + logRecordHeader->ZLRH_Zxr.ZXR_Lsn = + (zlogBeast->ZLB_P.ZLBP_ActivePointerLsn)++; + /* Place XAS specific items into Pool Log Record Header */ + logRecordHeader->ZLRH_Zxr.ZXR_LocalXid = + zlogXaction->localXid; + +#if ZLOG_DEBUG IS_ENABLED + /* Callers must initialize after return form this funciton. */ + logRecordHeader->ZLRH_Zxr.ZXR_FunctionIndex = 0xBEEFBEEFuL; + logRecordHeader->ZLRH_Zxr.ZXR_PoolBlockCount = 0xBEEFu; +#endif +// logRecordHeader->ZLRH_Zxr.ZXR_BeastClass = +// rootBeast->beastClass->classID; + logRecordHeader->ZLRH_Zxr.ZXR_TransactionState = zlogXaction->xstate; + /* Update the Pool Log Block Header */ + logBlockHeader->ZLBH_NumberOfUnusedLongs -= (LONG)neededLongs; + logBlockHeader->ZLBH_NumberOfRecords += 1; + zlogBeast->ZLB_LogRecordFilledInCount += 1; + + if ( zlogXaction->xstate & XAS_XR_TS_START ) + { /* Store LSN and Block number to make moving home easy */ + /* + * If already on seniority list then assume a NESTED + * transaction and skip most START items. + */ + if ( !QMEMBER( &zlogXaction->ZX_seniorityList ) ) + { /* Not NESTED */ + zlogXaction->ZX_lsn = logRecordHeader->ZLRH_Zxr.ZXR_Lsn; + zlogXaction->ZX_logBlockNumber = + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + /* + * Store ZLOG Beast so we can directly access when + * ZLOGTransactionHomed gets called. + */ + zlogXaction->ZX_zlogBeast = zlogBeast; + /* + * Order transactions by oldest(lowest LSN) to the + * youngest(highest LSN). The current transaction + * is the youngest (highest LSN) so we put at the + * end of the transaction list. ZLOG_TransactionHomed() will + * use and then remove from list. + */ +// NULLIFY( &zlogXaction->ZX_seniorityList ); + DQ_ENQ( &zlogBeast->ZLB_SeniorityListHead, zlogXaction, + ZX_seniorityList ); + } + + /* + * API definition requires us to turn off the start bit. Note + * that we have already stored into log record header. It is + * possible that end bit is set so do not destory because + * ZLOG_ReleaseRecord needs + */ + zlogXaction->xstate &= ~XAS_XR_TS_START; + } + /* Set up user return values */ + zlogBeast->ZLB_ZfsXasRecovery = &logRecordHeader->ZLRH_Zxr; + /* All other return values have been previously set */ + RTN_VOID(); + +} /* End of zlog_ORCommon */ + + +/************************************************************************** + * + * This routine makes copies of logical undo records, tags them as being + * copies, and gives them newer LSNs. + * + * The code for this routine has been copied from ZLOG_ObtainRecord, + * zlog_ORCommon and ZLOG_ReleaseRecord, and the unwanted pieces + * deleted. Any changes to ZLOG_ObtainRecord zlog_ORCommon ZLOG_ReleaseRecord + * need to be considered and checked to see if they need to be made in this + * routine also. + * + *****************************************************************************/ +void ZLOG_CopyLogicalUndoRecord( + ZlogBeast_s *zlogBeast, + ZfsXasRecovery_s *zxr) +{ + Buffer_s *buffer; + ZLOGRecordHeader_s *srcLogRecordHeader; + ZLOGRecordHeader_s *dstLogRecordHeader; + ZLOGBlockHeader_s *logBlockHeader; + NINT neededLongs; + + ASSERT_MPKNSS_LOCK(); + + srcLogRecordHeader = STRUCT(zxr, ZLOGRecordHeader_s, ZLRH_Zxr); + + neededLongs = srcLogRecordHeader->ZLRH_LongLength; + + /* Make local copy */ + if ( zlogBeast->ZLB_ActiveHasBeenRead == FALSE ) + { /* + * This test is only TRUE the first time Obtain is called. At + * which time ZLB_Buffer is also NULL (which is required). + */ + zlogBeast->ZLB_ActiveHasBeenRead = TRUE; + goto GetCurrentBlock; + } + /* + * Now check if defaultFlush (caused by timer pop or a DATA buffer + * wanting to go out or a beast flush) has been called on the + * buffer we are currently using. If any of the above have occurred + * our flush block start handler NULLs out our buffer pointer. We + * useMPKNSS_LOCK() for protection because our handler can not + * get a latch on the ZLOG beast to protect ZLB_Buffer. Because + * our signal handler has a shared latch on the buffers agent latch + * and aMPKNSS_LOCK() we can use either latch method to protect + * ZLB_Buffer. + */ + + if ( zlogBeast->ZLB_Buffer == NULL ) + { /* + * defaultFlush() has been called on our buffer. Just go get + * the next block. + */ + + goto GetNextBlock; + } + else + { + NINT waitFlag; + + buffer = zlogBeast->ZLB_Buffer; + /* + * We will use the XLATCH code that will not block is we can + * not get an X Latch. + */ + X_NOWAIT( &buffer->agent.latch, waitFlag ); + /* Is the latch owned by someone else? */ + if ( waitFlag == WAITED ) + { /* + * Yes - then we will just go to the next buffer. Note that + * this works because we have marked the buffer dirty and + * filled in one record in it when we first grabbed it. The + * timer in it will defaultFlush it at some point and our + * signal handler will checksum it. + */ + zlogBeast->ZLB_Buffer = NULL; + + goto GetNextBlock; + } + + zASSERT( zlogBeast->ZLB_Buffer != NULL ); + } + /* + * It is O.K. to use the current ZLOG buffer!!! + */ + /* O.K. to give out log record */ + logBlockHeader = (ZLOGBlockHeader_s *)buffer->pBuf.data; + if ( (NINT)logBlockHeader->ZLBH_NumberOfUnusedLongs >= neededLongs ) + { /* Current log block has enough room for request */ + goto doTheActualCopy; + } + /* There is not enough space in the current log block */ + /* Mark why we are writing and ensure debug bit set correctly */ + logBlockHeader->ZLBH_Status = ZLBH_S_TOOBIG | ZLBH_S_DEBUG + | ZLBH_S_CHECKSUM; + /* + * Indicate that we are done with the buffer, before we allow + * shared access which would allow our signal handler access to + * ZLB_Buffer. We do not needMPKNSS_LOCK() here because we have the + * ZLB_Buffer latched. + */ + zlogBeast->ZLB_Buffer = NULL; + /* + * We are now done with this buffer, unlatch and then flush it. + */ + CACHE_UNXLATCH( buffer ); + /* + * To write we simply note that we would like to do a flush of + * our buffer. + */ + defaultFlush( &buffer->agent ); + /* Why is there no barrier in this code like in ZLOG_ObtainRecord? + * Because: This code creates new journal blocks that are on a + * timer. When we return from this code, it will result in + * ZLOG_ZB_S_NEEDTODO_COMPENSATION. And XactionRecover will take + * 4 clean checkpoints. A clean checkpoint causes the journal to + * be flushed, and checkpointtake will do the barrier flush. + */ + /* + * Write errors show up in our done signal handler. + */ + + /*** Get the next log block in the file ***/ +/************/ +GetNextBlock:; +/************/ + /* + * We are back the buffer may or may not be flushed, but we + * don't care. + */ + + /* + * The ZLOG code is very dependent that this is the ONLY place a log + * block is obtained for transaction logging. + * + */ + /* Accounting */ + zlogBeast->ZLB_LogBlockFilledInCount++; + + /* Update to next log block number */ + ZLOG_NEXT_POOL_LOG_BLOCK( zlogBeast, + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber ); +/***************/ +GetCurrentBlock:; +/***************/ + /* + * Because we always write the full block we do not need to do + * a pre-read. So we are only getting a buffer from the cache system. + * We will have this buffer X latched so our signal handler will not be + * trying to poke ZLB_Buffer so we do not need MPKNSS_LOCK here. + */ + /* + * Must look on mycache list because we do not toss ALL the time. + * 1). Recovery. + * 2). Other people reading file. + */ + buffer = cacheLookup(&zlogBeast->ZFSLOGmycache, + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber, + CACHE_WRITE); + if ( buffer == NULL ) + { + Blknum_t volBlk = 0; + Fmap_s *fmap = &zlogBeast->ZFSLOGzfsStInfo->fmap; + Blknum_t fileBlk = zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + NINT i; + zASSERT(fmap->numRecs > 1); + + for (i = 1; i < fmap->numRecs; i++) + { + if (fmap->dirExt[i].count > fileBlk) + { + volBlk = fmap->dirExt[i].poolBlk + + (fileBlk - fmap->dirExt[i-1].count); + break; + } + } + zASSERT(volBlk != 0); + /* + * We must know when the cache system flushes our buffer so we + * insert our own handler into the flush path verses the standard + * signal handler. + */ + buffer = cacheAllocBuffer( &zlogBeast->ZFSLOGmycache, + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber, + volBlk, &Zlog_FlushBlockStartSignalHandler, + CACHE_WRITE); + zASSERT( buffer != NULL ); + } + else + { + /* + * The poke is needed just in case the buffer was originally + * obtained by our recovery code or some one reading the file. + * In these cases the signal handler is the default block handler. + */ + Inst.cache.ioHitSystem++; + POKE_SIGNAL_HANDLER( buffer, &Zlog_FlushBlockStartSignalHandler ); + } + /* + * Note that we have an exclusive latch on the buffer since we + * will be writing to it. + */ + zASSERT( zlogBeast->ZLB_Buffer == NULL ); + zlogBeast->ZLB_Buffer = buffer; + /* + * Mark buffer as dirty. This also starts a flush timer so that + * this buffer is not held too long. Our flush handler (set above) + * will be called when the timer goes off (or a bound DATA block + * indicates it needs to go out (or someone flushed the ZLOG beast)) + * via calls to defaultFlush(). + */ + CACHE_MARK_DIRTY( buffer ); + /* + * Order cached buffers by oldest(lowest LSN) to the youngest + * (highest LSN). The current buffer is the youngest (highest LSN) + * so we put at the end of the cached buffer list. Zlog_WriteBlkEnd() + * will use and then remove from list. + */ + + DQ_ENQ( &zlogBeast->ZLB_CachedBufferListHead, buffer, signalLink ); + + + logBlockHeader = (ZLOGBlockHeader_s *)buffer->pBuf.data; +#if NSS_ASSERT IS_ENABLED || ZLOG_DEBUG IS_ENABLED + /* + * Fill complete block with explanation points. This puts + * them in the padding of log records so each log record can be found + * and allows for ASSERTs if someone writes past their log record end. + */ + memset( logBlockHeader, 0x44, 1 << zlogBeast->ZFSLOGblkSizeShift ); +#endif + logBlockHeader->ZLBH_TimeStart = GetUTCTime(); + logBlockHeader->ZLBH_Signature = ZLBH_S_SIGNATURE; + zASSERT( LB_GUIDValidate( &zlogBeast->ZFSLOGroot.ROOTinternalID ) ); + logBlockHeader->ZLBH_InternalID = zlogBeast->ZFSLOGroot.ROOTinternalID; +#if ZLOG_DEBUG IS_ENABLED + logBlockHeader->ZLBH_FileBlockNumber = + (LONG)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; +#endif + logBlockHeader->ZLBH_Status = ZLBH_S_TIME_OR_BOND | ZLBH_S_DEBUG + | ZLBH_S_CHECKSUM; + /* Set to zero because routine will increment */ + logBlockHeader->ZLBH_NumberOfRecords = 0; + /* Set to initial size */ + logBlockHeader->ZLBH_NumberOfUnusedLongs = + ((1 << zlogBeast->ZFSLOGblkSizeShift)/4) - + ( sizeof(ZLOGBlockHeader_s ) / sizeof(LONG) ); + /* Fill in this log block header, log record header, and ... */ +/**************/ +doTheActualCopy: +/**************/ + + dstLogRecordHeader = (ZLOGRecordHeader_s *)((LONG *)logBlockHeader + + (((1 << zlogBeast->ZFSLOGblkSizeShift)/4) - + logBlockHeader->ZLBH_NumberOfUnusedLongs)); + + memcpy(dstLogRecordHeader, srcLogRecordHeader, neededLongs * sizeof(LONG)); +#if ZLOG_DEBUG IS_ENABLED + dstLogRecordHeader->ZLRH_StartTime = (LONG)microSecondTimer(); +#endif + /* Get and update LSN */ + dstLogRecordHeader->ZLRH_Zxr.ZXR_Lsn = + (zlogBeast->ZLB_P.ZLBP_ActivePointerLsn)++; + + dstLogRecordHeader->ZLRH_Zxr.ZXR_TransactionState |= + XAS_XR_TS_UNDO_LOGICAL_COPY; + + /* Update the Pool Log Block Header */ + logBlockHeader->ZLBH_NumberOfUnusedLongs -= (LONG)neededLongs; + logBlockHeader->ZLBH_NumberOfRecords += 1; + zlogBeast->ZLB_LogRecordFilledInCount += 1; + + zlogBeast->ZLB_P.ZLBP_ActivePointerReferenceBlockCount += + dstLogRecordHeader->ZLRH_Zxr.ZXR_PoolBlockCount; + +#if ZLOG_DEBUG IS_ENABLED + /** Fill in end time **/ + dstLogRecordHeader->ZLRH_EndTime = (LONG)microSecondTimer(); +#endif + + /* + * Calculate checksum on all bytes in log record. This includes + * the bytes that are used to align us to the proper boundary. This + * is done because ZLOG does not track how many of these bytes + * are present in each log record. In addition, our checksum + * routine does LONGs so to be fast. + */ + dstLogRecordHeader->ZLRH_Checksum = 0; + /* + * The checksum we store is the value that makes a good + * block's checksum equal to zero. + */ +#if ZLOG_CHECKSUM_LOG_RECORDS IS_ENABLED + dstLogRecordHeader->ZLRH_Checksum = 0 - zlog_CalculateChecksum( + (LONG *)dstLogRecordHeader, + (dstLogRecordHeader->ZLRH_LongLength) ); +#endif + /** Release the buffer (this allows it to be flushed) **/ + CACHE_UNXLATCH( zlogBeast->ZLB_Buffer ); + + return; +} + + +/* + * + * This is really 'Create.' Our constructor has already been called. All + * 'is a' guys do not get called? Of course, our unpack routine will not + * be called. + * + * This routine is called by ZFSPOOL_InitNewPool after the five super + * beasts have been created. ZFSPOOL_InitNewPool does not initialize + * the ZLOG beast file portion so we do that here. + * + * Execution Requirements/Assumptions + * Synchronous and blocking + * + */ + + +/* + * Zlog_PoolInitialize() - + * Initializes ALL persistent fields of the ZLOG beast. + * + * Notes - + * When called by rebuild some statistical fields are left as + * passed in. This is because rebuild does unpack a ZLOG beast + * before calling us. We still MUST reset LSN related items as + * rebuild ZAPs all LSNs when fixing a pool. + */ +STATUS Zlog_PoolInitialize( + GeneralMsg_s *genMsg, + ZlogBeast_s *zlogBeast, + BOOL rebuild ) + +{ + QUAD zlogFileSize; + ZfsPool_s *pool; + Blknum_t blocksInLogFile; + STATUS status; + Extent_s extent; + NINT len; + NINT dirIndex; + NINT checkpoint; + NINT extentsize; + Blknum_t seed[8]; +#if NSS_DEBUG IS_ENABLED + NINT temp; +#endif + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, Zlog_PoolInitialize); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + + /* Get to the pool beast that we belong with */ + zASSERT( zlogBeast->ZFSLOGroot.vol.zfsVol->pool != NULL ); + pool = zlogBeast->ZFSLOGroot.vol.zfsVol->pool; + zASSERT( pool->zfsLogBeast == zlogBeast ); + /* We create a ZLOG file with between 200 and 2500 log blocks. The + * exact number is determined by using 0.5% of the pool's blocks for + * the ZLOG file. If this number is not within range then we force + * it into range. + */ +#if NSS_DEBUG IS_ENABLED + /* + * This is real dirty and does not work real well if more than one + * pool, but we turn off write prevention so we can init the ZLOG + * file. + */ + temp = CrashPools; + CrashPools = FALSE; +#endif + + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + + if (!rebuild) + { + /* FixFixFix6(Design) - we need a better formula and/or to be configurable */ + zlogFileSize = ( (QUAD)pool->ZFSPOOLtotalBlocks + << zlogBeast->ZFSLOGblkSizeShift ) / + ZLOG_FILE_SIZE_FRACTION; + if ( zlogFileSize > ZLOG_FILE_SIZE_CROSS_OVER ) + { /* Rebuild may need to have a larger ZLOG file + * than what ZLOG requires. If so make the ZLOG + * file bigger. + */ + zlogFileSize = (QUAD)pool->ZFSPOOLtotalBlocks * + ZLOG_REBUILD_BYTES_PER_BLOCK; + if ( zlogFileSize < ZLOG_FILE_SIZE_CROSS_OVER ) + { + zlogFileSize = ZLOG_FILE_SIZE_CROSS_OVER; + } + } + else + { /* Force to at least minimum size */ + if ( zlogFileSize < ZLOG_FILE_SIZE_MINIMUM ) + { + zlogFileSize = ZLOG_FILE_SIZE_MINIMUM_AT_CREATE; + } + } + zlogFileSize += ZLOG_REBUILD_FUDGE_BLOCKS * 4096; + /* A little round up so Rebuild can + * have two areas that can both be + * rounded to 4K. + */ +#if NSS_DEBUG IS_ENABLED + /* On debug systems with pools over 256MB make the Journal + * at least 128MB for debugging. + */ + if ( (zlogFileSize < (128*1024*1024uL)) && (pool->ZFSPOOLtotalBlocks > (256*1024*1024uL) ) ) + { + zlogFileSize = 128*1024*1024uL; + } +#endif + zASSERT( zlogBeast->ZFSLOGblkSizeShift != 0 ); + blocksInLogFile = (zlogFileSize+(1 << zlogBeast->ZFSLOGblkSizeShift)-1) + >> zlogBeast->ZFSLOGblkSizeShift; + /* + * Because this is create time for the pool we have to use special + * code to allocate the blocks of the ZLOG file. Once the blocks + * are allocated we can use COMN_... routines as long as we do + * not extend the file (which ZLOG does not do). + */ + len = blocksInLogFile; + extentsize = (len + ZLOG_EXTENT_MIN - 1)/ ZLOG_EXTENT_MIN; + + /* TODO(Perf): Should we still split journal into 8 sections? + We split the file because Ben required us so that he would + see all the lights on in an 8 drive non-RAID system:-( */ + seed[0] = 1; + seed[1] = pool->ZFSPOOLtotalBlocks / ZLOG_EXTENT_MIN; + seed[2] = seed[1] << 1; + seed[3] = seed[1] + seed[2]; + seed[4] = seed[1] << 2; + seed[5] = seed[1] + seed[4]; + seed[6] = seed[2] + seed[4]; + seed[7] = seed[1] + seed[6]; + + zlogBeast->ZFSLOGroot.storage.zfsInfo->fmap.dirExt[0].count = 0; + zlogBeast->ZFSLOGroot.storage.zfsInfo->fmap.dirExt[0].poolBlk = 0; + zlogBeast->ZFSLOGroot.storage.zfsInfo->fmap.numRecs = 1; + dirIndex = 1; + while (len) + { + zASSERT(dirIndex < MAX_DIRECT); + + /*- make a min. extent count of 8 to spread the log activity -*/ + if (len >= extentsize) + { + extent.lengthOfExtent = extentsize; + extent.poolBlkNum = seed[dirIndex - 1]; + } + else + { + extent.lengthOfExtent = len; + extent.poolBlkNum = 0; + } + + if ( zfsAllocExtent(genMsg, &pool->ZFSPOOLzfsVol, &extent, + XTREE_SYSTEM_REQUEST, NULL) != zOK ) + { +#if NSS_DEBUG IS_ENABLED + CrashPools = temp; +#endif + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + RTN_STATUS(zFAILURE); + } + zlogBeast->ZFSLOGroot.storage.zfsInfo->fmap.dirExt[dirIndex].count = + extent.lengthOfExtent + + zlogBeast->ZFSLOGroot.storage.zfsInfo->fmap.dirExt[dirIndex-1].count; + zlogBeast->ZFSLOGroot.storage.zfsInfo->fmap.dirExt[dirIndex].poolBlk = + extent.poolBlkNum; + zlogBeast->ZFSLOGroot.storage.zfsInfo->fmap.numRecs++; + len -= extent.lengthOfExtent; + ++dirIndex; + } + zlogBeast->ZFSLOGroot.storage.zfsInfo->nextBlk = blocksInLogFile; + zlogBeast->ZFSLOGroot.storage.zfsInfo->fmapDataBlks = blocksInLogFile; + zlogBeast->ZFSLOGroot.storage.zfsInfo->fmapTreeBlks = 0; + } +// COMN_MARK_BEAST_DIRTY( &zlogBeast->ZFSLOGroot ); Don't do as checkpoints write. + + /* + * Update EOF - We use number of blocks and not zlogFileSize + * because of possible rounding error in file size to block conversion. + */ + zlogBeast->ZFSLOGroot.eof = (QUAD)zlogBeast->ZFSLOGroot.storage.zfsInfo->fmapDataBlks << + zlogBeast->ZFSLOGblkSizeShift; + + zlogBeast->ZLB_P.ZLBP_Signature = ZLOG_ZLBP_S_SIGNATURE; + zlogBeast->ZLB_P.ZLBP_Signature2 = ZLOG_ZLBP_S_SIGNATURE; + zlogBeast->ZLB_P.ZLBP_VersionMajor = ZLOG_ZLBP_VM_MAJOR; + zlogBeast->ZLB_P.ZLBP_VersionMinor = ZLOG_ZLBP_VM_MINOR; + zlogBeast->ZLB_P.ZLBP_FileVersion = ZLOG_ZLBP_FV_VERSION; + zlogBeast->ZLB_P.ZLBP_ActivePointerLsn = 1; + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber = 0; + zlogBeast->ZLB_P.ZLBP_HomePointerLsn = 1; + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber = 0; + for ( checkpoint = 0 ; checkpoint < CHECKPOINT_NUMBER; checkpoint++ ) + { + zlogBeast->ZLB_P.ZLBP_OldHomePointerLsn[checkpoint] = 1; + zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[checkpoint] = 0; + } + zlogBeast->ZLB_P.ZLBP_FileThrottleTimeStart = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleTimeEnd = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleFullPercent = ZLOG_THROTTLE_FULL_PERCENT; + zlogBeast->ZLB_P.ZLBP_FileThrottleBackPercent = ZLOG_THROTTLE_BACK_PERCENT; + zlogBeast->ZLB_P.ZLBP_Mode = ZLOG_MODE_DEFAULT; + bzero( zlogBeast->ZLB_P.ZLBP_Padding, sizeof( zlogBeast->ZLB_P.ZLBP_Padding ) ); + zlogBeast->ZLB_P.ZLBP_PreHomePointerReferenceBlockCount = 0; + zlogBeast->ZLB_P.ZLBP_HomePointerReferenceBlockCount = 0; + zlogBeast->ZLB_P.ZLBP_ActivePointerReferenceBlockCount = 0; + zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn = 0; + zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn = 0; + zlogBeast->ZLB_P.ZLBP_CompensationPtrBlkNum = 0; + zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrBlkNum = 0; + bzero( zlogBeast->ZLB_P.ZLBP_Reserved, sizeof( zlogBeast->ZLB_P.ZLBP_Reserved ) ); +// zlogBeast->ZLB_P.ZLBP_Pad2 = 0; + + /* Don't reset counters when called by rebuild */ + if (!rebuild) + { /* Non-rebuild initialization of statistics */ + zlogBeast->ZLB_P.ZLBP_FileThrottleWaitCount = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleWait1Count = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleWait2Count = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleWait3Count = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleWait4Count = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleWait5Count = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleFullCount = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleCheckpointCount = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleSkipCount = 0; + zlogBeast->ZLB_P.ZLBP_FileThrottleBackCount = 0; + zlogBeast->ZLB_P.ZLBP_TimeActivation = 0; + zlogBeast->ZLB_P.ZLBP_TimeDeactivation = 0; + zlogBeast->ZLB_P.ZLBP_TimePoolCreate = GetUTCTime(); + zlogBeast->ZLB_P.ZLBP_RecoveryRecoverCount = 0; + zlogBeast->ZLB_P.ZLBP_RecoveryActivationCount = 0; + zlogBeast->ZLB_P.ZLBP_LastRecoveryTimeStart = 0; + zlogBeast->ZLB_P.ZLBP_LastRecoveryTimeEnd = 0; + zlogBeast->ZLB_P.ZLBP_LastRecoveryActivationCount = 0; + zlog_initAllPersistentStatistics( zlogBeast ); + } + + if (rebuild) + { /* Rebuild specific initialization + * - Track last 16 rebuild times. + */ + ZLOG_HISTORY_EVENT( &zlogBeast->ZLB_History[ZH16_REBUILD_UTC], + (QUAD)(GetUTCTime()) ); + } + status = zlog_CommonCreateAndOpen( genMsg, zlogBeast ); + if ( status != zOK ) + { +#if NSS_DEBUG IS_ENABLED + CrashPools = temp; +#endif + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + RTN_STATUS(status); + } + /* + * Wait until here for Pre Oldest because the MACRO uses items + * that are filled in in common create and open. + */ + ZLOG_PREVIOUS_POOL_LOG_BLOCK( zlogBeast, + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ); + ZLOG_PREVIOUS_POOL_LOG_BLOCK( zlogBeast, + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ); +// COMN_MARK_BEAST_DIRTY( &zlogBeast->ZFSLOGroot ); Don't do as checkpoints write. + +#if NSS_DEBUG IS_ENABLED + CrashPools = temp; +#endif + /* Since recovery is not needed on create we can say we are done + with recovery. */ + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RECOVERY_END; + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + + /* Wait for the LOG buffers to write, then toss everything. Since + * we no longer INIT zlog blocks this should not be needed. But + * since there are no blocks on cache there is no HARM in leaving + * in code. + */ + cacheFlushMyCache( &zlogBeast->ZFSLOGmycache ); + X_BARRIER( &zlogBeast->ZFSLOGbeastLatch ); + cacheTossAll( &zlogBeast->ZFSLOGmycache ); + /* + * We do not take a checkpoint because are caller will do when + * 'create pool' is done. This is good because if we crash before + * finishing create then there will not be an incomplete pool. Note + * that the caller we write all CHECKPOINT_NUMBER checkpoints when done. + */ + RTN_STATUS(status); + +} /* End of Zlog_PoolInitialize() */ + + +/* + * + * zlog_RAPProcessSingleLogBlock() + * + * Part of Recover Active Pointer logic. + * + * Reads the next log block. Verifies the block and then sees if the + * block is active or not. + * + * Returns + * zOK - If no errors. + * zFAILURE - A fatal error in proceesing log block. + * + * Additional Returns (via dereference of supplied pointers) + * 'done' - If zOK then 'done' has been set. + * TRUE - Active Pointer has been found and set. + * FALSE - Active Pointer has not yet been found. + * 'genMsg' - If zFAILURE the 'genMsg' has been set. + * '.errno' - On errors the 'errno' field is set. + * + */ + +STATUS zlog_RAPProcessSingleLogBlock( + GeneralMsg_s *genMsg, + ZlogBeast_s *zlogBeast, + NINT *done ) + +{ + + Buffer_s *buffer; + ZLOGBlockHeader_s *logBlockHeader; + ZLOGRecordHeader_s *logRecordHeader; + ZfsPool_s *pool; + Lsn_t specialAdd; + LONG checksum; + STATUS status; + IoMsg_s ioMsg; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, zlog_RAPProcessSingleLogBlock); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.vol.zfsVol->pool != NULL ); + pool = zlogBeast->ZFSLOGroot.vol.zfsVol->pool; + FILEBLK_IO_MSG(ioMsg, zlogBeast, + zlogBeast->ZLB_RecoveryActivePointerBlockNumber, + 1, CACHE_READ ); + buffer = COMN_GetFileBlk(genMsg, &ioMsg); + /* + * Note that COMN_GetFileBlk() has obtained an shared latch on + * the buffer for us since we will only be reading it. We + * release this latch right after processing the block. This + * latch is only obtained if zOK is returned. + */ + /* Did get block work? */ + if ( buffer == NULL ) + { /* NO - An error occurred we just leave genMsg alone. */ + WARN( "LOG BLOCK - get block error" == NULL ); + goto zlog_RAPGBerror1; + } + /*** Setup log block header pointer ***/ + logBlockHeader = (ZLOGBlockHeader_s *)buffer->pBuf.data; + + /* + * Better verify length field used for checksumming. All log blocks + * must have a log block header. Log blocks do not have to have a log + * record header because install does not fill in. The ZLOG + * specification says that checksum errors can only occur if we did + * a partial write of a log block. A partial write of a log block + * indicates that we have found our active pointer. + */ + if ( logBlockHeader->ZLBH_NumberOfUnusedLongs > + (((1 << zlogBeast->ZFSLOGblkSizeShift) - sizeof(ZLOGBlockHeader_s) )/4)) + { +// WARN( "LOG BLOCK - Pre-checksum error assuming EOF" == NULL ); +// status = zERR_ZLOG_BAD_CHECKSUM; + goto zlog_RAPGBFoundActivePointer; + } + + /* Calculate the checksum. Checksum will equal zero if correct. */ + checksum = zlog_CalculateChecksum( (LONG *)logBlockHeader, + ((1 << zlogBeast->ZFSLOGblkSizeShift)/4) - + logBlockHeader->ZLBH_NumberOfUnusedLongs ); + if ( checksum != 0 ) + { /* Checksum error on Log Block Header */ +// WARN( "LOG BLOCK - Checksum error" == NULL ); +// status = zERR_ZLOG_BAD_CHECKSUM; + goto zlog_RAPGBFoundActivePointer; + } + + zASSERT( LB_GUIDValidate( &zlogBeast->ZFSLOGroot.ROOTinternalID ) ); +// zASSERT( LB_GUIDValidate( &logBlockHeader->ZLBH_VolumeID ) ); + + if ( LB_GUIDCompare( &zlogBeast->ZFSLOGroot.ROOTinternalID, + &logBlockHeader->ZLBH_InternalID ) != 0 ) + { /* Not a block we wrote so assume we found active pointer */ +// zASSERT( "LOG RBLOCK - GUID mismatch (Ok to go, but tell Greg you hit this)" == NULL ); + goto zlog_RAPGBFoundActivePointer; + } + if ( logBlockHeader->ZLBH_RebuildCount != zlogBeast->ZFSLOGrebuildCount ) + { /* At best this is an old block so EOF */ + goto zlog_RAPGBFoundActivePointer; + } + + /* Verify that this is a log block */ + if ( logBlockHeader->ZLBH_Signature != ZLBH_S_SIGNATURE ) + { /* Incorrect signature */ +// zASSERT( "LOG BLOCK - Signature error" == NULL ); +// status = zERR_ZLOG_BAD_BLOCK_SIGNATURE; +// goto zlog_RAPGBerror2; + goto zlog_RAPGBFoundActivePointer; + } + /*** Checksum is good and this is a ZLOG block ***/ + if ( logBlockHeader->ZLBH_Status & ZLBH_S_INSTALL ) + { /* Install wrote so no LSN. Which means this will be + our active pointer block. */ + goto zlog_RAPGBFoundActivePointer; + } + /*** Checksum is good and this is a ZLOG block and not install + written ***/ + if ( logBlockHeader->ZLBH_NumberOfRecords < 1 ) + { + WARN( "LOG BLOCK - Log Record count error" == NULL ); + status = zERR_ZLOG_BAD_RECORD_COUNT; + goto zlog_RAPGBerror2; + } + /*** Start looking at 1st log record within block ***/ + logRecordHeader = (ZLOGRecordHeader_s *)(logBlockHeader + 1); + + if ( logRecordHeader->ZLRH_LongLength > + ( ( (1 << zlogBeast->ZFSLOGblkSizeShift) - + sizeof(ZLOGBlockHeader_s) )/4 ) ) + { + WARN( "LOG RECORD - Pre-checksum error" == NULL ); +// status = zERR_ZLOG_BAD_CHECKSUM; + goto zlog_RAPGBFoundActivePointer; + } +#if ZLOG_CHECKSUM_LOG_RECORDS IS_ENABLED + checksum = zlog_CalculateChecksum( + (LONG *)logRecordHeader, + logRecordHeader->ZLRH_LongLength ); + if ( checksum != 0 ) + { + WARN( "LOG RECORD - Checksum error" == NULL ); +// status = zERR_ZLOG_BAD_CHECKSUM; + goto zlog_RAPGBFoundActivePointer; + } +#endif + /*** Checksum is good - Finally lets look at the LSN ***/ + zASSERT( logRecordHeader->ZLRH_Zxr.ZXR_Lsn != 0 ); + /* + * Does this block have a bigger LSN? Note the > part of check is + * needed because of our magic big add to get a new LSN at recovery + * time. + */ + if ( (logRecordHeader->ZLRH_Zxr.ZXR_Lsn + + logBlockHeader->ZLBH_NumberOfRecords) >= + zlogBeast->ZLB_RecoveryActivePointerLsn ) + { /* YES - Then this is NOT the end of the log file */ + /* + * Update LSN to next expected LSN. This may look weird, + * but every log record WITHIN A LOG BLOCK gets assigned a LSN + * one higher than the next so we can just add the number of + * log records in the log block. Note our magic add a lot + * to the previous LSN only occurs at recovery and ONLY + * between LSNs in different log blocks. + */ + zlogBeast->ZLB_RecoveryActivePointerLsn = + logRecordHeader->ZLRH_Zxr.ZXR_Lsn + + logBlockHeader->ZLBH_NumberOfRecords; + /* + * Release shared access on buffer --- buffer, logBlockHeader + * and logRecordHeader are now invalid to access. + */ + CACHE_RELEASE( buffer ); + /* Update to next block number */ + ZLOG_NEXT_POOL_LOG_BLOCK( zlogBeast, + zlogBeast->ZLB_RecoveryActivePointerBlockNumber ); + *done = FALSE; + RTN_STATUS(zOK); + } + else + { /* + * NO - Then we have found the first log block that does not + * need to be recovered. We will use this log block as the + * location to set the current active pointer to. + */ +/***************************/ +zlog_RAPGBFoundActivePointer:; +/***************************/ + /* + * Release shared access on buffer --- buffer, logBlockHeader + * and logRecordHeader are now invalid to access. + */ + CACHE_RELEASE( buffer ); +#if ZLOG_DEBUG IS_ENABLED + /* ZAP pointers that point into buffer */ + buffer = NULL; + logRecordHeader = NULL; + logBlockHeader = NULL; +#endif + /* + * Update Active Pointer block and LSN. + * + * From Specification - ZLOG will add a gigabyte to this + * LSN to get the new current LSN. This is done as a proactive + * measure to reduce chances of duplicate LSNs. There really + * is no reason not to add such a number because it is only ~1 + * second if LSN increases once per nanosecond (which wraps + * a QUAD in 586 years). Also we must add a gigabyte for + * each checkpoint that could not be read. This will prevent + * us from using a LSN that one of the BAD checkpoints indicated. + * + */ + if ( zlogBeast->ZLB_P.ZLBP_ActivePointerLsn > + zlogBeast->ZLB_RecoveryActivePointerLsn ) + { /* The checkpoint has a higher LSN than what we found. */ +#if 0 + /* This just means we wrote a checkpoint before a log + * buffer made it to the media so I removed. + */ + zASSERT( "Active pointer LSN greater than ZLOG EOF LSN" == NULL ); +#endif + zlogBeast->ZLB_RecoveryActivePointerLsn = + zlogBeast->ZLB_P.ZLBP_ActivePointerLsn; + } + + zASSERT( pool->ZP_BadCheckpoints < CHECKPOINT_NUMBER ); + zASSERT( pool->ZP_BadCheckpoints >= 0 ); + /* + * Defensive code - we really do not want to add + * too much to our LSN. + */ + if ( (pool->ZP_BadCheckpoints < 0) || + (pool->ZP_BadCheckpoints > CHECKPOINT_NUMBER) ) + { /* Defensive case */ + specialAdd = (Lsn_t)0x40000000ul * (1 + CHECKPOINT_NUMBER); + } + else + { /* Normal case */ + specialAdd = (Lsn_t)0x40000000ul * (1 + pool->ZP_BadCheckpoints); + } + /* + * Now that we are happy with the active pointer found + * by recovery we can set the real active pointer to the found + * values. + */ + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber = + zlogBeast->ZLB_RecoveryActivePointerBlockNumber; + zlogBeast->ZLB_P.ZLBP_ActivePointerLsn = + zlogBeast->ZLB_RecoveryActivePointerLsn + specialAdd; + /*** This is a happy recovery EXIT ***/ + *done = TRUE; + RTN_STATUS(zOK); + } +/***************/ +zlog_RAPGBerror2:; +/***************/ + /* Release shared access on log block buffer */ + CACHE_RELEASE( buffer ); + SetErrno( genMsg, status ); +/***************/ +zlog_RAPGBerror1:; +/***************/ + RTN_STATUS(zFAILURE); + +} /* End of zlog_RAPProcessSingleLogBlock */ + + + +/* Synchronous Thread + * + * void zlog_RecoverActivePointer( + * GeneralMsg_s gemMsg, + * ZlogBeast_s zlogBeast ) + * + * The active pointer information read from the checkpoint is no longer + * current if more log records where written after the checkpoint was + * taken. Therefore, we need to scan the ZLOG beast to find the current + * active pointer (LSN and Block number). + * + * Note life is not always simple. We do not force log blocks + * out in sequential order. Even if we did we could get a read + * error and get the same problem. This means that we can have holes + * in the log file (blocks that have old data). These holes can + * appear anywhere after the home pointer. Technically, the holes + * (non-media related holes) + * can only appear after the commit pointer, but we do not track + * in this version of ZLOG. Any hole past the active pointer will + * cause us to mark it as the new active pointer block number. This + * is fine as long as we ensure that the active pointer LSN will + * not duplicate any values that may appear in blocks after the + * hole. This is done by adding a large value to the first LSN + * in the log block previous to the hole. But we MUST take another + * checkpoint BEFORE starting to log more transactions (this + * checkpoint is needed to record our new active pointer LSN). + * Otherwise, we could crash before new log block 1 is written, but + * after log block 2 is written. In which case, we would generate + * duplicate LSNs on the next recovery. + * + * Should we Limit LSNs and tell user to cleanly shut down so we + * can re-use LSNs? Can we re-use on a clean shutdown? + * + * Input + * ZLOG's beastLatch exclusively owned. + * ZLOG's ZLB_RecoveryLatch exclusively owned. + * + * Returns + * zOK - If no errors. + * zFAILURE - A fatal error in proceesing log block. + * + * Additional Returns (via dereference of supplied pointers) + * 'genMsg' - If zFAILURE the 'genMsg' has been set. + * '.errno' - On errors the 'errno' field is set. + * + */ + + +STATUS zlog_RecoverActivePointer( + GeneralMsg_s *genMsg, + ZlogBeast_s *zlogBeast ) + +{ + STATUS status; + NINT done; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, zlog_RecoverActivePointer); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + /* + * Mark that we made it to recovery - we waited until + * we got our exclusive access to the ZLOG system. + */ + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RAP_START; + do + { + status = zlog_RAPProcessSingleLogBlock( genMsg, zlogBeast, &done ); + if ( status != zOK ) + { + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RAP_ERROR; + RTN_STATUS(status); + } + } while ( !done ); + /*** This is a happy recovery EXIT ***/ + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RAP_END; + RTN_STATUS(status); + +} /* End of zlog_RecoverActivePointer */ + + +/* + * void ZLOG_RecoveryClose( ZlogRecoveryKey_s *zlogRecoveryKey ) + * + * zlogRecoveryKey Pointer to key that user is done using. + * + * This function releases resources that ZLOG has associated with the key. + * This function must be called for each ZLOG_RecoveryOpen() call. This + * call notifies ZLOG that the caller is done with the key. After calling + * this function, the caller must not pass the key into any ZLOG recovery + * functions. + */ + +void ZLOG_RecoveryClose( + ZlogRecoveryKey_s *zlogKey ) + +{ + + ZlogBeast_s *zlogBeast; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, ZLOG_RecoveryClose); + zASSERT( zlogKey != NULL ); + zASSERT( zlogKey->ZRK_Signature == ZLOG_ZRK_S_SIGNATURE ); + zASSERT( zlogKey->ZRK_Flag == ZLOG_ZRK_F_KEY_VALID ); + + zlogBeast = zlogKey->ZRK_ZlogBeast; + zASSERT( zlogBeast != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + if ( zlogKey->ZRK_Buffer != NULL ) + { + CACHE_RELEASE( zlogKey->ZRK_Buffer ); + zlogKey->ZRK_Buffer = NULL; + } + +#if NSS_ASSERT IS_ENABLED + --ZLOGRecoveryGetCount; +#endif + +#if ZLOG_DEBUG IS_ENABLED + /* ZAP whole structure for extra test */ + memset( zlogKey, 0xAA, sizeof( *zlogKey ) ); +#endif + RTN_VOID(); +} /* End of ZLOG_RecoveryClose() */ + + +/* + * STATUS ZLOG_RecoveryGet(GeneralMsg_s *genMsg, + * ZlogRecoveryKey_s *zlogRecoveryKey, NINT direction ) + * + * genMsg Contains thread specific error information. + * ZLOG will set the errno field when zOK is not + * returned. The errno value of + * zERR_ZLOG_NO_MORE_RECORDS indicates there are no + * log records. + * zlogRecoveryKey Pointer to key information for get. On + * successful returns, the ZRK_ZfsXasRecovery + * pointer will contain recovery information for + * the last recovery log record. This key must of + * been initialized with a single call to + * ZLOG_RecoveryOpen(). + * direction Indicates if this is a next or previous. Set to + * ZLOG_RG_NEXT for next log record. Set to + * ZLOG_RG_PREVIOUS for previous log record. Previous + * indicates a 'backwards' traversal of log records. + * + * This function returns the recovery information associated with the + * 'direction' recovery log record. This information is valid as long has no + * other ZLOG_RecoveryGet() functions are called. This function is used to + * traverse backwards or forwards through the recovery log records. + * + * The status return value of zOK indicates the call succeeded and that the + * other return values can be used. All other returns indicate an error in + * obtaining the previous recovery log record. The genMsg field errno + * contains extended error information. The genMsg errno value of + * zERR_ZLOG_NO_MORE_RECORDS indicates there are no log records. + * + * ZLOG_RecoveryGet() returns the following information in the ZLOG + * recovery key. + * + * ZRK_ZfsXasRecovery Pointer to a ZfsXasRecovery_s. See above + * for a description of the ZfsXasRecovery_s + * structure. The caller€s beast specific + * information is at zlogRecoveryKey- + * >ZRK_ZfsXasRecovery+1. + */ + +STATUS ZLOG_RecoveryGet( + GeneralMsg_s *genMsg, + ZlogRecoveryKey_s *zlogKey, + NINT direction ) +{ + STATUS status; + + /* Ignore the log records that are tagged as records that were copied + * for logical undo's. + */ + do + { + status = ZLOG_InternalRecoveryGet(genMsg, zlogKey, direction); + } while ((status == zOK) && + (zlogKey->ZRK_ZfsXasRecovery->ZXR_TransactionState & + XAS_XR_TS_UNDO_LOGICAL_COPY)); + return status; +} + +STATUS ZLOG_RecoveryGetLogicalOnly( + GeneralMsg_s *genMsg, + ZlogRecoveryKey_s *zlogKey, + NINT direction ) +{ + STATUS status; + + /* Ignore the log records that are tagged as records that were copied + * for logical undo's. + */ + do + { + status = ZLOG_InternalRecoveryGet(genMsg, zlogKey, direction); + zASSERT(zlogKey->ZRK_ZfsXasRecovery->ZXR_TransactionState & + XAS_XR_TS_UNDO_LOGICAL_COPY); + } while ((status == zOK) && + (!(zlogKey->ZRK_ZfsXasRecovery->ZXR_TransactionState & + XAS_XR_TS_UNDO_LOGICAL_COPY))); + return status; +} + +STATUS ZLOG_InternalRecoveryGet( + GeneralMsg_s *genMsg, + ZlogRecoveryKey_s *zlogKey, + NINT direction ) +{ + + ZlogBeast_s *zlogBeast; +#if NSS_DEBUG IS_ENABLED + ZLOGRecordHeader_s *logRecordHeader; + NINT skip = 0; +#endif + STATUS status; + + ASSERT_MPKNSS_LOCK(); + zlogBeast = zlogKey->ZRK_ZlogBeast; + zASSERT( zlogBeast != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + /* + * This loop ensures that we only redo/undo log records that have + * not had their METADATA homed. On most crashes the home block + * will have some log records that do not need to be played back. + * This code skips the log records whose LSN indicate that they + * have been homed. + * Note that it would be O.K. to redo/undo these + * log records because they redo routines should say that they have + * already been homed. + * Note that now that we have an undo pointer concept for the + * purge block code we MUST NOT redo/undo any log records that + * we have said will not be recovered! September 2, 1997 + */ + do + { +#if NSS_DEBUG IS_ENABLED + if ( skip > 0 ) + { + if ( DBG_DebugFlag & (TZLOG | TZLOG2) ) + { + logRecordHeader = STRUCT( zlogKey->ZRK_ZfsXasRecovery, + ZLOGRecordHeader_s, ZLRH_Zxr ); + zlog_LogRecordDump( logRecordHeader, MSGNot("Skipped "), LRED ); + } + /* This confusses our code if left on (and for good reason) */ +// logRecordHeader->ZLRH_Zxr.ZXR_TransactionState &= ~X_REDO_THEN_UNDO; + } + ++skip; +#endif + if ( direction == ZLOG_RG_NEXT ) + { + status = zlog_RGNReal( genMsg, zlogKey ); + } + else + { + status = zlog_RGPReal( genMsg, zlogKey ); + } + } while ( (status == zOK) && + (zlogKey->ZRK_ZfsXasRecovery->ZXR_Lsn < zlogKey->ZRK_LsnFirst)); +#if NSS_DEBUG IS_ENABLED + if ( status == zOK ) + { + logRecordHeader = STRUCT( zlogKey->ZRK_ZfsXasRecovery, + ZLOGRecordHeader_s, ZLRH_Zxr ); + if ( DBG_DebugFlag & (TZLOG | TZLOG2) ) + { + zlog_LogRecordDump( logRecordHeader, MSGNot("Recover "), LRED ); + } + /* This confusses our code if left on (and for good reason) */ + logRecordHeader->ZLRH_Zxr.ZXR_TransactionState &= ~X_REDO_THEN_UNDO; + } +#endif + return( status ); + +} /* End of ZLOG_RecoveryGet() */ + +/* + * Does the actual work of most of the ZLOG_RecoveryGet calls. + * + * If zlogKey->ZRK_Buffer != NULL then the log record is assumed to be + * within the buffer. Otherwise, the log block ZRK_LogBufferNumber is + * read and then the log record is retrieved from it. + */ + +STATUS zlog_RecoveryGet( + GeneralMsg_s *genMsg, + ZlogBeast_s *zlogBeast, + ZlogRecoveryKey_s *zlogKey ) + +{ + + int currentRecord; + ZLOGBlockHeader_s *logBlockHeader; + ZLOGRecordHeader_s *logRecordHeader; + + ASSERT_MPKNSS_LOCK(); + zASSERT( zlogBeast != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + zASSERT( zlogKey->ZRK_Signature == ZLOG_ZRK_S_SIGNATURE ); + zASSERT( zlogKey->ZRK_Flag == ZLOG_ZRK_F_KEY_VALID ); + + if ( zlogKey->ZRK_Buffer == NULL ) + { /* Read and Verify the current block */ + zlogKey->ZRK_Buffer = zlog_RGReadAndVerifyBlock( genMsg, zlogBeast, + zlogKey->ZRK_LogBufferNumber ); + /* + * Note that a NON-NULL return also obtains an shared latch on + * the buffer for us. We release this latch when we go to + * the next block or at done. + * + */ + if ( zlogKey->ZRK_Buffer == NULL ) + { + RTN_STATUS( zFAILURE ); + } + } + + /*** We have a buffer now go find the correct log record ***/ + + logBlockHeader = (ZLOGBlockHeader_s *)zlogKey->ZRK_Buffer->pBuf.data; + if ( zlogKey->ZRK_LogRecordNumber == -1 ) + { /* -1 means the last record number */ + zlogKey->ZRK_LogRecordNumber = logBlockHeader->ZLBH_NumberOfRecords - 1; + } + zASSERT(zlogKey->ZRK_LogRecordNumber < logBlockHeader->ZLBH_NumberOfRecords); + logRecordHeader = (ZLOGRecordHeader_s *)(logBlockHeader + 1); + for ( currentRecord = 0; currentRecord < zlogKey->ZRK_LogRecordNumber; + currentRecord++ ) + { + /* Point to next log record within block */ + logRecordHeader = (ZLOGRecordHeader_s *)( ((LONG *)logRecordHeader) + + logRecordHeader->ZLRH_LongLength); + zASSERT( ((BYTE *)(logRecordHeader + 1)) <= + (((BYTE *)logBlockHeader) + (1 << zlogBeast->ZFSLOGblkSizeShift)) ); + } + /*** Set up return parameter for user ***/ + zlogKey->ZRK_ZfsXasRecovery = &logRecordHeader->ZLRH_Zxr; + RTN_STATUS(zOK); + +} /* End of zlog_RecoveryGet() */ + +/* + * ZLOG_RecoveryInfoGet() + * + * This function must only be used by FTEST. Used so FTEST demo can + * display how many redo(s) and undo(s) occured during recovery. + * + * Returns + * TRUE - Recovery Information filled in. + * FALSE - Volume not found + */ + +NINT ZLOG_RecoveryInfoGet( + unicode_t *volumeName, /* Input - Must be a ZLSS volume */ + ZlogRecoveryInfo_s *info ) /* Output */ + +{ + + Volume_s *volume; + GeneralMsg_s dummyGenMsg; + VolumeID_t retVolumeID; + + ASSERT_MPKNSS_LOCK(); + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + + volume = COMN_VolumeNameLookup( &dummyGenMsg, volumeName, TRUE, + &retVolumeID); + + if ( volume != NULL ) + { + ZfsPool_s *pool; + ZlogBeast_s *zlog; + + if ( !COMN_IsDerivedFrom(volume, zFTYPE_ZLSS_VOL) ) + { /* Not a ZLSS volume so exit */ + COMN_Release( &volume ); + return( FALSE ); + } + pool = ((ZfsVolume_s *)volume)->pool; + zlog = pool->zfsLogBeast; + /* Fill in XACTION related information */ + info->ZRI_Zxs = zlog->ZLB_Ftest; /*** Structure copy ***/ + /* Fill in activation time which is not store in zlog ZLB_Ftest */ + info->ZRI_ActivationTime = zlog->ZLB_P.ZLBP_TimeActivation; + COMN_Release( &volume ); + return( TRUE ); + } + return( FALSE ); + +} + +/* + * void ZLOG_RecoveryOpen( NINT getMode, ZfsPool_s *zfsPool, + * ZlogRecoveryKey_s *zlogRecoveryKey ) + * + * getMode - Location to start log record retrievals from. + * Must be either ZLOG_RECOVERY_BEGIN or + * ZLOG_RECOVERY_END. + * - 'Get' area to do retrival from. Must be either + * ZLOG_RECOVERY_NORMAL or ZLOG_RECOVERY_EXPANDED. + * zfsPool Pointer to ZFS pool that the log file belongs to. + * zlogRecoveryKey Pointer to key information. The internal items + * in this structure are initialized by this call. + * + * This function opens the ZLOG file for recovery record retrieval. The + * define ZLOG_RECOVERY_BEGIN indicates that retrieval will start at the + * beginning of the recovery area. The define ZLOG_RECOVERY_END indicates + * that retrieval will start at the end of the recovery area. + * + * For every call to ZLOG_RecoveryOpen() there must be a call to + * ZLOG_RecoveryClose() with the same key. Before ZLOG_RecoveryOpen() can + * be called with a used key the key must be closed with a call to + * ZLOG_RecoveryClose(). + */ + +void ZLOG_RecoveryOpen( + NINT getMode, + ZfsPool_s *zfsPool, + ZlogRecoveryKey_s *zlogKey ) + +{ + + ZlogBeast_s *zlogBeast; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, ZLOG_RecoveryOpen); + /* Make sure legal values */ + zASSERT( zfsPool != NULL ); + zlogBeast = zfsPool->zfsLogBeast; + zASSERT( zlogBeast != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + +#if NSS_ASSERT IS_ENABLED + zlogKey->ZRK_Flag = ZLOG_ZRK_F_KEY_VALID; + zlogKey->ZRK_Signature = ZLOG_ZRK_S_SIGNATURE; +#endif +#if ZLOG_DEBUG IS_ENABLED + zlogKey->ZRK_ZfsXasRecovery = NULL; +#endif + zlogKey->ZRK_Buffer = NULL; + zlogKey->ZRK_ZlogBeast = zlogBeast; + if ( getMode & ZLOG_RECOVERY_EXPANDED ) + { + NINT checkpointIndex; + SNINT count; + + checkpointIndex = zlog_GetNewestCheckpointIndex( zlogBeast ); + /* The number of checkpoints we go back is related + * to the number of checkpoints that are bad. + * + * Bad Checkpoints Previous checkpoint to use + * 0 2 + * 1 1 + * 2 0 (newest/current) + * 3+ 0 (newest/current) + * + * This algorithm is needed so that we NEVER include log + * blocks that have been promised not to be recovered. This + * is required because the purge block code is told when a + * purged block is guaranteed not to appear in any recovery + * area. Currently the purge code is told this when ALL + * checkpoints are past the purged block. This is in spite + * of the fact we generally only will go back to the 3rd oldest + * checkpoint when doing recovery. Currently, the only exception + * to this is when we there are 3+ bad checkpoints. + */ + zASSERT( zfsPool->ZP_BadCheckpoints >= 0 ); + for ( count = 0; count < (2 - zfsPool->ZP_BadCheckpoints); ++count ) + { + CHECKPOINT_GET_PREVIOUS( &checkpointIndex ); + } + zASSERT( checkpointIndex >= 0 ); + zASSERT( checkpointIndex < CHECKPOINT_NUMBER ); + zlogKey->ZRK_LsnFirst = + zlogBeast->ZLB_P.ZLBP_OldHomePointerLsn[checkpointIndex]; + zlogKey->ZRK_BlockFirst = + zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[checkpointIndex]; + zlogKey->ZRK_BlockLast = + zlogBeast->ZLB_RecoveryActivePointerBlockNumber; + } + else if ( getMode & ZLOG_RECOVERY_LOGICAL ) + { + zlogKey->ZRK_BlockFirst = zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrBlkNum; + zlogKey->ZRK_LsnFirst = zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn; + zlogKey->ZRK_BlockLast = zlogBeast->ZLB_P.ZLBP_CompensationPtrBlkNum; + } + else + { + zlogKey->ZRK_BlockFirst = zlogBeast->ZLB_RecoveryAreaFirstBlockNumber; + zlogKey->ZRK_LsnFirst = zlogBeast->ZLB_P.ZLBP_HomePointerLsn; + zlogKey->ZRK_BlockLast = + zlogBeast->ZLB_RecoveryActivePointerBlockNumber; + } + if ( getMode & ZLOG_RECOVERY_BEGIN ) + { + zlogKey->ZRK_LogBufferNumber = zlogKey->ZRK_BlockFirst; + zlogKey->ZRK_LogRecordNumber = 0; + } + else + { /* + * The last block is the first block that is past + * the recovery area. To get the last block within the area get + * the previous block. + */ + zlogKey->ZRK_LogBufferNumber = zlogKey->ZRK_BlockLast; + ZLOG_PREVIOUS_POOL_LOG_BLOCK( zlogBeast, zlogKey->ZRK_LogBufferNumber ); + /* A -1 as a record number means the last record */ + zlogKey->ZRK_LogRecordNumber = -1; + } +#if NSS_ASSERT IS_ENABLED + /* Track number of times ZLOG_RecoveryClose should be called */ + ++ZLOGRecoveryGetCount; +#endif + RTN_VOID(); + +} /* End of ZLOG_RecoveryOpen() */ + + + +/* Synchronous Thread + * + * ZLOGRecoveryPhase - This is ZLOG's recovery phase main routine. The + * ZLOG specification has a section on the recovery phase. + * + * Notes - + * Persistently, the following blocks have been read in BEFORE + * this routine is called. + * + * Superblock Header + * LVDB(Internal Volume) + * LPDB + * VDB(Internal Volume) + * PDB + * + * The first THREE of these can be logged. This is VERY IMPORTANT as + * this means that the REDO/UNDO code may change them persistently from + * want we have in memory. The superblock header REDO/UNDO updates the memory + * copy while this routine handles the LVDB and LPDB. + * + * One must be VERY CAREFUL with how the contents of the first three + * blocks have been used up to this point because we may be changing + * their contents here. For example, we could be changing the media + * version and the beast tree special block number. + */ + +STATUS ZLOG_RecoveryPhase( + GeneralMsg_s *genMsg, + ZlogBeast_s *zlogBeast, + NINT mode) +{ + + STATUS status; + NINT oldestIndex; + NINT checkpoint; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, ZLOG_RecoveryPhase); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + /* Serialize recovery code */ + X_LATCH( &zlogBeast->ZLB_RecoveryLatch ); + /* Serialize ZLOG system */ + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + /* This implies we will allow several recovery attempts */ + zASSERT( zlogBeast->ZLB_DebugState <= ZLOG_ZB_DS_RECOVERY_END ); + zASSERT( zlogBeast->ZLB_DebugState >= ZLOG_ZB_DS_CONSTRUCT_END ); + + /* + * Mark that we made it to recovery - we waited until + * we got our exclusive access to the ZLOG system. + */ + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RECOVERY_START; + + /* Set here as called on every recovery attempt */ + zlogBeast->ZLB_RedoUndoStatus = zOK; + zlogBeast->ZLB_RedoUndoStatusSetter = WHERE; + /* Update number of times we have been called */ + zlogBeast->ZLB_P.ZLBP_RecoveryActivationCount += 1; + /* + * Initialize Recovery's active pointer. + */ + zlogBeast->ZLB_RecoveryActivePointerBlockNumber = + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber; + zlogBeast->ZLB_RecoveryActivePointerLsn = + zlogBeast->ZLB_P.ZLBP_HomePointerLsn; + /* + * Calculate the pre oldest home pointer. + */ + oldestIndex = zlog_GetOldestCheckpointIndex( zlogBeast ); + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber = + zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[ oldestIndex ]; + ZLOG_PREVIOUS_POOL_LOG_BLOCK( zlogBeast, + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ); + ZLOG_PREVIOUS_POOL_LOG_BLOCK( zlogBeast, + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ); + +#ifndef __linux__ // LINUX_ZlogDebug +#if NSS_DEBUG IS_ENABLED + if (GCL_ZlogFileCopy[0]) + { + zlog_CopyLogFileToVolume( genMsg, zlogBeast,GCL_ZlogFileCopy, 1 ); + GCL_ZlogFileCopy[0] = '\0'; + } + if (GCL_ZlogBeastCopy[0]) + { + zlog_CopyLogFileToVolume( genMsg, zlogBeast,GCL_ZlogBeastCopy, 0); + GCL_ZlogBeastCopy[0] = '\0'; + } + if ( PoolSave[0] ) + { + zlog_PoolSave( genMsg, zlogBeast->ZFSLOGroot.vol.zfsVol->pool ); + PoolSave[0] = 0; + } + if ( PoolCompare[0] ) + { + zlog_PoolCompare( genMsg, zlogBeast->ZFSLOGroot.vol.zfsVol->pool ); + PoolCompare[0] = 0; + } + if ( PoolRestoreImage[0] ) + { + zlog_PoolRestoreImage( genMsg, zlogBeast->ZFSLOGroot.vol.zfsVol->pool ); + /* We bomb here so user has to start over. Too many things + have been initialized already. */ + aprintf(LRED,MSGNot("Activation is being prevented because of switch /PoolRestoreImage.\n")); + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RECOVERY_ERROR; + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + UNX_LATCH( &zlogBeast->ZLB_RecoveryLatch ); + PoolRestoreImage[0] = 0; + RTN_STATUS( zFAILURE ); + } +#endif +#endif + /* + * Persistently store the current time. This time will also be + * used is we have to do recovery. + */ + zlogBeast->ZLB_P.ZLBP_TimeActivation = GetUTCTime(); + /* Recover the active pointer - synchronously */ + status = zlog_RecoverActivePointer( genMsg, zlogBeast ); + /* Did we get an error? */ + if ( status != zOK) + { /* YES - We were unable to update the active pointer */ + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RECOVERY_ERROR; + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + UNX_LATCH( &zlogBeast->ZLB_RecoveryLatch ); + RTN_STATUS( status ); + } + + /* + * We have just crossed a very intersting point in our lives. + * Before this point transaction logging is illegal, now + * we must allow it so components doing recovery can + * log changes. + */ + + /* We start recovery at the home pointer */ + zlogBeast->ZLB_RecoveryAreaFirstBlockNumber = + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber; + /* Are we already at the first block not to recover? */ + if ( (zlogBeast->ZLB_RecoveryActivePointerLsn == + zlogBeast->ZLB_P.ZLBP_HomePointerLsn ) +#if NSS_DEBUG IS_ENABLED + || (SkipRecovery) +#endif + ) + { /* + * Yes - No blocks to recover. Should be normal code path + * if ZSTORE was shutdown properly. Note that we can get a + * checksum error on first block when looking for current + * active pointer that would make us think we did a clean + * shutdown when we really did not. + */ + if ( zlogBeast->ZLB_RecoveryActivePointerLsn == + zlogBeast->ZLB_P.ZLBP_HomePointerLsn ) + { + if ( mode & VOLMODE_VERBOSE ) aprintf(NSS_POOL_COLOR,MSG(" ** Previous clean shutdown detected " + "(consistency check OK)\n", 341)); + DEBUG_PRINTF(TZLOG, DBG_NOINDENT, (TZLOG_COLOR, MSGNot("Previous clean shutdown detected (ZLOG recovery not needed).\n")) ); + /* + * Start checkpoint timer here, because we will not be + * taking any checkpoints in this code path. Taking a + * checkpoint is the other way this routine starts the + * checkpoint timer. + */ + setOneShot(&zlogBeast->ZLB_CheckpointTakeWorkToDoTimer, + ZstoreConfig.tick.chkpt, ZFSPOOL_CheckpointTimerPop); + status = zOK; +#if NSS_DEBUG IS_ENABLED + if (SkipRecovery) + { + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + status = zFAILURE; + } +#endif + } + else + { + aprintf(LRED,MSG(" ** Skipping ZLOG recovery per your command.\n", 871)); + DEBUG_PRINTF(TZLOG, DBG_NOINDENT, (TZLOG_COLOR, MSGNot("Skipping ZLOG recovery per your command.\n")) ); + SetErrno( genMsg, zERR_NOT_SUPPORTED ); + status = zFAILURE; + } + /* + * We do not have to do a checkpoint because the current + * checkpoint is good enough (excluding possible checksum + * error). + */ + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RECOVERY_END; + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + UNX_LATCH( &zlogBeast->ZLB_RecoveryLatch ); + /* + * Update the ZLOG file version if recovery is not needed. + * Required when the file version changes after a clean + * shutdown. + */ + zlogBeast->ZLB_P.ZLBP_FileVersion = ZLOG_ZLBP_FV_VERSION; + RTN_STATUS( status ); + } + + /*** Recovery is needed ***/ + if ( mode & VOLMODE_VERBOSE ) aprintf(NSS_POOL_COLOR,MSG(" ** Processing journal\n", 872)); + + /* See if the ZLOG File format is what we understand? */ + if ( zlogBeast->ZLB_P.ZLBP_FileVersion != ZLOG_ZLBP_FV_VERSION ) + { /* Opps - we do not understand ZLOG File Format */ +#if NSS_DEBUG IS_ENABLED + aprintf(LRED,MSGNot(" ** ZLOG file format has changed. Please /reset your pool or\n")); + aprintf(LRED,MSGNot(" ** Do a clean shutdown with your previous NSS.NLM\n")); + aprintf(LRED,MSGNot(" ** The system tends to get mad when Recovery returns an error so hang on ...\n")); +#endif + SetErrno( genMsg, zERR_ZLOG_UNSUPPORTED_FILE_VERSION ); + RTN_STATUS( zFAILURE ); + } + + DEBUG_PRINTF(TZLOG, DBG_NOINDENT, (TZLOG_COLOR, MSGNot("Recovery started.\n")) ); + +// if (zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn == 0) +// { +// zlogBeast->ProcessCompensationRecords = FALSE; +// } +// else +// { +// zASSERT(zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn != 0); +// zASSERT(zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn != 0); +// zlogBeast->ProcessCompensationRecords = TRUE; +// } + + zlogBeast->ZLB_RecoveryInitialActivePointerLsn = + zlogBeast->ZLB_P.ZLBP_ActivePointerLsn; + zlogBeast->ZLB_RecoveryInitialActivePointerBlockNumber = + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + + /* Update number of time we had to do recovery */ + zlogBeast->ZLB_P.ZLBP_RecoveryRecoverCount += 1; + zlogBeast->ZLB_P.ZLBP_LastRecoveryTimeStart = + zlogBeast->ZLB_P.ZLBP_TimeActivation; + + zlogBeast->ZLB_P.ZLBP_LastRecoveryActivationCount = + zlogBeast->ZLB_P.ZLBP_RecoveryActivationCount; + zASSERT( zlogBeast->ZFSLOGroot.vol.zfsVol->pool != NULL ); + /* Call Transaction Recovery Engine */ + if (zlogBeast->ZFSLOGroot.vol.zfsVol->pool->ZP_super-> + SB_Header.hdr.SBH_VersionMediaMajor <= AIPU_1ST_6PACK_MEDIA_VERSION) + { + status = XactionRecover( genMsg, + zlogBeast->ZFSLOGroot.vol.zfsVol->pool, mode ); + } + else + { + status = XactionRecoverLogicalUndoPass1To6( genMsg, + zlogBeast->ZFSLOGroot.vol.zfsVol->pool, mode ); + } + +#ifdef USER_GPACHNER +// status = zFAILURE; +// SetErrno( genMsg, 23456 ); +#endif + + if ( status == zOK) + { /* + * Update home pointer using active pointer. The active pointer + * was updated by our recover active pointer code. I.E. the + * call to zlog_RecoverActivePointer() at the start of + * this routine. The active pointer has had our magic very big + * number added to it. + */ + if (zlogBeast->ZLB_State & ZLOG_ZB_S_NEEDTODO_COMPENSATION) + { + zASSERT(zlogBeast->ZLB_P.ZLBP_ActivePointerLsn != + zlogBeast->ZLB_RecoveryInitialActivePointerLsn); + + /* We did at least one logical undo copy + * Before taking a checkpoint set: + * LogicalUndoPtr (LUP) = IAP + * CompensationPtr (COMP) = AP + * Home Pointer = LogicalUndoPtr (LUP) + */ + zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn = + zlogBeast->ZLB_RecoveryInitialActivePointerLsn; + zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrBlkNum = + zlogBeast->ZLB_RecoveryInitialActivePointerBlockNumber; + + /* The COMP pointer block number is set to be 1 greater than the + * ACT pointer because we are now going to flush the ZLOG file, + * and the block that the active pointer currently points to will + * get flushed, and the next time somebody does an obtain record, + * the record will be in the next block. We want COMP to point to + * this next block + */ + zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn = + zlogBeast->ZLB_P.ZLBP_ActivePointerLsn; + zlogBeast->ZLB_P.ZLBP_CompensationPtrBlkNum = + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + ZLOG_NEXT_POOL_LOG_BLOCK(zlogBeast, + zlogBeast->ZLB_P.ZLBP_CompensationPtrBlkNum); + + zlogBeast->ZLB_P.ZLBP_HomePointerLsn = + zlogBeast->ZLB_RecoveryInitialActivePointerLsn; + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber = + zlogBeast->ZLB_RecoveryInitialActivePointerBlockNumber; + + /* This checkpoint taken by ZFSPOOL_CheckpointTake waits until + * the ZLOG blocks to which the + * logical undo records are written are flushed. + * It does a defaultFlushWait on the zlogbeast. + * This will cause the dirty buffers of the zlog file to be written + * including the blocks to which we just copied the new logical undo + * records. It then waits for the buffers to be flushed. + */ + } + else + { + zASSERT(zlogBeast->ZLB_P.ZLBP_ActivePointerLsn == + zlogBeast->ZLB_RecoveryInitialActivePointerLsn); + + zASSERT(zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn == 0); + zASSERT(zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrBlkNum == 0); + zASSERT(zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn == 0); + zASSERT(zlogBeast->ZLB_P.ZLBP_CompensationPtrBlkNum == 0); + + + /* We set HOME = ACT only if there were no Logical Undo's. + * If we had logical undo records/compensation records the + * the ZLOG_ZB_S_NEEDTODO_COMPENSATION will be set in the + * the ZLB_State + */ + zlogBeast->ZLB_P.ZLBP_HomePointerLsn = + zlogBeast->ZLB_P.ZLBP_ActivePointerLsn; + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber = + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + } + zASSERT( zlogBeast->ZFSLOGroot.vol.zfsVol->pool != NULL ); + /* + * Take a clean checkpoint. This puts a checkpoint with a + * home pointer that will prevent us from recovery buffers + * that we have already recovered. If we crash before the + * checkpoint is written we will have to recover the items + * we just recovered. This is a clean checkpoint because + * the transaction system flushes all buffers before returning + * to us. This is a requirement. + * + * We take four checkpoints here so that if the crash occurred + * at ZLOG File Full Condition we will move past the condition. + * This is required because ZLOG_ObtainRecord() does not handle + * a file full condition on the VERY FIRST call to it. + * + * The purge block code also requires us to take four checkpoints. + * This moves our 'undo pointer' far enough that the purge + * code can release on the purge blocks in its log into the + * Free B-Tree. The purge log started doing this September 1997. + * + */ + +#if NSS_DEBUG IS_ENABLED + /* Need so obtain record will not ASSERT */ + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RECOVERY_END; +#endif + /* Can not own ZLOG latch when taking a checkpoint. */ + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + for ( checkpoint=0; checkpoint < CHECKPOINT_NUMBER; ++checkpoint ) + { + STATUS tempStatus; + + tempStatus = ZFSPOOL_CheckpointTake( genMsg, + zlogBeast->ZFSLOGroot.vol.zfsVol->pool, + CHECKPOINT_CT_S_CLEAN|CHECKPOINT_CT_S_RECOVERY); + if ( (tempStatus != zOK) && (status ==zOK) ) + { /* Report first error only, but try to write rest + * of checkpoints. Zrepair will have to make it so + * we can write the checkpoint before we will ever + * succeed at recovery. Note that if user tries + * to re-activate the pool it will succeed as long + * as one checkpoint got written. This is caused + * by the fact that we are happy if we can read one + * checkpoint. And since it is a clean checkpoint + * recovery will not be needed. Therefore we will + * not try to write the checkpoint we got an error + * on. + */ + status = tempStatus; + } + } + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + + /* We now re-load the volumes persistent data because REDO/UNDO + * may have updated the persistent copy. The same is now + * true with the pool's persistent data. + */ + if ( status == zOK ) + { + status = ZFSVOL_ReadPersistentVolumeData( genMsg, + zlogBeast->ZFSLOGroot.vol.zfsVol ); + } + if ( status == zOK ) + { /* Pool blocks do not exist until media 40.03 so skip + * reading pool blocks until then. + */ + + ZfsPool_s *zfsPool; + Pool_s *pool; + ZlssPool_s *zlssPool; + LONG version; + + zfsPool = (ZfsPool_s *)zlogBeast->ZFSLOGroot.vol.zfsVol; + pool = zfsPool->ZFSPOOLvol.v_pool; + zlssPool = (ZlssPool_s *)pool; + + zASSERT( zlssPool != NULL ); + zASSERT( zfsPool->ZP_super != 0 ); + + version = (zfsPool->ZP_super->SB_Header.hdr.SBH_VersionMediaMajor * 0x100L) + + zfsPool->ZP_super->SB_Header.hdr.SBH_VersionMediaMinor; + + if ( version >= (AIPU_LV_MEDIA_MAJOR*0x100L+AIPU_LV_STEP_PDB_DONE) ) + { + status = ZLSSPOOL_ReadPersistentPoolData( genMsg, zlssPool ); + } + } + } + else + { + zlogBeast->ZLB_RedoUndoStatus = GetErrno( genMsg ); + zlogBeast->ZLB_RedoUndoStatusSetter = GetErrnoSetter( genMsg ); +#if NSS_DEBUG IS_ENABLED + DBG_DebugPrintf( LRED, MSGNot("REDU/UNDO error %ld set at %s\n"), + zlogBeast->ZLB_RedoUndoStatus,zlogBeast->ZLB_RedoUndoStatusSetter ); + zASSERT("REDO/UNDO has failed"==NULL); +#endif +// GeneralMsg_s dummyGenMsg; +// +// (void)ZFSVOL_ReadPersistentVolumeData( &dummyGenMsg, +// zlogBeast->ZFSLOGroot.vol.zfsVol ); + } + + /* Store off the recovery end time */ + zlogBeast->ZLB_P.ZLBP_LastRecoveryTimeEnd = GetUTCTime(); + /* We round up the recovery time */ + ZLOG_HISTOGRAM_EVENT( &zlogBeast->ZLB_RecoveryTimeHistogram, + (QUAD)(zlogBeast->ZLB_P.ZLBP_LastRecoveryTimeEnd - + zlogBeast->ZLB_P.ZLBP_LastRecoveryTimeStart + 1u) ); + ZLOG_HISTORY_EVENT( &zlogBeast->ZLB_RecoveryTimeHistory, + (QUAD)(zlogBeast->ZLB_P.ZLBP_LastRecoveryTimeEnd - + zlogBeast->ZLB_P.ZLBP_LastRecoveryTimeStart + 1u) ); + /* Log time_t of recovery start (we do start as part + * of a log is the event time which will be very close + * to the recovery end time). + */ + ZLOG_HISTORY_EVENT( (ZlogHistory_s *)&zlogBeast->ZLB_History[ZH16_RECOVERY_UTC], + (QUAD)zlogBeast->ZLB_P.ZLBP_LastRecoveryTimeStart ); +#if 0 + /* Store off error code */ + zlogBeast->ZLB_P.ZLBP_LastRecoveryStatus = status; +#endif + /* Did we get an error? */ + if ( status != zOK) + { /* YES - We were unable to process log file */ + errPrintf(WHERE, Module, 1456, + MSG("** System verification failed.\n" + "You may have to rebuild your volume. If this happens again,\n" + "contact your Novell Technical Support Provider.", 873)); + + DEBUG_PRINTF(TZLOG, DBG_NOINDENT, (TZLOG_COLOR, MSGNot("Recovery complete - failure.\n")) ); + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RECOVERY_ERROR; + } + else + { /* NO */ + if ( mode & VOLMODE_VERBOSE ) aprintf(LGREEN,MSG(" ** System verification completed\n", 874)); + DEBUG_PRINTF(TZLOG, DBG_NOINDENT, (TZLOG_COLOR, MSGNot("Recovery complete - success.\n")) ); + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_RECOVERY_END; + } + + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + UNX_LATCH( &zlogBeast->ZLB_RecoveryLatch ); + + RTN_STATUS(status); + +} /* End of ZLOG_RecoveryPhase */ + + +/* + * + * void ZLOG_ReleaseRecord( ZfsXaction_s *zlogXaction ) + * + * Informs ZLOG logging subsystem that the caller has completed the following + * requirements. + * + * Filled in the data portion of the pool log record. + * Created all bonds required by the Transaction System. + * + * ZLOG uses this call to determine when a log block can be written to the + * log file. The caller must ensure that the time between ZLOG_ObtainRecord() + * and ZLOG_ReleaseRecord() is only used to perform the two requirements above. + * + */ + + + +Time_t ZLOG_RecoveryTimeGet( + ZlogRecoveryKey_s *zlogKey ) + +{ + + ZlogBeast_s *zlogBeast; + ZLOGBlockHeader_s *logBlockHeader; + + ASSERT_MPKNSS_LOCK(); + zlogBeast = zlogKey->ZRK_ZlogBeast; + zASSERT( zlogBeast != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + zASSERT( zlogKey->ZRK_Buffer != NULL ); + logBlockHeader = (ZLOGBlockHeader_s *)zlogKey->ZRK_Buffer->pBuf.data; + return( logBlockHeader->ZLBH_TimeStart ); + +} + + +void ZLOG_ReleaseRecord( + ZfsXaction_s *zlogXaction ) + +{ + + ZlogBeast_s *zlogBeast; + ZLOGRecordHeader_s *logRecordHeader; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, ZLOG_ReleaseRecord); + /* Get the ZLOG beast that this transaction is being done on */ + zlogBeast = zlogXaction->ZX_zlogBeast; + zASSERT( zlogBeast != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + /* Caller must own beastLatch (via ZLOG_ObtainRecord) */ + ASSERT_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + /* Must have called ZLOGObtainRecord which sets to non-NULL */ + zASSERT( zlogBeast->ZLB_ZfsXasRecovery != NULL ); + /* + * The ZLOG record header contains a ZfsXasRecovery_s structure, + * therefore we can obtain the ZLOG record header from the stored + * address of the ZfsXasRecovery_s structure. + */ + logRecordHeader = STRUCT( zlogBeast->ZLB_ZfsXasRecovery, + ZLOGRecordHeader_s, ZLRH_Zxr ); +#if NSS_ASSERT IS_ENABLED + { + NINT len; + ZLOGBlockHeader_s *logBlockHeader; + LONG *checkArea; + + logBlockHeader = (ZLOGBlockHeader_s *)zlogBeast->ZLB_Buffer->pBuf.data; + len = logBlockHeader->ZLBH_NumberOfUnusedLongs; + checkArea = ((LONG *)logBlockHeader) + + (( (1 << zlogBeast->ZFSLOGblkSizeShift))/4) - len; + for( ; len > 0; --len ) + { + if ( *checkArea != 0x44444444uL ) + { + zASSERT("Someone is writing past log record end" == NULL); + break; + } + checkArea += 1; + } + } +#endif +#if NSS_ASSERT IS_ENABLED + /* + * We ZAP so if caller uses after release they may crash. Note + * that if they use between someones elses ZLOG_ObtainRecord() and + * ZLOG_ReleaseRecord THEY will not be caught. Note we must zero + * out if ASSERTs are in our code. + */ + zlogBeast->ZLB_ZfsXasRecovery = NULL; +#endif +#if ZLOG_DEBUG IS_ENABLED + /* + * Callers must initialize after return from ZLOG_ObtainRecord() + * and before call to this function. + */ + zASSERT( logRecordHeader->ZLRH_Zxr.ZXR_FunctionIndex != 0xBEEFBEEFuL ); + zASSERT( logRecordHeader->ZLRH_Zxr.ZXR_PoolBlockCount != 0xBEEFu ); + /* Verify index is within range */ + zASSERT( logRecordHeader->ZLRH_Zxr.ZXR_FunctionIndex < XFUNC_MAX ); +#endif + ZLOG_HISTOGRAM_EVENT( &zlogBeast->ZLB_FunctionHistogram, + logRecordHeader->ZLRH_Zxr.ZXR_FunctionIndex ); + zlogBeast->ZLB_P.ZLBP_ActivePointerReferenceBlockCount += + logRecordHeader->ZLRH_Zxr.ZXR_PoolBlockCount; + /* + * OR in current states so we do not loss states added in obtain. + * Note this OR adds in extra debug information. + */ + logRecordHeader->ZLRH_Zxr.ZXR_TransactionState |= zlogXaction->xstate; +#if ZLOG_DEBUG IS_ENABLED + /** Fill in end time **/ + logRecordHeader->ZLRH_EndTime = (LONG)microSecondTimer(); +#endif + +#if LOG_TEST IS_ENABLED + if ( LogTest && (logRecordHeader->ZLRH_Zxr.ZXR_FunctionIndex >= xfirst) && + (logRecordHeader->ZLRH_Zxr.ZXR_FunctionIndex <= xlast) ) + { + XRecovery_f func; + GeneralMsg_s genMsg; + ZfsPool_s *zfsPool; + ZfsXasRecovery_s *logRec; + STATUS status; + BlockInfo_s *blkInfo; + NINT i; + BYTE oldState; + + COMN_SETUP_GENERAL_MSG_NOSA( &genMsg ); + zfsPool = zlogBeast->ZFSLOGroot.vol.zfsVol->pool; + zASSERT( zfsPool != NULL ); + logRec = &logRecordHeader->ZLRH_Zxr; + + blkInfo = ZLOG_START_OF_POOL_BLOCKS(logRec); + zASSERT( logRec->ZXR_PoolBlockCount <= LOG_TEST_MAX ); + for (i = 0; i < logRec->ZXR_PoolBlockCount; ++i) + { + if ((blkInfo[i].blkNum != 0) && (zlogXaction->ZX_Buffers[i] != 0)) + { + blkInfo[i].replayState = 0; + /* Turn on bit that some routines look at to determine + if we should really toss a buffer, read a buffer, ... */ + zlogXaction->ZX_Buffers[i]->state |= CACHE_LOG_TEST; + zASSERT( zlogBeast->ZLB_LT[i].LT_Copy != NULL ); + zASSERT( zlogXaction->ZX_Buffers[i]->pBuf.data != NULL ); + ASSERT_LATCH( &zlogXaction->ZX_Buffers[i]->agent.latch ); + /* Copy buffer address into ZLOG Beast for + ZFS_ReadPoolBlk(). */ + zlogBeast->ZLB_LT[i].LT_Buffer = zlogXaction->ZX_Buffers[i]; + /* Make a copy of meta DATA buffer */ + memcpy( zlogBeast->ZLB_LT[i].LT_Copy /*dest*/, + zlogXaction->ZX_Buffers[i]->pBuf.data /* Src */, + 1 << zlogBeast->ZFSLOGblkSizeShift /* size */ ); +// zlogBeast->ZLB_LT[i].LT_Buffer = blkInfo[i].buffer; + + } + } + oldState = zlogBeast->ZLB_DebugState; + zlogBeast->ZLB_DebugState = ZLOG_ZB_DS_LOG_TEST; + if ( zlogXaction->xstate & X_REDO_THEN_UNDO ) + { + /* Do REDO */ + func = XRecoveryTable[ logRec->ZXR_FunctionIndex].redo; + status = func(&genMsg, zfsPool, logRec, X_REDO); + zASSERT( status == zOK ); + /* Do UNDO */ + func = XRecoveryTable[ logRec->ZXR_FunctionIndex].undo; + status = func(&genMsg, zfsPool, logRec, X_UNDO); + zASSERT( status == zOK ); + } + else + { + func = XRecoveryTable[ logRec->ZXR_FunctionIndex].undo; + status = func(&genMsg, zfsPool, logRec, X_UNDO); + zASSERT( status == zOK ); + func = XRecoveryTable[ logRec->ZXR_FunctionIndex].redo; + status = func(&genMsg, zfsPool, logRec, X_REDO); + zASSERT( status == zOK ); + } + zlogBeast->ZLB_DebugState = oldState; + + for (i = 0; i < logRec->ZXR_PoolBlockCount; ++i) + { + if ((blkInfo[i].blkNum != 0) && (zlogXaction->ZX_Buffers[i] != 0)) + { /* Put thing back to normal */ + + void *src1; + void *src2; + NINT len; + + /* Verify undo/redo operations */ + src1 = zlogBeast->ZLB_LT[i].LT_Copy; + zASSERT( src1 != NULL ); + src2 = (LONG *)zlogXaction->ZX_Buffers[i]->pBuf.data; + zASSERT( src2 != NULL ); + len = 1 << zlogBeast->ZFSLOGblkSizeShift; + (void)zlogXaction->ZX_Compare[i]( src1, src2, len ); + /* Put things back to normal */ + blkInfo[i].replayState = 0; + zlogXaction->ZX_Buffers[i]->state &= ~CACHE_LOG_TEST; + zlogBeast->ZLB_LT[i].LT_Buffer = NULL; + } + } + + } + /* + * Turn off the bit that causes several routines to act special + * so that our undo/redo tests can work. + */ + zlogXaction->xstate &= ~X_REDO_THEN_UNDO; +#endif + /* + * Calculate checksum on all bytes in log record. This includes + * the bytes that are used to align us to the proper boundary. This + * is done because ZLOG does not track how many of these bytes + * are present in each log record. In addition, our checksum + * routine does LONGs so to be fast. + */ + logRecordHeader->ZLRH_Checksum = 0; + /* + * The checksum we store is the value that makes a good + * block's checksum equal to zero. + */ +#if ZLOG_CHECKSUM_LOG_RECORDS IS_ENABLED + logRecordHeader->ZLRH_Checksum = 0 - zlog_CalculateChecksum( + (LONG *)logRecordHeader, + (logRecordHeader->ZLRH_LongLength) ); +#endif + +#if NSS_DEBUG IS_ENABLED + if ( DBG_DebugFlag & (TZLOG | TZLOG2) ) + { + zlog_LogRecordDump( logRecordHeader, MSGNot("Release "), GREEN ); + } +#endif + + if (zlogXaction->xaction.agent.state & AGENT_WAIT_FOR_COMMIT) + { + Agent_s waitAgent; + + zlogXaction->xaction.agent.state &= ~AGENT_WAIT_FOR_COMMIT; + + initAgent(&waitAgent, NULL, "Commit Agent"); + bind(&waitAgent, &zlogBeast->ZLB_Barrier->ZB_Agent); + + UNX_LATCH( &zlogBeast->ZLB_Barrier->ZB_Agent.latch ); + /** Release the buffer (this allows it to be flushed) **/ + CACHE_UNXLATCH( zlogBeast->ZLB_Buffer ); + /** Release our exclusive latch over the ZLOG system. **/ + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + /** No more playing with the log record because Buffer_s that it + is in can now be released by other ZLOG code. **/ + + /* Since UNX_LATCH are non blocking calls we know nobody can get the + * ZLB_Buffer before we yield. So I can call defaultFlushWait here + * on ZLB_Buffer if I want to wait for the xaction to commit + */ + defaultFlushWait(&waitAgent); + } + else + { + + UNX_LATCH( &zlogBeast->ZLB_Barrier->ZB_Agent.latch ); + /** Release the buffer (this allows it to be flushed) **/ + CACHE_UNXLATCH( zlogBeast->ZLB_Buffer ); + /** Release our exclusive latch over the ZLOG system. **/ + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + /** No more playing with the log record because Buffer_s that it + is in can now be released by other ZLOG code. **/ + } + RTN_VOID(); + +} /* End of ZLOG_ReleaseRecord() */ + +/* + zlog_RGNReal - Does the high level work for a 'get next'. + + See comments for ZLOG_RecoveryGet() above for details. + +*/ + +STATUS zlog_RGNReal( + GeneralMsg_s *genMsg, + ZlogRecoveryKey_s *zlogKey ) + +{ + + ZlogBeast_s *zlogBeast; + ZLOGBlockHeader_s *logBlockHeader; + STATUS status; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, ZLOG_RecoveryGetNext); + zlogBeast = zlogKey->ZRK_ZlogBeast; + zASSERT( zlogBeast != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + if ( zlogKey->ZRK_Buffer == NULL ) + { /* We have no buffer so this is really a Get First */ + /* Are we at the end of the recovery area? */ + if ( zlogKey->ZRK_LogBufferNumber == zlogKey->ZRK_BlockLast ) + { /* Yes - then return our special error code */ + SetErrno( genMsg, zERR_ZLOG_NO_MORE_RECORDS ); + RTN_STATUS(zFAILURE); + } + status = zlog_RecoveryGet( genMsg, zlogBeast, zlogKey ); + RTN_STATUS(status); + } + else + { /* This is the true get next */ + zlogKey->ZRK_LogRecordNumber++; + } + + zASSERT( zlogKey->ZRK_Buffer != NULL ); + logBlockHeader = (ZLOGBlockHeader_s *)zlogKey->ZRK_Buffer->pBuf.data; + zASSERT(zlogKey->ZRK_LogRecordNumber <= + logBlockHeader->ZLBH_NumberOfRecords ); + if (zlogKey->ZRK_LogRecordNumber == logBlockHeader->ZLBH_NumberOfRecords ) + { /* Need to go to the first record of the next log block */ + /* Release shared access on current log block buffer */ + CACHE_RELEASE( zlogKey->ZRK_Buffer ); + /* zlog_RecoveryGet requires the next two lines */ + zlogKey->ZRK_Buffer = NULL; + zlogKey->ZRK_LogRecordNumber = 0; + ZLOG_NEXT_POOL_LOG_BLOCK( zlogBeast, zlogKey->ZRK_LogBufferNumber ); + /* Are we at the end of the recovery area? */ + if ( zlogKey->ZRK_LogBufferNumber == zlogKey->ZRK_BlockLast ) + { /* Yes - then return our special error code */ + SetErrno( genMsg, zERR_ZLOG_NO_MORE_RECORDS ); + RTN_STATUS(zFAILURE); + } + } + status = zlog_RecoveryGet( genMsg, zlogBeast, zlogKey ); + RTN_STATUS(status); + +} /* End of zlog_RGNReal() */ + + +/* + zlog_RGNReal - Does the high level work for a 'get previous'. + + See comments for ZLOG_RecoveryGet() above for details. + +*/ + +STATUS zlog_RGPReal( + GeneralMsg_s *genMsg, + ZlogRecoveryKey_s *zlogKey ) + +{ + + ZlogBeast_s *zlogBeast; + ZLOGBlockHeader_s *logBlockHeader; + STATUS status; + Blknum_t beforeFirst; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, ZLOG_RecoveryGetPrevious); + zlogBeast = zlogKey->ZRK_ZlogBeast; + zASSERT( zlogBeast != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + /* A NULL buffer means this is really get last */ + if ( zlogKey->ZRK_Buffer == NULL ) + { + beforeFirst = zlogKey->ZRK_BlockFirst; + ZLOG_PREVIOUS_POOL_LOG_BLOCK( zlogBeast, beforeFirst ); + /* Are we at the beginning of the recovery area? */ + if ( zlogKey->ZRK_LogBufferNumber == beforeFirst ) + { /* Yes - then return our special error code */ + SetErrno( genMsg, zERR_ZLOG_NO_MORE_RECORDS ); + RTN_STATUS(zFAILURE); + } + status = zlog_RecoveryGet( genMsg, zlogBeast, zlogKey ); + RTN_STATUS(status); + } + else + { /* This is the true get previous */ + --zlogKey->ZRK_LogRecordNumber; + } + zASSERT( zlogKey->ZRK_Buffer != NULL ); + logBlockHeader = (ZLOGBlockHeader_s *)zlogKey->ZRK_Buffer->pBuf.data; + + zASSERT(zlogKey->ZRK_LogRecordNumber >= -1 ); + if (zlogKey->ZRK_LogRecordNumber == -1 ) + { /* Need to go to the Last record of the previous log block */ + /* Release shared access on current log block buffer */ + CACHE_RELEASE( zlogKey->ZRK_Buffer ); + /* zlog_RecoveryGet requires the next two lines */ + zlogKey->ZRK_Buffer = NULL; +// zlogKey->ZRK_LogRecordNumber = -1; + /* Are we at the beginning of the recovery area? */ + beforeFirst = zlogKey->ZRK_BlockFirst; + ZLOG_PREVIOUS_POOL_LOG_BLOCK( zlogBeast, beforeFirst ); + if ( zlogKey->ZRK_LogBufferNumber == beforeFirst ) + { /* Yes - then return our special error code */ + SetErrno( genMsg, zERR_ZLOG_NO_MORE_RECORDS ); + RTN_STATUS(zFAILURE); + } + ZLOG_PREVIOUS_POOL_LOG_BLOCK( zlogBeast, zlogKey->ZRK_LogBufferNumber ); + } + status = zlog_RecoveryGet( genMsg, zlogBeast, zlogKey ); + RTN_STATUS(status); + +} /* End of zlog_RGPReal() */ + +/* + * Reads a log block and verifies the log block and all log records + * within the log block. + * + * Returns + * Buffer or NULL + * If Buffer then shared latched on buffer stilled owned. + * If NULL then genMsg filled in and no Latch. + */ + +Buffer_s *zlog_RGReadAndVerifyBlock( + GeneralMsg_s *genMsg, + ZlogBeast_s *zlogBeast, + Blknum_t blockToRead ) + +{ + + Buffer_s *buffer; + ZLOGBlockHeader_s *logBlockHeader; +// LONG checksum; + STATUS status; + NINT recordNumber; + IoMsg_s ioMsg; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, zlog_RGReadAndVerifyBlock); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + FILEBLK_IO_MSG(ioMsg, zlogBeast, blockToRead, 1, CACHE_READ ); + buffer = COMN_GetFileBlk(genMsg, &ioMsg); + /* + * Note that COMN_GetFileBlk() has obtained an shared latch on + * the buffer for us since we will only be reading it. We + * release this latch right after processing the block. This + * latch is only obtained if zOK is returned. + */ + /* Did our read succeed? */ + if (buffer == NULL) + { /* NO - An error occurred. Note buffer latch is not owned. */ + WARN( "LOG BLOCK - get block error" == NULL ); + RTN_PTR(NULL); + } + + /*** Setup log block header pointer ***/ + + /* + * FixFixFix6(Future,Performance,Paul,Neal) - many of these + * test were done in RAP, do we really need to do again? + * This comes down to ZSTORE rule of if I read a block and + * verify then read again (no writes to blocks in between) + * do I have to re-verify? + * + * As of September 1997, we have no problems with recovery time + * requirement of less then 15 seconds. + */ + logBlockHeader = (ZLOGBlockHeader_s *)buffer->pBuf.data; + /* + * Better verify length field used for checksumming. All log blocks + * must have a log block header. Log blocks do not have to have a log + * record header because install does not fill in. + */ + if ( logBlockHeader->ZLBH_NumberOfUnusedLongs > + ( ( (1 << zlogBeast->ZFSLOGblkSizeShift) - + sizeof(ZLOGBlockHeader_s) )/4) ) + { + zASSERT( "LOG BLOCK - Pre-checksum error" == NULL ); + status = zERR_ZLOG_BAD_CHECKSUM; + goto zlogRGRAVBStopProcessing2; + } + /* + * Calculate the checksum. The checksum code was removed because + * the Transaction System changes the logging state element + * of log records during recovery. Not doing checksums + * here is O.K. because the buffers have already made it + * past one checksum when the log buffer was read in during + * the Recover Active Pointer pass. + */ + + /* Verify that this is a log block */ + if ( logBlockHeader->ZLBH_Signature != ZLBH_S_SIGNATURE ) + { /* Incorrect signature */ + zASSERT( "LOG BLOCK - Signature error" == NULL ); + status = zERR_ZLOG_BAD_BLOCK_SIGNATURE; + goto zlogRGRAVBStopProcessing2; + } + /*** Checksum is good and this is a ZLOG block ***/ + if ( logBlockHeader->ZLBH_Status & ZLBH_S_INSTALL ) + { /* + * Install wrote which means we must be done processing + * the log file. Note that the only time we should see a + * install log block is if we have a "hole" in between + * the home pointer and active pointer that was stored in the + * checkpoint. A "hole" is caused by some log blocks not + * being written before we crash. This hole implies that + * we did not recovery ALL metadata (although we do not know + * if we failed on committed or non-committed data recovery). + */ + zASSERT( "LOG BLOCK - hole detected" == NULL ); + status = zERR_ZLOG_BAD_BLOCK_SIGNATURE; + goto zlogRGRAVBStopProcessing2; + } + /*** Checksum is good and this is a ZLOG block and not install + * written + ***/ + recordNumber = logBlockHeader->ZLBH_NumberOfRecords; + if ( ( recordNumber < 1 ) || + (recordNumber > ((zlog_MAXIMUM_RECORD_SIZE/sizeof(ZLOGRecordHeader_s)+1)) )) + { + zASSERT( "LOG BLOCK - Log Record count error" == NULL ); + status = zERR_ZLOG_BAD_RECORD_COUNT; + goto zlogRGRAVBStopProcessing2; + } + + /*** Verify all log records in the log block ***/ + + /* + * Call our code that verifies all the log records in a single + * log block + */ + status = zlog_RGVerifyRecords( genMsg, zlogBeast, buffer, recordNumber ); + if ( status != zOK ) + { + /* Release shared access on log block buffer */ + CACHE_RELEASE( buffer ); + buffer = NULL; + } + RTN_PTR(buffer); + +/*************************/ +zlogRGRAVBStopProcessing2:; +/*************************/ + /* Release shared access on log block buffer */ + CACHE_RELEASE( buffer ); + SetErrno( genMsg, status ); + RTN_PTR(NULL); + +} /* End of zlog_RGReadAndVerifyBlock() */ + + +/* + * + * Verify ALL log records of the current log block. + * + * Latches owned - ZLOG beast latch + * ZLB_RecoveryLatch + * Buffer's latch + * + * Returns + * zOK - If no errors. + * zFAILURE - A fatal error in processing log block. + * + * Additional Returns (via dereference of supplied pointers) + * 'genMsg' - If zFAILURE the 'genMsg' has been set. + * '.errno' - On errors the 'errno' field is set. + * + */ + +STATUS zlog_RGVerifyRecords( + GeneralMsg_s *genMsg, + ZlogBeast_s *zlogBeast, + Buffer_s *buffer, + NINT recordNumber ) + +{ + + ZLOGBlockHeader_s *logBlockHeader; + ZLOGRecordHeader_s *logRecordHeader; +// LONG checksum; + STATUS status; + NINT processedBytes; + + ASSERT_MPKNSS_LOCK(); + /* Shared buffer latch owned */ + /* ZLOG's beastLatch owned */ + + ENTER(TZLOG, zlog_RGVerifyRecords); + zASSERT( buffer != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + + /*** Setup log block header pointer ***/ + logBlockHeader = (ZLOGBlockHeader_s *)buffer->pBuf.data; + zASSERT( logBlockHeader != 0 ); +#if ZLOG_DEBUG IS_ENABLED + if ( logBlockHeader->ZLBH_Signature != ZLBH_S_SIGNATURE ) + { /* Incorrect signature even though previous O.K. !!! */ + zASSERT( "LOG BLOCK - Signature error" == NULL ); + status = zERR_ZLOG_BAD_BLOCK_SIGNATURE; + goto zlogRGVRStopProcessing2; + } +#endif + zASSERT( recordNumber >= 0 ); + + processedBytes = sizeof( ZLOGBlockHeader_s ); + + for ( ; recordNumber > 0; recordNumber-- ) + { + /* Point to next log record within block */ + logRecordHeader = (ZLOGRecordHeader_s *)( ((BYTE *)logBlockHeader) + + processedBytes); + /* Verify that length is not to short for even just the header */ + if ( logRecordHeader->ZLRH_LongLength < (sizeof(ZLOGRecordHeader_s)/4) ) + { + zASSERT( "LOG RECORD - Pre-checksum error" == NULL ); + status = zERR_ZLOG_BAD_RECORD_SIZE; + goto zlogRGVRStopProcessing2; + } + /* Add length into total processed for this log block */ + processedBytes += logRecordHeader->ZLRH_LongLength * 4; + /* Verify that we don't have more longs than a log block contains */ + if ( processedBytes > (1 << zlogBeast->ZFSLOGblkSizeShift) ) + { /* Too many LONGs */ + zASSERT( "LOG RECORD - Pre-checksum error" == NULL ); + status = zERR_ZLOG_BAD_CHECKSUM; + goto zlogRGVRStopProcessing2; + } + + /* + * Calculate the checksum. The checksum code was removed because + * the Transaction System changes the logging state element + * of log records during recovery. Not doing checksums + * here is O.K. because the buffers have already made it + * past one checksum when the log buffer was read in during + * the Recover Active Pointer pass. + */ + + /*** Finally lets look at the LSN ***/ + + if ( logRecordHeader->ZLRH_Zxr.ZXR_Lsn == 0 ) + { + zASSERT( "LOG RECORD - LSN value error" == NULL ); + status = zERR_ZLOG_BAD_LSN; + goto zlogRGVRStopProcessing2; + } + /* The LSN in the log record must be less than the active + pointer LSN. If not we stop. */ + if ( logRecordHeader->ZLRH_Zxr.ZXR_Lsn >= + zlogBeast->ZLB_P.ZLBP_ActivePointerLsn ) + { /* LSN is too large to be legal */ + zASSERT( "LOG RECORD - LSN value error" == NULL ); + status = zERR_ZLOG_BAD_LSN; + goto zlogRGVRStopProcessing2; + } + } + RTN_STATUS(zOK); + +/**************************/ +zlogRGVRStopProcessing2:; +/**************************/ + /* Release shared access on log block buffer */ + SetErrno( genMsg, status ); + RTN_STATUS(zFAILURE); + +} /* End of zlog_RGVerifyRecords() */ + +void zlog_THSerialized( FsmLite_s *fsmLite ) + +{ + ZfsXaction_s *oldestZXaction; + ZfsXaction_s *homedZXaction; + ZlogBeast_s *zlogBeast; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, zlog_THSerialized); + homedZXaction = STRUCT( fsmLite, ZfsXaction_s, xaction.agent.fsm ); + zlogBeast = homedZXaction->ZX_zlogBeast; + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + /* + * Not all transactions have log records. So only check the + * seniority list if we made it past start. + */ + if ( !(homedZXaction->xstate & XAS_XR_TS_START) ) + { /* This transaction has a log record */ + zlogBeast->ZLB_TransactionsHomedSinceCheckpointCount++; + zlogBeast->ZLB_TransactionsHomedCount++; + /* Peek at the oldest transaction */ + DQ_PEEK( &zlogBeast->ZLB_SeniorityListHead, oldestZXaction, + ZfsXaction_s, ZX_seniorityList ); + zASSERT( oldestZXaction != NULL ); + /* + * Remove homed transaction from ZLOG Beast seniority + * list (ZX_SeniorityListHead) + */ + DQ_RMV( homedZXaction, ZX_seniorityList ); + /* Is the homed transaction also the oldest transaction? */ + if ( oldestZXaction == homedZXaction ) + { /* + * YES - The homed transaction was the oldest transaction so + * the home pointer can be moved. + */ + /* Get the new oldest transaction */ + DQ_PEEK( &zlogBeast->ZLB_SeniorityListHead, oldestZXaction, + ZfsXaction_s, ZX_seniorityList ); +#if NSS_DEBUG IS_ENABLED + /* + * For /nodata to work we must not update home otherwise + * we will not do recovery on the metadata blocks that we + * did not really write. + */ + if ( !CrashPools ) + { +#endif + /* Is there a new oldest transaction? */ + if ( oldestZXaction == NULL ) + { /* NO -- update home pointer using active pointer */ + zlogBeast->ZLB_P.ZLBP_HomePointerLsn = + zlogBeast->ZLB_P.ZLBP_ActivePointerLsn; + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber = + zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + /* + * I thought about taking a checkpoint whenever we + * have no transactions left on our list. This would + * get a checkpoint out everytime we where 'clean', + * but this may occur too often during the normal + * running case and would only gain us less recovery + * time if we crash. We will just drop through to + * code that will do a checkpoint if home moved a + * lot since the last checkpoint. + */ + } + else + { /* YES, update home pointer using oldest Xaction info */ + zlogBeast->ZLB_P.ZLBP_HomePointerLsn = + oldestZXaction->ZX_lsn; + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber = + oldestZXaction->ZX_logBlockNumber; + } +#if NSS_DEBUG IS_ENABLED + } +#endif + { + /* + * If the home pointer has moved a lot since our last + * checkpoint OR if we are in throttle back mode + * then schedule another checkpoint. Note that + * checkpoint schedule code will not schedule a + * checkpoint if one is already scheduled. + */ + NINT checkpoint; + Blknum_t blocks; + + checkpoint = zlog_GetNewestCheckpointIndex( zlogBeast ); + if ( zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[checkpoint] > + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber ) + { + blocks = (zlogBeast->ZLB_NumberOfLogBlocks - + zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[checkpoint]) + + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber; + } + else + { + blocks = zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber - + zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[checkpoint]; + } + + if ( blocks > zlogBeast->ZLB_LogBlockFilledInMaximum ) + { + /* Async, non-blocking routine to schedule a checkpoint */ + if (!(zlogBeast->ZLB_State & ZLOG_ZB_S_DOING_COMPENSATION)) + { + zlog_CheckpointTakeSchedule( zlogBeast, + CHECKPOINT_CT_S_NORMAL|CHECKPOINT_CT_S_HOME_MOVEMENT ); + } +#if NSS_ASSERT IS_ENABLED + else + { + zASSERT("ZLOG file should not get filled up during Pass 7 of Redo/Undo" == NULL); + } +#endif + } + } + /* + * There is no reason to wake up waiters here because for them to + * stay awake the pre oldest home pointer block must move. This + * block ONLY moves when checkpoints are taken. Therefore the check + * that is in the checkpoint code for waiters is all that is needed. + * Noticed this when adding 'too' full waiting code on June 17, 1997. + */ + } + } + /** Release our exclusive latch over the ZLOG system **/ + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + /* + * Instead of calling transaction system to let it finish processing + * transaction ZLOG finishes for it. + */ + defaultSignal( &homedZXaction->xaction.agent); + freeXaction( homedZXaction); + RTN_VOID(); + +} /* End of zlog_THSerialized */ + +/* Standard FSM Rules + * + * void ZLOG_TransactionHomed( ZfsXaction_s *xaction ) + * + * + * Used to tell ZLOG that it can move the home pointer forward. The + * Transaction System calls off a pointer in the ZfsXaction_s structure + * everytime a transaction is homed. ZLOG has placed this routine + * into the pointer (in ZLOG_ObtainRecord). ZLOG must check to see if + * the ZLOG home pointer can be moved. + * + * The Transaction System allows ZLOG to use the FsmLite_s within the home + * agent within the XAS. + * + */ + +void ZLOG_TransactionHomed( ZfsXaction_s *xaction ) +{ + + FsmLite_s *fsmLite; + ZlogBeast_s *zlogBeast; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, ZLOG_TransactionHomed); + fsmLite = &xaction->xaction.agent.fsm; + zlogBeast = xaction->ZX_zlogBeast; + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + /* Serialize the ZLOG system for this beast */ + FSM_X_LATCH( &zlogBeast->ZFSLOGbeastLatch, fsmLite, zlog_THSerialized ); + RTN_VOID(); + +} /* End of ZLOGTransactionHomed */ + + + + +/*=========================================================================== + *=========================================================================== + * + * ZFS LOG BEAST OPERATIONS Functions + * + *=========================================================================== + *===========================================================================*/ + +/* + * See comnBeast.h for more information about Storage System operations. + * + * Note - Unlike Common and Auth operations we do NOT inherit storage + * pool operations from our parent. + */ + +/**************************************************************************** + * ZFS BASED storage pack routine + *****************************************************************************/ + + /* Requirements: Synchronous and non-blocking + Entry state: Shared lock on beast latch + Zlog_ZfsPackedSize - + + */ +STATIC NINT Zlog_ZfsPackedSize( + void *zlogBeast_LX) +{ + ZlogBeast_s *zlogBeast = (ZlogBeast_s *)zlogBeast_LX; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, Zlog_ZfsPackedSize); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); +// ASSERT_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + RTN_NINT(sizeof( zlogBeast->ZLB_P )); +} /* End of Zlog_ZfsPackedSize */ + +/* + * Requirements: Synchronous and non-blocking + * Entry state: Shared lock on beast latch + * + * Zlog_ZfsPack - We assume this gets called only when a checkpoint is + * being taken. + * + * Return - + * Non-NULL - Pointer to next free location within storeBuffer + * NULL indicates an error - errno set in genMsg + * + */ + +STATIC BYTE *Zlog_ZfsPack( + void *zlogBeast_LX, + BYTE *storeBuffer) +{ + ZlogBeast_s *zlogBeast = (ZlogBeast_s *)zlogBeast_LX; + ZfsPool_s *zfsPool; + ZlssPool_s *zlssPool; + NINT nextCheckpoint; + MediaCheckpoint_s *mcp; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, Zlog_ZfsPack); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + zASSERT( zlogBeast->ZLB_P.ZLBP_VersionMajor == ZLOG_ZLBP_VM_MAJOR ); + +// ASSERT_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + zfsPool = zlogBeast->ZFSLOGroot.vol.zfsVol->pool; + zASSERT( zfsPool != NULL ); + zASSERT( COMN_IsDerivedFrom(zfsPool, zFTYPE_ZLSS_ZFSPOOL) ); + + zlssPool = ZFS_POOL_TO_ZLSS_POOL( zfsPool ); + + zASSERT( zlssPool != NULL ); + zASSERT( COMN_IsDerivedFrom(zlssPool, zFTYPE_ZLSS_LOGICAL_POOL) ); + + mcp = &zfsPool->ZP_super->SB_Checkpoint; + /* + * Update the ZLOG beasts copy of the four checkpoint's home + * pointer. This can be done now (before we know checkpoint makes + * it to the media) because our code that uses the old home pointer + * only executes if the checkpoint gets written (see + * (ZFSPOOL_CheckpointTake()). This values are also used when we + * come up in the recovery code. By updating here the checkpoint + * will have the latest information when we come up next time. + */ + nextCheckpoint = zfsPool->ZP_NextCheckpoint + 1; + if ( nextCheckpoint >= CHECKPOINT_NUMBER ) + { + nextCheckpoint = 0; + } + /* + * + */ + { + LONG diff; + + if ( !(mcp->chkPnt.CP_State & CHECKPOINT_CT_S_DEBUG) ) + { /* Do not update histogram if this is a debug checkpoint. In + * the debug system we take 2 checkpoints each time + * checkpoint take is called. The debug checkpoint messes + * up our histogram information. + */ + if ( zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[nextCheckpoint] + <= zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber ) + { + diff = zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber - + zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[nextCheckpoint]; + } + else + { + zASSERT( zlogBeast->ZLB_NumberOfLogBlocks > 0 ); + + diff = zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber + + zlogBeast->ZLB_NumberOfLogBlocks - + zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[nextCheckpoint]; + } + ZLOG_HISTOGRAM_EVENT( &zlogBeast->ZLB_BlockInuseCountHistogram, + diff ); + ZLOG_HISTORY_EVENT( &zlogBeast->ZLB_BlockInuseCountHistory, + diff ); + } + } +#if NSS_DEBUG IS_ENABLED + /* Store the time we packed the checkpoint + */ + zlogBeast->ZLB_CheckpointTime[nextCheckpoint] = GetUTCTime(); +#endif + zlogBeast->ZLB_P.ZLBP_OldHomePointerLsn[nextCheckpoint] = + zlogBeast->ZLB_P.ZLBP_HomePointerLsn; + zlogBeast->ZLB_P.ZLBP_OldHomePointerBlockNumber[nextCheckpoint] = + zlogBeast->ZLB_P.ZLBP_HomePointerBlockNumber; + /* + * Record/track information about how many blocks are being referenced + * from the previous home pointer to the active pointer. This + * tracks the 'normal' worst case block references that recovery + * will have to access. I.E. if we crashed now recovery would + * start at the previous home pointer (it is the newest copy + * on the media) and stop at the current active pointer (if + * the log blocks have made it to the media already). This is the + * 'normal' worst case because we assume that we are able to + * use the newest checkpoint when we do recovery. + */ + if ( !(mcp->chkPnt.CP_State & CHECKPOINT_CT_S_DEBUG) ) + { /* See previous if comment above */ + ZLOG_HISTOGRAM_EVENT( &zlogBeast->ZLB_ReferenceBlockCountHistogram, + zlogBeast->ZLB_P.ZLBP_ActivePointerReferenceBlockCount - + zlogBeast->ZLB_P.ZLBP_PreHomePointerReferenceBlockCount ); + ZLOG_HISTORY_EVENT( &zlogBeast->ZLB_ReferenceBlockCountHistory, + zlogBeast->ZLB_P.ZLBP_ActivePointerReferenceBlockCount - + zlogBeast->ZLB_P.ZLBP_PreHomePointerReferenceBlockCount ); + } + zlogBeast->ZLB_P.ZLBP_PreHomePointerReferenceBlockCount = + zlogBeast->ZLB_P.ZLBP_HomePointerReferenceBlockCount; + zlogBeast->ZLB_P.ZLBP_HomePointerReferenceBlockCount = + zlogBeast->ZLB_P.ZLBP_ActivePointerReferenceBlockCount; + + /* Now grab the ZLSS POOL's physical I/O statistics and + * throw into ZLOG. This way we get a fairly persistent + * information in ZLOG. + */ + zASSERT( sizeof( zlssPool->ZP_PRS ) == sizeof(zlogBeast->ZLB_P.ZLBP_StatisticsRead) ); + memcpy( &zlogBeast->ZLB_P.ZLBP_StatisticsRead, + &zlssPool->ZP_PRS, + sizeof ( zlogBeast->ZLB_P.ZLBP_StatisticsRead ) ); + zASSERT( sizeof( zlssPool->ZP_PWS ) == sizeof(zlogBeast->ZLB_P.ZLBP_StatisticsWrite) ); + memcpy( &zlogBeast->ZLB_P.ZLBP_StatisticsWrite, + &zlssPool->ZP_PWS, + sizeof ( zlogBeast->ZLB_P.ZLBP_StatisticsWrite ) ); + zlogBeast->ZLB_P.ZLBP_StatisticsResetUTCTime = (LONG)zlssPool->ZP_StatisticsResetUTCTime; + + /*** Do the pack now that we have finsihed updating the beast ***/ + memcpy(storeBuffer,&zlogBeast->ZLB_P, sizeof( zlogBeast->ZLB_P ) ); + RTN_PTR(storeBuffer + sizeof( zlogBeast->ZLB_P )); + +} /* End of Zlog_ZfsPack */ + + +/* + * Requirements: Synchronous + * Entry state: No lock on beast latch + * Zlog_ZfsUnpack - + * + * Return - + * Non-NULL - Pointer to next free location within storeBuffer + * NULL indicates an error - errno set in genMsg + * + * Note - + * No latch is owned on the beastLatch for performance reasons. This + * works because the beast has not yet been fully created so it is + * not on any lists. I.E. we do not have the possiblity of concurrent + * access yet. + * + */ + +STATIC BYTE *Zlog_ZfsUnpack( + GeneralMsg_s *genMsg, + void *zlogBeast_LX, + BYTE *storeBuffer) +{ + ZlogBeast_s *zlogBeast = (ZlogBeast_s *)zlogBeast_LX; + ZlogBeastPersistent_s *zlogPForV = (ZlogBeastPersistent_s *)storeBuffer; + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, Zlog_ZfsUnpack); + + /* The persistent size of the 1.0 ZLOG beast is 3220 */ + zASSERT( 3320 == sizeof(ZlogBeastPersistent_1Dot0_s ) ); + /* If the next item changes then the unpack routine may need to change */ + zASSERT( 3448 == sizeof(ZlogBeastPersistent_s ) ); + switch ( zlogPForV->ZLBP_VersionMajor ) + { + case 1: + { + ZlogBeastPersistent_1Dot0_s *zlogP1Dot0 = (ZlogBeastPersistent_1Dot0_s *)storeBuffer; + + /** Copy ALL the items at the beginning of the persistent + ** areas as they match in both versions 1.0 and 3.0. + ** After this we MUST assign items as they no longer match + ** up(i.e. offsets within structures are differrent). + **/ + + if ( zlogP1Dot0->ZLBP_1Dot0_Signature2 != ZLOG_ZLBP_S_SIGNATURE ) + { + SetErrno( genMsg, zERR_ZLOG_BAD_BEAST_SIGNATURE ); + RTN_PTR(NULL); + } + if ( zlogP1Dot0->ZLBP_1Dot0_Signature != ZLOG_ZLBP_S_SIGNATURE ) + { + SetErrno( genMsg, zERR_ZLOG_BAD_BEAST_SIGNATURE ); + RTN_PTR(NULL); + } + memcpy( &zlogBeast->ZLB_P, storeBuffer, + offsetof(ZlogBeastPersistent_1Dot0_s, + ZLBP_1Dot0_PreHomePointerReferenceBlockCount) ); + + /** Init new persistnent histograms... + **/ + zlog_initAllPersistentStatistics( zlogBeast ); + + /** Assign and convert any histograms that we care + ** about. We do the following histograms. + ** + ** 1) Function Histogram. + **/ + + zlogBeast->ZLB_P.ZLBP_FunctionHistogramPersistent.ZHP_CurrentEvent = + zlogP1Dot0->ZLBP_1Dot0_FunctionHistogram.ZH_1Dot0_CurrentEvent; + zlogBeast->ZLB_P.ZLBP_FunctionHistogramPersistent.ZHP_EventCount = + zlogP1Dot0->ZLBP_1Dot0_FunctionHistogram.ZH_1Dot0_EventCount; + zASSERT(sizeof( zlogP1Dot0->ZLBP_1Dot0_FunctionBucket ) <= + sizeof( zlogBeast->ZLB_P.ZLBP_FunctionBucket ) ); + memcpy( &zlogBeast->ZLB_P.ZLBP_FunctionBucket[0], + &zlogP1Dot0->ZLBP_1Dot0_FunctionBucket[0], + sizeof( zlogP1Dot0->ZLBP_1Dot0_FunctionBucket ) ); + + /** These four items only exist in some early (pre-October + ** 2000 versions of Six-Pack media). We handle so that + ** these six-pack volumes will not have to be destroyed. + ** On MOAB/Cobra system this lines just copy some + ** zeros around. + **/ + zlogBeast->ZLB_P.ZLBP_CompensationPtrLsn = zlogP1Dot0->ZLBP_1Dot0_CompensationPtrLsn; + zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrLsn = zlogP1Dot0->ZLBP_1Dot0_LogicalUndoPtrLsn; + zlogBeast->ZLB_P.ZLBP_CompensationPtrBlkNum = zlogP1Dot0->ZLBP_1Dot0_CompensationPtrBlkNum; + zlogBeast->ZLB_P.ZLBP_LogicalUndoPtrBlkNum = zlogP1Dot0->ZLBP_1Dot0_LogicalUndoPtrBlkNum; + + +// bzero( &zlogBeast->ZLB_P.ZLBP_Reserved, sizeof(zlogBeast->ZLB_P.ZLBP_Reserved) ); + /** Assign the last TWO items of the persistent + ** areas as they do NOT match in versions 1.0 and 3.0. + **/ + zlogBeast->ZLB_P.ZLBP_Signature = zlogP1Dot0->ZLBP_1Dot0_Signature; +// zlogBeast->ZLB_P.ZLBP_Pad2 = zlogP1Dot0->ZLBP_1Dot0_Pad2; + + /* Now that we have unpacked, update version so + * that it will be persistent on next PACK. + */ + zlogBeast->ZLB_P.ZLBP_VersionMajor = ZLOG_ZLBP_VM_MAJOR; + storeBuffer += sizeof( *zlogP1Dot0 ); + break; + } + case ZLOG_ZLBP_VM_MAJOR: + { + ZlogBeastPersistent_s *zlogP = (ZlogBeastPersistent_s *)storeBuffer; + + if ( zlogP->ZLBP_Signature2 != ZLOG_ZLBP_S_SIGNATURE ) + { + SetErrno( genMsg, zERR_ZLOG_BAD_BEAST_SIGNATURE ); + RTN_PTR(NULL); + } + if ( zlogP->ZLBP_Signature != ZLOG_ZLBP_S_SIGNATURE ) + { + SetErrno( genMsg, zERR_ZLOG_BAD_BEAST_SIGNATURE ); + RTN_PTR(NULL); + } + memcpy(&zlogBeast->ZLB_P,storeBuffer, sizeof( zlogBeast->ZLB_P )); + storeBuffer += sizeof( *zlogP ); + break; + } + default: + { + zASSERT("In Zlog_ZfsUnpack for unknown ZLOG beasts"==NULL); + SetErrno( genMsg, zERR_ZLOG_UNSUPPORTED_BEAST_VERSION ); + RTN_PTR(NULL); + } + } + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + /* + * Restore persistent items back to default values. Usaully the items + * being retored have only been placed in the persistent area so + * that they can be viewed by the user or developer. + */ + zlogBeast->ZLB_P.ZLBP_Mode = ZLOG_MODE_DEFAULT; + /* + * Initialize common non-persistent items that are dependent on + * persistent information. + */ + if ( zlog_CommonCreateAndOpen( genMsg, zlogBeast ) != zOK ) + { + zASSERT("Please get Greg (or Vandana)"==NULL); + RTN_PTR( NULL ); + } + zASSERT( zlogBeast->ZLB_ReferenceBlockCountHistogram.ZH_Bucket == + &zlogBeast->ZLB_P.ZLBP_ReferenceBlockCountBucket ); + zASSERT( zlogBeast->ZLB_FunctionHistogram.ZH_Bucket == + &zlogBeast->ZLB_P.ZLBP_FunctionBucket ); + zASSERT( zlogBeast->ZLB_SizeHistogram.ZH_Bucket == + &zlogBeast->ZLB_P.ZLBP_SizeBucket ); + zASSERT( zlogBeast->ZLB_BlockInuseCountHistogram.ZH_Bucket == + &zlogBeast->ZLB_P.ZLBP_BlockInuseCountBucket ); + zASSERT( zlogBeast->ZLB_RecoveryTimeHistogram.ZH_Bucket == + &zlogBeast->ZLB_P.ZLBP_RecoveryTimeBucket ); + zASSERT( zlogBeast->ZLB_DeferredWritesHistogram.ZH_Bucket == + &zlogBeast->ZLB_P.ZLBP_DeferredWritesBucket ); + RTN_PTR(storeBuffer); + +} /* End of Zlog_ZfsUnpack() */ + + +/*--------------------------------------------------------------------------- + * ZFS Log Beast(ZLOG) STORAGE ops definition + *---------------------------------------------------------------------------*/ +LSSSpecificPackUnpackOps_s ZLOG_lssOps[] = +{ + /* + * State on entry + * + * beastLatch + * PackedSize Shared(Owned) + * Pack Shared(Owned) + * Unpack None + * + * Requirements placed on code + * + * Synchronous Non-Blocking + * PackedSize Yes Yes + * Pack Yes Yes + * Unpack Yes No + * + */ + + {zLSS_ID_ZLSS,Zlog_ZfsPackedSize,Zlog_ZfsPack,NULL,Zlog_ZfsUnpack}, + {zLSS_ID_INVALID} + +}; + + +void Zlog_Throttle( Volume_s *volume, ZfsXaction_s *zlogXaction ) + +{ + ZlogBeast_s *zlogBeast; + + + ASSERT_MPKNSS_LOCK(); + ENTER(TZLOG, Throttle); + zASSERT( zlogXaction != NULL ); + zlogBeast = zlogXaction->ZX_zlogBeast; + zASSERT( zlogBeast != NULL ); + zASSERT( zlogBeast->ZLB_Signature == ZLOG_ZLB_S_SIGNATURE ); + zASSERT( zlogBeast->ZFSLOGroot.eof >= ZLOG_FILE_SIZE_MINIMUM ); + zASSERT( zlogBeast->ZLB_P.ZLBP_Signature == ZLOG_ZLBP_S_SIGNATURE ); + /* + * Verify that we are not about to give out a block when in + * throttle mode. If in throttle mode we do not allow any NEW + * common layer transactions obtain a first log record. + * + * Is the file 'too' full? + */ + + if ( zlogBeast->ZLB_P.ZLBP_Mode & ZLOG_THROTTLE_BACK ) + { /* In throttle back mode */ + NINT blocksInuse; + NINT percent; + SNINT delayAmount = 0; /* In milliseconds */ + + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWaitCount; + + X_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + + if ( zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber <= + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ) + { /* Do wrap math calculation */ + blocksInuse = (NINT)zlogBeast->ZLB_NumberOfLogBlocks - + (NINT)zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber + + (NINT)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber; + zASSERT(zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber != + zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber ); + } + else + { /* No wrap -- do normal math */ + blocksInuse = (NINT)zlogBeast->ZLB_P.ZLBP_ActivePointerBlockNumber - + (NINT)zlogBeast->ZLB_P.ZLBP_PreOldestHomePointerBlockNumber; + } + if ( blocksInuse >= zlogBeast->ZLB_P.ZLBP_FileThrottleBack ) + { /* Yes - The file is 'too' full */ + /** August 9, 2001 - + ** A recent fix to rebuild causes the ZLOG to + ** throttle rebuild. On my volume I saw that this + ** throttle was TOO extreme because we did not + ** take enough checkpoints to move PRE oldest home + ** pointer. On a 1GB volume with 7000 files we + ** would wait for the 30 second timer to move the + ** fourth checkpoint (the one that would make the + ** ZLOG file less than 1/2 full). In the long run + ** we may wish to remove that logic that says the + ** ZLOG fullness is measured from ACTIVE to OLDEST HOME. + ** This is no longer TRUE since we no longer UNDO + ** committed transactions. I.E. we never have to go + ** past the 1st checkpoint. BUT BUT BUT if we can not + ** read a checkpoint then the previous checkpoint(s) + ** need to still have a valid home. + **/ + zlog_CheckpointTakeSchedule( zlogBeast, + CHECKPOINT_CT_S_NORMAL|CHECKPOINT_CT_S_THROTTLE_BACK ); + percent = (100u * blocksInuse) / + zlogBeast->ZLB_NumberOfLogBlocks; + if ( percent < 55u ) + { + delayAmount = 1000u/10u; /* .1 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait1Count; + } + else + { + if ( percent < 60u ) + { + delayAmount = 1000u/2u; /* .5 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait2Count; + } + else + { + if ( percent < 70u ) + { + delayAmount = 1000u*1u; /* 1 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait3Count; + } + else + { + if ( percent < 80u ) + { + delayAmount = 1000u*10u; /* 10 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait4Count; + } + else + { + if ( percent < 90u ) + { + delayAmount = 1000u*100u; /* 100 sec */ + ++zlogBeast->ZLB_P.ZLBP_FileThrottleWait5Count; + } + else + { + /** On March 1, 2001 + * Paul, Vandana have decided that + * we should DISBALE pools that have a full + * ZLOG file. We do this at 90% so to + * leave room for logical undo (it needs + * to copy logical undo records at + * REDO/UNDO time). + */ + GeneralMsg_s dummyGenMsg; + LONG flags; + + flags = CVA_SYSTEM_DATA | CVA_NON_IO | CVA_POOL_DISABLE | CVA_POOL_ALERT; + COMN_SETUP_GENERAL_MSG_NOSA( &dummyGenMsg ); + (void)COMN_VolumeAlert( &dummyGenMsg, NULL, + volume, NULL, 0, 0, zERR_ZLOG_FILE_FULL, + WHERE, flags ); + } + } + } + } + } + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + /* The specificaiton requires us not to sleep for + * a long time so that if the throttle mode changes + * our thread gets to run 'right' away. We therefore + * delay 0.1 seconds and then see if still in throttle + * back mode. We also let run when thread has waited + * required amount(remember this prevents deadlocks). + */ + do { + LB_delay( 1000u/10u ); + delayAmount -= 1000u/10u; + /* Note that we normally are in a MPKNSS_LOCK region + * when looking at the throttle bit. We ignore here + * because it will not cause any serious problems. + */ + } while ( (delayAmount > 0) && + (zlogBeast->ZLB_P.ZLBP_Mode & ZLOG_THROTTLE_BACK) ); + RTN_VOID(); + } + + UNX_LATCH( &zlogBeast->ZFSLOGbeastLatch ); + RTN_VOID(); + } + + RTN_VOID(); + +}