git-svn-id: https://svn.code.sf.net/p/flaim/code/trunk@482 0109f412-320b-0410-ab79-c3e0c5ffbbe6
436 lines
10 KiB
C++
436 lines
10 KiB
C++
//------------------------------------------------------------------------------
|
|
// Desc: Index collation routines
|
|
//
|
|
// Tabs: 3
|
|
//
|
|
// Copyright (c) 1991-2006 Novell, Inc. All Rights Reserved.
|
|
//
|
|
// This program is free software; you can redistribute it and/or
|
|
// modify it under the terms of version 2 of the GNU General Public
|
|
// License as published by the Free Software Foundation.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, contact Novell, Inc.
|
|
//
|
|
// To contact Novell about this file by physical or electronic mail,
|
|
// you may find current contact information at www.novell.com
|
|
//
|
|
// $Id$
|
|
//------------------------------------------------------------------------------
|
|
|
|
#include "flaimsys.h"
|
|
|
|
FSTATIC RCODE KYFormatUTF8Text(
|
|
IF_PosIStream * pIStream,
|
|
FLMUINT uiFlags,
|
|
FLMUINT uiCompareRules,
|
|
F_DynaBuf * pDynaBuf);
|
|
|
|
/****************************************************************************
|
|
Desc: Build a collated key value piece.
|
|
****************************************************************************/
|
|
RCODE KYCollateValue(
|
|
FLMBYTE * pucDest,
|
|
FLMUINT * puiDestLen,
|
|
IF_PosIStream * pIStream,
|
|
FLMUINT uiDataType,
|
|
FLMUINT uiFlags,
|
|
FLMUINT uiCompareRules,
|
|
FLMUINT uiLimit,
|
|
FLMUINT * puiCollationLen,
|
|
FLMUINT * puiLuLen,
|
|
FLMUINT uiLanguage,
|
|
FLMBOOL bFirstSubstring,
|
|
FLMBOOL bDataTruncated,
|
|
FLMBOOL * pbDataTruncated,
|
|
FLMBOOL * pbOriginalCharsLost)
|
|
{
|
|
RCODE rc = NE_SFLM_OK;
|
|
FLMUINT uiDestLen;
|
|
IF_BufferIStream * pBufferIStream = NULL;
|
|
FLMUINT uiCharLimit;
|
|
FLMUINT uiLength;
|
|
FLMBYTE * pucTmpDest;
|
|
FLMUINT uiBytesRead;
|
|
FLMBOOL bHaveData = TRUE;
|
|
FLMUNICODE uChar;
|
|
FLMBYTE ucDynaBuf[ 64];
|
|
F_DynaBuf dynaBuf( ucDynaBuf, sizeof( ucDynaBuf));
|
|
|
|
if (puiLuLen)
|
|
{
|
|
*puiLuLen = 0;
|
|
}
|
|
|
|
if ((uiDestLen = *puiDestLen) == 0)
|
|
{
|
|
rc = RC_SET( NE_SFLM_KEY_OVERFLOW);
|
|
goto Exit;
|
|
}
|
|
|
|
if (uiDataType != SFLM_STRING_TYPE)
|
|
{
|
|
if( !pIStream->remainingSize())
|
|
{
|
|
bHaveData = FALSE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
FLMUINT64 ui64SavePosition = pIStream->getCurrPosition();
|
|
|
|
if( RC_BAD( rc = f_readUTF8CharAsUnicode(
|
|
pIStream, &uChar)))
|
|
{
|
|
if (rc == NE_SFLM_EOF_HIT)
|
|
{
|
|
bHaveData = FALSE;
|
|
rc = NE_SFLM_OK;
|
|
}
|
|
else
|
|
{
|
|
goto Exit;
|
|
}
|
|
}
|
|
|
|
if( RC_BAD( rc = pIStream->positionTo( ui64SavePosition)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
|
|
// The text is expected to be 0-terminated UTF-8
|
|
|
|
if ((uiFlags & ICD_ESC_CHAR) ||
|
|
(uiCompareRules &
|
|
(FLM_COMP_COMPRESS_WHITESPACE |
|
|
FLM_COMP_NO_WHITESPACE |
|
|
FLM_COMP_NO_UNDERSCORES |
|
|
FLM_COMP_NO_DASHES |
|
|
FLM_COMP_WHITESPACE_AS_SPACE |
|
|
FLM_COMP_IGNORE_LEADING_SPACE |
|
|
FLM_COMP_IGNORE_TRAILING_SPACE)))
|
|
{
|
|
dynaBuf.truncateData( 0);
|
|
if (RC_BAD( rc = KYFormatUTF8Text( pIStream,
|
|
uiFlags, uiCompareRules, &dynaBuf)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
|
|
if( RC_BAD( rc = FlmAllocBufferIStream( &pBufferIStream)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
|
|
if (RC_BAD( rc = pBufferIStream->open(
|
|
(const char *)dynaBuf.getBufferPtr(), dynaBuf.getDataLength())))
|
|
{
|
|
goto Exit;
|
|
}
|
|
pIStream = pBufferIStream;
|
|
}
|
|
|
|
uiCharLimit = uiLimit ? uiLimit : ICD_DEFAULT_LIMIT;
|
|
|
|
if( (uiLanguage >= FLM_FIRST_DBCS_LANG ) &&
|
|
(uiLanguage <= FLM_LAST_DBCS_LANG))
|
|
{
|
|
if( RC_BAD( rc = f_asiaUTF8ToColText( pIStream, pucDest, &uiDestLen,
|
|
(uiCompareRules & FLM_COMP_CASE_INSENSITIVE)
|
|
? TRUE
|
|
: FALSE,
|
|
puiCollationLen, puiLuLen,
|
|
uiCharLimit, bFirstSubstring,
|
|
bDataTruncated, pbDataTruncated)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if( RC_BAD( rc = flmUTF8ToColText( pIStream, pucDest, &uiDestLen,
|
|
(uiCompareRules & FLM_COMP_CASE_INSENSITIVE)
|
|
? TRUE
|
|
: FALSE,
|
|
puiCollationLen, puiLuLen,
|
|
uiLanguage, uiCharLimit, bFirstSubstring,
|
|
bDataTruncated,
|
|
pbOriginalCharsLost, pbDataTruncated)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
}
|
|
}
|
|
|
|
// TRICKY: uiDestLen could be set to zero if text and no value.
|
|
|
|
if (!bHaveData || !uiDestLen)
|
|
{
|
|
uiDestLen = 0;
|
|
goto Exit;
|
|
}
|
|
|
|
switch (uiDataType)
|
|
{
|
|
case SFLM_STRING_TYPE:
|
|
break;
|
|
|
|
case SFLM_NUMBER_TYPE:
|
|
{
|
|
FLMBYTE ucTmpBuf [FLM_MAX_NUM_BUF_SIZE];
|
|
|
|
uiLength = (FLMUINT)pIStream->remainingSize();
|
|
|
|
flmAssert( uiLength <= sizeof( ucTmpBuf));
|
|
|
|
if (RC_BAD( rc = pIStream->read( ucTmpBuf, uiLength, &uiBytesRead)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
flmAssert( uiBytesRead == uiLength);
|
|
if (RC_BAD( rc = flmStorageNum2CollationNum( ucTmpBuf,
|
|
uiBytesRead, pucDest, &uiDestLen)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case SFLM_BINARY_TYPE:
|
|
{
|
|
uiLength = (FLMUINT)pIStream->remainingSize();
|
|
pucTmpDest = pucDest;
|
|
|
|
if (uiLength >= uiLimit)
|
|
{
|
|
uiLength = uiLimit;
|
|
bDataTruncated = TRUE;
|
|
}
|
|
|
|
// We don't want any single key piece to "pig out" more
|
|
// than 256 bytes of the key
|
|
|
|
if (uiDestLen > 256)
|
|
{
|
|
uiDestLen = 256;
|
|
}
|
|
|
|
if (uiLength > uiDestLen)
|
|
{
|
|
|
|
// Compute length so will not overflow
|
|
|
|
uiLength = uiDestLen;
|
|
bDataTruncated = TRUE;
|
|
}
|
|
else
|
|
{
|
|
uiDestLen = uiLength;
|
|
}
|
|
|
|
// Store as is.
|
|
|
|
if (RC_BAD( rc = pIStream->read( pucTmpDest, uiDestLen, &uiBytesRead)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
|
|
if (bDataTruncated && pbDataTruncated)
|
|
{
|
|
*pbDataTruncated = TRUE;
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
{
|
|
rc = RC_SET( NE_SFLM_CANNOT_INDEX_DATA_TYPE);
|
|
break;
|
|
}
|
|
}
|
|
|
|
Exit:
|
|
|
|
if( pBufferIStream)
|
|
{
|
|
pBufferIStream->Release();
|
|
}
|
|
|
|
*puiDestLen = uiDestLen;
|
|
return( rc);
|
|
}
|
|
|
|
/****************************************************************************
|
|
Desc: Format text removing leading and trailing spaces. Treat
|
|
underscores as spaces. As options, remove all spaces and dashes.
|
|
Ret: NE_SFLM_OK always. WIll truncate so text will fill SFLM_MAX_KEY_SIZE.
|
|
Allocate 8 more than SFLM_MAX_KEY_SIZE for psDestBuf.
|
|
Visit: Pass in uiLimit and pass back a truncated flag when the
|
|
string is truncated. This was not done because we will have
|
|
to get the exact truncated count that is done in f_tocoll.cpp
|
|
and that could introduce some bugs.
|
|
****************************************************************************/
|
|
FSTATIC RCODE KYFormatUTF8Text(
|
|
IF_PosIStream * pIStream,
|
|
FLMUINT uiFlags, // ICD flags
|
|
FLMUINT uiCompareRules, // ICD compare rules
|
|
F_DynaBuf * pDynaBuf)
|
|
{
|
|
RCODE rc = NE_SFLM_OK;
|
|
FLMUINT uiFirstSpaceCharPos = FLM_MAX_UINT;
|
|
FLMUNICODE uChar;
|
|
FLMUINT uiSize;
|
|
FLMUINT uiStrSize = 0;
|
|
FLMBYTE * pucTmp;
|
|
|
|
if( !pIStream->remainingSize())
|
|
{
|
|
pDynaBuf->truncateData( 0);
|
|
goto Exit;
|
|
}
|
|
|
|
for (;;)
|
|
{
|
|
if (RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar)))
|
|
{
|
|
if (rc == NE_SFLM_EOF_HIT)
|
|
{
|
|
rc = NE_SFLM_OK;
|
|
break;
|
|
}
|
|
goto Exit;
|
|
}
|
|
if ((uChar = f_convertChar( uChar, uiCompareRules)) == 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (uChar == ASCII_SPACE)
|
|
{
|
|
if (uiCompareRules &
|
|
(FLM_COMP_COMPRESS_WHITESPACE |
|
|
FLM_COMP_IGNORE_TRAILING_SPACE))
|
|
{
|
|
|
|
// Remember the position of the first space.
|
|
// When we come to the end of the spaces, we may reset
|
|
// the size to compress out spaces if necessary. Or,
|
|
// we may opt to get rid of all of them.
|
|
|
|
if (uiFirstSpaceCharPos == FLM_MAX_UINT)
|
|
{
|
|
uiFirstSpaceCharPos = uiStrSize;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
|
|
// Once we hit a non-space character, we can turn off the
|
|
// ignore leading spaces flag.
|
|
|
|
uiCompareRules &= (~(FLM_COMP_IGNORE_LEADING_SPACE));
|
|
|
|
// See if we need to compress spaces.
|
|
|
|
if (uiFirstSpaceCharPos != FLM_MAX_UINT)
|
|
{
|
|
|
|
// Output exactly one ASCII_SPACE character if we are compressing
|
|
// spaces. If we are not compressing spaces, then the only other
|
|
// way uiFirstSpaceCharPos would have been set is if we were
|
|
// ignoring trailing spaces. In that case, since the spaces
|
|
// were not trailing spaces, we need to leave them as is.
|
|
|
|
if (uiCompareRules & FLM_COMP_COMPRESS_WHITESPACE)
|
|
{
|
|
|
|
// A space will already have been encoded into the string.
|
|
// Since we know a space takes exactly one byte in the UTF8
|
|
// space, we can simply set our pointer one byte past where
|
|
// the last non-space character was found.
|
|
|
|
uiStrSize = uiFirstSpaceCharPos + 1;
|
|
pDynaBuf->truncateData( uiStrSize);
|
|
}
|
|
uiFirstSpaceCharPos = FLM_MAX_UINT;
|
|
}
|
|
|
|
// If we are allowing escaped characters, backslash is treated
|
|
// always as an escape character. Whatever follows the
|
|
// backslash is the character we need to process.
|
|
|
|
if (uChar == ASCII_BACKSLASH && (uiFlags & ICD_ESC_CHAR))
|
|
{
|
|
if (RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar)))
|
|
{
|
|
if (rc == NE_SFLM_EOF_HIT)
|
|
{
|
|
rc = NE_SFLM_OK;
|
|
}
|
|
else
|
|
{
|
|
goto Exit;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Output the character - need at most three bytes
|
|
|
|
if (RC_BAD( rc = pDynaBuf->allocSpace( 3, (void **)&pucTmp)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
uiSize = 3;
|
|
if (RC_BAD( rc = f_uni2UTF8( uChar, pucTmp, &uiSize)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
uiStrSize += uiSize;
|
|
pDynaBuf->truncateData( uiStrSize);
|
|
}
|
|
|
|
// If uiFirstSpaceCharPos != FLM_MAX_UINT, it means that all of the
|
|
// characters at the end of the string were spaces. If we
|
|
// are ignoring trailing spaces, we need to truncate the string so
|
|
// they will be ignored. Otherwise, we need to compress them into
|
|
// a single space.
|
|
|
|
if (uiFirstSpaceCharPos != FLM_MAX_UINT)
|
|
{
|
|
if (uiCompareRules & FLM_COMP_IGNORE_TRAILING_SPACE)
|
|
{
|
|
uiStrSize = uiFirstSpaceCharPos;
|
|
}
|
|
else
|
|
{
|
|
flmAssert( uiCompareRules & FLM_COMP_COMPRESS_WHITESPACE);
|
|
|
|
// A space will already have been encoded into the string.
|
|
// Since we know a space takes exactly one byte in the UTF8
|
|
// space, we can simply set our pointer one byte past where
|
|
// the last non-space character was found.
|
|
|
|
uiStrSize = uiFirstSpaceCharPos + 1;
|
|
}
|
|
pDynaBuf->truncateData( uiStrSize);
|
|
}
|
|
|
|
// Terminate the UTF-8 string
|
|
|
|
if (RC_BAD( rc = pDynaBuf->appendByte( 0)))
|
|
{
|
|
goto Exit;
|
|
}
|
|
|
|
Exit:
|
|
|
|
return( rc);
|
|
}
|