Files
mars-flaim/flaim/src/fqtextc.cpp
dsandersoremutah c55dab446f Renamed version4 to flaim and version5 to xflaim
git-svn-id: https://svn.code.sf.net/p/flaim/code/trunk@7 0109f412-320b-0410-ab79-c3e0c5ffbbe6
2006-01-27 21:06:39 +00:00

1425 lines
39 KiB
C++

//-------------------------------------------------------------------------
// Desc: Query text comparison
// Tabs: 3
//
// Copyright (c) 1991-2006 Novell, Inc. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of version 2 of the GNU General Public
// License as published by the Free Software Foundation.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, contact Novell, Inc.
//
// To contact Novell about this file by physical or electronic mail,
// you may find current contact information at www.novell.com
//
// $Id: fqtextc.cpp 12329 2006-01-20 17:49:30 -0700 (Fri, 20 Jan 2006) ahodgkinson $
//-------------------------------------------------------------------------
#include "flaimsys.h"
// From COLTBL.cpp
extern FLMBYTE fwp_dia60Tbl[]; /* Diacritic conversions */
extern FLMBYTE fwp_alefSubColTbl[];
extern FLMBYTE fwp_ar2BitTbl[];
#define COMPARE_COLLATION 1
#define COMPARE_COL_AND_SUBCOL 2
#define COMPARE_VALUE 3
#define NULL_SUB_COL_CHECK NULL
#define NULL_CASE_CHECK NULL
#define NULL_WILD_CARD_CHECK NULL
FSTATIC FLMINT flmTextCompareSingleChar(
FLMBYTE ** ppLeftText,
FLMUINT * puiLeftLen,
FLMUINT * puiLeftWpChar2,
FLMBYTE ** ppRightText,
FLMUINT * puiRightLen,
FLMUINT * puiRightWpChar2,
FLMINT * piSubColCompare,
FLMINT * piCaseCompare,
FLMBOOL * pbHitWildCard,
FLMINT iCompareType,
FLMUINT16 * pui16ColVal,
FLMUINT uiFlags,
FLMUINT uiLangId);
FSTATIC FLMUINT16 flmTextGetSubCol(
FLMUINT16 ui16WPValue,
FLMUINT16 ui16ColValue,
FLMUINT uiLangId);
/****************************************************************************
Desc:
****************************************************************************/
FINLINE FLMUINT flmCharTypeAnsi7(
FLMUINT16 ui16Char)
{
if( (ui16Char >= ASCII_LOWER_A && ui16Char <= ASCII_LOWER_Z) ||
(ui16Char >= ASCII_UPPER_A && ui16Char <= ASCII_UPPER_Z) ||
(ui16Char >= ASCII_ZERO && ui16Char <= ASCII_NINE))
{
return SDWD_CHR;
}
if( ui16Char == 0x27)
return WDJN_CHR;
if( ui16Char <= 0x2B)
return DELI_CHR;
if( ui16Char == ASCII_COMMA ||
ui16Char == ASCII_DASH ||
ui16Char == ASCII_DOT ||
ui16Char == ASCII_SLASH ||
ui16Char == ASCII_COLON ||
ui16Char == ASCII_AT ||
ui16Char == ASCII_BACKSLASH ||
ui16Char == ASCII_UNDERSCORE)
return WDJN_CHR;
return DELI_CHR;
}
/*API~***********************************************************************
Name : FlmStrCmp
Desc : Compare two unicode strings. This comparison uses the collation
rules that are defined for the specified language.
Return: Signed value of compare.
<0 if less than, 0 if equal, >0 if greater than
The case of returning 1 may be in using wild cards which
only need to return a does not match value.
*END************************************************************************/
FLMINT FlmStrCmp(
FLMUINT uiCompFlags,
FLMUINT byLang,
const FLMUNICODE * uzStr1,
const FLMUNICODE * uzStr2)
{
FLMINT iCmp;
POOL Pool;
NODE * pNd1;
NODE * pNd2;
RCODE rc;
GedPoolInit( &Pool, 256);
if( (pNd1 = GedNodeMake( &Pool, 1, &rc)) == NULL ||
(pNd2 = GedNodeMake( &Pool, 1, &rc)) == NULL)
{
flmAssert( 0);
iCmp = 1;
goto Exit;
}
if( RC_BAD( rc = GedPutUNICODE( &Pool, pNd1, uzStr1)))
{
flmAssert( RC_OK( rc));
iCmp = 1;
goto Exit;
}
if( RC_BAD( rc = GedPutUNICODE( &Pool, pNd2, uzStr2)))
{
flmAssert( RC_OK( rc));
iCmp = -1;
goto Exit;
}
// Handle null string cases.
if( GedValLen( pNd1) == 0)
{
iCmp = 1;
goto Exit;
}
else if( GedValLen( pNd2) == 0)
{
iCmp = -1;
goto Exit;
}
// VISIT: need to add support for the IGNORE_DASH and IGNORE_SPACE options.
iCmp = flmTextCompare( (FLMBYTE *)GedValPtr( pNd1), GedValLen( pNd1),
(FLMBYTE *)GedValPtr( pNd2), GedValLen( pNd2), uiCompFlags, byLang);
Exit:
GedPoolFree( &Pool);
return iCmp;
}
/****************************************************************************
Desc: Compare two entire strings. There is some debate how this routine
should compare the sub-collation values when wild cards are used.
THIS DOES NOT ALLOW WILD CARDS.
Return: Signed value of compare.
<0 if less than, 0 if equal, >0 if greater than
The case of returning 1 may be in using wild cards which
only need to return a does not match value.
****************************************************************************/
FLMINT flmTextCompare(
FLMBYTE * pLeftBuf,
FLMUINT uiLeftLen,
FLMBYTE * pRightBuf,
FLMUINT uiRightLen,
FLMUINT uiFlags,
FLMUINT uiLang)
{
FLMINT iCompare = 0;
FLMINT iSubColCompare = 0; // MUST BE INITIALIZED
FLMINT * pSubColCompare;
FLMINT iCaseCompare = 0; // MUST BE INITIALIZED
FLMINT * pCaseCompare;
FLMUINT uiLeadingSpace;
FLMUINT uiTrailingSpace;
FLMUINT16 ui16ColVal = 0; // Needed for asian collation
FLMUINT16 ui16WPChar;
FLMUINT16 ui16UniChar;
FLMUINT uiLeftWpChar2 = 0;
FLMUINT uiRightWpChar2 = 0;
uiTrailingSpace = uiLeadingSpace =
(uiFlags & FLM_MIN_SPACES) ? FLM_NO_SPACE : 0;
pCaseCompare = (uiFlags & FLM_NOCASE) ? NULL : &iCaseCompare;
pSubColCompare = &iSubColCompare;
// Handle NULL buffers first.
if (!pLeftBuf)
{
if (pRightBuf)
{
iCompare = -1;
}
goto Exit;
}
while ((uiLeftLen || uiLeftWpChar2) &&
(uiRightLen || uiRightWpChar2))
{
if ((iCompare = flmTextCompareSingleChar(
&pLeftBuf, &uiLeftLen, &uiLeftWpChar2,
&pRightBuf, &uiRightLen, &uiRightWpChar2,
pSubColCompare, pCaseCompare, NULL_WILD_CARD_CHECK,
COMPARE_COLLATION, &ui16ColVal,
uiFlags | uiLeadingSpace, uiLang)) != 0)
{
goto Exit;
}
uiLeadingSpace = 0;
}
// EQUAL - as far as the collation values are concerned and one
// or both of the strings is at the end.
if (uiLeftLen || uiLeftWpChar2)
{
uiLeftLen -= flmTextGetValue( pLeftBuf, uiLeftLen, &uiLeftWpChar2,
uiFlags | uiTrailingSpace, &ui16WPChar, &ui16UniChar);
if (uiLeftLen || ui16WPChar || ui16UniChar)
{
iCompare = 1;
}
}
else if (uiRightLen || uiRightWpChar2)
{
uiRightLen -= flmTextGetValue( pRightBuf, uiRightLen, &uiRightWpChar2,
uiFlags | uiTrailingSpace, &ui16WPChar, &ui16UniChar);
if (uiRightLen || ui16WPChar || ui16UniChar)
{
iCompare = -1;
}
}
if (iCompare == 0)
{
// All collation bytes equal - return subcollation/case difference.
iCompare = (iSubColCompare != 0) ? iSubColCompare : iCaseCompare;
}
Exit:
return iCompare;
}
/****************************************************************************
Desc: Match two entire strings.
Return: FLM_TRUE or FLM_FALSE
Notes: This code calls the collation routine because in the future there
will be equal conditions with different unicode characters.
DOCUMENTATION DEALING WITH WILD CARDS AND SPACE RULES.
The space rules are not obvious when dealing with wild cards.
This will outline the rules that are being applied so that we can
do a regression test when this code changes.
Rule #1: Return same result if leading or trailing wild card is added.
The underscore is also the space character in these examples
and the MIN_SPACES rule is being applied.
Format: DataString Operator SearchString
Example: if A == A A_ == A A == A_ A_ == A_
then A == A* A_ == A* A == A_* A_ == A_*
and A == *A A_ == *A A == *A_ A_ == *A_
and A == *A* A_ == *A* A == *A_* A_ == *A_*
where 'A' represent a string of any characters.
Strictly put, the query Field == A_* can be broken down to
Field == A || Field == A_*
where the space after 'A' should not be treated as a trailing space.
In addition we can apply the space before the string with the same results,
but we are not going to handle the case of *_A correctly.
This is because the query *_A should be expanded to
Field == A || Field == *_A
where the space before 'A' should not be treated as a leading space.
When we need to find "_A" in a search string then we will expand the
query to handle this.
Rule #2: The spaces before a trailing truncation are NOT to be treated
as trailing spaces if there are remaining bytes in the data string.
Example: (A_B == A_*) but (AB != A_*)
Rule #3: Space value(s) without anything other value are equal to no values.
Example: (" " == "")
Rule #4: Trim leading/trailing spaces before and after wild cards.
SMI does this when formatting.
_* and *_ same as * so A == _* and A = *_ but A != *_*
Additional wildcard cases to test for:
Wildcard cases to handle.
(ABBBBC == A*BC) Hits the goto Compare_Again case three times.
(ABBBBD != A*B) Stuff still remains in dataString
(ABBBBC != A*BCD) Stuff still remains in searchString
****************************************************************************/
FLMUINT flmTextMatch(
FLMBYTE * pLeftBuf,
FLMUINT uiLeftLen,
FLMBYTE * pRightBuf,
FLMUINT uiRightLen,
FLMUINT uiFlags,
FLMBOOL bLeadingWildCard,
FLMBOOL bTrailingWildCard,
FLMUINT uiLang)
{
FLMINT iCompare = 0;
FLMUINT uiLeadingSpace;
FLMUINT uiTrailingSpace;
FLMBOOL bHitWildCard;
FLMBOOL bHasWildCardPos;
FLMBOOL * pbHitWildCard;
FLMUINT uiValueLen;
FLMUINT16 ui16WPChar;
FLMUINT16 ui16UniChar;
FLMUINT16 ui16Tmp1;
FLMUINT16 ui16Tmp2;
FLMINT iCompareType;
FLMUINT uiLeftWpChar2 = 0;
FLMUINT uiRightWpChar2 = 0;
// LWCP = Last Wild Card Position - used for wild card state
FLMBYTE * pLWCPLeftBuf = NULL;
FLMBYTE * pLWCPRightBuf = NULL;
FLMUINT uiLWCPLeftLen = 0;
FLMUINT uiLWCPRightLen = 0;
FLMUINT uiLWCPLeftWpChar2 = 0;
FLMUINT uiLWCPRightWpChar2 = 0;
if( uiFlags & FLM_COMPARE_COLLATED_VALUES)
{
iCompareType = COMPARE_COLLATION;
}
else
{
iCompareType = (uiFlags & FLM_NOCASE)
? COMPARE_COL_AND_SUBCOL : COMPARE_VALUE;
}
// Handle NULL buffers first - don't test for zero length values yet.
if (!pLeftBuf)
{
if (pRightBuf)
{
iCompare = -1;
}
goto Exit;
}
bHitWildCard = bHasWildCardPos = FALSE;
uiLeadingSpace = uiTrailingSpace =
(uiFlags & FLM_MIN_SPACES) ? FLM_NO_SPACE : 0;
pbHitWildCard = (uiFlags & FLM_WILD) ? &bHitWildCard : NULL;
if (bLeadingWildCard)
{
goto Leading_Wild_Card;
}
while (!iCompare &&
(uiLeftLen || uiLeftWpChar2) &&
(uiRightLen || uiRightWpChar2))
{
iCompare = flmTextCompareSingleChar(
&pLeftBuf, &uiLeftLen, &uiLeftWpChar2,
&pRightBuf, &uiRightLen, &uiRightWpChar2,
NULL_SUB_COL_CHECK, NULL_CASE_CHECK, pbHitWildCard,
iCompareType, NULL,
uiFlags | uiLeadingSpace, uiLang);
uiLeadingSpace = 0;
if (bHitWildCard)
{
Leading_Wild_Card:
bHitWildCard = FALSE;
bHasWildCardPos = FALSE; // Turn off last wildcard.
// If right side is done, we are done.
if (!uiRightLen && !uiRightWpChar2)
{
uiLeftLen = 0;
uiLeftWpChar2 = 0;
break;
}
// Save state on the RIGHT to handle the sick case of search key
// "b*aH" being able to match "baaaaaaaaaH" (Lambda Case)
// LWCP = LastWildCardPosition
pLWCPRightBuf = pRightBuf;
uiLWCPRightLen = uiRightLen;
uiLWCPRightWpChar2 = uiRightWpChar2;
// Find first matching character on the left side.
Compare_Again:
iCompare = -1;
while (iCompare && (uiLeftLen || uiLeftWpChar2))
{
iCompare = flmTextCompareSingleChar(
&pLeftBuf, &uiLeftLen, &uiLeftWpChar2,
&pRightBuf, &uiRightLen, &uiRightWpChar2,
NULL_SUB_COL_CHECK, NULL_CASE_CHECK, NULL_WILD_CARD_CHECK,
iCompareType, NULL,
uiFlags | uiLeadingSpace, uiLang);
uiLeadingSpace = 0;
// Done with the right side? Return iCompare value.
if (!uiRightLen && !uiRightWpChar2)
{
break;
}
// Values different and still have stuff on left?
if (iCompare && (uiLeftLen || uiLeftWpChar2))
{
// Advance the left if there is anything left
uiValueLen = flmTextGetValue( pLeftBuf, uiLeftLen,
&uiLeftWpChar2,
uiFlags, &ui16Tmp1, &ui16Tmp2);
pLeftBuf += uiValueLen;
uiLeftLen -= uiValueLen;
}
}
// Save state on the LEFT
if (uiLeftLen || uiLeftWpChar2)
{
pLWCPLeftBuf = pLeftBuf;
uiLWCPLeftLen = uiLeftLen;
uiLWCPLeftWpChar2 = uiLeftWpChar2;
bHasWildCardPos = TRUE;
}
// EQUAL - as far as the collation values are concerned.
}
}
if (iCompare == 0)
{
// In here because LEFT and/or RIGHT are out of bytes.
// Check for trailing spaces if MIN_SPACES.
if (uiLeftLen || uiLeftWpChar2)
{
if (!bTrailingWildCard)
{
uiLeftLen -= flmTextGetValue( pLeftBuf, uiLeftLen,
&uiLeftWpChar2,
uiFlags | uiTrailingSpace, &ui16WPChar,
&ui16UniChar);
if (uiLeftLen || ui16WPChar || ui16UniChar)
{
iCompare = 1;
}
}
}
else if (uiRightLen || uiRightWpChar2)
{
uiRightLen -= flmTextGetValue( pRightBuf, uiRightLen, &uiRightWpChar2,
uiFlags | uiTrailingSpace, &ui16WPChar, &ui16UniChar);
// Equals if right just had a trailing wild card. (else case)
if (uiRightLen || !pbHitWildCard || ui16WPChar != '*')
{
if (uiRightLen || ui16WPChar || ui16UniChar)
{
iCompare = -1;
}
}
}
}
// Handle the embedded wild card case.
if (iCompare != 0 && bHasWildCardPos)
{
// Restore wild card state.
pLeftBuf = pLWCPLeftBuf;
uiLeftLen = uiLWCPLeftLen;
uiLeftWpChar2 = uiLWCPLeftWpChar2;
pRightBuf = pLWCPRightBuf;
uiRightLen = uiLWCPRightLen;
uiRightWpChar2 = uiLWCPRightWpChar2;
bHasWildCardPos = FALSE;
goto Compare_Again;
}
Exit:
return (!iCompare ? FLM_TRUE : FLM_FALSE);
}
/****************************************************************************
Desc: Compare only the leading left and right characters according
to the many flags that are passed in. This routine operates
to save and set state for the calling routine.
TODO:
This routine does NOT support Asian, Hebrew, or Arabic language
collations. In addition, fwpCheckDoubleCollation() is not called for other non-US
lanagues. There is still a lot of work to do! This is our
default US compare and it is not very good for JP.
Return: Signed value of compare.
<0 if less than, 0 if equal, >0 if greater than.
Asian Notes:
The asian compare takes two characters and may use one or both.
This makes the algorithm complex so we may have to build full
tests to see what we broke.
NDS Notes:
The right side (search string) is already formatted according
to the space/dash rules of the syntax.
****************************************************************************/
FSTATIC FLMINT flmTextCompareSingleChar(
FLMBYTE ** ppLeftText, // [in] Points to current value.
// [out] Points to next character if equals.
FLMUINT * puiLeftLen, // [in] Bytes remaining in text string.
// [out] Bytes remaining in text string.
FLMUINT * puiLeftWpChar2,// Second left character - for double characters
FLMBYTE ** ppRightText, // [in] Points to current value.
// [out] Points to next character if equals.
FLMUINT * puiRightLen, // [in] Bytes remaining in text string.
// [out] Bytes remaining in text string.
FLMUINT * puiRightWpChar2,// Second right character - for double characters.
FLMINT * piSubColCompare,//[in] If NULL disregard the subcollation
// values if collation values are equal.
// [out] If equals is returned, value is
// set ONLY if the signed value of comparing
// the sub-collation values is not equal.
// See lengthy unicode compare below.
FLMINT * piCaseCompare, // [in] If NULL disregard the case bits
// if collation values are equal. Japanese
// values are an exception to this rule.
// [out] If equals is returned, value is
// set ONLY if the signed value of comparing
// the case values is not equal.
FLMBOOL * pbHitWildCard, // [in] If NULL then do not look for wild
// cards in the right text string.
// [out] If non-null, a wild card (*,?) will
// be looked for on the RIGHT SIDE ONLY.
// If '?' is found 0 will be returned and
// pointers are advanced. If '*' is found,
// this value will be set to TRUE and the
// right side is advanced. If no wild
// card is found the value will not be set.
FLMINT iCompareType, // COMPARE_COLLATION, COMPARE_COL_AND_SUBCOL, COMPARE_VALUE
FLMUINT16 * pui16ColVal, // Needed for asian collation compare.
FLMUINT uiFlags, // FLM_* flags
FLMUINT uiLangId) // FLAIM/WordPerfect Lanaguge id.
{
FLMBYTE * pLeftText = *ppLeftText;
FLMBYTE * pRightText = *ppRightText;
FLMINT iCompare = 0;
FLMUINT uiRightFlags = uiFlags;
FLMUINT16 ui16LeftWPChar;
FLMUINT16 ui16LeftUniChar;
FLMUINT16 ui16RightWPChar;
FLMUINT16 ui16RightUniChar;
FLMUINT uiLeftValueLen;
FLMUINT uiRightValueLen;
FLMUINT16 ui16LeftCol;
FLMUINT16 ui16RightCol;
FLMUINT uiLeftWpChar2 = *puiLeftWpChar2;
FLMUINT uiRightWpChar2 = *puiRightWpChar2;
FLMBOOL bLeftTwoIntoOne;
FLMBOOL bRightTwoIntoOne;
// Get the next character from the TEXT string. NOTE: OEM characters
// will be returned as a UNICODE character. A unicode character here
// is a value that cannot be converted to the WP set (no good collation value)..
uiLeftValueLen = flmTextGetValue( pLeftText, *puiLeftLen, &uiLeftWpChar2,
uiFlags, &ui16LeftWPChar, &ui16LeftUniChar);
uiRightValueLen = flmTextGetValue( pRightText, *puiRightLen, &uiRightWpChar2,
uiRightFlags, &ui16RightWPChar, &ui16RightUniChar);
// At this point, the double character, if any, should have been consumed.
flmAssert( !uiLeftWpChar2 && !uiRightWpChar2);
// Check for the following escape characters: "\\" "*" and "\\" "\\"
if( ui16RightWPChar == ASCII_BACKSLASH)
{
if( pRightText[ uiRightValueLen ] == ASCII_BACKSLASH)
{
uiRightValueLen++;
}
else if( pRightText[ uiRightValueLen ] == ASCII_WILDCARD)
{
ui16RightWPChar = ASCII_WILDCARD;
uiRightValueLen++;
}
}
// Checking for wild cards in the right string? (Always a WP character)
else if( pbHitWildCard)
{
// The '*' wildcard means to match zero or many characters.
// The sick case of "A*B" compared to "A**B" should be considered.
if( ui16RightWPChar == ASCII_WILDCARD)
{
// Eat all duplicate wild cards.
while( pRightText[ uiRightValueLen] == ASCII_WILDCARD)
{
uiRightValueLen++;
}
// Advance the right value. Keep left value alone.
// Return equals (default).
*pbHitWildCard = TRUE;
// Don't advance the left value.
uiLeftValueLen = 0;
uiLeftWpChar2 = *puiLeftWpChar2;
goto Exit;
}
}
// First section is to compare just WP values.
if( ui16LeftWPChar && ui16RightWPChar)
{
FLMUINT16 ui16LeftSubCol;
FLMUINT16 ui16RightSubCol;
if (iCompareType == COMPARE_VALUE)
{
// Check the obvious case of equal WP values.
if( ui16LeftWPChar != ui16RightWPChar)
{
iCompare = -1;
}
goto Exit;
}
// JP compare code.
if (uiLangId >= FIRST_DBCS_LANG && uiLangId <= LAST_DBCS_LANG)
{
FLMUINT uiNextLeftLen;
FLMUINT uiNextRightLen;
FLMUINT16 ui16NextLeftWPChar;
FLMUINT16 ui16NextRightWPChar;
FLMUINT16 ui16ColVal = pui16ColVal ? *pui16ColVal : 0;
FLMBYTE ucLeftCaseValue;
FLMBYTE ucRightCaseValue;
// Should have already consumed double character, if any
flmAssert( !uiLeftWpChar2 && !uiRightWpChar2);
uiNextLeftLen = flmTextGetValue( pLeftText+uiLeftValueLen,
*puiLeftLen, &uiLeftWpChar2, uiFlags,
&ui16NextLeftWPChar, &ui16LeftUniChar);
uiNextRightLen = flmTextGetValue( pRightText+uiRightValueLen,
*puiRightLen, &uiRightWpChar2, uiFlags,
&ui16NextRightWPChar, &ui16RightUniChar);
// nextL/R WPChar may be zero.
if (fwpAsiaGetCollation( ui16LeftWPChar, ui16NextLeftWPChar, ui16ColVal,
&ui16LeftCol, &ui16LeftSubCol, &ucLeftCaseValue, FALSE) == 2)
{
uiLeftValueLen += uiNextLeftLen;
}
if (fwpAsiaGetCollation( ui16RightWPChar, ui16NextRightWPChar, ui16ColVal,
&ui16RightCol, &ui16RightSubCol, &ucRightCaseValue, FALSE) == 2)
{
uiRightValueLen += uiNextRightLen;
}
// Compare all of the stuff now.
if (ui16LeftCol == ui16RightCol)
{
if( (iCompareType == COMPARE_COL_AND_SUBCOL) ||
(piSubColCompare && (*piSubColCompare == 0)))
{
if( ui16LeftSubCol != ui16RightSubCol)
{
if( iCompareType == COMPARE_COL_AND_SUBCOL)
{
iCompare = -1;
goto Exit;
}
// At this point piSubColCompare cannot be NULL.
*piSubColCompare = (ui16LeftSubCol < ui16RightSubCol) ? -1 : 1;
// Write over the case compare value
if( piCaseCompare )
{
*piCaseCompare = *piSubColCompare;
}
}
}
if (iCompareType != COMPARE_COL_AND_SUBCOL)
{
// Check case?
if (piCaseCompare && (*piCaseCompare == 0))
{
if( ucLeftCaseValue != ucRightCaseValue)
{
*piCaseCompare = ucLeftCaseValue < ucRightCaseValue?-1:1;
}
}
}
}
else
{
iCompare = (ui16LeftCol < ui16RightCol) ? -1 : 1;
}
goto Exit;
}
flmAssert( !uiLeftWpChar2 && !uiRightWpChar2);
if (uiLangId != US_LANG)
{
const FLMBYTE * pucTmp;
pucTmp = pLeftText + uiLeftValueLen;
uiLeftWpChar2 = fwpCheckDoubleCollation( &ui16LeftWPChar, &bLeftTwoIntoOne,
&pucTmp, uiLangId);
uiLeftValueLen = (FLMUINT)(pucTmp - pLeftText);
pucTmp = pRightText + uiRightValueLen;
uiRightWpChar2 = fwpCheckDoubleCollation( &ui16RightWPChar, &bRightTwoIntoOne,
&pucTmp, uiLangId);
uiRightValueLen = (FLMUINT)(pucTmp - pRightText);
// See if we got the same double character
if (uiLeftWpChar2 == uiRightWpChar2 &&
ui16LeftWPChar == ui16RightWPChar)
{
uiLeftWpChar2 = 0;
uiRightWpChar2 = 0;
goto Exit;
}
}
else if (ui16LeftWPChar == ui16RightWPChar)
{
// Same WP character
goto Exit;
}
ui16LeftCol = fwpGetCollation( ui16LeftWPChar, uiLangId);
// Handle two characters collating as one.
if (uiLeftWpChar2 && bLeftTwoIntoOne)
{
ui16LeftCol++;
}
ui16RightCol = fwpGetCollation( ui16RightWPChar, uiLangId);
// Handle two characters collating as one.
if (uiRightWpChar2 && bRightTwoIntoOne)
{
ui16RightCol++;
}
if( ui16LeftCol == ui16RightCol)
{
// Should we bother to check subcollation? - don't bother with 7-bit
if( ( (iCompareType == COMPARE_COL_AND_SUBCOL)
|| (piSubColCompare && (*piSubColCompare == 0)))
&& ((ui16LeftWPChar | ui16RightWPChar) & 0xFF00)) // Non-ascii
{
ui16LeftSubCol = flmTextGetSubCol( ui16LeftWPChar,
ui16LeftCol, uiLangId);
ui16RightSubCol= flmTextGetSubCol( ui16RightWPChar,
ui16RightCol, uiLangId);
if (!piCaseCompare)
{
// If the sub-collation value is the original
// character, it means that the collation could not
// distinguish the characters and sub-collation is being
// used to do it. However, this creates a problem when the
// characters are the same character except for case. In that
// scenario, we incorrectly return a not-equal when we are
// doing a case-insensitive comparison. So, at this point,
// we need to use the sub-collation for the upper-case of the
// character instead of the sub-collation for the character
// itself.
if (ui16LeftSubCol == ui16LeftWPChar)
{
ui16LeftSubCol = flmTextGetSubCol(
fwpCh6Upper( ui16LeftWPChar),
ui16LeftCol, uiLangId);
}
if (ui16RightSubCol == ui16RightWPChar)
{
ui16RightSubCol= flmTextGetSubCol(
fwpCh6Upper( ui16RightWPChar),
ui16RightCol, uiLangId);
}
}
// YES - go for it...
if( ui16LeftSubCol != ui16RightSubCol)
{
if( iCompareType == COMPARE_COL_AND_SUBCOL)
{
iCompare = (ui16LeftSubCol < ui16RightSubCol) ? -1 : 1;
goto Exit;
}
// At this point piSubColCompare cannot be NULL.
*piSubColCompare = (ui16LeftSubCol < ui16RightSubCol) ? -1 : 1;
/* Write over the case compare value */
if( piCaseCompare )
{
*piCaseCompare = *piSubColCompare;
}
}
// ? goto Exit???
}
if( iCompareType == COMPARE_COL_AND_SUBCOL)
{
goto Exit;
}
// Check case?
if( piCaseCompare && (*piCaseCompare == 0))
{
// fwpIsUpper() only returns FALSE (lower) or TRUE (not-lower)
FLMBOOL bLeftUpper = fwpIsUpper( ui16LeftWPChar);
FLMBOOL bRightUpper = fwpIsUpper( ui16RightWPChar);
if (bLeftUpper != bRightUpper)
{
*piCaseCompare = !bLeftUpper ? -1 : 1;
}
// ? else - don't know why they would be the same.
}
}
else
{
iCompare = (ui16LeftCol < ui16RightCol) ? -1 : 1;
}
goto Exit;
} // end of working with BOTH WP characters
/*else*/
if( ui16LeftUniChar && ui16RightUniChar)
{
// Compare two (non-convertable) UNICODE values.
// Check the obvious case of equal UNICODE values.
if( ui16LeftUniChar == ui16RightUniChar)
{
goto Exit;
}
// Compare subcollation or compare value?
if( iCompareType != COMPARE_COLLATION)
{
iCompare = -1;
goto Exit;
}
/*
For non-asian - we store these values in the sub-collcation area.
We should return the differece in sub-collation values - but this
may not work for all compares.
For asian compares, most values we have a collation value.
This is a BIG differece in comparing asian values.
If we want sub-collation compare then set it, otherwise set main
iCompare value.
*/
if( piSubColCompare )
{
if( *piSubColCompare == 0)
{
*piSubColCompare = ui16LeftUniChar < ui16RightUniChar ? -1 : 1;
}
}
else
{
// Treat as the collation value - this is different than the index.
iCompare = ui16LeftUniChar < ui16RightUniChar ? -1 : 1;
}
goto Exit;
}
/*else*/
// Compare subcollation or compare value?
if( iCompareType != COMPARE_COLLATION)
{
iCompare = -1;
goto Exit;
}
// Check for no left character.
if( !ui16LeftWPChar && !ui16LeftUniChar)
{
// No left character. check if no right character.
if( ui16RightWPChar || ui16RightUniChar)
{
iCompare = -1;
}
/* else returns equals. */
}
// Check for no right character.
else if( !ui16RightWPChar && !ui16RightUniChar)
{
iCompare = 1;
}
/*
What remains is one WP char and one Unicode char.
Remember the sub-collation comment above. Some WP char may not
have a collation value (COLS0) so in US sort these values may be
equal and have different sub-collation values. YECH!!!!
The unicode value will always have collation value of COLS0 (0xFF)
and subcollation value of 11110 [unicodeValue]
The WP value could be anything & if collation value is COLS0 will
have a subcollation value os 1110 [WPValue]
So, we have to check to see of the WP collation value is COLS0.
If not iCompare is used. If both represent high collation then
the WP value will always have a lower sub-collation value.
The (not so obvious) code would be to code up...
iCompare = ui16LeftWPChar ? -1 : 1;
if we didn't care about sub-collation (and we may not care).
This is easier to over code than have ?: operators for the two cases.
*/
else if( ui16LeftWPChar)
{
// Remember - unicode subcol is always COLS0.
if( fwpGetCollation( ui16LeftWPChar, uiLangId) == COLS0)
{
if( piSubColCompare && (*piSubColCompare == 0))
{
*piSubColCompare = -1;
}
}
else
{
iCompare = -1;
}
}
else
{
// left=unicode, right=WP
// Remember - unicode subcol is always COLS0 for non-asian.
if( fwpGetCollation( ui16RightWPChar, uiLangId) == COLS0)
{
if( piSubColCompare && (*piSubColCompare == 0))
{
*piSubColCompare = 1;
}
}
else
{
iCompare = 1;
}
}
Exit:
if( !iCompare )
{
// Position to the next values if equal
*puiLeftLen -= uiLeftValueLen;
*ppLeftText = pLeftText + uiLeftValueLen;
*puiLeftWpChar2 = uiLeftWpChar2;
*puiRightLen -= uiRightValueLen;
*ppRightText = pRightText + uiRightValueLen;
*puiRightWpChar2 = uiRightWpChar2;
}
return iCompare;
}
/****************************************************************************
Desc: Return the next WP or unicode character value and parsing type.
****************************************************************************/
FLMUINT flmTextGetCharType(
const FLMBYTE * pText,
FLMUINT uiLen,
FLMUINT16 * pui16WPValue,
FLMUNICODE * puzUniValue,
FLMUINT * pType)
{
FLMUINT uiReturnLen;
FLMUINT16 wpValue;
FLMUNICODE uniValue;
FLMUINT uiCharSet;
uiReturnLen = flmTextGetValue( pText, uiLen, NULL,
FLM_MIN_SPACES, pui16WPValue, puzUniValue);
wpValue = *pui16WPValue;
uniValue = *puzUniValue;
if( wpValue)
{
if( wpValue < 0x080)
{
*pType = flmCharTypeAnsi7( wpValue);
goto Exit;
}
uiCharSet = (FLMUINT) (wpValue >> 8);
if( uiCharSet == 1 ||
uiCharSet == 2 ||
(uiCharSet >= 8 && uiCharSet <= 11))
{
*pType = SDWD_CHR;
goto Exit;
}
*pType = DELI_CHR;
}
else
{
// For now all unicode is a delimeter
*pType = DELI_CHR;
}
Exit:
return uiReturnLen;
}
/****************************************************************************
Desc: Return the next WP or unicode character value.
Return: Number of bytes formatted to return the character value.
Note: This code must be fast so some compromises have been made
in respect to maintenance.
DON"T CHEAT. This routine returns the number of spaces
skipped over if FLM_MIN_SPACE or FLM_NO_SPACE is turned on.
White space checking does NOT applity to WP spaces. Only
to the 0x20 space.
****************************************************************************/
FLMUINT flmTextGetValue(
const FLMBYTE * pText, // [in] Points to current value.
FLMUINT uiLen, // [in] Bytes remaining in text.
FLMUINT * puiWpChar2, // Was there a double character?
FLMUINT uiFlags, // [in]
FLMUINT16 * pui16WPValue, // [out] WP Character value or 0 if unicode.
FLMUNICODE * puzUniValue) // [out] Unicode or OEM value if
// *pui16WPChar is zero.
{
FLMUINT uiReturnLength = 0;
FLMUINT uiObjectLength;
FLMUINT16 ui16CurValue; // Current working (WPish) value.
FLMUNICODE uzUniValue;
uiReturnLength = 0;
ui16CurValue = 0;
uzUniValue = 0;
if (puiWpChar2 && *puiWpChar2)
{
ui16CurValue = (FLMUINT16)(*puiWpChar2);
*puiWpChar2 = 0;
uiObjectLength = 0;
goto Check_White_Space;
}
while (uiLen && !ui16CurValue && !uzUniValue)
{
ui16CurValue = (FLMUINT16) *pText;
switch( GedTextObjType( ui16CurValue ))
{
case ASCII_CHAR_CODE: /* 0nnnnnnn */
uiObjectLength = 1;
Check_White_Space:
// Do all of the bIgnore* stuff here.
// WHITE SPACE CODE doesn't apply.
if( ui16CurValue == (FLMUINT16) ASCII_UNDERSCORE && (uiFlags & FLM_NO_UNDERSCORE))
{
ui16CurValue = (FLMUINT16) ASCII_SPACE;
}
if( ui16CurValue == (FLMUINT16) ASCII_SPACE)
{
if( uiFlags & FLM_NO_SPACE)
{
ui16CurValue = 0;
}
else if( uiFlags & FLM_MIN_SPACES)
{
// Eat up the remaining spaces and underscores (if NO_UNDERSCORES).
while( (pText[ uiObjectLength] == ASCII_SPACE
|| ( pText[ uiObjectLength] == ASCII_UNDERSCORE
&& (uiFlags & FLM_NO_UNDERSCORE)))
&& uiObjectLength < uiLen)
{
uiObjectLength++;
}
}
}
else if( ui16CurValue == ASCII_DASH && (uiFlags & FLM_NO_DASH))
{
ui16CurValue = 0;
}
break;
case CHAR_SET_CODE: /* 10nnnnnn - Character Set | Char */
uiObjectLength = 2;
ui16CurValue = (FLMUINT16)
(((FLMUINT16)(ui16CurValue & (~CHAR_SET_MASK)) << 8)
+ (FLMUINT16)*(pText + 1)); /* Character */
break;
case WHITE_SPACE_CODE: /* 110nnnnn */
{
FLMBYTE ucTmpByte;
uiObjectLength = 1;
ucTmpByte = *pText & (~WHITE_SPACE_MASK);
ui16CurValue = ((ucTmpByte == HARD_HYPHEN) ||
(ucTmpByte == HARD_HYPHEN_EOL) ||
(ucTmpByte == HARD_HYPHEN_EOP))
? (FLMUINT16) 0x2D /* Minus sign */
: (FLMUINT16) 0x20; /* Space */
break;
}
case EXT_CHAR_CODE: /* Full extended character */
uiObjectLength = 3;
ui16CurValue = (FLMUINT16)(((FLMUINT16)*(pText + 1) << 8) /* Char set */
+ (FLMUINT16) *(pText + 2)); /* Character */
break;
case UNICODE_CODE: /* Unconvertable UNICODE code */
uiObjectLength = 3;
ui16CurValue = 0;
uzUniValue = (FLMUINT16)(((FLMUINT16)*(pText + 1) << 8) /* Char set */
+ (FLMUINT16)*(pText + 2)); /* Character */
break;
case OEM_CODE:
uiObjectLength = 2; /* OEM characters are always >= 128.*/
/* Make this a unicode character */
ui16CurValue = 0;
uzUniValue = (FLMUINT16) *(pText + 1);
break;
/* Skip all of the unknown stuff */
case UNK_GT_255_CODE:
uiObjectLength = (FLMUINT16)(1 + sizeof( FLMUINT16) + FB2UW( pText + 1));
break;
case UNK_LE_255_CODE:
uiObjectLength = 2 + (FLMUINT16)*(pText + 1);
break;
case UNK_EQ_1_CODE:
uiObjectLength = 2;
break;
default: /* should NEVER happen: bug if does */
/* Coded to skip remaining data. */
ui16CurValue = 0;
uiObjectLength = uiLen;
break; /* just give up. */
} /* End of switch */
uiReturnLength += uiObjectLength;
pText += uiObjectLength;
uiLen -= uiObjectLength;
}
//Exit:
*pui16WPValue = ui16CurValue;
*puzUniValue = uzUniValue;
return uiReturnLength;
}
/****************************************************************************
Desc: Return the sub-collation value of a WPText character.
Unconvered Unicode values always have a sub-collation
value of 11110 + Unicode Value.
****************************************************************************/
FSTATIC FLMUINT16 flmTextGetSubCol(
FLMUINT16 ui16WPValue, // [in] WP Character value.
FLMUINT16 ui16ColValue, // [in] Collation Value (for arabic)
FLMUINT uiLangId) // [in] WP Language ID.
{
FLMUINT16 ui16SubColVal;
FLMBYTE byCharVal;
FLMBYTE byCharSet;
FLMUINT16 ui16Base;
// Easy case first.
ui16SubColVal = 0;
if( (ui16WPValue & 0xFF00 ) == 0)
{
goto Exit;
}
// From here down default ui16SubColVal is WP value.
ui16SubColVal = ui16WPValue;
byCharVal = (FLMBYTE) ui16WPValue;
byCharSet = (FLMBYTE) (ui16WPValue >> 8);
/**--------------------------------------------------
*** Convert char to uppercase because case information
*** is stored above. This will help
*** insure that the "ETA" doesn't sort before "eta"
*** could use is lower code here for added performance.
***-------------------------------------------------*/
/* This just happens to work with all WP character values. */
if (!fwpIsUpper( ui16WPValue))
{
ui16WPValue &= ~1;
}
switch( byCharSet)
{
case CHSMUL1:
/**--------------------------------------------------
*** If you cannot break down a char into base and
*** diacritic then you cannot combine the charaacter
*** later when converting back the key. So, write
*** the entire WP char in the sub-collation area.
*** We can ONLY SUPPORT MULTINATIONAL 1 for brkcar()
***-------------------------------------------------*/
if( fwpCh6Brkcar( ui16WPValue, &ui16Base, &ui16SubColVal))
{
// WordPerfect character cannot be broken down.
// If we had a collation value other than 0xFF (COLS0), don't
// return a sub-collation value. This will allow things like
// upper and lower AE digraphs to compare properly.
if (ui16ColValue != COLS0)
{
ui16SubColVal = 0;
}
goto Exit;
}
/**-------------------------------------------------
*** Write the FLAIM diacritic sub-collation value.
*** Prefix is 2 bits "10". Remember to leave
*** "111" alone for the future.
*** Bug 11/16/92 = was only writing a "1" and not "10"
***------------------------------------------------*/
ui16SubColVal = (
(ui16SubColVal & 0xFF) == umlaut /* Def in charset.h */
&& ( (uiLangId == SU_LANG) ||
(uiLangId == SV_LANG) ||
(uiLangId == CZ_LANG) ||
(uiLangId == SL_LANG)
)
)
? (FLMUINT16)(fwp_dia60Tbl[ ring] + 1) /* umlaut must be after ring above*/
: (FLMUINT16)(fwp_dia60Tbl[ ui16SubColVal & 0xFF]);
break;
case CHSGREK:
/**------------
*** Greek
***-----------*/
if( (byCharVal >= 52) || /* Keep case bit for 52-69 else ignore*/
(ui16WPValue == 0x804) || /*[ 8,4] BETA Medial | Terminal*/
(ui16WPValue == 0x826)) /*[ 8,38] SIGMA termainal */
{
ui16SubColVal = ui16WPValue;
}
/* else no subcollation to worry about */
break;
case CHSCYR:
if( byCharVal >= 144)
{
ui16SubColVal = ui16WPValue;
}
/* else no subcollation to worry about */
/* VISIT: Georgian covers 208-249 - no collation defined yet */
break;
case CHSHEB: /* Hebrew */
/**-----------------------------------------------------------
*** Three sections in Hebrew:
*** 0..26 - main characters
*** 27..83 - accents that apear over previous character
*** 84..118- dagesh (ancient) hebrew with accents
***
*** Because the ancient is only used for sayings & scriptures
*** we will support a collation value and in the sub-collation
*** store the actual character because sub-collation is in
*** character order.
***----------------------------------------------------------*/
if( byCharVal >= 84) /* Save ancient - value 84 and above */
{
ui16SubColVal = ui16WPValue;
}
break;
case CHSARB1: /* Arabic 1 */
/**-------------------------------------------------------
*** Three sections in Arabic:
*** 00..37 - accents that display OVER a previous character
*** 38..46 - symbols
*** 47..57 - numbers
*** 58..163 - characters
*** 164 - hamzah accent
*** 165..180- common characters with accents
*** 181..193- ligatures - common character combinations
*** 194..195- extensions - throw away when sorting
***------------------------------------------------------*/
if( byCharVal <= 46 )
{
ui16SubColVal = ui16WPValue;
}
else
{
if( ui16ColValue == COLS10a+1) /* Alef? */
{
ui16SubColVal = (byCharVal >= 165)
? (FLMUINT16)(fwp_alefSubColTbl[ byCharVal - 165 ])
: (FLMUINT16)7; /* Alef subcol value */
}
else
{
if( byCharVal >= 181) /* Ligatures - char combination*/
{
ui16SubColVal = ui16WPValue;
}
else if( byCharVal == 64) /* taa exception */
{
ui16SubColVal = 8;
}
}
}
break;
case CHSARB2: /* Arabic 2 */
/* There are some characters that share the same slot */
/* Check the bit table if above character 64 */
if ((byCharVal >= 64) &&
(fwp_ar2BitTbl[(byCharVal-64)>> 3] & (0x80 >> (byCharVal&0x07))))
{
ui16SubColVal = ui16WPValue;
}
break;
} /* end switch */
Exit:
return ui16SubColVal;
}