Files
mars-flaim/flaim/src/f_tocoll.cpp
dsandersoremutah c55dab446f Renamed version4 to flaim and version5 to xflaim
git-svn-id: https://svn.code.sf.net/p/flaim/code/trunk@7 0109f412-320b-0410-ab79-c3e0c5ffbbe6
2006-01-27 21:06:39 +00:00

773 lines
20 KiB
C++

//-------------------------------------------------------------------------
// Desc: Collation routines for indexing.
// Tabs: 3
//
// Copyright (c) 1991-2001,2003,2005-2006 Novell, Inc. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of version 2 of the GNU General Public
// License as published by the Free Software Foundation.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, contact Novell, Inc.
//
// To contact Novell about this file by physical or electronic mail,
// you may find current contact information at www.novell.com
//
// $Id: f_tocoll.cpp 12245 2006-01-19 14:29:51 -0700 (Thu, 19 Jan 2006) dsanders $
//-------------------------------------------------------------------------
#include "flaimsys.h"
// Returns TRUE if upper case, FALSE if lower case.
FINLINE FLMBOOL charIsUpper(
FLMUINT16 ui16Char)
{
return( (FLMBOOL)((ui16Char < 0x7F)
? (FLMBOOL)((ui16Char >= ASCII_LOWER_A &&
ui16Char <= ASCII_LOWER_Z)
? (FLMBOOL)FALSE
: (FLMBOOL)TRUE)
: fwpIsUpper( ui16Char)));
}
extern FLMBYTE fwp_dia60Tbl[]; // Diacritic conversions
extern FLMBYTE fwp_alefSubColTbl[]; // Arabic sub collation table.
extern FLMBYTE fwp_ar2BitTbl[]; // Arabic 2 bit table.
/****************************************************************************
Desc: Convert a text string to a collated string.
If FERR_CONV_DEST_OVERFLOW is returned the string is truncated as
best as it can be. The caller must decide to return the error up
or deal with the truncation.
Return: RCODE = SUCCESS or FERR_CONV_DEST_OVERFLOW
VISIT: If the string is EXACTLY the length of the truncation
length then it should, but doesn't, set the truncation flag.
The code didn't match the design intent. Fix next major
version.
****************************************************************************/
RCODE FTextToColStr(
const FLMBYTE * pucStr, // Points to the internal TEXT string
FLMUINT uiStrLen, // Length of the internal TEXT string
FLMBYTE * pucCollatedStr, // Returns collated string
FLMUINT * puiCollatedStrLen, // Returns total collated string length
// Input is maximum bytes in buffer
FLMUINT uiUppercaseFlag, // Set if to convert to uppercase
FLMUINT * puiCollationLen, // Returns the collation bytes length
FLMUINT * puiCaseLen, // Returns length of case bytes
FLMUINT uiLanguage, // Language
FLMUINT uiCharLimit, // Max number of characters in this key piece
FLMBOOL bFirstSubstring, // TRUE is this is the first substring key
FLMBOOL * pbOriginalCharsLost,
FLMBOOL * pbDataTruncated)
{
RCODE rc = FERR_OK;
const FLMBYTE * pucStrEnd; // Points to the end of the string
FLMUINT16 ui16Base; // Value of the base character
FLMUINT16 ui16SubColVal; // Sub-collated value (diacritic)
FLMUINT uiObjLength = 0;
FLMUINT uiLength; // Temporary variable for length
FLMUINT uiTargetColLen = *puiCollatedStrLen - 8; // 4=ovhd,4=worse char
FLMUINT uiObjType;
FLMBOOL bDataTruncated = FALSE;
// Need to increase the buffer sizes to not overflow.
// Characaters without COLL values will take up 3 bytes in
// the ucSubColBuf[] and easily overflow the buffer.
// Hard coded the values so as to minimize changes.
FLMBYTE ucSubColBuf[ MAX_SUBCOL_BUF + 301]; // Holds sub-collated values(diac)
FLMBYTE ucCaseBits[ MAX_LOWUP_BUF + 81]; // Holds case bits
FLMUINT16 ui16WpChr; // Current WP character
FLMUNICODE unichr = 0; // Current unconverted Unicode character
FLMUINT16 ui16WpChr2; // 2nd character if any; default 0 for US lang
FLMUINT uiColLen; // Return value of collated length
FLMUINT uiSubColBitPos; // Sub-collation bit position
FLMUINT uiCaseBitPos; // Case bit position
FLMUINT uiFlags; // Clear all bit flags
FLMBOOL bHebrewArabic = FALSE; // Set if language is hebrew, arabic, farsi
FLMBOOL bTwoIntoOne;
uiColLen = 0;
uiSubColBitPos = 0;
uiCaseBitPos = 0;
uiFlags = 0;
ui16WpChr2 = 0;
// Don't allow any key component to exceed 256 bytes regardless of the
// user-specified character or byte limit. The goal is to prevent
// any single key piece from consuming too much of the key (which is
// limited to 640 bytes) and thus "starving" other pieces, resulting
// in a key overflow error.
if( uiTargetColLen > 256)
{
uiTargetColLen = 256;
}
// Code below sets ucSubColBuf[] and ucCaseBits[] values to zero.
if (uiLanguage != US_LANG)
{
if (uiLanguage == AR_LANG || // Arabic
uiLanguage == FA_LANG || // Farsi - persian
uiLanguage == HE_LANG || // Hebrew
uiLanguage == UR_LANG) // Urdu
{
bHebrewArabic = TRUE;
}
}
pucStrEnd = &pucStr [uiStrLen];
while (pucStr < pucStrEnd)
{
// Set the case bits and sub-collation bits to zero when
// on the first bit of the byte.
if (!(uiCaseBitPos & 0x07))
{
ucCaseBits [uiCaseBitPos >> 3] = 0;
}
if (!(uiSubColBitPos & 0x07))
{
ucSubColBuf [uiSubColBitPos >> 3] = 0;
}
// Get the next character from the TEXT string.
for (ui16WpChr = ui16SubColVal = 0; // Default sub-collation value
!ui16WpChr && pucStr < pucStrEnd;
pucStr += uiObjLength)
{
FLMBYTE ucChar = *pucStr;
uiObjType = GedTextObjType( ucChar);
switch (uiObjType)
{
case ASCII_CHAR_CODE: // 0nnnnnnn
uiObjLength = 1;
// Character set zero is assumed.
ui16WpChr = (FLMUINT16)ucChar;
continue;
case CHAR_SET_CODE: // 10nnnnnn
uiObjLength = 2;
// Character set followed by character
ui16WpChr = (((FLMUINT16)(ucChar & (~CHAR_SET_MASK)) << 8)
+ (FLMUINT16)*(pucStr + 1));
continue;
case WHITE_SPACE_CODE: // 110nnnnn
uiObjLength = 1;
ucChar &= (~WHITE_SPACE_MASK);
ui16WpChr = (ucChar == HARD_HYPHEN ||
ucChar == HARD_HYPHEN_EOL ||
ucChar == HARD_HYPHEN_EOP)
? (FLMUINT16)0x2D // Minus sign -- character set 0
: (FLMUINT16)0x20;// Space -- character set zero
continue;
// Skip all of the unknown stuff
case UNK_GT_255_CODE:
uiObjLength = 3 + FB2UW( pucStr + 1);
continue;
case UNK_LE_255_CODE:
uiObjLength = 2 + (FLMUINT16)*(pucStr + 1);
continue;
case UNK_EQ_1_CODE:
uiObjLength = 2;
continue;
case EXT_CHAR_CODE:
uiObjLength = 3;
// Character set followed by character
ui16WpChr = (((FLMUINT16)*(pucStr + 1) << 8)
+ (FLMUINT16)*(pucStr + 2));
continue;
case OEM_CODE:
// OEM characters are always >= 128
// Use character set zero to process them.
uiObjLength = 2;
ui16WpChr = (FLMUINT16)*(pucStr + 1);
continue;
case UNICODE_CODE: // Unconvertable UNICODE code
uiObjLength = 3;
// Unicode character followed by unicode character set
unichr = (FLMUINT16)(((FLMUINT16)*(pucStr + 1) << 8)
+ (FLMUINT16)*(pucStr + 2));
ui16WpChr = UNK_UNICODE_CODE;
continue;
default:
// Should not happen, but don't return an error
flmAssert( 0);
continue;
}
}
// If we didn't get a character, break out of while loop.
if (!ui16WpChr)
{
break;
}
// fwpCheckDoubleCollation modifies ui16WpChr if a digraph or a double
// character sequence is found. If a double character is found, pucStr
// is incremented and ui16WpChr2 is set to 1. If a digraph is found,
// pucStr is not changed, but ui16WpChr contains the first character and
// ui16WpChr2 contains the second character of the digraph.
if (uiLanguage != US_LANG)
{
ui16WpChr2 = fwpCheckDoubleCollation( &ui16WpChr, &bTwoIntoOne,
&pucStr, uiLanguage);
}
// Save the case bit
if (!uiUppercaseFlag)
{
// charIsUpper returns TRUE if upper case, 0 if lower case.
if (!charIsUpper( ui16WpChr))
{
uiFlags |= HAD_LOWER_CASE;
}
else
{
// Set if upper case.
SET_BIT( ucCaseBits, uiCaseBitPos);
}
uiCaseBitPos++;
}
// Handle OEM characters, non-collating characters,
// characters with subcollating values, double collating
// values.
// Get the collated value from the WP character-if not collating value
if ((pucCollatedStr[ uiColLen++] =
(FLMBYTE)(fwpGetCollation( ui16WpChr, uiLanguage))) >= COLS11)
{
FLMUINT uiTemp;
// Save OEM characters just like non-collating characters
// If lower case, convert to upper case.
if (!charIsUpper( ui16WpChr))
{
ui16WpChr &= ~1;
}
// No collating value given for this WP char.
// Save original WP char (2 bytes) in subcollating
// buffer.
// 1110 is a new code that will store an insert over
// the character OR a non-convertable unicode character.
// Store with the same alignment as "store_extended_char"
// below.
// 11110 is code for unmappable UNICODE value.
// A value 0xFE will be the collation value. The sub-collation
// value will be 0xFFFF followed by the UNICODE value.
// Be sure to eat an extra case bit.
// See specific Hebrew and Arabic comments in the
// switch statement below.
// Set the next byte that follows in the sub collation buffer.
ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0;
if (bHebrewArabic && (pucCollatedStr [uiColLen-1] == COLS0_ARABIC))
{
// Store first bit of 1110, fall through & store remaining 3 bits
SET_BIT( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
// Don't store collation value
uiColLen--;
}
else if (unichr)
{
ui16WpChr = unichr;
unichr = 0;
// Store 11 out of 11110
SET_BIT( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
SET_BIT( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
if (!uiUppercaseFlag)
{
ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0;
// Set upper case bit.
SET_BIT( ucCaseBits, uiCaseBitPos);
uiCaseBitPos++;
}
}
store_extended_char:
// Set the next byte that follows in the sub collation buffer.
ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0;
ucSubColBuf [(uiSubColBitPos + 16) >> 3] = 0;
uiFlags |= HAD_SUB_COLLATION;
// Set 110 bits in sub-collation - continued from above.
// No need to explicitly set the zero, but must increment
// for it.
SET_BIT( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
SET_BIT( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos += 2;
// store_aligned_word: This label is not referenced.
// Go to the next byte boundary to write the character.
uiSubColBitPos = (uiSubColBitPos + 7) & (~7);
uiTemp = BYTES_IN_BITS( uiSubColBitPos);
// Need to big-endian - so it will sort correctly.
ucSubColBuf [uiTemp] = (FLMBYTE)(ui16WpChr >> 8);
ucSubColBuf [uiTemp + 1] = (FLMBYTE)(ui16WpChr);
uiSubColBitPos += 16;
ucSubColBuf [uiSubColBitPos >> 3] = 0;
}
else
{
// Had a collation value
// Add the lower/uppercase bit if a mixed case output.
// If not lower ASCII set - check diacritic value for sub-collation
if (!(ui16WpChr & 0xFF00))
{
// ASCII character set - set a single 0 bit - just need to
// increment to do this.
uiSubColBitPos++;
}
else
{
FLMBYTE ucTmpChar = (FLMBYTE)ui16WpChr;
FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChr >> 8);
// Convert char to uppercase because case information
// is stored above. This will help
// ensure that the "ETA" doesn't sort before "eta"
if (!charIsUpper(ui16WpChr))
{
ui16WpChr &= ~1;
}
switch (ucCharSet)
{
case CHSMUL1: // Multinational 1
// If we cannot break down a char into base and
// diacritic we cannot combine the charaacter
// later when converting back the key. In that case,
// write the entire WP char in the sub-collation area.
if (fwpCh6Brkcar( ui16WpChr, &ui16Base, &ui16SubColVal))
{
goto store_extended_char;
}
// Write the FLAIM diacritic sub-collation value.
// Prefix is 2 bits "10". Remember to leave
// "111" alone for the future.
// NOTE: The "unlaut" character must sort after the "ring"
// character.
ui16SubColVal = ((ui16SubColVal & 0xFF) == umlaut &&
(uiLanguage == SU_LANG ||
uiLanguage == SV_LANG ||
uiLanguage == CZ_LANG ||
uiLanguage == SL_LANG))
? (FLMUINT16)(fwp_dia60Tbl[ ring] + 1)
: (FLMUINT16)(fwp_dia60Tbl[ ui16SubColVal & 0xFF]);
store_sub_col:
// Set the next byte that follows in the sub collation buffer.
ucSubColBuf[ (uiSubColBitPos + 8) >> 3] = 0;
uiFlags |= HAD_SUB_COLLATION;
// Set the 10 bits - no need to explicitly set the zero, but
// must increment for it.
SET_BIT( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos += 2;
// Set sub-collation bits.
SETnBITS( 5, ucSubColBuf, uiSubColBitPos, ui16SubColVal);
uiSubColBitPos += 5;
break;
case CHSGREK: // Greek
if (ucTmpChar >= 52 || // Keep case bit for 52-69 else ignore
ui16WpChr == 0x804 || // [ 8,4] BETA Medial | Terminal
ui16WpChr == 0x826) // [ 8,38] SIGMA terminal
{
goto store_extended_char;
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
break;
case CHSCYR:
if (ucTmpChar >= 144)
{
goto store_extended_char;
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
// VISIT: Georgian covers 208-249 - no collation defined yet
break;
case CHSHEB: // Hebrew
// Three sections in Hebrew:
// 0..26 - main characters
// 27..83 - accents that apear over previous character
// 84..118- dagesh (ancient) hebrew with accents
// Because the ancient is only used for sayings & scriptures
// we will support a collation value and in the sub-collation
// store the actual character because sub-collation is in
// character order.
if (ucTmpChar >= 84) // Save ancient - value 84 and above
{
goto store_extended_char;
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
break;
case CHSARB1: // Arabic 1
// Three sections in Arabic:
// 00..37 - accents that display OVER a previous character
// 38..46 - symbols
// 47..57 - numbers
// 58..163 - characters
// 164 - hamzah accent
// 165..180- common characters with accents
// 181..193- ligatures - common character combinations
// 194..195- extensions - throw away when sorting
if (ucTmpChar <= 46)
{
goto store_extended_char; // save original character
}
if (pucCollatedStr[ uiColLen-1] == COLS10a+1) // Alef?
{
ui16SubColVal = (ucTmpChar >= 165)
? (FLMUINT16)(fwp_alefSubColTbl[ ucTmpChar - 165 ])
: (FLMUINT16)7; // Alef subcol value
goto store_sub_col;
}
if (ucTmpChar >= 181) // Ligatures - char combination
{
goto store_extended_char; // save original character
}
if (ucTmpChar == 64) // taa exception
{
ui16SubColVal = 8;
goto store_sub_col;
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
break;
case CHSARB2: // Arabic 2
// There are some characters that share the same slot
// Check the bit table if above character 64
if (ucTmpChar >= 64 &&
fwp_ar2BitTbl[(ucTmpChar - 64) >> 3] &
(0x80 >> (ucTmpChar & 0x07)))
{
goto store_extended_char; // Will save original
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
break;
default:
// Increment bit position to set a zero bit.
uiSubColBitPos++;
break;
}
}
// Now let's worry about double character sorting
if (ui16WpChr2)
{
if (pbOriginalCharsLost)
{
*pbOriginalCharsLost = TRUE;
}
// Set the next byte that follows in the sub collation buffer.
ucSubColBuf [(uiSubColBitPos + 7) >> 3] = 0;
if (bTwoIntoOne)
{
// Sorts after character in ui16WpChr after call to
// fwpCheckDoubleCollation
// Write the char 2 times so lower/upper bits are correct.
// Could write infinite times because of collation rules.
pucCollatedStr[ uiColLen] = ++pucCollatedStr[ uiColLen-1];
uiColLen++;
// If original was upper case, set one more upper case bit
if (!uiUppercaseFlag)
{
ucCaseBits[ (uiCaseBitPos + 7) >> 3] = 0;
if (!charIsUpper( (FLMUINT16) *(pucStr - 1)))
{
uiFlags |= HAD_LOWER_CASE;
}
else
{
SET_BIT( ucCaseBits, uiCaseBitPos);
}
uiCaseBitPos++;
}
// Take into account the diacritical space
uiSubColBitPos++;
}
else
{
// We have a digraph, get second collation value
pucCollatedStr[ uiColLen++] =
(FLMBYTE)(fwpGetCollation( ui16WpChr2, uiLanguage));
// Normal case, assume no diacritics set
uiSubColBitPos++;
// If first was upper, set one more upper bit.
if (!uiUppercaseFlag)
{
ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0;
if (charIsUpper( ui16WpChr))
{
SET_BIT( ucCaseBits, uiCaseBitPos);
}
uiCaseBitPos++;
// no need to reset the uiFlags
}
}
}
}
// Check to see if uiColLen is at some overflow limit.
if (uiColLen >= uiCharLimit ||
uiColLen + BYTES_IN_BITS( uiSubColBitPos) +
BYTES_IN_BITS( uiCaseBitPos) >= uiTargetColLen)
{
// We hit the maximum number of characters.
if (pucStr < pucStrEnd)
{
bDataTruncated = TRUE;
}
break;
}
}
// END OF WHILE LOOP
if (puiCollationLen)
{
*puiCollationLen = uiColLen;
}
// Add the first substring marker - also serves as making the string non-null.
if (bFirstSubstring)
{
pucCollatedStr [uiColLen++] = COLL_FIRST_SUBSTRING;
}
if (bDataTruncated)
{
pucCollatedStr[ uiColLen++ ] = COLL_TRUNCATED;
}
// 10/20/98 - Add code to return NOTHING if no values found.
if (!uiColLen && !uiSubColBitPos)
{
if (puiCaseLen)
{
*puiCaseLen = 0;
}
goto Exit;
}
// Store extra zero bit in the sub-collation area for Hebrew/Arabic
if (bHebrewArabic)
{
uiSubColBitPos++;
}
// Done putting the string into 4 sections - build the COLLATED KEY
// Don't set uiUppercaseFlag earlier than here because SC_LOWER may be zero
uiUppercaseFlag = (uiLanguage == GR_LANG) ? SC_LOWER : SC_UPPER;
// The default terminating characters is (COLL_MARKER|SC_UPPER)
// Did we write anything to the subcollation area?
if (uiFlags & HAD_SUB_COLLATION)
{
// Writes out a 0x7
pucCollatedStr [uiColLen++] = COLL_MARKER | SC_SUB_COL;
// Move the sub-collation into the collating string
uiLength = BYTES_IN_BITS( uiSubColBitPos);
f_memcpy( &pucCollatedStr[uiColLen], ucSubColBuf, uiLength);
uiColLen += uiLength;
}
// Move the upper/lower case stuff - force bits for Greek ONLY
// This is such a small size that a memcpy is not worth it
if (uiFlags & HAD_LOWER_CASE)
{
FLMUINT uiNumBytes = BYTES_IN_BITS( uiCaseBitPos);
FLMBYTE * pucCasePtr = ucCaseBits;
// Output the 0x5
pucCollatedStr [uiColLen++] = (FLMBYTE)(COLL_MARKER | SC_MIXED);
if (puiCaseLen)
{
*puiCaseLen = uiNumBytes + 1;
}
if (uiUppercaseFlag == SC_LOWER)
{
// Negate case bits for languages (like GREEK) that sort
// upper case before lower case.
while (uiNumBytes--)
{
pucCollatedStr [uiColLen++] = ~(*pucCasePtr++);
}
}
else
{
while (uiNumBytes--)
{
pucCollatedStr [uiColLen++] = *pucCasePtr++;
}
}
}
else
{
// All characters are either upper or lower case, as determined
// by uiUppercaseFlag.
pucCollatedStr [uiColLen++] = (FLMBYTE)(COLL_MARKER | uiUppercaseFlag);
if( puiCaseLen)
{
*puiCaseLen = 1;
}
}
Exit:
// Set length return value.
if( pbDataTruncated)
{
*pbDataTruncated = bDataTruncated;
}
*puiCollatedStrLen = uiColLen;
return( rc);
}