mars-flaim/flaim/src/f_tocoll.cpp

//-------------------------------------------------------------------------
// Desc:	Collation routines for indexing.
// Tabs:	3
//
//		Copyright (c) 1991-2001,2003,2005-2006 Novell, Inc. All Rights Reserved.
//
//		This program is free software; you can redistribute it and/or
//		modify it under the terms of version 2 of the GNU General Public
//		License as published by the Free Software Foundation.
//
//		This program is distributed in the hope that it will be useful,
//		but WITHOUT ANY WARRANTY; without even the implied warranty of
//		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//		GNU General Public License for more details.
//
//		You should have received a copy of the GNU General Public License
//		along with this program; if not, contact Novell, Inc.
//
//		To contact Novell about this file by physical or electronic mail,
//		you may find current contact information at www.novell.com
//
// $Id: f_tocoll.cpp 12245 2006-01-19 14:29:51 -0700 (Thu, 19 Jan 2006) dsanders $
//-------------------------------------------------------------------------

#include "flaimsys.h"

// Returns TRUE if upper case, FALSE if lower case.

FINLINE FLMBOOL charIsUpper(
	FLMUINT16	ui16Char)
{
	return( (FLMBOOL)((ui16Char < 0x7F)
							? (FLMBOOL)((ui16Char >= ASCII_LOWER_A &&
											 ui16Char <= ASCII_LOWER_Z)
											 ? (FLMBOOL)FALSE
											 : (FLMBOOL)TRUE)
							: fwpIsUpper( ui16Char)));
}

extern FLMBYTE		fwp_dia60Tbl[];				// Diacritic conversions
extern FLMBYTE		fwp_alefSubColTbl[];			// Arabic sub collation table.
extern FLMBYTE		fwp_ar2BitTbl[];				// Arabic 2 bit table.

/****************************************************************************
Desc:  	Convert a text string to a collated string.
			If FERR_CONV_DEST_OVERFLOW is returned the string is truncated as
			best as it can be.  The caller must decide to return the error up
			or deal with the truncation.
Return:	RCODE = SUCCESS or FERR_CONV_DEST_OVERFLOW
VISIT:	If the string is EXACTLY the length of the truncation
			length then it should, but doesn't, set the truncation flag.
			The code didn't match the design intent.  Fix next major
			version.
****************************************************************************/
RCODE FTextToColStr(
	const FLMBYTE *	pucStr,					// Points to the internal TEXT string
	FLMUINT 				uiStrLen,				// Length of the internal TEXT string
	FLMBYTE *			pucCollatedStr,		// Returns collated string
	FLMUINT *			puiCollatedStrLen,	// Returns total collated string length
														// Input is maximum bytes in buffer
	FLMUINT  			uiUppercaseFlag,		// Set if to convert to uppercase
	FLMUINT *			puiCollationLen,		// Returns the collation bytes length
	FLMUINT *			puiCaseLen,				// Returns length of case bytes
	FLMUINT				uiLanguage,				// Language
	FLMUINT				uiCharLimit,			// Max number of characters in this key piece
	FLMBOOL				bFirstSubstring,		// TRUE is this is the first substring key
	FLMBOOL *			pbOriginalCharsLost,
	FLMBOOL *			pbDataTruncated)
{
	RCODE					rc = FERR_OK;
	const FLMBYTE *	pucStrEnd;				// Points to the end of the string
	FLMUINT16			ui16Base;				// Value of the base character
	FLMUINT16			ui16SubColVal;			// Sub-collated value (diacritic)
	FLMUINT 				uiObjLength = 0;
	FLMUINT 				uiLength;				// Temporary variable for length
	FLMUINT 				uiTargetColLen = *puiCollatedStrLen - 8;	// 4=ovhd,4=worse char
	FLMUINT				uiObjType;
	FLMBOOL				bDataTruncated = FALSE;

	// Need to increase the buffer sizes to not overflow.
	// Characaters without COLL values will take up 3 bytes in
	// the ucSubColBuf[] and easily overflow the buffer.
	// Hard coded the values so as to minimize changes.

	FLMBYTE		ucSubColBuf[ MAX_SUBCOL_BUF + 301];	// Holds sub-collated values(diac)
	FLMBYTE		ucCaseBits[ MAX_LOWUP_BUF + 81];		// Holds case bits
	FLMUINT16	ui16WpChr;			// Current WP character
	FLMUNICODE	unichr = 0;			// Current unconverted Unicode character
	FLMUINT16	ui16WpChr2;			// 2nd character if any; default 0 for US lang
	FLMUINT		uiColLen;			// Return value of collated length
	FLMUINT		uiSubColBitPos;	// Sub-collation bit position
	FLMUINT	 	uiCaseBitPos;		// Case bit position
	FLMUINT		uiFlags;				// Clear all bit flags
	FLMBOOL		bHebrewArabic = FALSE;	// Set if language is hebrew, arabic, farsi
	FLMBOOL		bTwoIntoOne;

	uiColLen = 0;
	uiSubColBitPos = 0;
	uiCaseBitPos = 0;
	uiFlags = 0;
	ui16WpChr2 = 0;

	// Don't allow any key component to exceed 256 bytes regardless of the
	// user-specified character or byte limit.  The goal is to prevent
	// any single key piece from consuming too much of the key (which is
	// limited to 640 bytes) and thus "starving" other pieces, resulting
	// in a key overflow error.

	if( uiTargetColLen > 256)
	{
		uiTargetColLen = 256;
	}

	// Code below sets ucSubColBuf[] and ucCaseBits[] values to zero.

	if (uiLanguage != US_LANG)
	{
		if (uiLanguage == AR_LANG ||		// Arabic
			 uiLanguage == FA_LANG ||		// Farsi - persian
			 uiLanguage == HE_LANG ||		// Hebrew
			 uiLanguage == UR_LANG)			// Urdu
		{
			bHebrewArabic = TRUE;
		}
	}
	pucStrEnd = &pucStr [uiStrLen];

	while (pucStr < pucStrEnd)
	{

		// Set the case bits and sub-collation bits to zero when
		// on the first bit of the byte.

		if (!(uiCaseBitPos & 0x07))
		{
			ucCaseBits [uiCaseBitPos >> 3] = 0;
		}
		if (!(uiSubColBitPos & 0x07))
		{
			ucSubColBuf [uiSubColBitPos >> 3] = 0;
		}

		// Get the next character from the TEXT string.

		for (ui16WpChr = ui16SubColVal = 0;	// Default sub-collation value
			  !ui16WpChr && pucStr < pucStrEnd;
			  pucStr += uiObjLength)
		{
			FLMBYTE	ucChar = *pucStr;

			uiObjType = GedTextObjType( ucChar);
			switch (uiObjType)
			{
				case ASCII_CHAR_CODE:  			// 0nnnnnnn
					uiObjLength = 1;

					// Character set zero is assumed.

					ui16WpChr = (FLMUINT16)ucChar;
					continue;
				case CHAR_SET_CODE:	  			// 10nnnnnn
					uiObjLength = 2;

					// Character set followed by character

					ui16WpChr = (((FLMUINT16)(ucChar & (~CHAR_SET_MASK)) << 8)
							+ (FLMUINT16)*(pucStr + 1));
					continue;
				case WHITE_SPACE_CODE:			// 110nnnnn
					uiObjLength = 1;
					ucChar &= (~WHITE_SPACE_MASK);
					ui16WpChr = (ucChar == HARD_HYPHEN ||
									 ucChar == HARD_HYPHEN_EOL ||
									 ucChar == HARD_HYPHEN_EOP)
									? (FLMUINT16)0x2D	// Minus sign -- character set 0
									: (FLMUINT16)0x20;// Space -- character set zero
					continue;

				// Skip all of the unknown stuff

				case UNK_GT_255_CODE:
					uiObjLength = 3 + FB2UW( pucStr + 1);
					continue;
				case UNK_LE_255_CODE:
					uiObjLength = 2 + (FLMUINT16)*(pucStr + 1);
					continue;
				case UNK_EQ_1_CODE:
					uiObjLength = 2;
					continue;
				case EXT_CHAR_CODE:
					uiObjLength = 3;

					// Character set followed by character

					ui16WpChr = (((FLMUINT16)*(pucStr + 1) << 8)
							  + (FLMUINT16)*(pucStr + 2));
					continue;
				case OEM_CODE:

					// OEM characters are always >= 128
					// Use character set zero to process them.

					uiObjLength = 2;
					ui16WpChr = (FLMUINT16)*(pucStr + 1);
					continue;
				case UNICODE_CODE:			// Unconvertable UNICODE code
					uiObjLength = 3;

					// Unicode character followed by unicode character set

					unichr = (FLMUINT16)(((FLMUINT16)*(pucStr + 1) << 8)
								+ (FLMUINT16)*(pucStr + 2));
					ui16WpChr = UNK_UNICODE_CODE;
					continue;
				default:

					// Should not happen, but don't return an error

					flmAssert( 0);
					continue;
			}
		}

		// If we didn't get a character, break out of while loop.

		if (!ui16WpChr)
		{
			break;
		}

		// fwpCheckDoubleCollation modifies ui16WpChr if a digraph or a double
		// character sequence is found.  If a double character is found, pucStr
		// is incremented and ui16WpChr2 is set to 1.  If a digraph is found,
		// pucStr is not changed, but ui16WpChr contains the first character and
		// ui16WpChr2 contains the second character of the digraph.

		if (uiLanguage != US_LANG)
		{
			ui16WpChr2 = fwpCheckDoubleCollation( &ui16WpChr, &bTwoIntoOne,
											&pucStr, uiLanguage);
		}

		// Save the case bit

		if (!uiUppercaseFlag)
		{

			// charIsUpper returns TRUE if upper case, 0 if lower case.

			if (!charIsUpper( ui16WpChr))
			{
				uiFlags |= HAD_LOWER_CASE;
			}
			else
			{

				// Set if upper case.

				SET_BIT( ucCaseBits, uiCaseBitPos);
			}
			uiCaseBitPos++;
		}

		// Handle OEM characters, non-collating characters,
		// characters with subcollating values, double collating
		// values.

		// Get the collated value from the WP character-if not collating value

		if ((pucCollatedStr[ uiColLen++] =
				(FLMBYTE)(fwpGetCollation( ui16WpChr, uiLanguage))) >= COLS11)
		{
			FLMUINT	uiTemp;

			// Save OEM characters just like non-collating characters

			// If lower case, convert to upper case.

			if (!charIsUpper( ui16WpChr))
			{
				ui16WpChr &= ~1;
			}

			// No collating value given for this WP char.
			// Save original WP char (2 bytes) in subcollating
			// buffer.

			// 1110 is a new code that will store an insert over
			// the character OR a non-convertable unicode character.
			// Store with the same alignment as "store_extended_char"
			// below.

			// 11110 is code for unmappable UNICODE value.
			// A value 0xFE will be the collation value.  The sub-collation
			// value will be 0xFFFF followed by the UNICODE value.
			// Be sure to eat an extra case bit.

			// See specific Hebrew and Arabic comments in the
			//	switch statement below.

			// Set the next byte that follows in the sub collation buffer.

			ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0;

			if (bHebrewArabic && (pucCollatedStr [uiColLen-1] == COLS0_ARABIC))
			{

				// Store first bit of 1110, fall through & store remaining 3 bits

				SET_BIT( ucSubColBuf, uiSubColBitPos);
				uiSubColBitPos++;

				// Don't store collation value

				uiColLen--;
			}
			else if (unichr)
			{
				ui16WpChr = unichr;
				unichr = 0;

				// Store 11 out of 11110

				SET_BIT( ucSubColBuf, uiSubColBitPos);
				uiSubColBitPos++;
				SET_BIT( ucSubColBuf, uiSubColBitPos);
				uiSubColBitPos++;
				if (!uiUppercaseFlag)
				{
					ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0;

					// Set upper case bit.

					SET_BIT( ucCaseBits, uiCaseBitPos);
					uiCaseBitPos++;
				}
			}
store_extended_char:

			// Set the next byte that follows in the sub collation buffer.

			ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0;
			ucSubColBuf [(uiSubColBitPos + 16) >> 3] = 0;
			uiFlags |= HAD_SUB_COLLATION;

			// Set 110 bits in sub-collation - continued from above.
			// No need to explicitly set the zero, but must increment
			// for it.

			SET_BIT( ucSubColBuf, uiSubColBitPos);
			uiSubColBitPos++;
			SET_BIT( ucSubColBuf, uiSubColBitPos);
			uiSubColBitPos += 2;

			// store_aligned_word: This label is not referenced.
			// Go to the next byte boundary to write the character.

			uiSubColBitPos = (uiSubColBitPos + 7) & (~7);
			uiTemp = BYTES_IN_BITS( uiSubColBitPos);

			// Need to big-endian - so it will sort correctly.

			ucSubColBuf [uiTemp] = (FLMBYTE)(ui16WpChr >> 8);
			ucSubColBuf [uiTemp + 1] = (FLMBYTE)(ui16WpChr);
			uiSubColBitPos += 16;
			ucSubColBuf [uiSubColBitPos >> 3] = 0;
		}
		else
		{
			// Had a collation value

			// Add the lower/uppercase bit if a mixed case output.

			// If not lower ASCII set - check diacritic value for sub-collation

			if (!(ui16WpChr & 0xFF00))
			{

				// ASCII character set - set a single 0 bit - just need to
				// increment to do this.

				uiSubColBitPos++;
			}
			else
			{
				FLMBYTE	ucTmpChar = (FLMBYTE)ui16WpChr;
				FLMBYTE	ucCharSet = (FLMBYTE)(ui16WpChr >> 8);

				// Convert char to uppercase because case information
				// is stored above.  This will help
				// ensure that the "ETA" doesn't sort before "eta"

				if (!charIsUpper(ui16WpChr))
				{
					ui16WpChr &= ~1;
				}

				switch (ucCharSet)
				{
					case CHSMUL1:	// Multinational 1

						// If we cannot break down a char into base and
						// diacritic we cannot combine the charaacter
						// later when converting back the key.  In that case,
						// write the entire WP char in the sub-collation area.

						if (fwpCh6Brkcar( ui16WpChr, &ui16Base, &ui16SubColVal))
						{
							goto store_extended_char;
						}

						// Write the FLAIM diacritic sub-collation value.
						// Prefix is 2 bits "10".  Remember to leave
						// "111" alone for the future.
						// NOTE: The "unlaut" character must sort after the "ring"
						// character.

						ui16SubColVal = ((ui16SubColVal & 0xFF) == umlaut	&&
											  (uiLanguage == SU_LANG ||
												uiLanguage == SV_LANG ||
												uiLanguage == CZ_LANG ||
												uiLanguage == SL_LANG))
							?	(FLMUINT16)(fwp_dia60Tbl[ ring] + 1)
							:	(FLMUINT16)(fwp_dia60Tbl[ ui16SubColVal & 0xFF]);

store_sub_col:
						// Set the next byte that follows in the sub collation buffer.

						ucSubColBuf[ (uiSubColBitPos + 8) >> 3] = 0;
						uiFlags |= HAD_SUB_COLLATION;

						// Set the 10 bits - no need to explicitly set the zero, but
						// must increment for it.

						SET_BIT( ucSubColBuf, uiSubColBitPos);
						uiSubColBitPos += 2;

						// Set sub-collation bits.

						SETnBITS( 5, ucSubColBuf, uiSubColBitPos, ui16SubColVal);
						uiSubColBitPos += 5;
						break;

					case CHSGREK:		// Greek

						if (ucTmpChar >= 52  ||		// Keep case bit for 52-69 else ignore
          				 ui16WpChr == 0x804 ||	// [ 8,4] BETA Medial | Terminal
							 ui16WpChr == 0x826) 	// [ 8,38] SIGMA terminal
						{
							goto store_extended_char;
						}

						// No subcollation to worry about - set a zero bit by
						// incrementing the bit position.

						uiSubColBitPos++;
						break;

					case CHSCYR:
						if (ucTmpChar >= 144)
						{
							goto store_extended_char;
						}

						// No subcollation to worry about - set a zero bit by
						// incrementing the bit position.

						uiSubColBitPos++;

						// VISIT: Georgian covers 208-249 - no collation defined yet

						break;

					case CHSHEB:		// Hebrew

						// Three sections in Hebrew:
						//		0..26 - main characters
						//		27..83 - accents that apear over previous character
						//		84..118- dagesh (ancient) hebrew with accents

						// Because the ancient is only used for sayings & scriptures
						// we will support a collation value and in the sub-collation
						// store the actual character because sub-collation is in
						// character order.

            		if (ucTmpChar >= 84)		// Save ancient - value 84 and above
						{
							goto store_extended_char;
						}

						// No subcollation to worry about - set a zero bit by
						// incrementing the bit position.

						uiSubColBitPos++;
						break;

					case CHSARB1:		// Arabic 1

						// Three sections in Arabic:
						//		00..37  - accents that display OVER a previous character
						//		38..46  - symbols
						//		47..57  - numbers
						//		58..163 - characters
						//		164     - hamzah accent
						//		165..180- common characters with accents
						//		181..193- ligatures - common character combinations
						//		194..195- extensions - throw away when sorting

						if (ucTmpChar <= 46)
						{
							goto store_extended_char;	// save original character
						}

						if (pucCollatedStr[ uiColLen-1] == COLS10a+1)	// Alef?
						{
							ui16SubColVal = (ucTmpChar >= 165)
								? (FLMUINT16)(fwp_alefSubColTbl[ ucTmpChar - 165 ])
								: (FLMUINT16)7;			// Alef subcol value
							goto store_sub_col;
						}
						if (ucTmpChar >= 181)			// Ligatures - char combination
						{
							goto store_extended_char;	// save original character
						}

						if (ucTmpChar == 64)				// taa exception
						{
							ui16SubColVal = 8;
							goto store_sub_col;
						}

						// No subcollation to worry about - set a zero bit by
						// incrementing the bit position.

						uiSubColBitPos++;
						break;

					case CHSARB2:			// Arabic 2

						// There are some characters that share the same slot
						// Check the bit table if above character 64

						if (ucTmpChar >= 64 &&
							 fwp_ar2BitTbl[(ucTmpChar - 64) >> 3] &
								(0x80 >> (ucTmpChar & 0x07)))
						{
							goto store_extended_char;	// Will save original
						}

						// No subcollation to worry about - set a zero bit by
						// incrementing the bit position.

						uiSubColBitPos++;
						break;

					default:

						// Increment bit position to set a zero bit.

						uiSubColBitPos++;
						break;
				}
			}


			// Now let's worry about double character sorting

			if (ui16WpChr2)
			{
				if (pbOriginalCharsLost)
				{
					*pbOriginalCharsLost = TRUE;
				}

				// Set the next byte that follows in the sub collation buffer.

				ucSubColBuf [(uiSubColBitPos + 7) >> 3] = 0;

				if (bTwoIntoOne)
				{

					// Sorts after character in ui16WpChr after call to
					// fwpCheckDoubleCollation
					// Write the char 2 times so lower/upper bits are correct.
					// Could write infinite times because of collation rules.

					pucCollatedStr[ uiColLen] = ++pucCollatedStr[ uiColLen-1];
					uiColLen++;

					// If original was upper case, set one more upper case bit

					if (!uiUppercaseFlag)
					{
						ucCaseBits[ (uiCaseBitPos + 7) >> 3] = 0;
						if (!charIsUpper( (FLMUINT16) *(pucStr - 1)))
						{
							uiFlags |= HAD_LOWER_CASE;
						}
						else
						{
							SET_BIT( ucCaseBits, uiCaseBitPos);
						}
						uiCaseBitPos++;
					}

					// Take into account the diacritical space

					uiSubColBitPos++;
				}
				else
				{

					// We have a digraph, get second collation value

					pucCollatedStr[ uiColLen++] =
						(FLMBYTE)(fwpGetCollation( ui16WpChr2, uiLanguage));

					// Normal case, assume no diacritics set

					uiSubColBitPos++;

					// If first was upper, set one more upper bit.

					if (!uiUppercaseFlag)
					{
						ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0;
						if (charIsUpper( ui16WpChr))
						{
							SET_BIT( ucCaseBits, uiCaseBitPos);
						}
						uiCaseBitPos++;

						// no need to reset the uiFlags
					}
				}
			}
		}

		// Check to see if uiColLen is at some overflow limit.

		if (uiColLen >= uiCharLimit ||
			 uiColLen + BYTES_IN_BITS( uiSubColBitPos) +
						  BYTES_IN_BITS( uiCaseBitPos) >= uiTargetColLen)
		{

			// We hit the maximum number of characters.

			if (pucStr < pucStrEnd)
			{
				bDataTruncated = TRUE;
			}
			break;
		}
	}

	// END OF WHILE LOOP

	if (puiCollationLen)
	{
		*puiCollationLen = uiColLen;
	}

	// Add the first substring marker - also serves as making the string non-null.

	if (bFirstSubstring)
	{
		pucCollatedStr [uiColLen++] = COLL_FIRST_SUBSTRING;
	}

	if (bDataTruncated)
	{
		pucCollatedStr[ uiColLen++ ] = COLL_TRUNCATED;
	}

	// 10/20/98 - Add code to return NOTHING if no values found.

	if (!uiColLen && !uiSubColBitPos)
	{
		if (puiCaseLen)
		{
			*puiCaseLen = 0;
		}
		goto Exit;
	}

	// Store extra zero bit in the sub-collation area for Hebrew/Arabic

	if (bHebrewArabic)
	{
		uiSubColBitPos++;
	}

	// Done putting the string into 4 sections - build the COLLATED KEY
	// Don't set uiUppercaseFlag earlier than here because SC_LOWER may be zero

	uiUppercaseFlag = (uiLanguage == GR_LANG) ? SC_LOWER : SC_UPPER;

	// The default terminating characters is (COLL_MARKER|SC_UPPER)
	// Did we write anything to the subcollation area?

	if (uiFlags & HAD_SUB_COLLATION)
	{
		// Writes out a 0x7

		pucCollatedStr [uiColLen++] = COLL_MARKER | SC_SUB_COL;

		// Move the sub-collation into the collating string

		uiLength = BYTES_IN_BITS( uiSubColBitPos);
		f_memcpy( &pucCollatedStr[uiColLen], ucSubColBuf, uiLength);
		uiColLen += uiLength;
	}

	// Move the upper/lower case stuff - force bits for Greek ONLY
	// This is such a small size that a memcpy is not worth it

	if (uiFlags & HAD_LOWER_CASE)
	{
		FLMUINT		uiNumBytes = BYTES_IN_BITS( uiCaseBitPos);
		FLMBYTE *	pucCasePtr = ucCaseBits;

		// Output the 0x5

		pucCollatedStr [uiColLen++] = (FLMBYTE)(COLL_MARKER | SC_MIXED);
		if (puiCaseLen)
		{
			*puiCaseLen = uiNumBytes + 1;
		}

		if (uiUppercaseFlag == SC_LOWER)
		{

			// Negate case bits for languages (like GREEK) that sort
			// upper case before lower case.

			while (uiNumBytes--)
			{
				pucCollatedStr [uiColLen++] = ~(*pucCasePtr++);
			}
		}
		else
		{
			while (uiNumBytes--)
			{
				pucCollatedStr [uiColLen++] = *pucCasePtr++;
			}
		}
	}
	else
	{

		// All characters are either upper or lower case, as determined
		// by uiUppercaseFlag.

		pucCollatedStr [uiColLen++] = (FLMBYTE)(COLL_MARKER | uiUppercaseFlag);
		if( puiCaseLen)
		{
			*puiCaseLen = 1;
		}
	}
Exit:

	// Set length return value.

	if( pbDataTruncated)
	{
		*pbDataTruncated = bDataTruncated;
	}

	*puiCollatedStrLen = uiColLen;
	return( rc);
}