Files
mars-flaim/flaim/src/kyasia1.cpp
dsandersoremutah c55dab446f Renamed version4 to flaim and version5 to xflaim
git-svn-id: https://svn.code.sf.net/p/flaim/code/trunk@7 0109f412-320b-0410-ab79-c3e0c5ffbbe6
2006-01-27 21:06:39 +00:00

358 lines
10 KiB
C++

//-------------------------------------------------------------------------
// Desc: String collation for Asian languages.
// Tabs: 3
//
// Copyright (c) 1993-2006 Novell, Inc. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of version 2 of the GNU General Public
// License as published by the Free Software Foundation.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, contact Novell, Inc.
//
// To contact Novell about this file by physical or electronic mail,
// you may find current contact information at www.novell.com
//
// $Id: kyasia1.cpp 12312 2006-01-19 15:14:03 -0700 (Thu, 19 Jan 2006) dsanders $
//-------------------------------------------------------------------------
#include "flaimsys.h"
/****************************************************************************
Desc: Convert a text string to a collated string.
****************************************************************************/
RCODE AsiaFlmTextToColStr(
const FLMBYTE * Str, /* Points to the internal TEXT string */
FLMUINT StrLen, /* Length of the internal TEXT string */
FLMBYTE * ColStr, /* Output collated string */
FLMUINT * ColStrLenRV, /* Collated string length return value */
/* Input value is MAX num of bytes in buffer*/
FLMUINT UppercaseFlag, /* Set if to convert to uppercase */
FLMUINT * puiCollationLen, /* Returns the collation bytes length */
FLMUINT * puiCaseLen, /* Returns length of case bytes */
FLMUINT uiCharLimit, /* Max number of characters in this key piece */
FLMBOOL bFirstSubstring, /* TRUE is this is the first substring key */
FLMBOOL * pbDataTruncated)
{
RCODE rc = FERR_OK;
const FLMBYTE * pszStrEnd; /* Points to the end of the String */
FLMUINT Length; /* Temporary variable for length */
FLMUINT uiTargetColLen = *ColStrLenRV - 12;/* 6=ovhd,6=worst char*/
FLMBYTE SubColBuf[ MAX_SUBCOL_BUF+1]; /* Holds Sub-col values (diac) */
FLMBYTE LowUpBuf[ MAX_LOWUP_BUF+MAX_LOWUP_BUF+2]; /* 2 case bits/wpchar */
FLMUINT ColLen; /* Return value of collated length */
FLMUINT SubColBitPos; /* Index into Sub-collation buffer */
FLMUINT LowUpBitPos; /* Lower upper bit position */
FLMUINT Flags; /* Clear all bit flags */
FLMUINT16 NextWpChar; /* next WP character */
FLMUINT16 UnicodeChar; /* Unicode character */
FLMUINT16 ColValue; /* Set to zero for the first time */
FLMBOOL bDataTruncated = FALSE;
ColLen = SubColBitPos = LowUpBitPos = Flags = 0;
UnicodeChar = ColValue = 0;
// Don't allow any key component to exceed 256 bytes regardless of the
// user-specified character or byte limit. The goal is to prevent
// any single key piece from consuming too much of the key (which is
// limited to 640 bytes) and thus "starving" other pieces, resulting
// in a key overflow error.
if( uiTargetColLen > 256)
{
uiTargetColLen = 256;
}
// Make sure SubColBuf and LowUpBuf are set to 0's
f_memset( SubColBuf, 0, sizeof( SubColBuf));
f_memset( LowUpBuf, 0, sizeof( LowUpBuf));
pszStrEnd = &Str[ StrLen];
NextWpChar = 0;
while( (Str < pszStrEnd) || NextWpChar || UnicodeChar)
{
FLMUINT16 WpChar; /* Current WP character */
FLMUINT ObjLength;
FLMUINT16 SubColVal; /* Sub-collated value (diacritic) */
FLMBYTE CaseFlags;
// Get the next character from the TEXT String. NOTE: OEM characters
// will be returned as character set ZERO, the character will be
// greater than 127.
WpChar = NextWpChar;
for( NextWpChar = 0;
(!WpChar || !NextWpChar) && !UnicodeChar && (Str < pszStrEnd);
Str += ObjLength ) /* Inc Str to skip what its pointing at*/
{
FLMBYTE ObjType;
FLMBYTE CurByte;
FLMUINT16 CurWpChar = 0;
CurByte = *Str;
ObjType = (FLMBYTE)(GedTextObjType( CurByte));
ObjLength = 1;
switch( ObjType)
{
case ASCII_CHAR_CODE: /* 0nnnnnnn */
CurWpChar = (FLMUINT16)CurByte; /* Character set zero is assumed */
break;
case CHAR_SET_CODE: /* 10nnnnnn */
ObjLength = 2;
CurWpChar =
(FLMUINT16)(((FLMUINT16)(CurByte & (~CHAR_SET_MASK)) << 8) /* Char set */
+ (FLMUINT16)*(Str + 1)); /* Character */
break;
case WHITE_SPACE_CODE: /* 110nnnnn */
CurByte &= (~WHITE_SPACE_MASK);
CurWpChar = ((CurByte == HARD_HYPHEN) ||
(CurByte == HARD_HYPHEN_EOL) ||
(CurByte == HARD_HYPHEN_EOP)
)
? 0x2D /* Minus sign - character set zero*/
: 0x20; /* Space -- character set zero */
break;
/* Skip all of the unknown stuff */
case UNK_GT_255_CODE:
ObjLength = 1 + sizeof( FLMUINT16) + FB2UW( Str + 1);
break;
case UNK_LE_255_CODE:
ObjLength = 2 + (FLMUINT16)*(Str + 1);
break;
case UNK_EQ_1_CODE:
ObjLength = 2;
break;
case EXT_CHAR_CODE:
ObjLength = 3;
CurWpChar =
(FLMUINT16)(((FLMUINT16)*(Str + 1) << 8) /* Character set */
+ (FLMUINT16)*(Str + 2)); /* Character */
break;
case OEM_CODE: /* Should never get these */
ObjLength = 2;
/* OEM characters are always >= 128. */
/* We use character set zero to process them. */
CurWpChar = (FLMUINT16)*(Str + 1);
break;
case UNICODE_CODE: /* Unconvertable UNICODE code */
ObjLength = 3;
UnicodeChar = (FLMUINT16)(((FLMUINT16)*(Str + 1) << 8) /* Char set */
+ (FLMUINT16)*(Str + 2)); /* Character */
CurWpChar = 0;
break;
default: /* should NEVER happen: bug if does */
continue;
} /* end switch */
if( !WpChar) /* Change which needs changing */
WpChar = CurWpChar;
else
NextWpChar = CurWpChar;
} /* end of FOR statement */
/**-----------------------------------------------------------
*** If we didn't get a character, break out of the outer
*** processing loop.
***----------------------------------------------------------*/
if( !WpChar && !UnicodeChar)
break; /* leave WHILE statement */
if( WpChar)
{
if( fwpAsiaGetCollation( WpChar, NextWpChar, ColValue, &ColValue,
&SubColVal, &CaseFlags, (FLMUINT16)UppercaseFlag ) == 2)
{
/* Took the NextWpChar value */
NextWpChar = 0; /* Force to skip this value */
}
}
else // use the UnicodeChar value for this pass
{
// This handles all of the UNICODE characters that could not
// be converted to WP characters - which will include most
// of the Asian characters.
CaseFlags = 0;
if (UnicodeChar < 0x20)
{
ColValue = 0xFFFF;
// Setting SubColVal to a high code will ensure
// that the code that the UnicodeChar will be stored
// in its full 16 bits in the sub-collation area.
SubColVal = 0xFFFF;
// NOTE: UnicodeChar SHOULD NOT be set to zero here.
// It will be set to zero below.
}
else
{
ColValue = UnicodeChar;
SubColVal = 0;
UnicodeChar = 0;
}
}
/* Store the values in 2 bytes */
ColStr[ ColLen++ ] = (FLMBYTE)(ColValue >> 8);
ColStr[ ColLen++ ] = (FLMBYTE)(ColValue & 0xFF);
if( SubColVal)
{
Flags |= HAD_SUB_COLLATION;
if( SubColVal <= 31) /* 5 bit - store bits 10 */
{
SET_BIT( SubColBuf, SubColBitPos);
SubColBitPos += 1 + 1; /* Stores a zero */
SETnBITS( 5, SubColBuf, SubColBitPos, SubColVal);
SubColBitPos += 5;
}
else /* 2 bytes - store bits 110 or 11110 */
{
FLMUINT Temp;
SET_BIT( SubColBuf, SubColBitPos);
SubColBitPos++;
SET_BIT( SubColBuf, SubColBitPos);
SubColBitPos++;
if( !WpChar && UnicodeChar) /* Store as "11110" */
{
SubColVal = UnicodeChar;
UnicodeChar = 0;
SET_BIT( SubColBuf, SubColBitPos );
SubColBitPos++;
SET_BIT( SubColBuf, SubColBitPos );
SubColBitPos++;
}
SubColBitPos++; /* Skip past the zero */
/* Go to the next byte boundary to write the WP char */
SubColBitPos = (SubColBitPos + 7) & (~7);
Temp = BYTES_IN_BITS( SubColBitPos);
/* Need to store HIGH-Low - PC format is Low-high! */
SubColBuf[ Temp ] = (FLMBYTE) (SubColVal >> 8);
SubColBuf[ Temp+1] = (FLMBYTE) (SubColVal);
SubColBitPos += 16;
}
}
else
SubColBitPos++;
/* Save case information - always 2 bits worth for asian */
if( CaseFlags & 0x02)
{
SET_BIT( LowUpBuf, LowUpBitPos);
}
LowUpBitPos++;
if( CaseFlags & 0x01)
{
SET_BIT( LowUpBuf, LowUpBitPos);
}
LowUpBitPos++;
// Check to see if ColLen is within 1 byte of max
if( (ColLen >= uiCharLimit) ||
(ColLen + BYTES_IN_BITS( SubColBitPos) +
BYTES_IN_BITS( LowUpBitPos) >= uiTargetColLen))
{
// Still something left?
if ((Str < pszStrEnd) || NextWpChar || UnicodeChar)
{
bDataTruncated = TRUE;
}
// Hit the max. number of characters
break;
}
}
if( puiCollationLen)
{
*puiCollationLen = ColLen;
}
// Add the first substring marker - also serves as making the string non-null.
if( bFirstSubstring)
{
ColStr[ ColLen++ ] = 0;
ColStr[ ColLen++ ] = COLL_FIRST_SUBSTRING;
}
if( bDataTruncated)
{
ColStr[ ColLen++ ] = 0;
ColStr[ ColLen++ ] = COLL_TRUNCATED;
}
if( !ColLen && !SubColBitPos)
{
if( puiCaseLen)
{
*puiCaseLen = 0;
}
goto Exit;
}
// Done putting the String into 3 sections - build the COLLATED KEY
if( Flags & HAD_SUB_COLLATION)
{
ColStr[ ColLen++ ] = 0;
ColStr[ ColLen++ ] = COLL_MARKER | SC_SUB_COL;
// Move the Sub-collation (diacritics) into the collating String
Length = (FLMUINT)(BYTES_IN_BITS(SubColBitPos));
f_memcpy( &ColStr[ColLen], SubColBuf, Length);
ColLen += Length;
}
// Always represent the marker as 2 bytes and case bits in asia
ColStr[ ColLen++ ] = 0;
ColStr[ ColLen++ ] = COLL_MARKER | SC_MIXED;
Length = (FLMUINT)(BYTES_IN_BITS( LowUpBitPos ));
f_memcpy( &ColStr[ ColLen ], LowUpBuf, Length);
if( puiCaseLen)
{
*puiCaseLen = (FLMUINT)(Length + 2);
}
ColLen += Length;
Exit:
if( pbDataTruncated)
{
*pbDataTruncated = bDataTruncated;
}
*ColStrLenRV = (FLMUINT)ColLen;
return( rc);
}