Files
mars-flaim/flaim/src/f_uncoll.cpp
dsandersoremutah c55dab446f Renamed version4 to flaim and version5 to xflaim
git-svn-id: https://svn.code.sf.net/p/flaim/code/trunk@7 0109f412-320b-0410-ab79-c3e0c5ffbbe6
2006-01-27 21:06:39 +00:00

575 lines
19 KiB
C++

//-------------------------------------------------------------------------
// Desc: Uncollation routines for converting from collated string to WP string.
// Tabs: 3
//
// Copyright (c) 1992-2001,2003-2006 Novell, Inc. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of version 2 of the GNU General Public
// License as published by the Free Software Foundation.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, contact Novell, Inc.
//
// To contact Novell about this file by physical or electronic mail,
// you may find current contact information at www.novell.com
//
// $Id: f_uncoll.cpp 12245 2006-01-19 14:29:51 -0700 (Thu, 19 Jan 2006) dsanders $
//-------------------------------------------------------------------------
#include "flaimsys.h"
/**-----------------------------------------
*** External tables
*** Could be far in another data segment
***----------------------------------------*/
/* From COLLATE1.C */
extern FLMUINT16 colToWPChr[]; /* Converts collated value to WP character */
extern FLMBYTE ml1_COLtoD[]; /* Diacritic conversions */
extern FLMUINT16 HebArabColToWPChr[];
extern FLMUINT16 ArabSubColToWPChr[];
/**-----------------------------------------
*** Local Static Routine Prototypes
***----------------------------------------*/
/**----------------------------------------------------------------
*** The version using the table uses 34 ticks and 29 bytes ONLY
*** because the turbo optimizer uses register variables better.
*** The other version below uses 39 ticks and 33 bytes.
*** Macro not moved to other calls in f_tocoll or kycompnd because
*** these are rarely called areas right now.
***---------------------------------------------------------------*/
FSTATIC FLMUINT FWWSGetColStr( /* Returns byte length of word string*/
FLMBYTE * fColStr,
FLMUINT * fcStrLenRV,
FLMBYTE * wordStr,
FLMUINT fWPLang,
FLMBOOL * pbDataTruncated,
FLMBOOL * pbFirstSubstring);
FSTATIC FLMUINT FWWSCmbSubColBuf(
FLMBYTE * wordStr,
FLMUINT * wdStrLenRV,
FLMBYTE * subColBuf,
FLMBOOL hebrewArabicFlag );
FSTATIC FLMUINT FWWSToMixed(
FLMBYTE * wordStr,
FLMUINT wdStrLen,
FLMBYTE * lowUpBitStr,
FLMUINT fWPLang );
/**************************************************************************
Desc: Get the Flaim collating string and convert back to a text string
Ret: Length of new wpStr
Notes: Allocates the area for the word string buffer if will be over 256.
***************************************************************************/
FLMUINT FColStrToText(
FLMBYTE * fColStr, /* Points to the Flaim collated string */
FLMUINT * fcStrLenRV, /* Length of the Flaim collated string */
FLMBYTE * textStr, /* Output string to build - TEXT string */
FLMUINT fWPLang, /* FLAIM WP language number */
FLMBYTE * postBuf, /* Lower/upper POST buffer or NULL */
FLMUINT * postBytesRV, /* Return next position to use in postBuf */
FLMBOOL * pbDataTruncated, /* Sets to TRUE if data had been truncated */
FLMBOOL * pbFirstSubstring) /* Sets to TRUE if first substring */
{
#define LOCAL_CHARS 150
FLMBYTE wordStr[ LOCAL_CHARS * 2 + LOCAL_CHARS / 5 ]; // Sample + 20%
FLMBYTE * wsPtr = NULL;
FLMBYTE * wsAllocatedWsPtr = NULL;
FLMUINT wsLen;
FLMUINT textLen;
FLMBYTE * textPtr;
if( *fcStrLenRV > LOCAL_CHARS ) /* If won't fit allocate 1280 */
{
if( RC_BAD( f_alloc( MAX_KEY_SIZ * 2, &wsPtr)))
{
return( 0 );
}
wsAllocatedWsPtr = wsPtr;
}
else
wsPtr = wordStr;
if( (fWPLang >= FIRST_DBCS_LANG) &&
(fWPLang <= LAST_DBCS_LANG))
{
wsLen = AsiaConvertColStr( fColStr, fcStrLenRV, wsPtr,
pbDataTruncated, pbFirstSubstring );
if( postBuf )
{
FLMUINT postBytes = *postBytesRV + 2; /* Skip past marker */
/* may change wsLen */
postBytes += AsiaParseCase( wsPtr, &wsLen, &postBuf[ postBytes]);
*postBytesRV = postBytes;
}
}
else
{
wsLen = FWWSGetColStr( fColStr, fcStrLenRV, wsPtr, fWPLang,
pbDataTruncated, pbFirstSubstring );
/* If a post buffer is sent - turn unflagged chars to lower case */
if( postBuf )
{
FLMUINT postBytes = *postBytesRV;
/* Check if mixed case chars follow and always increment postBytes */
if( postBuf[ postBytes++ ] == (COLL_MARKER | SC_MIXED))
{
postBytes += FWWSToMixed( wsPtr, wsLen,
&postBuf[ postBytes ], fWPLang);
}
*postBytesRV = postBytes;
}
}
/**-------------------------------------------
*** Copy word string to TEXT string area
***------------------------------------------*/
wsLen >>= 1; /* Convert # of bytes to # of words */
textPtr = textStr;
while( wsLen--)
{
register FLMBYTE ch, cSet;
/* Put the character in a local variable for speed */
ch = *wsPtr++;
cSet = *wsPtr++;
if( (!cSet) && (ch <= 127))
{
/**-----------------------------------------------------------
*** Character set zero only needs one byte if the character
*** is <= 127. Otherwise, it is handled like all other
*** extended characters below.
***----------------------------------------------------------*/
*textPtr++ = ch;
}
/**-----------------------------------------------------
*** If the character set is > 63 it takes three bytes
*** to store, otherwise only two bytes are needed.
***----------------------------------------------------*/
else if( cSet < 63)
{
*textPtr++ = (FLMBYTE)(CHAR_SET_CODE | cSet);
*textPtr++ = ch;
}
else if( cSet == 0xFF && ch == 0xFF)
{
*textPtr++ = UNICODE_CODE;
*textPtr++ = *(wsPtr+1); /* Character set */
*textPtr++ = *wsPtr; /* Character */
wsPtr += 2;
wsLen--; /* Skip past 4 bytes for UNICODE */
}
else
{
*textPtr++ = EXT_CHAR_CODE;
*textPtr++ = cSet;
*textPtr++ = ch;
}
}
textLen = (textPtr - textStr); /* Compute total length */
if( wsAllocatedWsPtr != NULL)
f_free( &wsAllocatedWsPtr);
return( textLen);
}
/*****************************************************************************
Desc: Get the Flaim collating string and convert back to a WP word string
Ret: Length of new WP word string
*****************************************************************************/
FSTATIC FLMUINT FWWSGetColStr(
FLMBYTE * fColStr, /* Points to the Flaim collated string */
FLMUINT * fcStrLenRV, /* Length of the Flaim collated string */
FLMBYTE * wordStr, /* Output string to build - WP word string */
FLMUINT fWPLang, /* FLAIM WP language number */
FLMBOOL * pbDataTruncated, /* Set to TRUE if truncated */
FLMBOOL * pbFirstSubstring) /* Sets to TRUE if first substring */
{
FLMBYTE * wsPtr = wordStr; /* Points to the word string data area */
FLMUINT length = *fcStrLenRV;/* May optimize as a register */
FLMUINT pos = 0; /* Position in fColStr[] */
FLMUINT bitPos; /* Computed bit position */
FLMUINT colChar; /* Not portable if a FLMBYTE value */
FLMUINT wdStrLen;
FLMBOOL hebrewArabicFlag = 0;/* Set if hebrew/arabic language */
/**
*** WARNING:
*** The code is duplicated for performance reasons.
*** The US code below is much more optimized so
*** any changes must be done twice.
**/
if( fWPLang != US_LANG) /* Code for NON-US languages */
{
if( (fWPLang == AR_LANG ) || /* Arabic */
(fWPLang == FA_LANG ) || /* Farsi - persian */
(fWPLang == HE_LANG ) || /* Hebrew */
(fWPLang == UR_LANG )) /* Urdu */
hebrewArabicFlag++; /* Add sindhi, pashto, kurdish, malay*/
// MVSVISIT: will not work correctly on IBM390 - need to change toolkit tables.
while( length && (fColStr[pos] > MAX_COL_OPCODE))
{
length--;
colChar = (FLMUINT) fColStr[ pos++ ];
switch( colChar)
{
case COLS9+4: /* ch in spanish */
case COLS9+11: /* ch in czech */
/* Put the WP char in the word string */
UW2FBA( (FLMUINT16) 'C', wsPtr );
wsPtr += 2;
colChar = (FLMUINT) 'H';
pos++; /* move past second duplicate char */
break;
case COLS9+17: /* ll in spanish */
/* Put the WP char in the word string */
UW2FBA( (FLMUINT16)'L', wsPtr );
wsPtr += 2;
colChar = (FLMUINT)'L';
pos++; /* move past duplicate character */
break;
case COLS0: /* Non collating character or OEM character */
/* Actual character is in sub-collation area*/
colChar = (FLMUINT) 0xFFFF;
break;
default:
/* Watch out COLS10h has () around it for subtraction */
if( hebrewArabicFlag && (colChar >= COLS10h))
{
colChar = (colChar < COLS10a) /* Hebrew only? */
? (FLMUINT) (0x900 + (colChar - (COLS10h))) /* Hebrew */
: (FLMUINT) (HebArabColToWPChr[ colChar - (COLS10a)]); /* Arabic */
}
else
{
colChar = (FLMUINT) colToWPChr[ colChar - COLLS ];
}
break;
}
UW2FBA( (FLMUINT16) colChar, wsPtr ); /* Put the WP char in the word string*/
wsPtr += 2;
} /* end while */
} /* end if */
else /* US Sorting - optimized */
{
while( length && (fColStr[pos] > MAX_COL_OPCODE))
{
length--;
/* Move in the WP value given uppercase collated value */
colChar = (FLMUINT) fColStr[ pos++ ];
if( colChar == COLS0)
{
colChar = (FLMUINT) 0xFFFF;
}
else
{
colChar = (FLMUINT) colToWPChr[ colChar - COLLS ];
}
UW2FBA( (FLMUINT16) colChar, wsPtr ); /* Put the WP char in the word string */
wsPtr += 2;
}
}
/* NULL Terminate the string */
UW2FBA( (FLMUINT16)0, wsPtr);
wdStrLen = pos + pos; /* Multiply fcStrLen by 2 */
/**--------------------------------------------------------------------
*** Parse through the sub-collation and case information.
*** Watch out for COMP POST indexes - don't have case info following.
*** Here are values for some of the codes:
*** [ 0x01] - end for fields - case info follows - for COMP POST ixs
*** [ 0x02] - compound marker
*** [ 0x03] - not really used at this time
*** [ 0x04] - case information is all uppercase (IS,DK,GR)
*** [ 0x05] - case bits follow
*** [ 0x06] - case information is all uppercase
*** [ 0x07] - beginning of sub-collation information
*** [ 0x08] - first substring field that is made
*** [ 0x09] - truncation marker for text and binary
***
*** Below are some cases to consider...
***
*** [ COLLATION][ 0x07 sub-collation][ 0x05 case info][ 0x02]
*** [ COLLATION][ 0x07 sub-collation][ 0x05 case info]
*** [ COLLATION][ 0x07 sub-collation][ 0x02]
*** [ COLLATION][ 0x07 sub-collation][ 0x01]
*** [ COLLATION][ 0x05 case info][ 0x02]
*** [ COLLATION][ 0x05 case info]
*** [ COLLATION][ 0x02]
*** [ COLLATION][ 0x01]
***
*** In the future still want[ 0x06] to be compressed out for uppercase
*** only indexes.
***-------------------------------------------------------------------*/
// Check first substring before truncated
if( length && fColStr[pos] == COLL_FIRST_SUBSTRING)
{
if( pbFirstSubstring)
*pbFirstSubstring = TRUE; // Don't need to initialize to FALSE.
length--;
pos++;
}
if( length && fColStr[pos] == COLL_TRUNCATED)
{
if( pbDataTruncated)
*pbDataTruncated = TRUE; // Don't need to initialize to FALSE.
length--;
pos++;
}
/**------------------------------
*** Does sub-collation follow?
***-----------------------------*/
/* Still more to process - first work on the sub-collation (diacritics) */
/* Hebrew/Arabic may have empty collation area */
if( length && (fColStr[pos] == (COLL_MARKER | SC_SUB_COL)))
{
FLMUINT tempLen;
/* Do another pass on the word string adding the diacritics */
bitPos = FWWSCmbSubColBuf( wordStr, &wdStrLen,
&fColStr[++pos],
hebrewArabicFlag );
/* Move pos to next byte value */
tempLen = BYTES_IN_BITS( bitPos );
pos += tempLen;
length -= tempLen + 1; /* The 1 includes the 0x07 byte */
}
/**-------------------------------
*** Does the case info follow?
***------------------------------*/
if( length && (fColStr[pos] > COMPOUND_MARKER))
{
/**----------------------------------------------------
*** Take care of the lower and upper case conversion
*** If mixed case then convert using case bits
***---------------------------------------------------*/
if( fColStr[pos++] & SC_MIXED) /* Increment pos here! */
{
/* Don't pre-increment pos on line below! */
pos += FWWSToMixed( wordStr, wdStrLen, &fColStr[pos], fWPLang );
}
/* else 0x04 or 0x06 - all characters already in uppercase */
}
*fcStrLenRV = pos; /* pos should be on the 0x01 or 0x02 flag */
return( wdStrLen); /* Return the length of the word string */
}
/**************************************************************************
Desc: Combine the diacritic 5 bit values to an existing word string
Todo: May want to check fwpCh6Cmbcar() for CY return value
***************************************************************************/
FSTATIC FLMUINT FWWSCmbSubColBuf(
FLMBYTE * wordStr, /* Existing word string to modify */
FLMUINT * wdStrLenRV, /* Wordstring length in bytes */
FLMBYTE * subColBuf, /* Diacritic values in 5 bit sets */
FLMBOOL hebrewArabicFlag) /* Set if language is Hebrew or Arabic */
{
FLMUINT subColBitPos = 0;
FLMUINT numWords = *wdStrLenRV >> 1;
FLMUINT16 diac;
FLMUINT16 wpchar;
FLMUINT temp;
/* For each word in the word string ... */
while( numWords--)
{
/* label used for hebrew/arabic - additional subcollation can follow */
/* This macro DOESN'T increment bitPos */
if( TEST1BIT( subColBuf, subColBitPos))
{
/**--------------------------------------------
*** If "11110" - unmappable unicode char - 0xFFFF is before it
*** If "1110" then INDEX extended char is inserted
*** If "110" then extended char follows that replaces collation
*** If "10" then take next 5 bits which
*** contain the diacritic subcollation value.
***-------------------------------------------*/
after_last_character:
subColBitPos++; /* Eat the first 1 bit */
if( ! TEST1BIT( subColBuf, subColBitPos))
{
subColBitPos++; /* Eat the 0 bit */
diac = (FLMUINT16)(GETnBITS( 5, subColBuf, subColBitPos));
subColBitPos += 5;
if( (wpchar = FB2UW( wordStr )) < 0x100) /* If not extended base..*/
{
/* Convert to WP diacritic and combine characters */
fwpCh6Cmbcar( &wpchar, wpchar, (FLMUINT16) ml1_COLtoD[diac] );
/* Even if cmbcar fails, wpchar is still set to a valid value */
UW2FBA( wpchar, wordStr);
}
else if( (wpchar & 0xFF00) == 0x0D00) /* arabic? */
{
wpchar = ArabSubColToWPChr[ diac ];
UW2FBA( wpchar, wordStr);
}
/* else diacritic is extra info */
/* cmbcar should not handle extended chars for this design */
}
else /* "110" or "1110" or "11110" */
{
subColBitPos++; /* Eat the 2nd '1' bit */
if( TEST1BIT( subColBuf, subColBitPos)) /* Test the 3rd bit */
{
/* 1110 - shift wpchars down 1 word and insert value below */
subColBitPos++; /* Eat the 3rd '1' bit */
*wdStrLenRV += 2; /* Return 2 more bytes */
if( TEST1BIT( subColBuf, subColBitPos )) /* Test 4th bit */
{
/* Unconvertable UNICODE character */
/* The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode */
shiftN( wordStr, numWords + numWords + 4, 2 );
subColBitPos++; /* Eat the 4th '1' bit */
wordStr += 2; /* Skip the 0xFFFF for now */
}
else
{
/* Move down 2 byte NULL and rest of the 2 byte characters */
/* The extended character does not have a 0xFF col value */
shiftN( wordStr, numWords + numWords + 2, 2 );
numWords++; /* Increment because inserted */
/* fall through reading the actual charater value */
}
}
subColBitPos++; /* Skip past the zero bit */
subColBitPos = (subColBitPos + 7) & (~7); /*roundup to next byte*/
temp = BYTES_IN_BITS( subColBitPos ); /* compute position */
wordStr[1] = subColBuf[ temp ]; /* Character set */
wordStr[0] = subColBuf[ temp + 1 ]; /* Character */
subColBitPos += 16;
}
}
else
subColBitPos++;
wordStr += 2; /* Next WP character */
}
if( hebrewArabicFlag )
{
if( TEST1BIT( subColBuf, subColBitPos))
{
/**--------------------------------------------------
*** Hebrew/Arabic can have trailing accents that
*** don't have a matching collation value.
*** Keep looping in this case.
*** Note that subColBitPos isn't incremented above.
***-------------------------------------------------*/
numWords = 0; /* set so won't loop forever! */
goto after_last_character; /* process trailing bit */
}
subColBitPos++; /* Eat the last '0' bit */
}
return( subColBitPos);
}
/**************************************************************************
Desc: Convert the word string to lower case chars given low/upp bit string
Out: WP characters have modified to their original case
Ret: Number of bytes used in the lower/upper buffer
Notes: Only WP to lower case conversion is done here for each bit NOT set.
***************************************************************************/
FSTATIC FLMUINT FWWSToMixed(
FLMBYTE * wordStr, /* Existing word string to modify */
FLMUINT wdStrLen, /* Length of the wordstring in bytes */
FLMBYTE * lowUpBitStr, /* Lower/upper case bit string */
FLMUINT fWPLang) /*Visit: Scott */
{
FLMUINT numWords;
FLMUINT tempWord;
FLMBYTE tempByte = 0;
FLMBYTE maskByte;
FLMBYTE xorByte; /* Used to reverse GR, bits */
xorByte = (fWPLang == US_LANG ) /* Do most common compare first */
? (FLMBYTE)0
: (fWPLang == GR_LANG) /* Greek has uppercase first */
? (FLMBYTE)0xFF
: (FLMBYTE)0 ;
/* For each word in the word string ... */
for( numWords = wdStrLen >> 1, /* Total number of words in word string */
maskByte = 0; /* Force first time to get a byte */
numWords--; /* Test */
wordStr += 2, /* Next WP character - word */
maskByte >>= 1 ) /* Next bit to mask and check */
{
if( maskByte == 0) /* Time to get another byte */
{
tempByte = xorByte ^ *lowUpBitStr++;
maskByte = 0x80;
}
if( ( tempByte & maskByte) == 0) /* If lowercase conver - else is upper*/
{
/* Convert to lower case - COLL -> WP is already in upper case */
tempWord = (FLMUINT) FB2UW( wordStr );
if( (tempWord >= ASCII_UPPER_A) && (tempWord <= ASCII_UPPER_Z)) /* yes */
tempWord |= 0x20;
else
{
FLMBYTE charVal = (FLMBYTE)(tempWord & 0xFF);
FLMBYTE charSet = (FLMBYTE) (tempWord >> 8);
/* check if charact within region of character set */
if ( (( charSet == CHSMUL1) && /* Multinational 1 */
((charVal >= 26) && (charVal <= 241)))
||(( charSet == CHSGREK) && /* Greek */
( charVal <= 69))
||(( charSet == CHSCYR) && /* Cyrillic */
( charVal <= 199))
)
{
tempWord |= 0x01; /* Set - don't increment */
}
}
UW2FBA( (FLMUINT16) tempWord, wordStr );
}
}
numWords = wdStrLen >> 1;
return( BYTES_IN_BITS( numWords ));
}