//------------------------------------------------------------------------- // Desc: Uncollation routines for converting from collated string to WP string. // Tabs: 3 // // Copyright (c) 1992-2001,2003-2006 Novell, Inc. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of version 2 of the GNU General Public // License as published by the Free Software Foundation. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, contact Novell, Inc. // // To contact Novell about this file by physical or electronic mail, // you may find current contact information at www.novell.com // // $Id: f_uncoll.cpp 12245 2006-01-19 14:29:51 -0700 (Thu, 19 Jan 2006) dsanders $ //------------------------------------------------------------------------- #include "flaimsys.h" /**----------------------------------------- *** External tables *** Could be far in another data segment ***----------------------------------------*/ /* From COLLATE1.C */ extern FLMUINT16 colToWPChr[]; /* Converts collated value to WP character */ extern FLMBYTE ml1_COLtoD[]; /* Diacritic conversions */ extern FLMUINT16 HebArabColToWPChr[]; extern FLMUINT16 ArabSubColToWPChr[]; /**----------------------------------------- *** Local Static Routine Prototypes ***----------------------------------------*/ /**---------------------------------------------------------------- *** The version using the table uses 34 ticks and 29 bytes ONLY *** because the turbo optimizer uses register variables better. *** The other version below uses 39 ticks and 33 bytes. *** Macro not moved to other calls in f_tocoll or kycompnd because *** these are rarely called areas right now. ***---------------------------------------------------------------*/ FSTATIC FLMUINT FWWSGetColStr( /* Returns byte length of word string*/ FLMBYTE * fColStr, FLMUINT * fcStrLenRV, FLMBYTE * wordStr, FLMUINT fWPLang, FLMBOOL * pbDataTruncated, FLMBOOL * pbFirstSubstring); FSTATIC FLMUINT FWWSCmbSubColBuf( FLMBYTE * wordStr, FLMUINT * wdStrLenRV, FLMBYTE * subColBuf, FLMBOOL hebrewArabicFlag ); FSTATIC FLMUINT FWWSToMixed( FLMBYTE * wordStr, FLMUINT wdStrLen, FLMBYTE * lowUpBitStr, FLMUINT fWPLang ); /************************************************************************** Desc: Get the Flaim collating string and convert back to a text string Ret: Length of new wpStr Notes: Allocates the area for the word string buffer if will be over 256. ***************************************************************************/ FLMUINT FColStrToText( FLMBYTE * fColStr, /* Points to the Flaim collated string */ FLMUINT * fcStrLenRV, /* Length of the Flaim collated string */ FLMBYTE * textStr, /* Output string to build - TEXT string */ FLMUINT fWPLang, /* FLAIM WP language number */ FLMBYTE * postBuf, /* Lower/upper POST buffer or NULL */ FLMUINT * postBytesRV, /* Return next position to use in postBuf */ FLMBOOL * pbDataTruncated, /* Sets to TRUE if data had been truncated */ FLMBOOL * pbFirstSubstring) /* Sets to TRUE if first substring */ { #define LOCAL_CHARS 150 FLMBYTE wordStr[ LOCAL_CHARS * 2 + LOCAL_CHARS / 5 ]; // Sample + 20% FLMBYTE * wsPtr = NULL; FLMBYTE * wsAllocatedWsPtr = NULL; FLMUINT wsLen; FLMUINT textLen; FLMBYTE * textPtr; if( *fcStrLenRV > LOCAL_CHARS ) /* If won't fit allocate 1280 */ { if( RC_BAD( f_alloc( MAX_KEY_SIZ * 2, &wsPtr))) { return( 0 ); } wsAllocatedWsPtr = wsPtr; } else wsPtr = wordStr; if( (fWPLang >= FIRST_DBCS_LANG) && (fWPLang <= LAST_DBCS_LANG)) { wsLen = AsiaConvertColStr( fColStr, fcStrLenRV, wsPtr, pbDataTruncated, pbFirstSubstring ); if( postBuf ) { FLMUINT postBytes = *postBytesRV + 2; /* Skip past marker */ /* may change wsLen */ postBytes += AsiaParseCase( wsPtr, &wsLen, &postBuf[ postBytes]); *postBytesRV = postBytes; } } else { wsLen = FWWSGetColStr( fColStr, fcStrLenRV, wsPtr, fWPLang, pbDataTruncated, pbFirstSubstring ); /* If a post buffer is sent - turn unflagged chars to lower case */ if( postBuf ) { FLMUINT postBytes = *postBytesRV; /* Check if mixed case chars follow and always increment postBytes */ if( postBuf[ postBytes++ ] == (COLL_MARKER | SC_MIXED)) { postBytes += FWWSToMixed( wsPtr, wsLen, &postBuf[ postBytes ], fWPLang); } *postBytesRV = postBytes; } } /**------------------------------------------- *** Copy word string to TEXT string area ***------------------------------------------*/ wsLen >>= 1; /* Convert # of bytes to # of words */ textPtr = textStr; while( wsLen--) { register FLMBYTE ch, cSet; /* Put the character in a local variable for speed */ ch = *wsPtr++; cSet = *wsPtr++; if( (!cSet) && (ch <= 127)) { /**----------------------------------------------------------- *** Character set zero only needs one byte if the character *** is <= 127. Otherwise, it is handled like all other *** extended characters below. ***----------------------------------------------------------*/ *textPtr++ = ch; } /**----------------------------------------------------- *** If the character set is > 63 it takes three bytes *** to store, otherwise only two bytes are needed. ***----------------------------------------------------*/ else if( cSet < 63) { *textPtr++ = (FLMBYTE)(CHAR_SET_CODE | cSet); *textPtr++ = ch; } else if( cSet == 0xFF && ch == 0xFF) { *textPtr++ = UNICODE_CODE; *textPtr++ = *(wsPtr+1); /* Character set */ *textPtr++ = *wsPtr; /* Character */ wsPtr += 2; wsLen--; /* Skip past 4 bytes for UNICODE */ } else { *textPtr++ = EXT_CHAR_CODE; *textPtr++ = cSet; *textPtr++ = ch; } } textLen = (textPtr - textStr); /* Compute total length */ if( wsAllocatedWsPtr != NULL) f_free( &wsAllocatedWsPtr); return( textLen); } /***************************************************************************** Desc: Get the Flaim collating string and convert back to a WP word string Ret: Length of new WP word string *****************************************************************************/ FSTATIC FLMUINT FWWSGetColStr( FLMBYTE * fColStr, /* Points to the Flaim collated string */ FLMUINT * fcStrLenRV, /* Length of the Flaim collated string */ FLMBYTE * wordStr, /* Output string to build - WP word string */ FLMUINT fWPLang, /* FLAIM WP language number */ FLMBOOL * pbDataTruncated, /* Set to TRUE if truncated */ FLMBOOL * pbFirstSubstring) /* Sets to TRUE if first substring */ { FLMBYTE * wsPtr = wordStr; /* Points to the word string data area */ FLMUINT length = *fcStrLenRV;/* May optimize as a register */ FLMUINT pos = 0; /* Position in fColStr[] */ FLMUINT bitPos; /* Computed bit position */ FLMUINT colChar; /* Not portable if a FLMBYTE value */ FLMUINT wdStrLen; FLMBOOL hebrewArabicFlag = 0;/* Set if hebrew/arabic language */ /** *** WARNING: *** The code is duplicated for performance reasons. *** The US code below is much more optimized so *** any changes must be done twice. **/ if( fWPLang != US_LANG) /* Code for NON-US languages */ { if( (fWPLang == AR_LANG ) || /* Arabic */ (fWPLang == FA_LANG ) || /* Farsi - persian */ (fWPLang == HE_LANG ) || /* Hebrew */ (fWPLang == UR_LANG )) /* Urdu */ hebrewArabicFlag++; /* Add sindhi, pashto, kurdish, malay*/ // MVSVISIT: will not work correctly on IBM390 - need to change toolkit tables. while( length && (fColStr[pos] > MAX_COL_OPCODE)) { length--; colChar = (FLMUINT) fColStr[ pos++ ]; switch( colChar) { case COLS9+4: /* ch in spanish */ case COLS9+11: /* ch in czech */ /* Put the WP char in the word string */ UW2FBA( (FLMUINT16) 'C', wsPtr ); wsPtr += 2; colChar = (FLMUINT) 'H'; pos++; /* move past second duplicate char */ break; case COLS9+17: /* ll in spanish */ /* Put the WP char in the word string */ UW2FBA( (FLMUINT16)'L', wsPtr ); wsPtr += 2; colChar = (FLMUINT)'L'; pos++; /* move past duplicate character */ break; case COLS0: /* Non collating character or OEM character */ /* Actual character is in sub-collation area*/ colChar = (FLMUINT) 0xFFFF; break; default: /* Watch out COLS10h has () around it for subtraction */ if( hebrewArabicFlag && (colChar >= COLS10h)) { colChar = (colChar < COLS10a) /* Hebrew only? */ ? (FLMUINT) (0x900 + (colChar - (COLS10h))) /* Hebrew */ : (FLMUINT) (HebArabColToWPChr[ colChar - (COLS10a)]); /* Arabic */ } else { colChar = (FLMUINT) colToWPChr[ colChar - COLLS ]; } break; } UW2FBA( (FLMUINT16) colChar, wsPtr ); /* Put the WP char in the word string*/ wsPtr += 2; } /* end while */ } /* end if */ else /* US Sorting - optimized */ { while( length && (fColStr[pos] > MAX_COL_OPCODE)) { length--; /* Move in the WP value given uppercase collated value */ colChar = (FLMUINT) fColStr[ pos++ ]; if( colChar == COLS0) { colChar = (FLMUINT) 0xFFFF; } else { colChar = (FLMUINT) colToWPChr[ colChar - COLLS ]; } UW2FBA( (FLMUINT16) colChar, wsPtr ); /* Put the WP char in the word string */ wsPtr += 2; } } /* NULL Terminate the string */ UW2FBA( (FLMUINT16)0, wsPtr); wdStrLen = pos + pos; /* Multiply fcStrLen by 2 */ /**-------------------------------------------------------------------- *** Parse through the sub-collation and case information. *** Watch out for COMP POST indexes - don't have case info following. *** Here are values for some of the codes: *** [ 0x01] - end for fields - case info follows - for COMP POST ixs *** [ 0x02] - compound marker *** [ 0x03] - not really used at this time *** [ 0x04] - case information is all uppercase (IS,DK,GR) *** [ 0x05] - case bits follow *** [ 0x06] - case information is all uppercase *** [ 0x07] - beginning of sub-collation information *** [ 0x08] - first substring field that is made *** [ 0x09] - truncation marker for text and binary *** *** Below are some cases to consider... *** *** [ COLLATION][ 0x07 sub-collation][ 0x05 case info][ 0x02] *** [ COLLATION][ 0x07 sub-collation][ 0x05 case info] *** [ COLLATION][ 0x07 sub-collation][ 0x02] *** [ COLLATION][ 0x07 sub-collation][ 0x01] *** [ COLLATION][ 0x05 case info][ 0x02] *** [ COLLATION][ 0x05 case info] *** [ COLLATION][ 0x02] *** [ COLLATION][ 0x01] *** *** In the future still want[ 0x06] to be compressed out for uppercase *** only indexes. ***-------------------------------------------------------------------*/ // Check first substring before truncated if( length && fColStr[pos] == COLL_FIRST_SUBSTRING) { if( pbFirstSubstring) *pbFirstSubstring = TRUE; // Don't need to initialize to FALSE. length--; pos++; } if( length && fColStr[pos] == COLL_TRUNCATED) { if( pbDataTruncated) *pbDataTruncated = TRUE; // Don't need to initialize to FALSE. length--; pos++; } /**------------------------------ *** Does sub-collation follow? ***-----------------------------*/ /* Still more to process - first work on the sub-collation (diacritics) */ /* Hebrew/Arabic may have empty collation area */ if( length && (fColStr[pos] == (COLL_MARKER | SC_SUB_COL))) { FLMUINT tempLen; /* Do another pass on the word string adding the diacritics */ bitPos = FWWSCmbSubColBuf( wordStr, &wdStrLen, &fColStr[++pos], hebrewArabicFlag ); /* Move pos to next byte value */ tempLen = BYTES_IN_BITS( bitPos ); pos += tempLen; length -= tempLen + 1; /* The 1 includes the 0x07 byte */ } /**------------------------------- *** Does the case info follow? ***------------------------------*/ if( length && (fColStr[pos] > COMPOUND_MARKER)) { /**---------------------------------------------------- *** Take care of the lower and upper case conversion *** If mixed case then convert using case bits ***---------------------------------------------------*/ if( fColStr[pos++] & SC_MIXED) /* Increment pos here! */ { /* Don't pre-increment pos on line below! */ pos += FWWSToMixed( wordStr, wdStrLen, &fColStr[pos], fWPLang ); } /* else 0x04 or 0x06 - all characters already in uppercase */ } *fcStrLenRV = pos; /* pos should be on the 0x01 or 0x02 flag */ return( wdStrLen); /* Return the length of the word string */ } /************************************************************************** Desc: Combine the diacritic 5 bit values to an existing word string Todo: May want to check fwpCh6Cmbcar() for CY return value ***************************************************************************/ FSTATIC FLMUINT FWWSCmbSubColBuf( FLMBYTE * wordStr, /* Existing word string to modify */ FLMUINT * wdStrLenRV, /* Wordstring length in bytes */ FLMBYTE * subColBuf, /* Diacritic values in 5 bit sets */ FLMBOOL hebrewArabicFlag) /* Set if language is Hebrew or Arabic */ { FLMUINT subColBitPos = 0; FLMUINT numWords = *wdStrLenRV >> 1; FLMUINT16 diac; FLMUINT16 wpchar; FLMUINT temp; /* For each word in the word string ... */ while( numWords--) { /* label used for hebrew/arabic - additional subcollation can follow */ /* This macro DOESN'T increment bitPos */ if( TEST1BIT( subColBuf, subColBitPos)) { /**-------------------------------------------- *** If "11110" - unmappable unicode char - 0xFFFF is before it *** If "1110" then INDEX extended char is inserted *** If "110" then extended char follows that replaces collation *** If "10" then take next 5 bits which *** contain the diacritic subcollation value. ***-------------------------------------------*/ after_last_character: subColBitPos++; /* Eat the first 1 bit */ if( ! TEST1BIT( subColBuf, subColBitPos)) { subColBitPos++; /* Eat the 0 bit */ diac = (FLMUINT16)(GETnBITS( 5, subColBuf, subColBitPos)); subColBitPos += 5; if( (wpchar = FB2UW( wordStr )) < 0x100) /* If not extended base..*/ { /* Convert to WP diacritic and combine characters */ fwpCh6Cmbcar( &wpchar, wpchar, (FLMUINT16) ml1_COLtoD[diac] ); /* Even if cmbcar fails, wpchar is still set to a valid value */ UW2FBA( wpchar, wordStr); } else if( (wpchar & 0xFF00) == 0x0D00) /* arabic? */ { wpchar = ArabSubColToWPChr[ diac ]; UW2FBA( wpchar, wordStr); } /* else diacritic is extra info */ /* cmbcar should not handle extended chars for this design */ } else /* "110" or "1110" or "11110" */ { subColBitPos++; /* Eat the 2nd '1' bit */ if( TEST1BIT( subColBuf, subColBitPos)) /* Test the 3rd bit */ { /* 1110 - shift wpchars down 1 word and insert value below */ subColBitPos++; /* Eat the 3rd '1' bit */ *wdStrLenRV += 2; /* Return 2 more bytes */ if( TEST1BIT( subColBuf, subColBitPos )) /* Test 4th bit */ { /* Unconvertable UNICODE character */ /* The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode */ shiftN( wordStr, numWords + numWords + 4, 2 ); subColBitPos++; /* Eat the 4th '1' bit */ wordStr += 2; /* Skip the 0xFFFF for now */ } else { /* Move down 2 byte NULL and rest of the 2 byte characters */ /* The extended character does not have a 0xFF col value */ shiftN( wordStr, numWords + numWords + 2, 2 ); numWords++; /* Increment because inserted */ /* fall through reading the actual charater value */ } } subColBitPos++; /* Skip past the zero bit */ subColBitPos = (subColBitPos + 7) & (~7); /*roundup to next byte*/ temp = BYTES_IN_BITS( subColBitPos ); /* compute position */ wordStr[1] = subColBuf[ temp ]; /* Character set */ wordStr[0] = subColBuf[ temp + 1 ]; /* Character */ subColBitPos += 16; } } else subColBitPos++; wordStr += 2; /* Next WP character */ } if( hebrewArabicFlag ) { if( TEST1BIT( subColBuf, subColBitPos)) { /**-------------------------------------------------- *** Hebrew/Arabic can have trailing accents that *** don't have a matching collation value. *** Keep looping in this case. *** Note that subColBitPos isn't incremented above. ***-------------------------------------------------*/ numWords = 0; /* set so won't loop forever! */ goto after_last_character; /* process trailing bit */ } subColBitPos++; /* Eat the last '0' bit */ } return( subColBitPos); } /************************************************************************** Desc: Convert the word string to lower case chars given low/upp bit string Out: WP characters have modified to their original case Ret: Number of bytes used in the lower/upper buffer Notes: Only WP to lower case conversion is done here for each bit NOT set. ***************************************************************************/ FSTATIC FLMUINT FWWSToMixed( FLMBYTE * wordStr, /* Existing word string to modify */ FLMUINT wdStrLen, /* Length of the wordstring in bytes */ FLMBYTE * lowUpBitStr, /* Lower/upper case bit string */ FLMUINT fWPLang) /*Visit: Scott */ { FLMUINT numWords; FLMUINT tempWord; FLMBYTE tempByte = 0; FLMBYTE maskByte; FLMBYTE xorByte; /* Used to reverse GR, bits */ xorByte = (fWPLang == US_LANG ) /* Do most common compare first */ ? (FLMBYTE)0 : (fWPLang == GR_LANG) /* Greek has uppercase first */ ? (FLMBYTE)0xFF : (FLMBYTE)0 ; /* For each word in the word string ... */ for( numWords = wdStrLen >> 1, /* Total number of words in word string */ maskByte = 0; /* Force first time to get a byte */ numWords--; /* Test */ wordStr += 2, /* Next WP character - word */ maskByte >>= 1 ) /* Next bit to mask and check */ { if( maskByte == 0) /* Time to get another byte */ { tempByte = xorByte ^ *lowUpBitStr++; maskByte = 0x80; } if( ( tempByte & maskByte) == 0) /* If lowercase conver - else is upper*/ { /* Convert to lower case - COLL -> WP is already in upper case */ tempWord = (FLMUINT) FB2UW( wordStr ); if( (tempWord >= ASCII_UPPER_A) && (tempWord <= ASCII_UPPER_Z)) /* yes */ tempWord |= 0x20; else { FLMBYTE charVal = (FLMBYTE)(tempWord & 0xFF); FLMBYTE charSet = (FLMBYTE) (tempWord >> 8); /* check if charact within region of character set */ if ( (( charSet == CHSMUL1) && /* Multinational 1 */ ((charVal >= 26) && (charVal <= 241))) ||(( charSet == CHSGREK) && /* Greek */ ( charVal <= 69)) ||(( charSet == CHSCYR) && /* Cyrillic */ ( charVal <= 199)) ) { tempWord |= 0x01; /* Set - don't increment */ } } UW2FBA( (FLMUINT16) tempWord, wordStr ); } } numWords = wdStrLen >> 1; return( BYTES_IN_BITS( numWords )); }