//------------------------------------------------------------------------- // Desc: WP character routines. // Tabs: 3 // // Copyright (c) 1991-1992,1994-2003,2005-2006 Novell, Inc. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of version 2 of the GNU General Public // License as published by the Free Software Foundation. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, contact Novell, Inc. // // To contact Novell about this file by physical or electronic mail, // you may find current contact information at www.novell.com // // $Id: fwpchar.cpp 12301 2006-01-19 15:02:55 -0700 (Thu, 19 Jan 2006) dsanders $ //------------------------------------------------------------------------- #include "flaimsys.h" static char fwp_langtbl[LAST_LANG+LAST_LANG] = { 'U', 'S', /* English, United States */ 'A', 'F', /* Afrikaans */ 'A', 'R', /* Arabic */ 'C', 'A', /* Catalan */ 'H', 'R', /* Croatian */ 'C', 'Z', /* Czech */ 'D', 'K', /* Danish */ 'N', 'L', /* Dutch */ 'O', 'Z', /* English, Australia */ 'C', 'E', /* English, Canada */ 'U', 'K', /* English, United Kingdom */ 'F', 'A', /* Farsi */ 'S', 'U', /* Finnish */ 'C', 'F', /* French, Canada */ 'F', 'R', /* French, France */ 'G', 'A', /* Galician */ 'D', 'E', /* German, Germany */ 'S', 'D', /* German, Switzerland */ 'G', 'R', /* Greek */ 'H', 'E', /* Hebrew */ 'M', 'A', /* Hungarian */ 'I', 'S', /* Icelandic */ 'I', 'T', /* Italian */ 'N', 'O', /* Norwegian */ 'P', 'L', /* Polish */ 'B', 'R', /* Portuguese, Brazil */ 'P', 'O', /* Portuguese, Portugal */ 'R', 'U', /* Russian */ 'S', 'L', /* Slovak */ 'E', 'S', /* Spanish */ 'S', 'V', /* Swedish */ 'Y', 'K', /* Ukrainian */ 'U', 'R', /* Urdu */ 'T', 'K', /* Turkey */ 'J', 'P', /* Japanese */ 'K', 'R', /* Korean */ 'C', 'T', /* Chinese-Traditional */ 'C', 'S', /* Chinese-Simplified */ 'L', 'A' /* Future asian language */ /* Removed in conjunction with change in wps6.h */ /* 'T', 'A' Taiwanese - really CS! */ }; /* fwp_caseConvertableRange[] defines the range of characters within the set which are case convertible. */ static FLMBYTE fwp_caseConvertableRange[] = { 26,241, /* Multinational 1 */ 0,0, /* Multinational 2 */ 0,0, /* Box Drawing */ 0,0, /* Symbol 1 */ 0,0, /* Symbol 2 */ 0,0, /* Math 1 */ 0,0, /* Math 2 */ 0,69, /* Greek 1 */ 0,0, /* Hebrew */ 0,199, /* Cyrillic */ 0,0, /* Japanese Kana */ 0,0, /* User-defined */ 0,0, /* Not defined */ 0,0, /* Not defined */ 0,0, /* Not defined */ }; /**************************************************************************** Desc: getNextCharState can be thought of as a 2 dimentional array with i and j as the row and column indicators respectively. If a value exists at the intersection of i and j, it is returned. Sparse array techniques are used to minimize memory usage. ****************************************************************************/ FINLINE FLMUINT16 getNextCharState( FLMUINT i, FLMUINT j) { FLMUINT k, x; for( k = fwp_indexi[ x = (i > START_COL) ? (START_ALL) : i ]; /* adjust so don't use full tables */ k <= (FLMUINT) (fwp_indexi[ x + 1] - 1); k++ ) { // FIXUP_AREA_SIZE should be 24. if( j == fwp_indexj[ k]) { return( fwp_valuea[ (i > START_COL) ? (k + (FIXUP_AREA_SIZE * (i - START_ALL))) : k]); } } return(0); } /**************************************************************************** Desc: Determine the language number from the 2 byte language code ****************************************************************************/ FLMEXP FLMUINT FLMAPI FlmLanguage( char * pszLanguageCode) { char cFirstChar = *pszLanguageCode; char cSecondChar = *(pszLanguageCode+1); FLMUINT uiTablePos; for (uiTablePos = 0; uiTablePos < (LAST_LANG+LAST_LANG); uiTablePos += 2 ) { if (fwp_langtbl [uiTablePos] == cFirstChar && fwp_langtbl [uiTablePos + 1] == cSecondChar) { // Return uiTablePos div 2 return( uiTablePos >> 1); } } // Language not found, return default US language return( US_LANG); } /**************************************************************************** Desc: Determine the language code from the language number ****************************************************************************/ FLMEXP void FLMAPI FlmGetLanguage( FLMUINT uiLangNum, char * pszLanguageCode) { // iLangNum could be negative if (uiLangNum >= LAST_LANG) { uiLangNum = US_LANG; } uiLangNum += uiLangNum; *pszLanguageCode++ = fwp_langtbl [uiLangNum]; *pszLanguageCode++ = fwp_langtbl [uiLangNum + 1]; *pszLanguageCode = 0; } /**************************************************************************** Desc: Converts a character to upper case (if possible) ****************************************************************************/ FLMUINT16 fwpCh6Upper( FLMUINT16 ui16WpChar) { if (ui16WpChar < 256) { if (ui16WpChar >= ASCII_LOWER_A && ui16WpChar <= ASCII_LOWER_Z) { // Return ASCII upper case return( ui16WpChar & 0xdf); } } else { FLMBYTE ucCharSet = ui16WpChar >> 8; if (ucCharSet == CHSMUL1) { FLMBYTE ucChar = ui16WpChar & 0xFF; if (ucChar >= fwp_caseConvertableRange[ (CHSMUL1-1) * 2] && ucChar <= fwp_caseConvertableRange[ ((CHSMUL1-1) * 2) + 1]) { return( ui16WpChar & 0xFFFE); } } else if (ucCharSet == CHSGREK) { if ((ui16WpChar & 0xFF) <= fwp_caseConvertableRange[ ((CHSGREK-1) * 2) + 1]) { return( ui16WpChar & 0xFFFE); } } else if (ucCharSet == CHSCYR) { if ((ui16WpChar & 0xFF) <= fwp_caseConvertableRange[ ((CHSCYR-1) * 2) + 1]) { return( ui16WpChar & 0xFFFE); } } else if (ui16WpChar >= Lower_JP_a) { // Possible double byte character set alphabetic character? if (ui16WpChar <= Lower_JP_z) { // Japanese? ui16WpChar = (ui16WpChar - Lower_JP_a) + Upper_JP_A; } else if (ui16WpChar >= Lower_KR_a && ui16WpChar <= Lower_KR_z) { // Korean? ui16WpChar = (ui16WpChar - Lower_KR_a) + Upper_KR_A; } else if (ui16WpChar >= Lower_CS_a && ui16WpChar <= Lower_CS_z) { // Chinese Simplified? ui16WpChar = (ui16WpChar - Lower_CS_a) + Upper_CS_A; } else if (ui16WpChar >= Lower_CT_a && ui16WpChar <= Lower_CT_z) { // Chinese Traditional? ui16WpChar = (ui16WpChar - Lower_CT_a) + Upper_CT_A; } } } // Return original character - original not in lower case. return( ui16WpChar); } /**************************************************************************** Desc: Checks to see if WP character is upper case ****************************************************************************/ FLMBOOL fwpIsUpper( FLMUINT16 ui16WpChar ) { FLMBYTE ucChar; FLMBYTE ucCharSet; // Get character ucChar = (FLMBYTE)(ui16WpChar & 0xFF); // Test if ASCII character set if (!(ui16WpChar & 0xFF00)) { return( (ucChar >= ASCII_LOWER_A && ucChar <= ASCII_LOWER_Z) ? FALSE : TRUE); } // Get the character set ucCharSet = (FLMBYTE) (ui16WpChar >> 8); // CHSMUL1 == Multinational 1 character set // CHSGREK == Greek character set // CHSCYR == Cyrillic character set if ((ucCharSet == CHSMUL1 && ucChar >= 26 && ucChar <= 241) || (ucCharSet == CHSGREK && ucChar <= 69) || (ucCharSet == CHSCYR && ucChar <= 199)) { return( (ucChar & 1) ? FALSE : TRUE); } // Don't care that double ss is lower return( TRUE); } /**************************************************************************** Desc: Converts a character to lower case (if possible) ****************************************************************************/ FLMUINT16 fwpCh6Lower( FLMUINT16 ui16WpChar) { if (ui16WpChar < 256) { if (ui16WpChar >= ASCII_UPPER_A && ui16WpChar <= ASCII_UPPER_Z) { return( ui16WpChar | 0x20); } } else { FLMBYTE ucCharSet = ui16WpChar >> 8; if (ucCharSet == CHSMUL1) { FLMBYTE ucChar = ui16WpChar & 0xFF; if (ucChar >= fwp_caseConvertableRange[ (CHSMUL1-1) * 2] && ucChar <= fwp_caseConvertableRange[ ((CHSMUL1-1) * 2) + 1] ) { return( ui16WpChar | 1); } } else if (ucCharSet == CHSGREK) { if ((ui16WpChar & 0xFF) <= fwp_caseConvertableRange[ ((CHSGREK-1) * 2) + 1]) { return( ui16WpChar | 1); } } else if (ucCharSet == CHSCYR) { if ((ui16WpChar & 0xFF) <= fwp_caseConvertableRange[ ((CHSCYR-1) * 2) + 1]) { return( ui16WpChar | 1); } } else if (ui16WpChar >= Upper_JP_A) { // Possible double byte character set alphabetic character? if (ui16WpChar <= Upper_JP_Z) { // Japanese? ui16WpChar = ui16WpChar - Upper_JP_A + Lower_JP_a; } else if (ui16WpChar >= Upper_KR_A && ui16WpChar <= Upper_KR_Z) { // Korean? ui16WpChar = ui16WpChar - Upper_KR_A + Lower_KR_a; } else if (ui16WpChar >= Upper_CS_A && ui16WpChar <= Upper_CS_Z) { // Chinese Simplified? ui16WpChar = ui16WpChar - Upper_CS_A + Lower_CS_a; } else if (ui16WpChar >= Upper_CT_A && ui16WpChar <= Upper_CT_Z) { // Chinese Traditional? ui16WpChar = ui16WpChar - Upper_CT_A + Lower_CT_a; } } } // Return original character, original not in upper case return(ui16WpChar); } /**************************************************************************** Desc: Break a WP character into a base and a diacritical char. Ret: TRUE - if not found FALSE - if found ****************************************************************************/ FLMBOOL fwpCh6Brkcar( FLMUINT16 ui16WpChar, FLMUINT16 * pui16BaseChar, FLMUINT16 * pui16DiacriticChar) { BASE_DIACRITP pBaseDiacritic; FLMINT iTableIndex; if ((pBaseDiacritic = fwp_car60_c[ HI(ui16WpChar)]) == 0) { return( TRUE); } iTableIndex = ((FLMBYTE)ui16WpChar) - pBaseDiacritic->start_char; if (iTableIndex < 0 || iTableIndex > pBaseDiacritic->char_count || pBaseDiacritic->table [iTableIndex].base == (FLMBYTE)0xFF) { return( TRUE); } if ((HI( ui16WpChar) != CHSMUL1) || ((fwp_ml1_cb60[ ((FLMBYTE) ui16WpChar) >> 3] >> (7 - (ui16WpChar & 0x07))) & 0x01)) { // normal case, same base as same as characters *pui16BaseChar = (ui16WpChar & 0xFF00) | pBaseDiacritic->table [iTableIndex].base; *pui16DiacriticChar = (ui16WpChar & 0xFF00) | pBaseDiacritic->table[iTableIndex].diacrit; } else { // Multi-national where base is ascii value. *pui16BaseChar = pBaseDiacritic->table [iTableIndex].base; *pui16DiacriticChar = (ui16WpChar & 0xFF00) | pBaseDiacritic->table[iTableIndex].diacrit; } return( FALSE); } /**************************************************************************** Desc: Take a base and a diacritic and compose a WP character. Note on base character: i's and j's must be dotless i's and j's (for those which use them) or they will not be found. Ret: TRUE - if not found FALSE - if found Notes: ascii characters with diacriticals are in multi-national if anywhere; all other base chars with diacritics are found in their own sets. ****************************************************************************/ FLMBOOL fwpCh6Cmbcar( FLMUINT16 * pui16WpChar, FLMUINT16 ui16BaseChar, FLMINT16 ui16DiacriticChar) { FLMUINT uiRemaining; FLMBYTE ucCharSet; FLMBYTE ucChar; BASE_DIACRITP pBaseDiacritic; BASE_DIACRIT_TABLEP pTable; ucCharSet = HI( ui16BaseChar); if (ucCharSet > fwp_max_car60_size) { return( TRUE); } // Is base ASCII? If so, look in multinational 1 if (!ucCharSet) { ucCharSet = CHSMUL1; } if ((pBaseDiacritic = fwp_car60_c[ucCharSet]) == 0) { return( TRUE); } ucChar = LO( ui16BaseChar); ui16DiacriticChar = LO( ui16DiacriticChar); pTable = pBaseDiacritic->table; for (uiRemaining = pBaseDiacritic->char_count; uiRemaining; uiRemaining--, pTable++ ) { // Same base? if (pTable->base == ucChar && (pTable->diacrit & 0x7F) == ui16DiacriticChar) { // Same diacritic? *pui16WpChar = (FLMUINT16) (((FLMUINT16) ucCharSet << 8) + (pBaseDiacritic->start_char + (FLMUINT16)(pTable - pBaseDiacritic->table))); return( FALSE); } } return( TRUE); } /************************************************************************** Desc: Find the collating value of a WP character ret: Collating value (COLS0 is high value - undefined WP char) ***********************************************************************/ FLMUINT16 fwpGetCollation( FLMUINT16 ui16WpChar, FLMUINT uiLanguage) { FLMUINT16 ui16State; FLMBYTE ucCharVal; FLMBYTE ucCharSet; FLMBOOL bHebrewArabicFlag = FALSE; TBL_B_TO_BP * pColTbl = fwp_col60Tbl; // State ONLY for non-US if (uiLanguage != US_LANG) { if (uiLanguage == AR_LANG || // Arabic uiLanguage == FA_LANG || // Farsi - persian uiLanguage == HE_LANG || // Hebrew uiLanguage == UR_LANG) // Urdu { pColTbl = fwp_HebArabicCol60Tbl; bHebrewArabicFlag = TRUE; } else { // check if uiLanguage candidate for alternate double collating ui16State = getNextCharState( START_COL, uiLanguage); if (0 != (ui16State = getNextCharState( (ui16State ? ui16State // look at special case languages : START_ALL), // look at US and European (FLMUINT) ui16WpChar))) { return( ui16State); } } } ucCharVal = (FLMBYTE)ui16WpChar; ucCharSet = (FLMBYTE)(ui16WpChar >> 8); // This is an optimized version of f_b_bp_citrp() inline for performance do { if (pColTbl->key == ucCharSet) { FLMBYTE * pucColVals; // table of collating values pucColVals = pColTbl->charPtr; // Check if the value is in the range of collated chars // Above lower range of table? if (ucCharVal >= *pucColVals) { // Make value zero based to index ucCharVal -= *pucColVals++; // Below maximum number of table entries? if (ucCharVal < *pucColVals++) { // Return collated value. return( pucColVals[ ucCharVal]); } } } // Go to next table entry pColTbl++; } while (pColTbl->key != 0xFF); if (bHebrewArabicFlag) { if (ucCharSet == CHSHEB || ucCharSet == CHSARB1 || ucCharSet == CHSARB2) { // Same as COLS0_HEBREW return( COLS0_ARABIC); } } // Defaults for characters that don't have a collation value. return( COLS0); } /**************************************************************************** Desc: Check for double characters that sort as 1 (like ch in Spanish) or 1 character that should sort as 2 (like � sorts as ae in French). Return: 0 = nothing changes. Otherwise, *pui16WpChar is the first character, and the return value contains the 2nd character. In addition, *pbTwoIntoOne will be TRUE if we should take two characters and treat as one (i.e, change the collation on the outside to one more than the collation of the first character). ****************************************************************************/ FLMUINT16 fwpCheckDoubleCollation( FLMUINT16 * pui16WpChar, FLMBOOL * pbTwoIntoOne, const FLMBYTE ** ppucInputStr, FLMUINT uiLanguage) { FLMUINT16 ui16CurState; FLMUINT16 ui16WpChar; FLMUINT16 ui16SecondChar; FLMUINT16 ui16LastChar = 0; FLMUINT uiInLen; FLMBOOL bUpperFlag; ui16WpChar = *pui16WpChar; bUpperFlag = fwpIsUpper( ui16WpChar); uiInLen = 0; ui16SecondChar = 0; // Primer read if ((ui16CurState = getNextCharState( 0, uiLanguage)) == 0) { goto Exit; } for (;;) { switch (ui16CurState) { case INSTSG: *pui16WpChar = ui16SecondChar = (FLMUINT16)f_toascii( 's'); *pbTwoIntoOne = FALSE; goto Exit; case INSTAE: if (bUpperFlag) { *pui16WpChar = (FLMUINT16)f_toascii( 'A'); ui16SecondChar = (FLMUINT16)f_toascii( 'E'); } else { *pui16WpChar = (FLMUINT16)f_toascii( 'a'); ui16SecondChar = (FLMUINT16)f_toascii( 'e'); } *pbTwoIntoOne = FALSE; goto Exit; case INSTIJ: if (bUpperFlag) { *pui16WpChar = (FLMUINT16)f_toascii( 'I'); ui16SecondChar = (FLMUINT16)f_toascii( 'J'); } else { *pui16WpChar = (FLMUINT16)f_toascii( 'i'); ui16SecondChar = (FLMUINT16)f_toascii( 'j'); } *pbTwoIntoOne = FALSE; goto Exit; case INSTOE: if (bUpperFlag) { *pui16WpChar = (FLMUINT16)f_toascii( 'O'); ui16SecondChar = (FLMUINT16)f_toascii( 'E'); } else { *pui16WpChar = (FLMUINT16)f_toascii( 'o'); ui16SecondChar = (FLMUINT16)f_toascii( 'e'); } *pbTwoIntoOne = FALSE; goto Exit; case WITHAA: *pui16WpChar = (FLMUINT16)(bUpperFlag ? (FLMUINT16)0x122 : (FLMUINT16)0x123); (*ppucInputStr)++; break; case AFTERC: *pui16WpChar = (FLMUINT16)(bUpperFlag ? (FLMUINT16)f_toascii( 'C') : (FLMUINT16)f_toascii( 'c')); ui16SecondChar = ui16LastChar; *pbTwoIntoOne = TRUE; (*ppucInputStr)++; goto Exit; case AFTERH: *pui16WpChar = (FLMUINT16)(bUpperFlag ? (FLMUINT16)f_toascii( 'H') : (FLMUINT16)f_toascii( 'h')); ui16SecondChar = ui16LastChar; *pbTwoIntoOne = TRUE; (*ppucInputStr)++; goto Exit; case AFTERL: *pui16WpChar = (FLMUINT16)(bUpperFlag ? (FLMUINT16)f_toascii( 'L') : (FLMUINT16)f_toascii( 'l')); ui16SecondChar = ui16LastChar; *pbTwoIntoOne = TRUE; (*ppucInputStr)++; goto Exit; default: // Handles STATE1 through STATE11 also break; } if ((ui16CurState = getNextCharState( ui16CurState, fwpCh6Lower( ui16WpChar))) == 0) { goto Exit; } ui16LastChar = ui16WpChar; ui16WpChar = (FLMUINT16) *((*ppucInputStr) + (uiInLen++)); } Exit: return( ui16SecondChar); }