//------------------------------------------------------------------------- // Desc: Convert collated string to WP string - for Asian languages. // Tabs: 3 // // Copyright (c) 1993-2006 Novell, Inc. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of version 2 of the GNU General Public // License as published by the Free Software Foundation. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, contact Novell, Inc. // // To contact Novell about this file by physical or electronic mail, // you may find current contact information at www.novell.com // // $Id: kyasia2.cpp 12312 2006-01-19 15:14:03 -0700 (Thu, 19 Jan 2006) dsanders $ //------------------------------------------------------------------------- #include "flaimsys.h" #define SET_CASE_BIT 0x01 #define SET_KATAKANA_BIT 0x01 #define SET_WIDTH_BIT 0x02 #define COLS_ASIAN_MARK_VAL 0x40 /* With out 0x100 */ extern FLMUINT16 colToWPChr[]; /* Converts collated value to WP character */ extern FLMBYTE ml1_COLtoD[]; /* Diacritic conversions */ extern FLMBYTE KanaSubColTbl[]; /* Position in the table+1 is subColValue */ extern BYTE_WORD_TBL fwp_Ch24ColTbl[]; FLMBYTE ColToKanaTbl[ 48 ] /* Only 48 values + 0x40, 0x41, 0x42 (169..171) */ = { 0, /* a=0, A=1 */ 2, /* i=2, I=3 */ 4, /* u=4, U=5, VU=83 */ 6, /* e=6, E=7 */ 8, /* o=8, O=9 */ 84, /* KA=10, GA=11, ka=84 - remember voicing table is optimized */ /* so that zero value is position and */ /* if voice=1 and no 0 is changed to 0 */ 12, /* KI=12, GI=13 */ 14, /* KU=14, GU=15 */ 85, /* KE=16, GE=17, ke=85 */ 18, /* KO=18, GO=19 */ /*10*/ 20, /* SA=20, ZA=21 */ 22, /* SHI=22, JI=23 */ 24, /* SU=24, ZU=25 */ 26, /* SE=26, ZE=27 */ 28, /* SO=28, ZO=29 */ 30, /* TA=30, DA=31 */ 32, /* CHI=32, JI=33 */ 34, /* tsu=34, TSU=35, ZU=36 */ 37, /* TE=37, DE=38 */ 39, /* TO=39, DO=40 */ /*20*/ 41, /* NA */ 42, /* NI */ 43, /* NU */ 44, /* NE */ 45, /* NO */ 46, /* HA, BA, PA */ 49, /* HI, BI, PI */ 52, /* FU, BU, PU */ 55, /* HE, BE, PE */ 58, /* HO, BO, PO */ /*30*/ 61, /* MA */ 62, /* MI */ 63, /* MU */ 64, /* ME */ 65, /* MO */ 66, /* ya, YA */ 68, /* yu, YU */ 70, /* yo, YO */ 72, /* RA */ 73, /* RI */ /*40*/ 74, /* RU */ 75, /* RE */ 76, /* RO */ 77, /* wa, WA */ 79, /* WI */ 80, /* WE */ 81, /* WO */ 82 /* N */ }; /*************************************************************************** Desc: Get the original string from an asian collation string Ret: Length of the word string in bytes ****************************************************************************/ FLMUINT AsiaConvertColStr( FLMBYTE * CollatedStr, /* Points to the Flaim collated string */ FLMUINT * CollatedStrLenRV, /* Length of the Flaim collated string */ FLMBYTE * WordStr, /* Output string to build - WP word string */ FLMBOOL * pbDataTruncated, /* Set to TRUE if data was truncated */ FLMBOOL * pbFirstSubstring) /* Set to TRUE if marker exists */ { FLMBYTE * pWordStr = WordStr; /* Points to the word string data area */ FLMUINT Length = *CollatedStrLenRV;/* May optimize as a register */ FLMUINT CollStrPos = 0; /* Position in CollatedStr[] */ FLMBOOL bHadExtended = FALSE; FLMUINT WordStrLen; FLMUINT16 ColChar; /* 2 byte value for asian */ while( Length) { FLMBYTE CharVal, CharSet; CharSet = CollatedStr[ CollStrPos ]; CharVal = CollatedStr[ CollStrPos + 1 ]; ColChar = (FLMUINT16)((CharSet << 8) + CharVal); if( ColChar <= MAX_COL_OPCODE) break; CollStrPos += 2; Length -= 2; if( CharSet == 0) /* Normal Latin/Greek/Cyrillic value */ { ColChar = colToWPChr[ CharVal - COLLS ]; } else if( CharSet == 1) /* katakana or hiragana character */ { if( CharVal > sizeof( ColToKanaTbl )) /* Special cases below */ { if( CharVal == COLS_ASIAN_MARK_VAL) /* dakuten */ ColChar = 0x240a; else if( CharVal == COLS_ASIAN_MARK_VAL + 1) /* handakuten */ ColChar = 0x240b; else if( CharVal == COLS_ASIAN_MARK_VAL + 2) /* chuuten */ ColChar = 0x2405; else ColChar = 0xFFFF; /* error */ } else { ColChar = (FLMUINT16)(0x2600 + ColToKanaTbl[ CharVal ]); } } else if( CharSet != 0xFF || CharVal != 0xFF) // Asian characters { // Insert zeroes that will be treated as a signal for // uncoverted unicode characters later on. NOTE: Cannot // use 0xFFFF, because we need to be able to detect this // case in the sub-collation stuff, and we don't want // to confuse it with the 0xFFFF that may have been inserted // in another case. // THIS IS A REALLY BAD HACK, BUT IT IS THE BEST WE CAN DO // FOR NOW! *pWordStr++ = 0; *pWordStr++ = 0; bHadExtended = TRUE; } /* else does not have a collation value - found in sub-collation part */ UW2FBA( ColChar, pWordStr ); /* Put the uncollation value back */ pWordStr += 2; } UW2FBA( 0, pWordStr); /* NULL Terminate the string */ WordStrLen = (FLMUINT) (pWordStr - WordStr); /**-------------------------------------------------------------------- *** Parse through the sub-collation and case information. *** Watch out for COMP CollStrPosT indexes-doesn't have case info after *** Here are values for some of the codes: *** [ 0x01] - end for fields case info follows - for COMP POST indexes *** [ 0x02] - compound marker *** [ 0x05] - case bits follow *** [ 0x06] - case information is all uppercase *** [ 0x07] - beginning of sub-collation information *** [ 0x08] - first substring field that is made *** [ 0x09] - truncation marker for text and binary *** *** Asian chars the case information should always be there and not *** compressed out. This is because the case information could change *** the actual width of the character from 0x26xx to charset 11. ***-------------------------------------------------------------------*/ /** *** Does truncation marker or sub-collation follow? **/ if( Length) { ColChar = (FLMUINT16)((CollatedStr[CollStrPos] << 8) + CollatedStr[CollStrPos+1]); // First substring is before truncated. if( ColChar == COLL_FIRST_SUBSTRING) { if( pbFirstSubstring) *pbFirstSubstring = TRUE; // Don't need to initialize to FALSE. Length -= 2; CollStrPos += 2; ColChar = (FLMUINT16)((CollatedStr[CollStrPos] << 8) + CollatedStr[CollStrPos+1]); } if( ColChar == COLL_TRUNCATED) { if( pbDataTruncated) *pbDataTruncated = TRUE; // Don't need to initialize to FALSE. Length -= 2; CollStrPos += 2; ColChar = (FLMUINT16)((CollatedStr[CollStrPos] << 8) + CollatedStr[CollStrPos+1]); } if( ColChar == (COLL_MARKER | SC_SUB_COL)) { FLMUINT TempLen; /* Do another pass on the word string adding diacritics/voicings */ CollStrPos += 2; Length -= 2; TempLen = AsiaParseSubCol( WordStr, &WordStrLen, &CollatedStr[ CollStrPos ]); CollStrPos += TempLen; Length -= TempLen; } else goto check_case; } /** *** Does the case info follow? - It may not because of post indexes **/ if( Length) { ColChar = (FLMUINT16)((CollatedStr[CollStrPos] << 8) + CollatedStr[CollStrPos+1]); check_case: if( ColChar == (COLL_MARKER | SC_MIXED)) { CollStrPos += 2; CollStrPos += AsiaParseCase( WordStr, &WordStrLen, &CollatedStr[CollStrPos]); // Set bHadExtended to FALSE, because they will have // been taken care of in this pass. bHadExtended = FALSE; } } // Change embedded zeroes to 0xFFFFs if (bHadExtended) { FLMUINT uiCnt; FLMBYTE * pucTmp; for (uiCnt = 0, pucTmp = WordStr; uiCnt < WordStrLen; uiCnt += 2, pucTmp += 2) { if (FB2UW( pucTmp) == 0) { UW2FBA( 0xFFFF, pucTmp); } } } /* Follow marker is 2 bytes if post otherwise will be 1 byte */ /* Should make a pass and count the extended characters */ *CollatedStrLenRV = CollStrPos; /* value should be on 0x01 or 0x02 flag */ return( WordStrLen); /* Return the length of the word string */ } /**************************************************************************** Desc: Combine the diacritic 5 and 16 bit values to an existing word string. Ret: FLMUINT - Number of bytes parsed Notes: For each bit in the sub-collation section: 0 - no subcollation information 10 - take next 5 bits - will tell about diacritics or japanese vowel 110 - align to next byte & take word value as extended character ****************************************************************************/ FLMUINT AsiaParseSubCol( FLMBYTE * WordStr, /* Existing word string to modify */ FLMUINT * puiWordStrLen, /* Wordstring length in bytes */ FLMBYTE * SubColBuf /* Diacritic values in 5 bit sets */ ) { FLMUINT SubColBitPos = 0; FLMUINT NumWords = *puiWordStrLen >> 1; FLMUINT16 Diac; FLMUINT16 WpChar; /* For each word in the word string ... */ while( NumWords--) { // Have to skip 0, because it is not accounted for // in the sub-collation bits. It was inserted when we // encountered unconverted unicode characters (Asian). // Will be converted to something else later on. // SEE NOTE ABOVE. if (FB2UW( WordStr) == 0) { WordStr += 2; continue; } /* This macro DOESN'T increment bitPos */ if( TEST1BIT( SubColBuf, SubColBitPos)) { /** *** Bits 10 - take next 5 bits *** Bits 110 align and take next word *** Bits 11110 align and take unicode value **/ SubColBitPos++; if( ! TEST1BIT( SubColBuf, SubColBitPos)) { SubColBitPos++; Diac = (FLMUINT16)(GETnBITS( 5, SubColBuf, SubColBitPos)); SubColBitPos += 5; if( (WpChar = FB2UW( WordStr )) < 0x100) { if( (WpChar >= 'A') && (WpChar <= 'Z')) { /* Convert to WP diacritic and combine characters */ fwpCh6Cmbcar( &WpChar, WpChar, (FLMUINT16) ml1_COLtoD[Diac] ); /* Even if cmbcar fails, WpChar is still set to a valid value */ } else /* Symbols from charset 0x24 */ { WpChar = (FLMUINT16)(0x2400 + fwp_Ch24ColTbl[ Diac - 1 ].ByteValue); } } else if( WpChar >= 0x2600) /* Katakana */ { /** *** Voicings - will allow to select original char *** 000 - some 001 are changed to 000 to save space *** 001 - set if large char (uppercase) *** 010 - set if voiced *** 100 - set if half voiced *** *** Should NOT match voicing or wouldn't be here! **/ FLMBYTE CharVal = (FLMBYTE)(WpChar & 0xFF); /* Try exceptions first so don't access out of bounds */ if( CharVal == 84) WpChar = (FLMUINT16)(0x2600 + ((Diac == 1) ? (FLMUINT16)10 : (FLMUINT16)11)); else if( CharVal == 85) WpChar = (FLMUINT16)(0x2600 + ((Diac == 1) ? (FLMUINT16)16 : (FLMUINT16)17)); /* Try the next 2 slots, if not then value is 83,84 or 85 */ else if( KanaSubColTbl[ CharVal + 1 ] == Diac ) WpChar++; else if( /* (Diac == 5) && ZU is an exception! */ (KanaSubColTbl[ CharVal + 2 ] == Diac )) WpChar += 2; /* last exception below */ else if( CharVal == 4) WpChar = 0x2600 + 83; /* else leave alone! - invalid storage */ } UW2FBA( WpChar, WordStr ); /* Set if changed or not */ } else /* "110" */ { FLMUINT Temp; SubColBitPos++; /* Skip second '1' */ if( TEST1BIT( SubColBuf, SubColBitPos)) /* 11?10 ? */ { /* Unconvertable UNICODE character */ /* The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode */ shiftN( WordStr, (FLMUINT16)(NumWords + NumWords + 4), 2 ); WordStr += 2; /* Skip the 0xFFFF for now */ SubColBitPos += 2; /* Skip next "11" */ (*puiWordStrLen) += 2; } SubColBitPos++; /* Skip the zero */ /* Round up to next byte */ SubColBitPos = (SubColBitPos + 7) & (~7); Temp = BYTES_IN_BITS( SubColBitPos ); WordStr[1] = SubColBuf[ Temp ]; /* Character set */ WordStr[0] = SubColBuf[ Temp + 1 ]; /* Character */ SubColBitPos += 16; } } else SubColBitPos++; /* Be sure to increment this! */ WordStr += 2; /* Next WP character */ } return( BYTES_IN_BITS( SubColBitPos )); } /**************************************************************************** Desc: The case bits for asia are: Latin/Greek/Cyrillic 01 - case bit set if character is uppercase 10 - double wide character in CS 0x25xx, 0x26xx and 0x27xx Japanese 00 - double wide hiragana 0x255e..25b0 01 - double wide katakana 0x2600..2655 10 - single wide symbols from charset 11 that map to CS24?? 11 - single wide katakana from charset 11 Ret: Notes: This is tricky to really understand the inputs. This looks at the bits according to the current character value. ****************************************************************************/ FLMUINT AsiaParseCase( FLMBYTE * WordStr, /* Existing word string to modify */ FLMUINT * WordStrLenRV, /* Length of the WordString in bytes */ FLMBYTE * pCaseBits /* Lower/upper case bit string */ ) { FLMUINT WordStrLen = *WordStrLenRV; FLMUINT uiWordCnt; FLMUINT uiExtraBytes = 0; FLMUINT16 WpChar; FLMBYTE TempByte = 0; FLMBYTE MaskByte; /* For each character in the word string ... */ for( uiWordCnt = WordStrLen >> 1,/* Total number of words in word string */ MaskByte = 0; /* Force first time to get a byte */ uiWordCnt--;) /* Test */ { FLMBYTE CharSet, CharVal; WpChar = FB2UW( WordStr ); /* Get the next character */ // Must skip any 0xFFFFs or zeroes that were inserted. if (WpChar == 0xFFFF || WpChar == 0) { // Put back 0xFFFF in case it was a zero. UW2FBA( 0xFFFF, WordStr); WordStr += 2; uiExtraBytes += 2; continue; } if( MaskByte == 0) /* Time to get another byte */ { TempByte = *pCaseBits++; MaskByte = 0x80; } CharSet = (FLMBYTE)(WpChar >> 8); CharVal = (FLMBYTE)(WpChar & 0xFF); if( WpChar < 0x2400 ) /*** SINGLE WIDE - NORMAL CHARACTERS ***/ { if( TempByte & MaskByte) /* convert to double wide? */ { /** *** Latin/greek/cyrillic *** Convert to uppercase double wide char **/ if( CharSet == 0) /* Latin - uppercase */ { /* May convert to 0x250F (Latin) or CS24 */ if( WpChar >= 'A' && WpChar <= 'Z') WpChar = (FLMUINT16)(WpChar - 0x30 + 0x250F); /* Convert to double wide*/ else HanToZenkaku( WpChar, 0, &WpChar ); } else if( CharSet == 8) /* Greek */ { if( CharVal > 38) /* Adjust for spaces in greek */ CharVal -= 2; if( CharVal > 4) CharVal -= 2; WpChar = (FLMUINT16)((CharVal >> 1) + 0x265E); } else if( CharSet == 10) /* Cyrillic */ { WpChar = (FLMUINT16)((CharVal >> 1) + 0x2700); } else HanToZenkaku( WpChar, 0, &WpChar ); CharSet = (FLMBYTE)(WpChar >> 8); /* Less code this way */ CharVal = (FLMBYTE)(WpChar & 0xFF); } MaskByte >>= 1; /* Next bit */ if( ( TempByte & MaskByte) == 0) /* Change to lower case? */ { switch( CharSet) /* Convert WpChar to lower case */ { case 0: WpChar |= 0x20; /* Bit zero only if lower case */ break; case 1: if( CharVal >= 26) /* in upper/lower case region? */ WpChar++; break; case 8: if( CharVal <= 69) /* All lowercase after 69 */ WpChar++; break; case 10: if( CharVal <= 199) /* No cases after 199 */ WpChar++; break; case 0x25: case 0x26: /* should be double wide latin or greek */ WpChar += 0x20; /* Add offset to convert to lowercase */ break; case 0x27: /* double wide cyrillic only */ WpChar += 0x30; /* Add offset to convert to lowercase */ break; } } } else /*** JAPANESE CHARACTERS ***/ { if( TempByte & MaskByte) /* Original chars from CharSet 11 */ { if( CharSet == 0x26) /* Convert to ZenToHankaku */ { FLMUINT16 NextChar = 0; WpChar = ZenToHankaku( WpChar, &NextChar ); if( NextChar) /* Move everone down */ { uiWordCnt++; shiftN( WordStr, uiWordCnt + uiWordCnt + 2, 2 ); UW2FBA( WpChar, WordStr ); WordStr += 2; WpChar = NextChar; /* Store this below */ *WordStrLenRV = *WordStrLenRV + 2; /* Adjust length */ /* Don't change WordStrLen - returns # bits used */ } } else if( CharSet == 0x24) { WpChar = ZenToHankaku( WpChar, (FLMUINT16 *) 0 ); } MaskByte >>= 1; /* Eat next bit! */ } else { MaskByte >>= 1; /* Next bit */ if( (TempByte & MaskByte) == 0) /* Convert to hiragana? */ { /* kanji will also fall through here */ if( CharSet == 0x26) WpChar = (FLMUINT16)(0x255E + CharVal); /* Convert to hiragana */ } } } UW2FBA( WpChar, WordStr ); WordStr += 2; MaskByte >>= 1; } uiWordCnt = WordStrLen - uiExtraBytes; // Should be 2 bits for each character. return( BYTES_IN_BITS( uiWordCnt )); }