Files
mars-flaim/flaim/src/kyasia2.cpp
dsandersoremutah c55dab446f Renamed version4 to flaim and version5 to xflaim
git-svn-id: https://svn.code.sf.net/p/flaim/code/trunk@7 0109f412-320b-0410-ab79-c3e0c5ffbbe6
2006-01-27 21:06:39 +00:00

603 lines
17 KiB
C++

//-------------------------------------------------------------------------
// Desc: Convert collated string to WP string - for Asian languages.
// Tabs: 3
//
// Copyright (c) 1993-2006 Novell, Inc. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of version 2 of the GNU General Public
// License as published by the Free Software Foundation.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, contact Novell, Inc.
//
// To contact Novell about this file by physical or electronic mail,
// you may find current contact information at www.novell.com
//
// $Id: kyasia2.cpp 12312 2006-01-19 15:14:03 -0700 (Thu, 19 Jan 2006) dsanders $
//-------------------------------------------------------------------------
#include "flaimsys.h"
#define SET_CASE_BIT 0x01
#define SET_KATAKANA_BIT 0x01
#define SET_WIDTH_BIT 0x02
#define COLS_ASIAN_MARK_VAL 0x40 /* With out 0x100 */
extern FLMUINT16 colToWPChr[]; /* Converts collated value to WP character */
extern FLMBYTE ml1_COLtoD[]; /* Diacritic conversions */
extern FLMBYTE KanaSubColTbl[];
/* Position in the table+1 is subColValue */
extern BYTE_WORD_TBL fwp_Ch24ColTbl[];
FLMBYTE ColToKanaTbl[ 48 ] /* Only 48 values + 0x40, 0x41, 0x42 (169..171) */
= {
0, /* a=0, A=1 */
2, /* i=2, I=3 */
4, /* u=4, U=5, VU=83 */
6, /* e=6, E=7 */
8, /* o=8, O=9 */
84, /* KA=10, GA=11, ka=84 - remember voicing table is optimized */
/* so that zero value is position and */
/* if voice=1 and no 0 is changed to 0 */
12, /* KI=12, GI=13 */
14, /* KU=14, GU=15 */
85, /* KE=16, GE=17, ke=85 */
18, /* KO=18, GO=19 */
/*10*/
20, /* SA=20, ZA=21 */
22, /* SHI=22, JI=23 */
24, /* SU=24, ZU=25 */
26, /* SE=26, ZE=27 */
28, /* SO=28, ZO=29 */
30, /* TA=30, DA=31 */
32, /* CHI=32, JI=33 */
34, /* tsu=34, TSU=35, ZU=36 */
37, /* TE=37, DE=38 */
39, /* TO=39, DO=40 */
/*20*/
41, /* NA */
42, /* NI */
43, /* NU */
44, /* NE */
45, /* NO */
46, /* HA, BA, PA */
49, /* HI, BI, PI */
52, /* FU, BU, PU */
55, /* HE, BE, PE */
58, /* HO, BO, PO */
/*30*/
61, /* MA */
62, /* MI */
63, /* MU */
64, /* ME */
65, /* MO */
66, /* ya, YA */
68, /* yu, YU */
70, /* yo, YO */
72, /* RA */
73, /* RI */
/*40*/
74, /* RU */
75, /* RE */
76, /* RO */
77, /* wa, WA */
79, /* WI */
80, /* WE */
81, /* WO */
82 /* N */
};
/***************************************************************************
Desc: Get the original string from an asian collation string
Ret: Length of the word string in bytes
****************************************************************************/
FLMUINT AsiaConvertColStr(
FLMBYTE * CollatedStr, /* Points to the Flaim collated string */
FLMUINT * CollatedStrLenRV, /* Length of the Flaim collated string */
FLMBYTE * WordStr, /* Output string to build - WP word string */
FLMBOOL * pbDataTruncated, /* Set to TRUE if data was truncated */
FLMBOOL * pbFirstSubstring) /* Set to TRUE if marker exists */
{
FLMBYTE * pWordStr = WordStr; /* Points to the word string data area */
FLMUINT Length = *CollatedStrLenRV;/* May optimize as a register */
FLMUINT CollStrPos = 0; /* Position in CollatedStr[] */
FLMBOOL bHadExtended = FALSE;
FLMUINT WordStrLen;
FLMUINT16 ColChar; /* 2 byte value for asian */
while( Length)
{
FLMBYTE CharVal, CharSet;
CharSet = CollatedStr[ CollStrPos ];
CharVal = CollatedStr[ CollStrPos + 1 ];
ColChar = (FLMUINT16)((CharSet << 8) + CharVal);
if( ColChar <= MAX_COL_OPCODE)
break;
CollStrPos += 2;
Length -= 2;
if( CharSet == 0) /* Normal Latin/Greek/Cyrillic value */
{
ColChar = colToWPChr[ CharVal - COLLS ];
}
else if( CharSet == 1) /* katakana or hiragana character */
{
if( CharVal > sizeof( ColToKanaTbl )) /* Special cases below */
{
if( CharVal == COLS_ASIAN_MARK_VAL) /* dakuten */
ColChar = 0x240a;
else if( CharVal == COLS_ASIAN_MARK_VAL + 1) /* handakuten */
ColChar = 0x240b;
else if( CharVal == COLS_ASIAN_MARK_VAL + 2) /* chuuten */
ColChar = 0x2405;
else
ColChar = 0xFFFF; /* error */
}
else
{
ColChar = (FLMUINT16)(0x2600 + ColToKanaTbl[ CharVal ]);
}
}
else if( CharSet != 0xFF || CharVal != 0xFF) // Asian characters
{
// Insert zeroes that will be treated as a signal for
// uncoverted unicode characters later on. NOTE: Cannot
// use 0xFFFF, because we need to be able to detect this
// case in the sub-collation stuff, and we don't want
// to confuse it with the 0xFFFF that may have been inserted
// in another case.
// THIS IS A REALLY BAD HACK, BUT IT IS THE BEST WE CAN DO
// FOR NOW!
*pWordStr++ = 0;
*pWordStr++ = 0;
bHadExtended = TRUE;
}
/* else does not have a collation value - found in sub-collation part */
UW2FBA( ColChar, pWordStr ); /* Put the uncollation value back */
pWordStr += 2;
}
UW2FBA( 0, pWordStr); /* NULL Terminate the string */
WordStrLen = (FLMUINT) (pWordStr - WordStr);
/**--------------------------------------------------------------------
*** Parse through the sub-collation and case information.
*** Watch out for COMP CollStrPosT indexes-doesn't have case info after
*** Here are values for some of the codes:
*** [ 0x01] - end for fields case info follows - for COMP POST indexes
*** [ 0x02] - compound marker
*** [ 0x05] - case bits follow
*** [ 0x06] - case information is all uppercase
*** [ 0x07] - beginning of sub-collation information
*** [ 0x08] - first substring field that is made
*** [ 0x09] - truncation marker for text and binary
***
*** Asian chars the case information should always be there and not
*** compressed out. This is because the case information could change
*** the actual width of the character from 0x26xx to charset 11.
***-------------------------------------------------------------------*/
/**
*** Does truncation marker or sub-collation follow?
**/
if( Length)
{
ColChar = (FLMUINT16)((CollatedStr[CollStrPos] << 8) +
CollatedStr[CollStrPos+1]);
// First substring is before truncated.
if( ColChar == COLL_FIRST_SUBSTRING)
{
if( pbFirstSubstring)
*pbFirstSubstring = TRUE; // Don't need to initialize to FALSE.
Length -= 2;
CollStrPos += 2;
ColChar = (FLMUINT16)((CollatedStr[CollStrPos] << 8) +
CollatedStr[CollStrPos+1]);
}
if( ColChar == COLL_TRUNCATED)
{
if( pbDataTruncated)
*pbDataTruncated = TRUE; // Don't need to initialize to FALSE.
Length -= 2;
CollStrPos += 2;
ColChar = (FLMUINT16)((CollatedStr[CollStrPos] << 8) +
CollatedStr[CollStrPos+1]);
}
if( ColChar == (COLL_MARKER | SC_SUB_COL))
{
FLMUINT TempLen;
/* Do another pass on the word string adding diacritics/voicings */
CollStrPos += 2;
Length -= 2;
TempLen = AsiaParseSubCol( WordStr, &WordStrLen,
&CollatedStr[ CollStrPos ]);
CollStrPos += TempLen;
Length -= TempLen;
}
else
goto check_case;
}
/**
*** Does the case info follow? - It may not because of post indexes
**/
if( Length)
{
ColChar = (FLMUINT16)((CollatedStr[CollStrPos] << 8) +
CollatedStr[CollStrPos+1]);
check_case:
if( ColChar == (COLL_MARKER | SC_MIXED))
{
CollStrPos += 2;
CollStrPos += AsiaParseCase( WordStr, &WordStrLen,
&CollatedStr[CollStrPos]);
// Set bHadExtended to FALSE, because they will have
// been taken care of in this pass.
bHadExtended = FALSE;
}
}
// Change embedded zeroes to 0xFFFFs
if (bHadExtended)
{
FLMUINT uiCnt;
FLMBYTE * pucTmp;
for (uiCnt = 0, pucTmp = WordStr;
uiCnt < WordStrLen;
uiCnt += 2, pucTmp += 2)
{
if (FB2UW( pucTmp) == 0)
{
UW2FBA( 0xFFFF, pucTmp);
}
}
}
/* Follow marker is 2 bytes if post otherwise will be 1 byte */
/* Should make a pass and count the extended characters */
*CollatedStrLenRV = CollStrPos; /* value should be on 0x01 or 0x02 flag */
return( WordStrLen); /* Return the length of the word string */
}
/****************************************************************************
Desc: Combine the diacritic 5 and 16 bit values to an existing word string.
Ret: FLMUINT - Number of bytes parsed
Notes: For each bit in the sub-collation section:
0 - no subcollation information
10 - take next 5 bits - will tell about diacritics or japanese vowel
110 - align to next byte & take word value as extended character
****************************************************************************/
FLMUINT AsiaParseSubCol(
FLMBYTE * WordStr, /* Existing word string to modify */
FLMUINT * puiWordStrLen, /* Wordstring length in bytes */
FLMBYTE * SubColBuf /* Diacritic values in 5 bit sets */
)
{
FLMUINT SubColBitPos = 0;
FLMUINT NumWords = *puiWordStrLen >> 1;
FLMUINT16 Diac;
FLMUINT16 WpChar;
/* For each word in the word string ... */
while( NumWords--)
{
// Have to skip 0, because it is not accounted for
// in the sub-collation bits. It was inserted when we
// encountered unconverted unicode characters (Asian).
// Will be converted to something else later on.
// SEE NOTE ABOVE.
if (FB2UW( WordStr) == 0)
{
WordStr += 2;
continue;
}
/* This macro DOESN'T increment bitPos */
if( TEST1BIT( SubColBuf, SubColBitPos))
{
/**
*** Bits 10 - take next 5 bits
*** Bits 110 align and take next word
*** Bits 11110 align and take unicode value
**/
SubColBitPos++;
if( ! TEST1BIT( SubColBuf, SubColBitPos))
{
SubColBitPos++;
Diac = (FLMUINT16)(GETnBITS( 5, SubColBuf, SubColBitPos));
SubColBitPos += 5;
if( (WpChar = FB2UW( WordStr )) < 0x100)
{
if( (WpChar >= 'A') && (WpChar <= 'Z'))
{
/* Convert to WP diacritic and combine characters */
fwpCh6Cmbcar( &WpChar, WpChar, (FLMUINT16) ml1_COLtoD[Diac] );
/* Even if cmbcar fails, WpChar is still set to a valid value */
}
else /* Symbols from charset 0x24 */
{
WpChar = (FLMUINT16)(0x2400 + fwp_Ch24ColTbl[ Diac - 1 ].ByteValue);
}
}
else if( WpChar >= 0x2600) /* Katakana */
{
/**
*** Voicings - will allow to select original char
*** 000 - some 001 are changed to 000 to save space
*** 001 - set if large char (uppercase)
*** 010 - set if voiced
*** 100 - set if half voiced
***
*** Should NOT match voicing or wouldn't be here!
**/
FLMBYTE CharVal = (FLMBYTE)(WpChar & 0xFF);
/* Try exceptions first so don't access out of bounds */
if( CharVal == 84)
WpChar = (FLMUINT16)(0x2600 +
((Diac == 1)
? (FLMUINT16)10
: (FLMUINT16)11));
else if( CharVal == 85)
WpChar = (FLMUINT16)(0x2600 +
((Diac == 1)
? (FLMUINT16)16
: (FLMUINT16)17));
/* Try the next 2 slots, if not then value is 83,84 or 85 */
else if( KanaSubColTbl[ CharVal + 1 ] == Diac )
WpChar++;
else if( /* (Diac == 5) && ZU is an exception! */
(KanaSubColTbl[ CharVal + 2 ] == Diac ))
WpChar += 2;
/* last exception below */
else if( CharVal == 4)
WpChar = 0x2600 + 83;
/* else leave alone! - invalid storage */
}
UW2FBA( WpChar, WordStr ); /* Set if changed or not */
}
else /* "110" */
{
FLMUINT Temp;
SubColBitPos++; /* Skip second '1' */
if( TEST1BIT( SubColBuf, SubColBitPos)) /* 11?10 ? */
{
/* Unconvertable UNICODE character */
/* The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode */
shiftN( WordStr, (FLMUINT16)(NumWords + NumWords + 4), 2 );
WordStr += 2; /* Skip the 0xFFFF for now */
SubColBitPos += 2; /* Skip next "11" */
(*puiWordStrLen) += 2;
}
SubColBitPos++; /* Skip the zero */
/* Round up to next byte */
SubColBitPos = (SubColBitPos + 7) & (~7);
Temp = BYTES_IN_BITS( SubColBitPos );
WordStr[1] = SubColBuf[ Temp ]; /* Character set */
WordStr[0] = SubColBuf[ Temp + 1 ]; /* Character */
SubColBitPos += 16;
}
}
else
SubColBitPos++; /* Be sure to increment this! */
WordStr += 2; /* Next WP character */
}
return( BYTES_IN_BITS( SubColBitPos ));
}
/****************************************************************************
Desc: The case bits for asia are:
Latin/Greek/Cyrillic
01 - case bit set if character is uppercase
10 - double wide character in CS 0x25xx, 0x26xx and 0x27xx
Japanese
00 - double wide hiragana 0x255e..25b0
01 - double wide katakana 0x2600..2655
10 - single wide symbols from charset 11 that map to CS24??
11 - single wide katakana from charset 11
Ret:
Notes: This is tricky to really understand the inputs.
This looks at the bits according to the current character value.
****************************************************************************/
FLMUINT AsiaParseCase(
FLMBYTE * WordStr, /* Existing word string to modify */
FLMUINT * WordStrLenRV, /* Length of the WordString in bytes */
FLMBYTE * pCaseBits /* Lower/upper case bit string */
)
{
FLMUINT WordStrLen = *WordStrLenRV;
FLMUINT uiWordCnt;
FLMUINT uiExtraBytes = 0;
FLMUINT16 WpChar;
FLMBYTE TempByte = 0;
FLMBYTE MaskByte;
/* For each character in the word string ... */
for( uiWordCnt = WordStrLen >> 1,/* Total number of words in word string */
MaskByte = 0; /* Force first time to get a byte */
uiWordCnt--;) /* Test */
{
FLMBYTE CharSet, CharVal;
WpChar = FB2UW( WordStr ); /* Get the next character */
// Must skip any 0xFFFFs or zeroes that were inserted.
if (WpChar == 0xFFFF || WpChar == 0)
{
// Put back 0xFFFF in case it was a zero.
UW2FBA( 0xFFFF, WordStr);
WordStr += 2;
uiExtraBytes += 2;
continue;
}
if( MaskByte == 0) /* Time to get another byte */
{
TempByte = *pCaseBits++;
MaskByte = 0x80;
}
CharSet = (FLMBYTE)(WpChar >> 8);
CharVal = (FLMBYTE)(WpChar & 0xFF);
if( WpChar < 0x2400 ) /*** SINGLE WIDE - NORMAL CHARACTERS ***/
{
if( TempByte & MaskByte) /* convert to double wide? */
{
/**
*** Latin/greek/cyrillic
*** Convert to uppercase double wide char
**/
if( CharSet == 0) /* Latin - uppercase */
{
/* May convert to 0x250F (Latin) or CS24 */
if( WpChar >= 'A' && WpChar <= 'Z')
WpChar = (FLMUINT16)(WpChar - 0x30 + 0x250F); /* Convert to double wide*/
else
HanToZenkaku( WpChar, 0, &WpChar );
}
else if( CharSet == 8) /* Greek */
{
if( CharVal > 38) /* Adjust for spaces in greek */
CharVal -= 2;
if( CharVal > 4)
CharVal -= 2;
WpChar = (FLMUINT16)((CharVal >> 1) + 0x265E);
}
else if( CharSet == 10) /* Cyrillic */
{
WpChar = (FLMUINT16)((CharVal >> 1) + 0x2700);
}
else
HanToZenkaku( WpChar, 0, &WpChar );
CharSet = (FLMBYTE)(WpChar >> 8); /* Less code this way */
CharVal = (FLMBYTE)(WpChar & 0xFF);
}
MaskByte >>= 1; /* Next bit */
if( ( TempByte & MaskByte) == 0) /* Change to lower case? */
{
switch( CharSet) /* Convert WpChar to lower case */
{
case 0:
WpChar |= 0x20; /* Bit zero only if lower case */
break;
case 1:
if( CharVal >= 26) /* in upper/lower case region? */
WpChar++;
break;
case 8:
if( CharVal <= 69) /* All lowercase after 69 */
WpChar++;
break;
case 10:
if( CharVal <= 199) /* No cases after 199 */
WpChar++;
break;
case 0x25:
case 0x26:
/* should be double wide latin or greek */
WpChar += 0x20; /* Add offset to convert to lowercase */
break;
case 0x27: /* double wide cyrillic only */
WpChar += 0x30; /* Add offset to convert to lowercase */
break;
}
}
}
else /*** JAPANESE CHARACTERS ***/
{
if( TempByte & MaskByte) /* Original chars from CharSet 11 */
{
if( CharSet == 0x26) /* Convert to ZenToHankaku */
{
FLMUINT16 NextChar = 0;
WpChar = ZenToHankaku( WpChar, &NextChar );
if( NextChar) /* Move everone down */
{
uiWordCnt++;
shiftN( WordStr, uiWordCnt + uiWordCnt + 2, 2 );
UW2FBA( WpChar, WordStr );
WordStr += 2;
WpChar = NextChar; /* Store this below */
*WordStrLenRV = *WordStrLenRV + 2; /* Adjust length */
/* Don't change WordStrLen - returns # bits used */
}
}
else if( CharSet == 0x24)
{
WpChar = ZenToHankaku( WpChar, (FLMUINT16 *) 0 );
}
MaskByte >>= 1; /* Eat next bit! */
}
else
{
MaskByte >>= 1; /* Next bit */
if( (TempByte & MaskByte) == 0) /* Convert to hiragana? */
{
/* kanji will also fall through here */
if( CharSet == 0x26)
WpChar = (FLMUINT16)(0x255E + CharVal); /* Convert to hiragana */
}
}
}
UW2FBA( WpChar, WordStr );
WordStr += 2;
MaskByte >>= 1;
}
uiWordCnt = WordStrLen - uiExtraBytes; // Should be 2 bits for each character.
return( BYTES_IN_BITS( uiWordCnt ));
}