Files
mars-flaim/ftk/src/ftkcoll.cpp
ahodgkinson 4e712ffbe0 Changed license to LGPL.
git-svn-id: https://svn.code.sf.net/p/flaim/code/trunk@1007 0109f412-320b-0410-ab79-c3e0c5ffbbe6
2007-01-23 07:50:29 +00:00

10116 lines
268 KiB
C++
Raw Blame History

//------------------------------------------------------------------------------
// Desc: Routines for building collation keys
// Tabs: 3
//
// Copyright (c) 1993-2007 Novell, Inc. All Rights Reserved.
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; version 2.1
// of the License.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Library Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, contact Novell, Inc.
//
// To contact Novell about this file by physical or electronic mail,
// you may find current contact information at www.novell.com.
//
// $Id: fcollate.cpp 3111 2006-01-19 13:10:50 -0700 (Thu, 19 Jan 2006) dsanders $
//------------------------------------------------------------------------------
#include "ftksys.h"
// Collating Sequence Equates
#define COLLS 32 // first collating number (space/end of line)
#define COLS0 255 // graphics/misc - chars without a collate value
#define COLS1 (COLLS + 9) // quotes
#define COLS2 (COLS1 + 5) // parens
#define COLS3 (COLS2 + 6) // money
#define COLS4 (COLS3 + 6) // math ops
#define COLS5 (COLS4 + 8) // math others
#define COLS6 (COLS5 + 14) // others: %#&@\_|~
#define COLS7 (COLS6 + 13) // greek
#define COLS8 (COLS7 + 25) // numbers
#define COLS9 (COLS8 + 10) // alphabet
#define COLS10 (COLS9 + 60) // cyrillic
#define COLS10h (COLS9 + 42) // hebrew - writes over european & cyrilic
#define COLS10a (COLS10h + 28) // arabic - inclusive from 198(C6)-252(FC)
#define COLS11 253 // End of list - arabic goes to the end
#define COLS0_ARABIC COLS11 // Set if arabic accent marking
#define COLS0_HEBREW COLS11 // Set if hebrew accent marking
#define COLS_ASIAN_MARKS 0x140
#define COLS_ASIAN_MARK_VAL 0x40 // Without 0x100
#define SET_CASE_BIT 0x01
#define SET_KATAKANA_BIT 0x01
#define SET_WIDTH_BIT 0x02
#define UNK_UNICODE_CODE 0xFFFE
#define MAX_SUBCOL_BUF (500)
#define MAX_CASE_BYTES (150)
#define ASCTBLLEN 95
#define MNTBLLEN 219
#define SYMTBLLEN 9
#define GRKTBLLEN 219
#define CYRLTBLLEN 200
#define HEBTBL1LEN 27
#define HEBTBL2LEN 35
#define AR1TBLLEN 158
#define AR2TBLLEN 179
#define Upper_JP_A 0x2520
#define Upper_JP_Z 0x2539
#define Upper_KR_A 0x5420
#define Upper_KR_Z 0x5439
#define Upper_CS_A 0x82FC
#define Upper_CS_Z 0x8316
#define Upper_CT_A 0xA625
#define Upper_CT_Z 0xA63E
#define Lower_JP_a 0x2540
#define Lower_JP_z 0x2559
#define Lower_KR_a 0x5440
#define Lower_KR_z 0x5459
#define Lower_CS_a 0x82DC
#define Lower_CS_z 0x82F5
#define Lower_CT_a 0xA60B
#define Lower_CT_z 0xA624
// # of characters in each character set.
// CHANGING ANY OF THESE DEFINES WILL CAUSE BUGS!
#define ASC_N 95
#define ML1_N 242
#define ML2_N 145
#define BOX_N 88
#define TYP_N 103
#define ICN_N 255
#define MTH_N 238
#define MTX_N 229
#define GRK_N 219
#define HEB_N 123
#define CYR_N 250
#define KAN_N 63
#define USR_N 255
#define ARB_N 196
#define ARS_N 220
// TOTAL: 1447 WP + 255 User Characters
#define C_N ASC_N + ML1_N + ML2_N + BOX_N +\
MTH_N + MTX_N + TYP_N + ICN_N +\
GRK_N + HEB_N + CYR_N + KAN_N +\
USR_N + ARB_N + ARS_N
// State table constants for double character sorting
#define STATE1 1
#define STATE2 2
#define STATE3 3
#define STATE4 4
#define STATE5 5
#define STATE6 6
#define STATE7 7
#define STATE8 8
#define STATE9 9
#define STATE10 10
#define STATE11 11
#define AFTERC 12
#define AFTERH 13
#define AFTERL 14
#define INSTAE 15
#define INSTOE 16
#define INSTSG 17
#define INSTIJ 18
#define WITHAA 19
#define START_COL 12
#define START_ALL (START_COL + 1) // all US and european
#define START_DK (START_COL + 2) // Danish
#define START_IS (START_COL + 3) // Icelandic
#define START_NO (START_COL + 4) // Norwegian
#define START_SU (START_COL + 5) // Finnish
#define START_SV (START_COL + 5) // Swedish
#define START_YK (START_COL + 6) // Ukrain
#define START_TK (START_COL + 7) // Turkish
#define START_CZ (START_COL + 8) // Czech
#define START_SL (START_COL + 8) // Slovak
#define FIXUP_AREA_SIZE 24 // Number of characters to fix up
FSTATIC FLMUINT16 flmWPAsiaGetCollation(
FLMUINT16 ui16WpChar,
FLMUINT16 ui16NextWpChar,
FLMUINT16 ui16PrevColValue,
FLMUINT16 * pui16ColValue,
FLMUINT16 * pui16SubColVal,
FLMBYTE * pucCaseBits,
FLMBOOL bUppercaseFlag);
FSTATIC FLMUINT16 flmWPGetSubCol(
FLMUINT16 ui16WPValue,
FLMUINT16 ui16ColValue,
FLMUINT uiLanguage);
FSTATIC RCODE flmWPCmbSubColBuf(
FLMBYTE * pucWPStr,
FLMUINT * puiWPStrLen,
FLMUINT uiMaxWPBytes,
const FLMBYTE * pucSubColBuf,
FLMBOOL bHebrewArabic,
FLMUINT * puiSubColBitPos);
FSTATIC RCODE flmAsiaParseCase(
FLMBYTE * pucWPStr,
FLMUINT * puiWPStrLen,
FLMUINT uiMaxWPBytes,
const FLMBYTE * pucCaseBits,
FLMUINT * puiColBytesProcessed);
// Global data
static FLMUINT16 * gv_pUnicodeToWP60 = NULL;
static FLMUINT16 * gv_pWP60ToUnicode = NULL;
static FLMUINT gv_uiMinUniChar = 0;
static FLMUINT gv_uiMaxUniChar = 0;
static FLMUINT gv_uiMinWPChar = 0;
static FLMUINT gv_uiMaxWPChar = 0;
FLMUINT16 * gv_pui16USCollationTable = NULL;
// Typedefs
typedef struct
{
FLMBYTE base;
FLMBYTE diacrit;
} BASE_DIACRIT_TABLE;
typedef struct
{
FLMUINT16 char_count; // # of characters in table
FLMUINT16 start_char; // start char.
BASE_DIACRIT_TABLE * table;
} BASE_DIACRIT;
typedef struct
{
FLMBYTE key; // character key to search on
FLMBYTE * charPtr; // character pointer for matched key
} TBL_B_TO_BP;
typedef struct
{
FLMBYTE ByteValue;
FLMUINT16 WordValue;
} BYTE_WORD_TBL;
// Collation tables
/****************************************************************************
Desc: Base character location table
Bit mapped table. (1) - corresponding base char is in same
set as combined
(0) - corresponding base char is in ascii set
Notes: In the following table, the bits are numbered from left
to right relative to each individual byte.
EX. 00000000b ;0-7
bit# 01234567
****************************************************************************/
static FLMBYTE fwp_ml1_cb60[] =
{
0x00, // 0-7
0x00, // 8-15
0x00, // 16-23
0x00, // 24-31
0x00, // 32-39
0x00, // 40-47
0x55, // 48-55
0x00, // 56-63
0x00, // 64-71
0x00, // 72-79
0x00, // 80-87
0x00, // 88-95
0x00, // 96-103
0x00, // 104-111
0x00, // 112-119
0x00, // 120-127
0x14, // 128-135
0x44, // 136-143
0x00, // 144-151
0x00, // 152-159
0x00, // 160-167
0x00, // 168-175
0x00, // 176-183
0x00, // 184-191
0x00, // 192-199
0x00, // 200-207
0x00, // 208-215
0x00, // 216-223
0x00, // 224-231
0x04, // 232-239
0x00, // 240-241
};
/****************************************************************************
Desc: Format of index:
2 words before = count.
word before = start character.
db code for base char.
db code for diacritic
Notes: Diacritical char is always in same set as composed char
base is in same set if other table indicates, else in ASCII
****************************************************************************/
static BASE_DIACRIT_TABLE fwp_ml1c_table[] =
{
{'A', F_ACUTE},
{'a', F_ACUTE},
{'A', F_CIRCUM},
{'a', F_CIRCUM},
{'A', F_UMLAUT},
{'a', F_UMLAUT},
{'A', F_GRAVE},
{'a', F_GRAVE},
{'A', F_RING},
{'a', F_RING},
{0xff, 0xff}, // no AE diagraph
{0xff, 0xff}, // no ae diagraph
{'C', F_CEDILLA},
{'c', F_CEDILLA},
{'E', F_ACUTE},
{'e', F_ACUTE},
{'E', F_CIRCUM},
{'e', F_CIRCUM},
{'E', F_UMLAUT},
{'e', F_UMLAUT},
{'E', F_GRAVE},
{'e', F_GRAVE},
{'I', F_ACUTE},
{F_DOTLESI, F_ACUTE},
{'I', F_CIRCUM},
{F_DOTLESI, F_CIRCUM},
{'I', F_UMLAUT},
{F_DOTLESI, F_UMLAUT},
{'I', F_GRAVE},
{F_DOTLESI, F_GRAVE},
{'N', F_TILDE},
{'n', F_TILDE},
{'O', F_ACUTE},
{'o', F_ACUTE},
{'O', F_CIRCUM},
{'o', F_CIRCUM},
{'O', F_UMLAUT},
{'o', F_UMLAUT},
{'O', F_GRAVE},
{'o', F_GRAVE},
{'U', F_ACUTE},
{'u', F_ACUTE},
{'U', F_CIRCUM},
{'u', F_CIRCUM},
{'U', F_UMLAUT},
{'u', F_UMLAUT},
{'U', F_GRAVE},
{'u', F_GRAVE},
{'Y', F_UMLAUT},
{'y', F_UMLAUT},
{'A', F_TILDE},
{'a', F_TILDE},
{'D', F_CROSSB},
{'d', F_CROSSB},
{'O', F_SLASH},
{'o', F_SLASH},
{'O', F_TILDE},
{'o', F_TILDE},
{'Y', F_ACUTE},
{'y', F_ACUTE},
{0xff, 0xff}, // no eth
{0xff, 0xff}, // no eth
{0xff, 0xff}, // no Thorn
{0xff, 0xff}, // no Thorn
{'A', F_BREVE},
{'a', F_BREVE},
{'A', F_MACRON},
{'a', F_MACRON},
{'A', F_OGONEK},
{'a', F_OGONEK},
{'C', F_ACUTE},
{'c', F_ACUTE},
{'C', F_CARON},
{'c', F_CARON},
{'C', F_CIRCUM},
{'c', F_CIRCUM},
{'C', F_DOTA},
{'c', F_DOTA},
{'D', F_CARON},
{'d', F_CARON},
{'E', F_CARON},
{'e', F_CARON},
{'E', F_DOTA},
{'e', F_DOTA},
{'E', F_MACRON},
{'e', F_MACRON},
{'E', F_OGONEK},
{'e', F_OGONEK},
{'G', F_ACUTE},
{'g', F_ACUTE},
{'G', F_BREVE},
{'g', F_BREVE},
{'G', F_CARON},
{'g', F_CARON},
{'G', F_CEDILLA},
{'g', F_APOSAB},
{'G', F_CIRCUM},
{'g', F_CIRCUM},
{'G', F_DOTA},
{'g', F_DOTA},
{'H', F_CIRCUM},
{'h', F_CIRCUM},
{'H', F_CROSSB},
{'h', F_CROSSB},
{'I', F_DOTA},
{F_DOTLESI, F_DOTA},
{'I', F_MACRON},
{F_DOTLESI, F_MACRON},
{'I', F_OGONEK},
{'i', F_OGONEK},
{'I', F_TILDE},
{F_DOTLESI, F_TILDE},
{0xff, 0xff}, // no IJ digraph
{0xff, 0xff}, // no ij digraph
{'J', F_CIRCUM},
{F_DOTLESJ, F_CIRCUM},
{'K', F_CEDILLA},
{'k', F_CEDILLA},
{'L', F_ACUTE},
{'l', F_ACUTE},
{'L', F_CARON},
{'l', F_CARON},
{'L', F_CEDILLA},
{'l', F_CEDILLA},
{'L', F_CENTERD},
{'l', F_CENTERD},
{'L', F_STROKE},
{'l', F_STROKE},
{'N', F_ACUTE},
{'n', F_ACUTE},
{'N', F_APOSBA},
{'n', F_APOSBA},
{'N', F_CARON},
{'n', F_CARON},
{'N', F_CEDILLA},
{'n', F_CEDILLA},
{'O', F_DACUTE},
{'o', F_DACUTE},
{'O', F_MACRON},
{'o', F_MACRON},
{0xff, 0xff}, // OE digraph
{0xff, 0xff}, // oe digraph
{'R', F_ACUTE},
{'r', F_ACUTE},
{'R', F_CARON},
{'r', F_CARON},
{'R', F_CEDILLA},
{'r', F_CEDILLA},
{'S', F_ACUTE},
{'s', F_ACUTE},
{'S', F_CARON},
{'s', F_CARON},
{'S', F_CEDILLA},
{'s', F_CEDILLA},
{'S', F_CIRCUM},
{'s', F_CIRCUM},
{'T', F_CARON},
{'t', F_CARON},
{'T', F_CEDILLA},
{'t', F_CEDILLA},
{'T', F_CROSSB},
{'t', F_CROSSB},
{'U', F_BREVE},
{'u', F_BREVE},
{'U', F_DACUTE},
{'u', F_DACUTE},
{'U', F_MACRON},
{'u', F_MACRON},
{'U', F_OGONEK},
{'u', F_OGONEK},
{'U', F_RING},
{'u', F_RING},
{'U', F_TILDE},
{'u', F_TILDE},
{'W', F_CIRCUM},
{'w', F_CIRCUM},
{'Y', F_CIRCUM},
{'y', F_CIRCUM},
{'Z', F_ACUTE},
{'z', F_ACUTE},
{'Z', F_CARON},
{'z', F_CARON},
{'Z', F_DOTA},
{'z', F_DOTA},
{0xff, 0xff}, // no Eng
{0xff, 0xff}, // no eng
{'D', F_MACRON},
{'d', F_MACRON},
{'L', F_MACRON},
{'l', F_MACRON},
{'N', F_MACRON},
{'n', F_MACRON},
{'R', F_GRAVE},
{'r', F_GRAVE},
{'S', F_MACRON},
{'s', F_MACRON},
{'T', F_MACRON},
{'t', F_MACRON},
{'Y', F_BREVE},
{'y', F_BREVE},
{'Y', F_GRAVE},
{'y', F_GRAVE},
{'D', F_APOSBES},
{'d', F_APOSBES},
{'O', F_APOSBES},
{'o', F_APOSBES},
{'U', F_APOSBES},
{'u', F_APOSBES},
{'E', F_BREVE},
{'e', F_BREVE},
{'I', F_BREVE},
{F_DOTLESI, F_BREVE},
{0xff, 0xff}, // no dotless I
{0xff, 0xff}, // no dotless i
{'O', F_BREVE},
{'o', F_BREVE}
};
/****************************************************************************
Desc:
****************************************************************************/
static BASE_DIACRIT fwp_ml1c =
{
216, // # of characters in table
26, // start char
fwp_ml1c_table,
};
/****************************************************************************
Desc: Format of index:
2 words before = count.
word before = start character.
db code for base char.
db code for diacritic
Notes: Diacritical char is always in same set as composed char
base is in same set
****************************************************************************/
static BASE_DIACRIT_TABLE fwp_grk_c_table[] =
{
{ 0, F_GHPRIME }, // ALPHA High Prime
{ 1, F_GACUTE }, // alpha acute
{ 10, F_GHPRIME }, // EPSILON High Prime
{ 11, F_GACUTE }, // epsilon Acute
{ 14, F_GHPRIME }, // ETA High Prime
{ 15, F_GACUTE }, // eta Acute
{ 18, F_GHPRIME }, // IOTA High Prime
{ 19, F_GACUTE }, // iota Acute
{ 0xFF, 0xFF }, // IOTA Diaeresis
{ 19, F_GDIA }, // iota Diaeresis
{ 30, F_GHPRIME }, // OMICRON High Prime
{ 31, F_GACUTE }, // omicron Acute
{ 42, F_GHPRIME }, // UPSILON High Prime
{ 43, F_GACUTE }, // upsilon Acute
{ 0xFF, 0xFF }, // UPSILON Diaeresis
{ 43, F_GDIA }, // upsilon Diaeresis
{ 50, F_GHPRIME }, // OMEGA High Prime
{ 51, F_GACUTE }, // omega Acute
{ 0xFF, 0xFF }, // epsilon (Variant)
{ 0xFF, 0xFF }, // theta (Variant)
{ 0xFF, 0xFF }, // kappa (Variant)
{ 0xFF, 0xFF }, // pi (Variant)
{ 0xFF, 0xFF }, // rho (Variant)
{ 0xFF, 0xFF }, // sigma (Variant)
{ 0xFF, 0xFF }, // UPSILON (Variant)
{ 0xFF, 0xFF }, // phi (Variant)
{ 0xFF, 0xFF }, // omega (Variant)
{ 0xFF, 0xFF }, // Greek Question Mark
{ 0xFF, 0xFF }, // Greek Semicolon
{ 0xFF, 0xFF }, // High Prime
{ 0xFF, 0xFF }, // Low Prime
{ 0xFF, 0xFF }, // Acute (Greek)
{ 0xFF, 0xFF }, // Diaeresis (Greek)
{ F_GACUTE, F_GDIA }, // Acute Diaeresis
{ F_GGRAVE, F_GDIA }, // Grave Diaeresis
{ 0xFF, 0xFF }, // Grave (Greek)
{ 0xFF, 0xFF }, // Circumflex (Greek)
{ 0xFF, 0xFF }, // Smooth Breathing
{ 0xFF, 0xFF }, // Rough Breathing
{ 0xFF, 0xFF }, // Iota Subscript
{ F_GSMOOTH, F_GACUTE }, // Smooth Breathing Acute
{ F_GROUGH, F_GACUTE }, // Rough Breathing Acute
{ F_GSMOOTH, F_GGRAVE }, // Smooth Breathing Grave
{ F_GROUGH, F_GGRAVE }, // Rough Breathing Grave
{ F_GSMOOTH, F_GCIRCM }, // Smooth Breathing Circumflex
{ F_GROUGH, F_GCIRCM }, // Rough Breathing Circumflex
{ F_GACUTE, F_GIOTA }, // Acute w/Iota Subscript
{ F_GGRAVE, F_GIOTA }, // Grave w/Iota Subscript
{ F_GCIRCM, F_GIOTA }, // Circumflex w/Iota Subscript
{ F_GSMOOTH, F_GIOTA }, // Smooth Breathing w/Iota Subscript
{ F_GROUGH, F_GIOTA }, // Rough Breathing w/Iota Subscript
{ F_GSMACT, F_GIOTA }, // Smooth Breathing Acute w/Iota Subscript
{ F_GRGACT, F_GIOTA }, // Rough Breathing Acute w/Iota Subscript
{ F_GSMGRV, F_GIOTA }, // Smooth Breathing Grave w/Iota Subscript
{ F_GRGGRV, F_GIOTA }, // Rough Breathing Grave w/Iota Subscript
{ F_GSMCIR, F_GIOTA }, // Smooth Breathing Circumflex w/Iota Sub
{ F_GRGCIR, F_GIOTA }, // Rough Breathing Circumflex w/Iota Sub
{ 1, F_GGRAVE }, // alpha Grave
{ 1, F_GCIRCM }, // alpha Circumflex
{ 1, F_GIOTA }, // alpha w/Iota
{ 1, F_GACTIO }, // alpha Acute w/Iota
{ 1, F_GGRVIO }, // alpha Grave w/Iota
{ 1, F_GCIRIO }, // alpha Circumflex w/Iota
{ 1, F_GSMOOTH }, // alpha Smooth
{ 1, F_GSMACT }, // alpha Smooth Acute
{ 1, F_GSMGRV }, // alpha Smooth Grave
{ 1, F_GSMCIR }, // alpha Smooth Circumflex
{ 1, F_GSMIO }, // alpha Smooth w/Iota
{ 1, F_GSMAIO }, // alpha Smooth Acute w/Iota
{ 1, F_GSMGVIO }, // alpha Smooth Grave w/Iota
{ 1, F_GSMCIO }, // alpha Smooth Circumflex w/Iota
{ 1, F_GROUGH }, // alpha Rough
{ 1, F_GRGACT }, // alpha Rough Acute
{ 1, F_GRGGRV }, // alpha Rough Grave
{ 1, F_GRGCIR }, // alpha Rough Circumflex
{ 1, F_GRGIO }, // alpha Rough w/Iota
{ 1, F_GRGAIO }, // alpha Rough Acute w/Iota
{ 1, F_GRGGVIO }, // alpha Rough Grave w/Iota
{ 1, F_GRGCIO }, // alpha Rough Circumflex w/Iota
{ 11, F_GGRAVE }, // epsilon Grave
{ 11, F_GSMOOTH }, // epsilon Smooth
{ 11, F_GSMACT }, // epsilon Smooth Acute
{ 11, F_GSMGRV }, // epsilon Smooth Grave
{ 11, F_GROUGH }, // epsilon Rough
{ 11, F_GRGACT }, // epsilon Rough Acute
{ 11, F_GRGGRV }, // epsilon Rough Grave
{ 15, F_GGRAVE }, // eta Grave
{ 15, F_GCIRCM }, // eta Circumflex
{ 15, F_GIOTA }, // eta w/Iota
{ 15, F_GACTIO }, // eta Acute w/Iota
{ 15, F_GGRVIO }, // eta Grave w/Iota
{ 15, F_GCIRIO }, // eta Circumflex w/Iota
{ 15, F_GSMOOTH }, // eta Smooth
{ 15, F_GSMACT }, // eta Smooth Acute
{ 15, F_GSMGRV }, // eta Smooth Grave
{ 15, F_GSMCIR }, // eta Smooth Circumflex
{ 15, F_GSMIO }, // eta Smooth w/Iota
{ 15, F_GSMAIO }, // eta Smooth Acute w/Iota
{ 15, F_GSMGVIO }, // eta Smooth Grave w/Iota
{ 15, F_GSMCIO }, // eta Smooth Circumflex w/Iota
{ 15, F_GROUGH }, // eta Rough
{ 15, F_GRGACT }, // eta Rough Acute
{ 15, F_GRGGRV }, // eta Rough Grave
{ 15, F_GRGCIR }, // eta Rough Circumflex
{ 15, F_GRGIO }, // eta Rough w/Iota
{ 15, F_GRGAIO }, // eta Rough Acute w/Iota
{ 15, F_GRGGVIO }, // eta Rough Grave w/Iota
{ 15, F_GRGCIO }, // eta Rough Circumflex w/Iota
{ 19, F_GGRAVE }, // iota Grave
{ 19, F_GCIRCM }, // iota Circumflex
{ 19, F_GACTDIA }, // iota Acute Diaeresis
{ 19, F_GGRVDIA }, // iota Grave Diaeresis
{ 19, F_GSMOOTH }, // iota Smooth
{ 19, F_GSMACT }, // iota Smooth Acute
{ 19, F_GSMGRV }, // iota Smooth Grave
{ 19, F_GSMCIR }, // iota Smooth Circumflex
{ 19, F_GROUGH }, // iota Rough
{ 19, F_GRGACT }, // iota Rough Acute
{ 19, F_GRGGRV }, // iota Rough Grave
{ 19, F_GRGCIR }, // iota Rough Circumflex
{ 31, F_GGRAVE }, // omicron Grave
{ 31, F_GSMOOTH }, // omicron Smooth
{ 31, F_GSMACT }, // omicron Smooth Acute
{ 31, F_GSMGRV }, // omicron Smooth Grave
{ 31, F_GROUGH }, // omicron Rough
{ 31, F_GRGACT }, // omicron Rough Acute
{ 31, F_GRGGRV }, // omicron Rough Grave
{ 0xFF, 0xFF }, // rho rough
{ 0xFF, 0xFF }, // rho smooth
{ 43, F_GGRAVE }, // upsilon Grave
{ 43, F_GCIRCM }, // upsilon Circumflex
{ 43, F_GACTDIA }, // upsilon Acute Diaeresis
{ 43, F_GGRVDIA }, // upsilon Grave Diaeresis
{ 43, F_GSMOOTH }, // upsilon Smooth
{ 43, F_GSMACT }, // upsilon Smooth Acute
{ 43, F_GSMGRV }, // upsilon Smooth Grave
{ 43, F_GSMCIR }, // upsilon Smooth Circumflex
{ 43, F_GROUGH }, // upsilon Rough
{ 43, F_GRGACT }, // upsilon Rough Acute
{ 43, F_GRGGRV }, // upsilon Rough Grave
{ 43, F_GRGCIR }, // upsilon Rough Circumflex
{ 51, F_GGRAVE }, // omega Grave
{ 51, F_GCIRCM }, // omega Circumflex
{ 51, F_GIOTA }, // omega w/Iota
{ 51, F_GACTIO }, // omega Acute w/Iota
{ 51, F_GGRVIO }, // omega Grave w/Iota
{ 51, F_GCIRIO }, // omega Circumflex w/Iota
{ 51, F_GSMOOTH }, // omega Smooth
{ 51, F_GSMACT }, // omega Smooth Acute
{ 51, F_GSMGRV }, // omega Smooth Grave
{ 51, F_GSMCIR }, // omega Smooth Circumflex
{ 51, F_GSMIO }, // omega Smooth w/Iota
{ 51, F_GSMAIO }, // omega Smooth Acute w/Iota
{ 51, F_GSMGVIO }, // omega Smooth Grave w/Iota
{ 51, F_GSMCIO }, // omega Smooth Circumflex w/Iota
{ 51, F_GROUGH }, // omega Rough
{ 51, F_GRGACT }, // omega Rough Acute
{ 51, F_GRGGRV }, // omega Rough Grave
{ 51, F_GRGCIR }, // omega Rough Circumflex
{ 51, F_GRGIO }, // omega Rough w/Iota
{ 51, F_GRGAIO }, // omega Rough Acute w/Iota
{ 51, F_GRGGVIO }, // omega Rough Grave w/Iota
{ 51, F_GRGCIO} // omega Rough Circumflex w/Iota
};
/****************************************************************************
Desc:
****************************************************************************/
static BASE_DIACRIT fwp_grk_c =
{
163, // # of characters in table.
52, // start char.
fwp_grk_c_table
};
/****************************************************************************
Desc: Format of index:
2 words before = count.
word before = start character.
db code for base char.
db code for diacritic
Notes: Diacritical char is always in same set as composed char
base is in same set
****************************************************************************/
static BASE_DIACRIT_TABLE fwp_rus_c_table[] =
{
{ 14, 204 }, // ZHE with right descender
{ 15, 204 }, // zhe with right descender
{ 0xFF, 0xFF }, // DZE
{ 0xFF, 0xFF }, // dze
{ 0xFF, 0xFF }, // Z
{ 0xFF, 0xFF }, // z
{ 18, 206 }, // II with macron
{ 19, 206 }, // ii with macron
{ 0xFF, 0xFF }, // I
{ 0xFF, 0xFF }, // i
{ 0xFF, 0xFF }, // YI
{ 0xFF, 0xFF }, // yi
{ 0xFF, 0xFF }, // I ligature
{ 0xFF, 0xFF }, // i ligature
{ 0xFF, 0xFF }, // JE
{ 0xFF, 0xFF }, // je
{ 0xFF, 0xFF }, // KJE
{ 0xFF, 0xFF }, // kje
{ 22, 204 }, // KA with right descender
{ 23, 204 }, // ka with right descender
{ 22, 205 }, // KA ogonek
{ 23, 205 }, // ka ogonek
{ 0xFF, 0xFF }, // KA vertical bar
{ 0xFF, 0xFF }, // ka vertical bar
{ 0xFF, 0xFF }, // LJE
{ 0xFF, 0xFF }, // lje
{ 28, 204 }, // EN with right descender
{ 29, 204 }, // en with right descender
{ 0xFF, 0xFF }, // NJE
{ 0xFF, 0xFF }, // nje
{ 0xFF, 0xFF }, // ROUND OMEGA
{ 0xFF, 0xFF }, // round omega
{ 0xFF, 0xFF }, // OMEGA
{ 0xFF, 0xFF }, // omega
{ 0xFF, 0xFF }, // TSHE
{ 0xFF, 0xFF }, // tshe
{ 0xFF, 0xFF }, // SHORT U
{ 0xFF, 0xFF }, // short u
{ 40, 206 }, // U with macron
{ 41, 206 }, // u with macron
{ 0xFF, 0xFF }, // STRAIGHT U
{ 0xFF, 0xFF }, // straight u
{ 0xFF, 0xFF }, // STRAIGHT U BAR
{ 0xFF, 0xFF }, // straight u bar
{ 0xFF, 0xFF }, // OU ligature
{ 0xFF, 0xFF }, // ou ligature
{ 44, 204 }, // KHA with right descender
{ 45, 204 }, // kha with right descender
{ 44, 205 }, // KHA ogonek
{ 45, 205 }, // kha ogonek
{ 0xFF, 0xFF }, // H
{ 0xFF, 0xFF }, // h
{ 0xFF, 0xFF }, // OMEGA titlo
{ 0xFF, 0xFF }, // omega titlo
{ 0xFF, 0xFF }, // DZHE
{ 0xFF, 0xFF }, // dzhe
{ 48, 204 }, // CHE with right descender
{ 49, 204 }, // che with right descender
{ 0xFF, 0xFF }, // CHE vertical bar
{ 0xFF, 0xFF }, // che vertical bar
{ 0xFF, 0xFF }, // SHCHA (variant)
{ 0xFF, 0xFF }, // shcha (variant)
{ 0xFF, 0xFF }, // YAT
{ 0xFF, 0xFF }, // yat
{ 0xFF, 0xFF }, // YUS BOLSHOI
{ 0xFF, 0xFF }, // yus bolshoi
{ 0xFF, 0xFF }, // BIG MALYI
{ 0xFF, 0xFF }, // big malyi
{ 0xFF, 0xFF }, // KSI
{ 0xFF, 0xFF }, // ksi
{ 0xFF, 0xFF }, // PSI
{ 0xFF, 0xFF }, // psi
{ 0xFF, 0xFF }, // FITA
{ 0xFF, 0xFF }, // fita
{ 0xFF, 0xFF }, // IZHITSA
{ 0xFF, 0xFF }, // izhitsa
{ 00, F_RACUTE }, // Russian A acute
{ 01, F_RACUTE }, // Russian a acute
{ 10, F_RACUTE }, // Russian IE acute
{ 11, F_RACUTE }, // Russian ie acute
{ 78, F_RACUTE }, // Russian E acute
{ 79, F_RACUTE }, // Russian e acute
{ 18, F_RACUTE }, // Russian II acute
{ 19, F_RACUTE }, // Russian ii acute
{ 88, F_RACUTE }, // Russian I acute
{ 89, F_RACUTE }, // Russian i acute
{ 90, F_RACUTE }, // Russian YI acute
{ 91, F_RACUTE }, // Russian yi acute
{ 30, F_RACUTE }, // Russian O acute
{ 31, F_RACUTE }, // Russian o acute
{ 40, F_RACUTE }, // Russian U acute
{ 41, F_RACUTE }, // Russian u acute
{ 56, F_RACUTE }, // Russian YERI acute
{ 57, F_RACUTE }, // Russian yeri acute
{ 60, F_RACUTE }, // Russian REVERSED E acute
{ 61, F_RACUTE }, // Russian reversed e acute
{ 62, F_RACUTE }, // Russian IU acute
{ 63, F_RACUTE }, // Russian iu acute
{ 64, F_RACUTE }, // Russian IA acute
{ 65, F_RACUTE }, // Russian ia acute
{ 00, F_RGRAVE }, // Russian A grave
{ 01, F_RGRAVE }, // Russian a grave
{ 10, F_RGRAVE }, // Russian IE grave
{ 11, F_RGRAVE }, // Russian ie grave
{ 12, F_RGRAVE }, // Russian YO grave
{ 13, F_RGRAVE }, // Russian yo grave
{ 18, F_RGRAVE }, // Russian I grave
{ 19, F_RGRAVE }, // Russian i grave
{ 30, F_RGRAVE }, // Russian O grave
{ 31, F_RGRAVE }, // Russian o grave
{ 40, F_RGRAVE }, // Russian U grave
{ 41, F_RGRAVE }, // Russian u grave
{ 56, F_RGRAVE }, // Russian YERI grave
{ 57, F_RGRAVE }, // Russian yeri grave
{ 60, F_RGRAVE }, // Russian REVERSED E grave
{ 61, F_RGRAVE }, // Russian reversed e grave
{ 62, F_RGRAVE }, // Russian IU grave
{ 63, F_RGRAVE }, // Russian iu grave
{ 64, F_RGRAVE }, // Russian IA grave
{ 65, F_RGRAVE } // Russian ia grave
};
/****************************************************************************
Desc:
****************************************************************************/
static BASE_DIACRIT fwp_rus_c =
{
120, // # of characters in table.
156, // start char.
fwp_rus_c_table,
};
/****************************************************************************
Desc: Table of pointers to character component tables.
****************************************************************************/
static BASE_DIACRIT * fwp_car60_c[ F_NCHSETS] =
{
(BASE_DIACRIT*)0, // no composed characters for ascii.
&fwp_ml1c,
(BASE_DIACRIT*)0, // no composed characters for multinational 2
(BASE_DIACRIT*)0, // no composed characters for line draw.
(BASE_DIACRIT*)0, // no composed characters for typographic.
(BASE_DIACRIT*)0, // no composed characters for icons.
(BASE_DIACRIT*)0, // no composed characters for math.
(BASE_DIACRIT*)0, // no composed characters for math extension.
&fwp_grk_c, // Greek
(BASE_DIACRIT*)0, // Hebrew
&fwp_rus_c, // Cyrillic - Russian
(BASE_DIACRIT*)0, // Hiragana or Katakana (Japanese)
(BASE_DIACRIT*)0, // no composed characters for user.
(BASE_DIACRIT*)0, // no composed characters for Arabic.
(BASE_DIACRIT*)0, // no composed characters for Arabic Script .
};
/****************************************************************************
Desc: Map special chars in CharSet (x24) to collation values
****************************************************************************/
static BYTE_WORD_TBL fwp_Ch24ColTbl[] =
{
{1, COLLS+2}, // comma
{2, COLLS+1}, // maru
{5, COLS_ASIAN_MARKS+2}, // chuuten
{10, COLS_ASIAN_MARKS}, // dakuten
{11, COLS_ASIAN_MARKS+1}, // handakuten
{43, COLS2+2}, // angled brackets
{44, COLS2+3}, //
{49, COLS2+2}, // pointy brackets
{50, COLS2+3},
{51, COLS2+2}, // double pointy brackets
{52, COLS2+3},
{53, COLS1}, // Japanese quotes
{54, COLS1},
{55, COLS1}, // hollow Japanese quotes
{56, COLS1},
{57, COLS2+2}, // filled rounded brackets
{58, COLS2+3}
};
/****************************************************************************
Desc: Kana subcollation values
BIT 0: set if large char
BIT 1: set if voiced
BIT 2: set if half voiced
Notes:
To save space should be nibbles
IMPORTANT:
The '1' entries that do not have
a matching '0' entry have been
changed to zero to save space in
the subcollation area.
The original table is listed below.
****************************************************************************/
static FLMBYTE KanaSubColTbl[] =
{
0,1,0,1,0,1,0,1,0,1, // a A i I u U e E o O
1,3,0,3,0,3,1,3,0,3, // KA GA KI GI KU GU KE GE KO GO
0,3,0,3,0,3,0,3,0,3, // SA ZA SHI JI SU ZU SE ZE SO ZO
0,3,0,3,0,1,3,0,3,0,3, // TA DA CHI JI tsu TSU ZU TE DE TO DO
0,0,0,0,0, // NA NI NU NE NO
0,3,5,0,3,5,0,3,5, // HA BA PA HI BI PI FU BU PU
0,3,5,0,3,5, // HE BE PE HO BO PO
0,0,0,0,0, // MA MI MU ME MO
0,1,0,1,0,1, // ya YA yu YU yo YO
0,0,0,0,0, // RA RI RU RE RO
0,1,0,0,0, // wa WA WI WE WO
0,3,0,0 // N VU ka ke
};
/****************************************************************************
Desc: Map katakana (CharSet x26) to collation values
kana collating values are two byte values
where the high byte is 0x01.
****************************************************************************/
static FLMBYTE KanaColTbl[] =
{
0, 0, 1, 1, 2, 2, 3, 3, 4, 4, // a A i I u U e E o O
5, 5, 6, 6, 7, 7, 8, 8, 9, 9, // KA GA KI GI KU GU KE GE KO GO
10,10,11,11,12,12,13,13,14,14, // SA ZA SHI JI SU ZU SE ZE SO ZO
15,15,16,16,17,17,17,18,18,19,19, // TA DA CHI JI tsu TSU ZU TE DE TO DO
20,21,22,23,24, // NA NI NU NE NO
25,25,25,26,26,26,27,27,27, // HA BA PA HI BI PI FU BU PU
28,28,28,29,29,29, // HE BE PE HO BO PO
30,31,32,33,34, // MA MI MU ME MO
35,35,36,36,37,37, // ya YA yu YU yo YO
38,39,40,41,42, // RA RI RU RE RO
43,43,44,45,46, // wa WA WI WE WO
47, 2, 5, 8 // N VU ka ke
};
/****************************************************************************
Desc: Map KataKana collated value to vowel value for
use for the previous char.
****************************************************************************/
static FLMBYTE KanaColToVowel[] =
{
0,1,2,3,4, // a i u e o
0,1,2,3,4, // ka ki ku ke ko
0,1,2,3,4, // sa shi su se so
0,1,2,3,4, // ta chi tsu te to
0,1,2,3,4, // na ni nu ne no
0,1,2,3,4, // ha hi hu he ho
0,1,2,3,4, // ma mi mu me mo
0,2,4, // ya yu yo
0,1,2,3,4, // ra ri ru re ro
0,1,3,4, // wa wi we wo
};
/****************************************************************************
Desc: Convert Zenkaku (double wide) to Hankaku (single wide)
Character set 0x24 maps to single wide chars in other char sets.
This enables collation values to be found on some symbols.
This is also used to convert symbols from hankaku to Zen24.
****************************************************************************/
static BYTE_WORD_TBL Zen24ToHankaku[] =
{
{ 0 ,0x0020 }, // space
{ 1 ,0x0b03 }, // japanese comma
{ 2 ,0x0b00 }, // circle period
{ 3 , 44 }, // comma
{ 4 , 46 }, // period
{ 5 ,0x0b04 }, // center dot
{ 6 , 58 }, // colon
{ 7 , 59 }, // semicolon
{ 8 , 63 }, // question mark
{ 9 , 33 }, // exclamation mark
{ 10 ,0x0b3d }, // dakuten
{ 11 ,0x0b3e }, // handakuten
{ 12 ,0x0106 }, // accent mark
{ 13 , 96 }, // accent mark
{ 14 ,0x0107 }, // umlat
{ 15 , 94 }, // caret
{ 16 ,0x0108 }, // macron
{ 17 , 95 }, // underscore
{ 27 ,0x0b0f }, // extend vowel
{ 28 ,0x0422 }, // mdash
{ 29 , 45 }, // hyphen
{ 30 , 47 }, // slash
{ 31 ,0x0607 }, // backslash
{ 32 , 126 }, // tilde
{ 33 ,0x0611 }, // doubleline
{ 34 ,0x0609 }, // line
{ 37 ,0x041d }, // left apostrophe
{ 38 ,0x041c }, // right apostrophe
{ 39 ,0x0420 }, // left quote
{ 40 ,0x041f }, // right quote
{ 41 , 40 }, // left paren
{ 42 , 41 }, // right paren
{ 45 , 91 }, // left bracket
{ 46 , 93 }, // right bracket
{ 47 , 123 }, // left curly bracket
{ 48 , 125 }, // right curly bracket
{ 53 ,0x0b01 }, // left j quote
{ 54 ,0x0b02 }, // right j quote
{ 59 , 43 }, // plus
{ 60 ,0x0600 }, // minus
{ 61 ,0x0601 }, // plus/minus
{ 62 ,0x0627 }, // times
{ 63 ,0x0608 }, // divide
{ 64 , 61 }, // equal
{ 65 ,0x0663 }, // unequal
{ 66 , 60 }, // less
{ 67 , 62 }, // greater
{ 68 ,0x0602 }, // less/equal
{ 69 ,0x0603 }, // greater/equal
{ 70 ,0x0613 }, // infinity
{ 71 ,0x0666 }, // traingle dots
{ 72 ,0x0504 }, // man
{ 73 ,0x0505 }, // woman
{ 75 ,0x062d }, // prime
{ 76 ,0x062e }, // double prime
{ 78 ,0x040c }, // yen
{ 79 , 36 }, // $
{ 80 ,0x0413 }, // cent
{ 81 ,0x040b }, // pound
{ 82 , 37 }, // %
{ 83 , 35 }, // #
{ 84 , 38 }, // &
{ 85 , 42 }, // *
{ 86 , 64 }, // @
{ 87 ,0x0406 }, // squiggle
{ 89 ,0x06b8 }, // filled star
{ 90 ,0x0425 }, // hollow circle
{ 91 ,0x042c }, // filled circle
{ 93 ,0x065f }, // hollow diamond
{ 94 ,0x0660 }, // filled diamond
{ 95 ,0x0426 }, // hollow box
{ 96 ,0x042e }, // filled box
{ 97 ,0x0688 }, // hollow triangle
{ 99 ,0x0689 }, // hollow upside down triangle
{ 103,0x0615 }, // right arrow
{ 104,0x0616 }, // left arrow
{ 105,0x0617 }, // up arrow
{ 106,0x0622 }, // down arrow
{ 119,0x060f },
{ 121,0x0645 },
{ 122,0x0646 },
{ 123,0x0643 },
{ 124,0x0644 },
{ 125,0x0642 }, // union
{ 126,0x0610 }, // intersection
{ 135,0x0655 },
{ 136,0x0656 },
{ 138,0x0638 }, // right arrow
{ 139,0x063c }, // left/right arrow
{ 140,0x067a },
{ 141,0x0679 },
{ 153,0x064f }, // angle
{ 154,0x0659 },
{ 155,0x065a },
{ 156,0x062c },
{ 157,0x062b },
{ 158,0x060e },
{ 159,0x06b0 },
{ 160,0x064d },
{ 161,0x064e },
{ 162,0x050e }, // square root
{ 164,0x0604 },
{ 175,0x0623 }, // angstrom
{ 176,0x044b }, // percent
{ 177,0x051b }, // sharp
{ 178,0x051c }, // flat
{ 179,0x0509 }, // musical note
{ 180,0x0427 }, // dagger
{ 181,0x0428 }, // double dagger
{ 182,0x0405 }, // paragraph
{ 187,0x068f } // big hollow circle
};
/****************************************************************************
Desc: Maps CS26 to CharSet 11
Used to uncollate characters for FLAIM - placed here for consistency
0x80 - add dakuten
0xC0 - add handakuten
0xFF - no mapping exists
****************************************************************************/
static FLMBYTE MapCS26ToCharSet11[ 86] =
{
0x06, // 0 a
0x10, // 1 A
0x07, // 2 i
0x11, // 3 I
0x08, // 4 u
0x12, // 5 U
0x09, // 6 e
0x13, // 7 E
0x0a, // 8 o
0x14, // 9 O
0x15, // 0x0a KA
0x95, // GA - 21 followed by 0x3D dakuten
0x16, // 0x0c KI
0x96, // GI
0x17, // 0x0e KU
0x97, // GU
0x18, // 0x10 KE
0x98, // GE
0x19, // 0x12 KO
0x99, // GO
0x1a, // 0x14 SA
0x9a, // ZA
0x1b, // 0x16 SHI
0x9b, // JI
0x1c, // 0x18 SU
0x9c, // ZU
0x1d, // 0x1a SE
0x9d, // ZE
0x1e, // 0x1c SO
0x9e, // ZO
0x1f, // 0x1e TA
0x9f, // DA
0x20, // 0x20 CHI
0xa0, // JI
0x0e, // 0x22 small tsu
0x21, // 0x23 TSU
0xa1, // ZU
0x22, // 0x25 TE
0xa2, // DE
0x23, // 0x27 TO
0xa3, // DO
0x24, // 0x29 NA
0x25, // 0x2a NI
0x26, // 0x2b NU
0x27, // 0x2c NE
0x28, // 0x2d NO
0x29, // 0x2e HA
0xa9, // 0x2f BA
0xe9, // 0x30 PA
0x2a, // 0x31 HI
0xaa, // 0x32 BI
0xea, // 0x33 PI
0x2b, // 0x34 FU
0xab, // 0x35 BU
0xeb, // 0x36 PU
0x2c, // 0x37 HE
0xac, // 0x38 BE
0xec, // 0x39 PE
0x2d, // 0x3a HO
0xad, // 0x3b BO
0xed, // 0x3c PO
0x2e, // 0x3d MA
0x2f, // 0x3e MI
0x30, // 0x3f MU
0x31, // 0x40 ME
0x32, // 0x41 MO
0x0b, // 0x42 small ya
0x33, // 0x43 YA
0x0c, // 0x44 small yu
0x34, // 0x45 YU
0x0d, // 0x46 small yo
0x35, // 0x47 YO
0x36, // 0x48 RA
0x37, // 0x49 RI
0x38, // 0x4a RU
0x39, // 0x4b RE
0x3a, // 0x4c RO
0xff, // 0x4d small wa
0x3b, // 0x4e WA
0xff, // 0x4f WI
0xff, // 0x50 WE
0x05, // 0x51 WO
0x3c, // 0x52 N
0xff, // 0x53 VU
0xff, // 0x54 ka
0xff // 0x55 ke
};
/****************************************************************************
Desc: Conversion from single (Hankaku) to double (Zenkaku) wide characters
Used in f_wpHanToZenkaku()
Maps from charset 11 to CS24 (punctuation) (starting from 11,0)
****************************************************************************/
static FLMBYTE From0AToZen[] =
{
0, 9, 40, 0x53, // sp ! " #
0x4f, 0x52, 0x54, 38, // $ % & '
// Was 187 for ! and 186 for '
0x29, 0x2a, 0x55, 0x3b, // ( ) * +
3, 0x1d, 4, 0x1e // , - . /
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE From0BToZen[] =
{
6, 7, 0x42, 0x40, // : ; < =
0x43, 8, 0x56 // > ? @
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE From0CToZen[] =
{
0x2d, 0x1f, 0x2e, 0x0f, 0x11, 0x0d // [ BACKSLASH ] ^ _ `
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE From0DToZen[] =
{
0x2f, 0x22, 0x30, 0x20 // { | } ~
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE From8ToZen[] =
{
0x5e, 0x7e, 0x5f, 0x7f, 0x5f, 0xFF, 0x60, 0x80,
0x61, 0x81, 0x62, 0x82, 0x63, 0x83, 0x64, 0x84,
0x65, 0x85, 0x66, 0x86, 0x67, 0x87, 0x68, 0x88,
0x69, 0x89, 0x6a, 0x8a, 0x6b, 0x8b, 0x6c, 0x8c,
0x6d, 0x8d, 0x6e, 0x8e, 0x6f, 0x8f, 0x6f, 0xFF,
0x70, 0x90, 0x71, 0x91, 0x72, 0x92, 0x73, 0x93,
0x74, 0x94, 0x75, 0x95
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE From11AToZen[] =
{
2, // japanese period
0x35, // left bracket
0x36, // right bracket
0x01, // comma
0x05 // chuuten
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE From11BToZen[] =
{
0x51, // wo
0,2,4,6,8,0x42,0x44,0x46,0x22, // small a i u e o ya yu yo tsu
0xFF, 1, 3, 5, 7, 9, // dash (x241b) a i u e o
0x0a, 0x0c, 0x0e, 0x10, 0x12, // ka ki ku ke ko
0x14, 0x16, 0x18, 0x1a, 0x1c, // sa shi su se so
0x1e, 0x20, 0x23, 0x25, 0x27, // ta chi tsu te to
0x29, 0x2a, 0x2b, 0x2c, 0x2d, // na ni nu ne no
0x2e, 0x31, 0x34, 0x37, 0x3a, // ha hi fu he ho
0x3d, 0x3e, 0x3f, 0x40, 0x41, // ma mi mu me mo
0x43, 0x45, 0x47, // ya yu yo
0x48, 0x49, 0x4a, 0x4b, 0x4c, // ra ri ru re ro
0x4e, 0x52 // WA N
}; // does not have wa WI WE VU ka ke
/****************************************************************************
Desc:
****************************************************************************/
static FLMUINT16 fwp_indexi[] =
{
0,11,14,15,17,18,19,21,22,23,24,25,26,35,59
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMUINT16 fwp_indexj[] =
{
FLM_CA_LANG, // Catalan (0)
FLM_CF_LANG, // Canadian French
FLM_CZ_LANG, // Czech
FLM_SL_LANG, // Slovak
FLM_DE_LANG, // German
FLM_SD_LANG, // Swiss German
FLM_ES_LANG, // Spanish (Spain)
FLM_FR_LANG, // French
FLM_NL_LANG, // Netherlands
0xFFFF, // DK_LANG, Danish - support for 'aa' -> a-ring out
0xFFFF, // NO_LANG, Norwegian - support for 'aa' -> a-ring out
0x0063, // c - DOUBLE CHARACTERS - STATE ENTRIES
0x006c, // l
0x0197, // l with center dot
0x0063, // c
0x0125, // ae digraph
0x01a7, // oe digraph
0x0068, // h
0x0068, // h
0x006c, // l
0x0101, // center dot alone
0x006c, // l
0x0117, // ? (for German)
0x018b, // ij digraph
0x0000, // was 'a' - will no longer map 'aa' to a-ring
0x0000, // was 'a'
FLM_CZ_LANG, // SINGLE CHARS - LANGUAGES
FLM_DK_LANG,
FLM_NO_LANG,
FLM_SL_LANG,
FLM_TK_LANG,
FLM_SU_LANG,
FLM_IS_LANG,
FLM_SV_LANG,
FLM_YK_LANG,
// SINGLE CHARS
0x011e, // A Diaeresis - alternate collating sequences
0x011f, // a Diaeresis
0x0122, // A Ring - 2
0x0123, // a Ring
0x0124, // AE Diagraph - 4
0x0125, // ae diagraph
0x013e, // O Diaeresis - 6
0x013f, // o Diaeresis
0x0146, // U Diaeresis - 8
0x0147, // u Diaeresis
0x0150, // O Slash - 10
0x0151, // o Slash
0x0A3a, // CYRILLIC SOFT SIGN - 12
0x0A3b, // CYRILLIC soft sign
0x01ee, // dotless i - turkish - 14
0x01ef, // dotless I - turkish
0x0162, // C Hacek/caron - 1,98 - 16
0x0163, // c Hacek/caron - 1,99
0x01aa, // R Hacek/caron - 1,170 - 18
0x01ab, // r Hacek/caron - 1,171
0x01b0, // S Hacek/caron - 1,176 - 20
0x01b1, // s Hacek/caron - 1,177
0x01ce, // Z Hacek/caron - 1,206 - 22
0x01cf, // z Hacek/caron - 1,207
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMUINT16 fwp_valuea[] =
{
// DOUBLE CHAR STATE VALUES
STATE1, // 00
STATE3,
STATE2,
STATE2,
STATE8,
STATE8,
STATE1,
STATE3,
STATE9,
STATE10, // No longer in use
STATE10, // No longer in use
STATE4,
STATE6,
STATE6,
STATE5,
INSTAE,
INSTOE,
AFTERC,
AFTERH,
AFTERL,
STATE7,
STATE6,
INSTSG, // ss for German
INSTIJ,
STATE11, // aa - no longer in use
WITHAA, // aa - no longer in use
// SINGLE CHARS - LANGUAGES
START_CZ, // Czech
START_DK, // Danish
START_NO, // Norwegian
START_SL, // Slovak
START_TK, // Turkish
START_SU, // Finnish
START_IS, // Icelandic
START_SV, // Swedish
START_YK, // Ukrainian
// SINGLE CHARS FIXUP AREAS
COLS9, COLS9, COLS9, COLS9, // US & OTHERS
COLS9+1, COLS9+1, COLS9+21, COLS9+21,
COLS9+30, COLS9+30, COLS9+21, COLS9+21,
COLS10+43, COLS10+43, COLS9+12, COLS9+12,
COLS9+3, COLS9+3, COLS9+25, COLS9+25,
COLS9+27, COLS9+27, COLS9+35, COLS9+35,
COLS9+45, COLS9+45, COLS9+55, COLS9+55, // DANISH
COLS9+42, COLS9+42, COLS9+53, COLS9+53,
COLS9+30, COLS9+30, COLS9+49, COLS9+49, // Oct98 U Diaer no longer to y Diaer
COLS10+43, COLS10+43, COLS9+12, COLS9+12,
COLS9+3, COLS9+3, COLS9+25, COLS9+25,
COLS9+27, COLS9+27, COLS9+35, COLS9+35,
COLS9, COLS9, COLS9, COLS9, // Icelandic
COLS9+46, COLS9+46, COLS9+50, COLS9+50,
COLS9+30, COLS9+30, COLS9+54, COLS9+54,
COLS10+43, COLS10+43, COLS9+12, COLS9+12,
COLS9+3, COLS9+3, COLS9+25, COLS9+25,
COLS9+27, COLS9+27, COLS9+35, COLS9+35,
COLS9, COLS9, COLS9+51, COLS9+51, // Norwegian
COLS9+43, COLS9+43, COLS9+21, COLS9+21,
COLS9+30, COLS9+30, COLS9+47, COLS9+47,
COLS10+43, COLS10+43, COLS9+12, COLS9+12,
COLS9+3, COLS9+3, COLS9+25, COLS9+25,
COLS9+27, COLS9+27, COLS9+35, COLS9+35,
COLS9+48, COLS9+48, COLS9+44, COLS9+44, // Finnish/Swedish
COLS9+1, COLS9+1, COLS9+52, COLS9+52,
COLS9+30, COLS9+30, COLS9+21, COLS9+21, // Oct98 U Diaer no longer to y Diaer
COLS10+43, COLS10+43, COLS9+12, COLS9+12,
COLS9+3, COLS9+3, COLS9+25, COLS9+25,
COLS9+27, COLS9+27, COLS9+35, COLS9+35,
COLS9, COLS9, COLS9, COLS9, // Ukrain
COLS9+1, COLS9+1, COLS9+21, COLS9+21,
COLS9+30, COLS9+30, COLS9+21, COLS9+21,
COLS10+48, COLS10+48, COLS9+12, COLS9+12,
COLS9+3, COLS9+3, COLS9+25, COLS9+25,
COLS9+27, COLS9+27, COLS9+35, COLS9+35,
COLS9, COLS9, COLS9, COLS9, // Turkish
COLS9+1, COLS9+1, COLS9+21, COLS9+21,
COLS9+30, COLS9+30, COLS9+21, COLS9+21,
COLS9+43, COLS9+43, COLS9+11, COLS9+11, // dotless i same as
COLS9+3, COLS9+3, COLS9+25, COLS9+25, // the "CH" in Czech
COLS9+27, COLS9+27, COLS9+35, COLS9+35, // works because char
// fails brkcar()
COLS9, COLS9, COLS9, COLS9, // Czech / Slovak
COLS9+1, COLS9+1, COLS9+21, COLS9+21,
COLS9+30, COLS9+30, COLS9+21, COLS9+21,
COLS10+43, COLS10+43, COLS9+12, COLS9+12,
COLS9+5, COLS9+5, COLS9+26, COLS9+26, // carons
COLS9+28, COLS9+28, COLS9+36, COLS9+36
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE fwp_asc60Tbl[ ASCTBLLEN + 2] =
{
0x20, // initial character offset!!
ASCTBLLEN, // len of this table
COLLS, // <Spc>
COLLS+5, // !
COLS1, // "
COLS6+1, // #
COLS3, // $
COLS6, // %
COLS6+2, // &
COLS1+1, // '
COLS2, // (
COLS2+1, // )
COLS4+2, // *
COLS4, // +
COLLS+2, // ,
COLS4+1, // -
COLLS+1, // .
COLS4+3, // /
COLS8, // 0
COLS8+1, // 1
COLS8+2, // 2
COLS8+3, // 3
COLS8+4, // 4
COLS8+5, // 5
COLS8+6, // 6
COLS8+7, // 7
COLS8+8, // 8
COLS8+9, // 9
COLLS+3, // :
COLLS+4, // ;
COLS5, // <
COLS5+2, // =
COLS5+4, // >
COLLS+7, // ?
COLS6+3, // @
COLS9, // A
COLS9+2, // B
COLS9+3, // C
COLS9+6, // D
COLS9+7, // E
COLS9+8, // F
COLS9+9, // G
COLS9+10, // H
COLS9+12, // I
COLS9+14, // J
COLS9+15, // K
COLS9+16, // L
COLS9+18, // M
COLS9+19, // N
COLS9+21, // O
COLS9+23, // P
COLS9+24, // Q
COLS9+25, // R
COLS9+27, // S
COLS9+29, // T
COLS9+30, // U
COLS9+31, // V
COLS9+32, // W
COLS9+33, // X
COLS9+34, // Y
COLS9+35, // Z
COLS9+40, // [ (note: alphabetic - end of list)
COLS6+4, // Backslash
COLS9+41, // ] (note: alphabetic - end of list)
COLS4+4, // ^
COLS6+5, // _
COLS1+2, // `
COLS9, // a
COLS9+2, // b
COLS9+3, // c
COLS9+6, // d
COLS9+7, // e
COLS9+8, // f
COLS9+9, // g
COLS9+10, // h
COLS9+12, // i
COLS9+14, // j
COLS9+15, // k
COLS9+16, // l
COLS9+18, // m
COLS9+19, // n
COLS9+21, // o
COLS9+23, // p
COLS9+24, // q
COLS9+25, // r
COLS9+27, // s
COLS9+29, // t
COLS9+30, // u
COLS9+31, // v
COLS9+32, // w
COLS9+33, // x
COLS9+34, // y
COLS9+35, // z
COLS2+4, // {
COLS6+6, // |
COLS2+5, // }
COLS6+7 // ~
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE fwp_mn60Tbl[ MNTBLLEN + 2] =
{
23, // initial character offset!!
MNTBLLEN, // len of this table
COLS9+27, // German Double s
COLS9+15, // Icelandic k
COLS9+14, // Dotless j
// IBM Charset
COLS9, // A Acute
COLS9, // a Acute
COLS9, // A Circumflex
COLS9, // a Circumflex
COLS9, // A Diaeresis or Umlaut
COLS9, // a Diaeresis or Umlaut
COLS9, // A Grave
COLS9, // a Grave
COLS9, // A Ring
COLS9, // a Ring
COLS9+1, // AE digraph
COLS9+1, // ae digraph
COLS9+3, // C Cedilla
COLS9+3, // c Cedilla
COLS9+7, // E Acute
COLS9+7, // e Acute
COLS9+7, // E Circumflex
COLS9+7, // e Circumflex
COLS9+7, // E Diaeresis or Umlaut
COLS9+7, // e Diaeresis or Umlaut
COLS9+7, // E Grave
COLS9+7, // e Grave
COLS9+12, // I Acute
COLS9+12, // i Acute
COLS9+12, // I Circumflex
COLS9+12, // i Circumflex
COLS9+12, // I Diaeresis or Umlaut
COLS9+12, // i Diaeresis or Umlaut
COLS9+12, // I Grave
COLS9+12, // i Grave
COLS9+20, // N Tilde
COLS9+20, // n Tilde
COLS9+21, // O Acute
COLS9+21, // o Acute
COLS9+21, // O Circumflex
COLS9+21, // o Circumflex
COLS9+21, // O Diaeresis or Umlaut
COLS9+21, // o Diaeresis or Umlaut
COLS9+21, // O Grave
COLS9+21, // o Grave
COLS9+30, // U Acute
COLS9+30, // u Acute
COLS9+30, // U Circumflex
COLS9+30, // u Circumflex
COLS9+30, // U Diaeresis or Umlaut
COLS9+30, // u Diaeresis or Umlaut
COLS9+30, // U Grave
COLS9+30, // u Grave
COLS9+34, // Y Diaeresis or Umlaut
COLS9+34, // y Diaeresis or Umlaut
// IBM foreign
COLS9, // A Tilde
COLS9, // a Tilde
COLS9+6, // D Cross Bar
COLS9+6, // d Cross Bar
COLS9+21, // O Slash
COLS9+21, // o Slash
COLS9+21, // O Tilde
COLS9+21, // o Tilde
COLS9+34, // Y Acute
COLS9+34, // y Acute
COLS9+6, // Uppercase Eth
COLS9+6, // Lowercase Eth
COLS9+37, // Uppercase Thorn
COLS9+37, // Lowercase Thorn
// Teletex chars
COLS9, // A Breve
COLS9, // a Breve
COLS9, // A Macron
COLS9, // a Macron
COLS9, // A Ogonek
COLS9, // a Ogonek
COLS9+3, // C Acute
COLS9+3, // c Acute
COLS9+3, // C Caron or Hachek
COLS9+3, // c Caron or Hachek
COLS9+3, // C Circumflex
COLS9+3, // c Circumflex
COLS9+3, // C Dot Above
COLS9+3, // c Dot Above
COLS9+6, // D Caron or Hachek (Apostrophe Beside)
COLS9+6, // d Caron or Hachek (Apostrophe Beside)
COLS9+7, // E Caron or Hachek
COLS9+7, // e Caron or Hachek
COLS9+7, // E Dot Above
COLS9+7, // e Dot Above
COLS9+7, // E Macron
COLS9+7, // e Macron
COLS9+7, // E Ogonek
COLS9+7, // e Ogonek
COLS9+9, // G Acute
COLS9+9, // g Acute
COLS9+9, // G Breve
COLS9+9, // g Breve
COLS9+9, // G Caron or Hachek
COLS9+9, // g Caron or Hachek
COLS9+9, // G Cedilla (Apostrophe Under)
COLS9+9, // g Cedilla (Apostrophe Over)
COLS9+9, // G Circumflex
COLS9+9, // g Circumflex
COLS9+9, // G Dot Above
COLS9+9, // g Dot Above
COLS9+10, // H Circumflex
COLS9+10, // h Circumflex
COLS9+10, // H Cross Bar
COLS9+10, // h Cross Bar
COLS9+12, // I Dot Above (Sharp Accent)
COLS9+12, // i Dot Above (Sharp Accent)
COLS9+12, // I Macron
COLS9+12, // i Macron
COLS9+12, // I Ogonek
COLS9+12, // i Ogonek
COLS9+12, // I Tilde
COLS9+12, // i Tilde
COLS9+13, // IJ Digraph
COLS9+13, // ij Digraph
COLS9+14, // J Circumflex
COLS9+14, // j Circumflex
COLS9+15, // K Cedilla (Apostrophe Under)
COLS9+15, // k Cedilla (Apostrophe Under)
COLS9+16, // L Acute
COLS9+16, // l Acute
COLS9+16, // L Caron or Hachek (Apostrophe Beside)
COLS9+16, // l Caron or Hachek (Apostrophe Beside)
COLS9+16, // L Cedilla (Apostrophe Under)
COLS9+16, // l Cedilla (Apostrophe Under)
COLS9+16, // L Center Dot
COLS9+16, // l Center Dot
COLS9+16, // L Stroke
COLS9+16, // l Stroke
COLS9+19, // N Acute
COLS9+19, // n Acute
COLS9+19, // N Apostrophe
COLS9+19, // n Apostrophe
COLS9+19, // N Caron or Hachek
COLS9+19, // n Caron or Hachek
COLS9+19, // N Cedilla (Apostrophe Under)
COLS9+19, // n Cedilla (Apostrophe Under)
COLS9+21, // O Double Acute
COLS9+21, // o Double Acute
COLS9+21, // O Macron
COLS9+21, // o Macron
COLS9+22, // OE digraph
COLS9+22, // oe digraph
COLS9+25, // R Acute
COLS9+25, // r Acute
COLS9+25, // R Caron or Hachek
COLS9+25, // r Caron or Hachek
COLS9+25, // R Cedilla (Apostrophe Under)
COLS9+25, // r Cedilla (Apostrophe Under)
COLS9+27, // S Acute
COLS9+27, // s Acute
COLS9+27, // S Caron or Hachek
COLS9+27, // s Caron or Hachek
COLS9+27, // S Cedilla
COLS9+27, // s Cedilla
COLS9+27, // S Circumflex
COLS9+27, // s Circumflex
COLS9+29, // T Caron or Hachek (Apostrophe Beside)
COLS9+29, // t Caron or Hachek (Apostrophe Beside)
COLS9+29, // T Cedilla (Apostrophe Under)
COLS9+29, // t Cedilla (Apostrophe Under)
COLS9+29, // T Cross Bar
COLS9+29, // t Cross Bar
COLS9+30, // U Breve
COLS9+30, // u Breve
COLS9+30, // U Double Acute
COLS9+30, // u Double Acute
COLS9+30, // U Macron
COLS9+30, // u Macron
COLS9+30, // U Ogonek
COLS9+30, // u Ogonek
COLS9+30, // U Ring
COLS9+30, // u Ring
COLS9+30, // U Tilde
COLS9+30, // u Tilde
COLS9+32, // W Circumflex
COLS9+32, // w Circumflex
COLS9+34, // Y Circumflex
COLS9+34, // y Circumflex
COLS9+35, // Z Acute
COLS9+35, // z Acute
COLS9+35, // Z Caron or Hachek
COLS9+35, // z Caron or Hachek
COLS9+35, // Z Dot Above
COLS9+35, // z Dot Above
COLS9+19, // Uppercase Eng
COLS9+19, // Lowercase Eng
// Other
COLS9+6, // D Macron
COLS9+6, // d Macron
COLS9+16, // L Macron
COLS9+16, // l Macron
COLS9+19, // N Macron
COLS9+19, // n Macron
COLS9+25, // R Grave
COLS9+25, // r Grave
COLS9+27, // S Macron
COLS9+27, // s Macron
COLS9+29, // T Macron
COLS9+29, // t Macron
COLS9+34, // Y Breve
COLS9+34, // y Breve
COLS9+34, // Y Grave
COLS9+34, // y Grave
COLS9+6, // D Apostrophe Beside
COLS9+6, // d Apostrophe Beside
COLS9+21, // O Apostrophe Beside
COLS9+21, // o Apostrophe Beside
COLS9+30, // U Apostrophe Beside
COLS9+30, // u Apostrophe Beside
COLS9+7, // E breve
COLS9+7, // e breve
COLS9+12, // I breve
COLS9+12, // i breve
COLS9+12, // dotless I
COLS9+12, // dotless i
COLS9+21, // O breve
COLS9+21 // o breve
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE fwp_sym60Tbl[ SYMTBLLEN + 2] =
{
11, // initial character offset!!
SYMTBLLEN, // len of this table
COLS3+2, // pound
COLS3+3, // yen
COLS3+4, // pacetes
COLS3+5, // floren
COLS0,
COLS0,
COLS0,
COLS0,
COLS3+1, // cent
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE fwp_grk60Tbl[ GRKTBLLEN + 2] =
{
0, // starting offset
GRKTBLLEN, // length
COLS7, // Uppercase Alpha
COLS7, // Lowercase Alpha
COLS7+1, // Uppercase Beta
COLS7+1, // Lowercase Beta
COLS7+1, // Uppercase Beta Medial
COLS7+1, // Lowercase Beta Medial
COLS7+2, // Uppercase Gamma
COLS7+2, // Lowercase Gamma
COLS7+3, // Uppercase Delta
COLS7+3, // Lowercase Delta
COLS7+4, // Uppercase Epsilon
COLS7+4, // Lowercase Epsilon
COLS7+5, // Uppercase Zeta
COLS7+5, // Lowercase Zeta
COLS7+6, // Uppercase Eta
COLS7+6, // Lowercase Eta
COLS7+7, // Uppercase Theta
COLS7+7, // Lowercase Theta
COLS7+8, // Uppercase Iota
COLS7+8, // Lowercase Iota
COLS7+9, // Uppercase Kappa
COLS7+9, // Lowercase Kappa
COLS7+10, // Uppercase Lambda
COLS7+10, // Lowercase Lambda
COLS7+11, // Uppercase Mu
COLS7+11, // Lowercase Mu
COLS7+12, // Uppercase Nu
COLS7+12, // Lowercase Nu
COLS7+13, // Uppercase Xi
COLS7+13, // Lowercase Xi
COLS7+14, // Uppercase Omicron
COLS7+14, // Lowercase Omicron
COLS7+15, // Uppercase Pi
COLS7+15, // Lowercase Pi
COLS7+16, // Uppercase Rho
COLS7+16, // Lowercase Rho
COLS7+17, // Uppercase Sigma
COLS7+17, // Lowercase Sigma
COLS7+17, // Uppercase Sigma Terminal
COLS7+17, // Lowercase Sigma Terminal
COLS7+18, // Uppercase Tau
COLS7+18, // Lowercase Tau
COLS7+19, // Uppercase Upsilon
COLS7+19, // Lowercase Upsilon
COLS7+20, // Uppercase Phi
COLS7+20, // Lowercase Phi
COLS7+21, // Uppercase Chi
COLS7+21, // Lowercase Chi
COLS7+22, // Uppercase Psi
COLS7+22, // Lowercase Psi
COLS7+23, // Uppercase Omega
COLS7+23, // Lowercase Omega
// Other Modern Greek Characters [8,52]
COLS7, // Uppercase ALPHA Tonos high prime
COLS7, // Lowercase Alpha Tonos - acute
COLS7+4, // Uppercase EPSILON Tonos - high prime
COLS7+4, // Lowercase Epslion Tonos - acute
COLS7+6, // Uppercase ETA Tonos - high prime
COLS7+6, // Lowercase Eta Tonos - acute
COLS7+8, // Uppercase IOTA Tonos - high prime
COLS7+8, // Lowercase iota Tonos - acute
COLS7+8, // Uppercase IOTA Diaeresis
COLS7+8, // Lowercase iota diaeresis
COLS7+14, // Uppercase OMICRON Tonos - high prime
COLS7+14, // Lowercase Omicron Tonos - acute
COLS7+19, // Uppercase UPSILON Tonos - high prime
COLS7+19, // Lowercase Upsilon Tonos - acute
COLS7+19, // Uppercase UPSILON Diaeresis
COLS7+19, // Lowercase Upsilon diaeresis
COLS7+23, // Uppercase OMEGA Tonos - high prime
COLS7+23, // Lowercase Omega Tonso - acute
// Variants [8,70]
COLS7+4, // epsilon (variant)
COLS7+7, // theta (variant)
COLS7+9, // kappa (variant)
COLS7+15, // pi (variant)
COLS7+16, // rho (variant)
COLS7+17, // sigma (variant)
COLS7+19, // upsilon (variant)
COLS7+20, // phi (variant)
COLS7+23, // omega (variant)
// Greek Diacritic marks [8,79]
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0,
COLS0, // 8,108 end of diacritic marks
// Ancient Greek [8,109]
COLS7, // alpha grave
COLS7, // alpha circumflex
COLS7, // alpha w/iota
COLS7, // alpha acute w/iota
COLS7, // alpha grave w/iota
COLS7, // alpha circumflex w/Iota
COLS7, // alpha smooth
COLS7, // alpha smooth acute
COLS7, // alpha smooth grave
COLS7, // alpha smooth circumflex
COLS7, // alpha smooth w/Iota
COLS7, // alpha smooth acute w/Iota
COLS7, // alpha smooth grave w/Iota
COLS7, // alpha smooth circumflex w/Iota
// [8,123]
COLS7, // alpha rough
COLS7, // alpha rough acute
COLS7, // alpha rough grave
COLS7, // alpha rough circumflex
COLS7, // alpha rough w/Iota
COLS7, // alpha rough acute w/Iota
COLS7, // alpha rough grave w/Iota
COLS7, // alpha rough circumflex w/Iota
// [8,131]
COLS7+4, // epsilon grave
COLS7+4, // epsilon smooth
COLS7+4, // epsilon smooth acute
COLS7+4, // epsilon smooth grave
COLS7+4, // epsilon rough
COLS7+4, // epsilon rough acute
COLS7+4, // epsilon rough grave
// [8,138]
COLS7+6, // eta grave
COLS7+6, // eta circumflex
COLS7+6, // eta w/iota
COLS7+6, // eta acute w/iota
COLS7+6, // eta grave w/Iota
COLS7+6, // eta circumflex w/Iota
COLS7+6, // eta smooth
COLS7+6, // eta smooth acute
COLS7+6, // eta smooth grave
COLS7+6, // eta smooth circumflex
COLS7+6, // eta smooth w/Iota
COLS7+6, // eta smooth acute w/Iota
COLS7+6, // eta smooth grave w/Iota
COLS7+6, // eta smooth circumflex w/Iota
COLS7+6, // eta rough
COLS7+6, // eta rough acute
COLS7+6, // eta rough grave
COLS7+6, // eta rough circumflex
COLS7+6, // eta rough w/Iota
COLS7+6, // eta rough acute w/Iota
COLS7+6, // eta rough grave w/Iota
COLS7+6, // eta rough circumflex w/Iota
// [8,160]
COLS7+8, // iota grave
COLS7+8, // iota circumflex
COLS7+8, // iota acute diaeresis
COLS7+8, // iota grave diaeresis
COLS7+8, // iota smooth
COLS7+8, // iota smooth acute
COLS7+8, // iota smooth grave
COLS7+8, // iota smooth circumflex
COLS7+8, // iota rough
COLS7+8, // iota rough acute
COLS7+8, // iota rough grave
COLS7+8, // iota rough circumflex
// [8,172]
COLS7+14, // omicron grave
COLS7+14, // omicron smooth
COLS7+14, // omicron smooth acute
COLS7+14, // omicron smooth grave
COLS7+14, // omicron rough
COLS7+14, // omicron rough acute
COLS7+14, // omicron rough grave
// [8,179]
COLS7+16, // rho smooth
COLS7+16, // rho rough
// [8,181]
COLS7+19, // upsilon grave
COLS7+19, // upsilon circumflex
COLS7+19, // upsilon acute diaeresis
COLS7+19, // upsilon grave diaeresis
COLS7+19, // upsilon smooth
COLS7+19, // upsilon smooth acute
COLS7+19, // upsilon smooth grave
COLS7+19, // upsilon smooth circumflex
COLS7+19, // upsilon rough
COLS7+19, // upsilon rough acute
COLS7+19, // upsilon rough grave
COLS7+19, // upsilon rough circumflex
// [8,193]
COLS7+23, // omega grave
COLS7+23, // omega circumflex
COLS7+23, // omega w/Iota
COLS7+23, // omega acute w/Iota
COLS7+23, // omega grave w/Iota
COLS7+23, // omega circumflex w/Iota
COLS7+23, // omega smooth
COLS7+23, // omega smooth acute
COLS7+23, // omega smooth grave
COLS7+23, // omega smooth circumflex
COLS7+23, // omega smooth w/Iota
COLS7+23, // omega smooth acute w/Iota
COLS7+23, // omega smooth grave w/Iota
COLS7+23, // omega smooth circumflex w/Iota
COLS7+23, // omega rough
COLS7+23, // omega rough acute
COLS7+23, // omega rough grave
COLS7+23, // omega rough circumflex
COLS7+23, // omega rough w/Iota
COLS7+23, // omega rough acute w/Iota
COLS7+23, // omega rough grave w/Iota
COLS7+23, // omega rough circumflex w/Iota
// [8,215]
COLS7+24, // Uppercase Stigma--the number 6
COLS7+24, // Uppercase Digamma--Obsolete letter used as 6
COLS7+24, // Uppercase Koppa--Obsolete letter used as 90
COLS7+24 // Uppercase Sampi--Obsolete letter used as 900
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE fwp_cyrl60Tbl[ CYRLTBLLEN + 2] =
{
0, // starting offset
CYRLTBLLEN, // len of table
COLS10, // Russian uppercase A
COLS10, // Russian lowercase A
COLS10+1, // Russian uppercase BE
COLS10+1, // Russian lowercase BE
COLS10+2, // Russian uppercase VE
COLS10+2, // Russian lowercase VE
COLS10+3, // Russian uppercase GHE
COLS10+3, // Russian lowercase GHE
COLS10+5, // Russian uppercase DE
COLS10+5, // Russian lowercase DE
COLS10+8, // Russian uppercase E
COLS10+8, // Russian lowercase E
COLS10+9, // Russian lowercase YO
COLS10+9, // Russian lowercase YO
COLS10+11, // Russian uppercase ZHE
COLS10+11, // Russian lowercase ZHE
COLS10+12, // Russian uppercase ZE
COLS10+12, // Russian lowercase ZE
COLS10+14, // Russian uppercase I
COLS10+14, // Russian lowercase I
COLS10+17, // Russian uppercase SHORT I
COLS10+17, // Russian lowercase SHORT I
COLS10+19, // Russian uppercase KA
COLS10+19, // Russian lowercase KA
COLS10+20, // Russian uppercase EL
COLS10+20, // Russian lowercase EL
COLS10+22, // Russian uppercase EM
COLS10+22, // Russian lowercase EM
COLS10+23, // Russian uppercase EN
COLS10+23, // Russian lowercase EN
COLS10+25, // Russian uppercase O
COLS10+25, // Russian lowercase O
COLS10+26, // Russian uppercase PE
COLS10+26, // Russian lowercase PE
COLS10+27, // Russian uppercase ER
COLS10+27, // Russian lowercase ER
COLS10+28, // Russian uppercase ES
COLS10+28, // Russian lowercase ES
COLS10+29, // Russian uppercase TE
COLS10+29, // Russian lowercase TE
COLS10+32, // Russian uppercase U
COLS10+32, // Russian lowercase U
COLS10+34, // Russian uppercase EF
COLS10+34, // Russian lowercase EF
COLS10+35, // Russian uppercase HA
COLS10+35, // Russian lowercase HA
COLS10+36, // Russian uppercase TSE
COLS10+36, // Russian lowercase TSE
COLS10+37, // Russian uppercase CHE
COLS10+37, // Russian lowercase CHE
COLS10+39, // Russian uppercase SHA
COLS10+39, // Russian lowercase SHA
COLS10+40, // Russian uppercase SHCHA
COLS10+40, // Russian lowercase SHCHA
COLS10+41, // Russian lowercase ER (also hard sign)
COLS10+41, // Russian lowercase ER (also hard sign)
COLS10+42, // Russian lowercase ERY
COLS10+42, // Russian lowercase ERY
COLS10+43, // Russian lowercase SOFT SIGN
COLS10+43, // Russian lowercase SOFT SIGN
COLS10+45, // Russian uppercase REVERSE E
COLS10+45, // Russian lowercase REVERSE E
COLS10+46, // Russian uppercase YU
COLS10+46, // Russian lowercase yu
COLS10+47, // Russian uppercase YA
COLS10+47, // Russian lowercase ya
COLS0, // Russian uppercase EH
COLS0, // Russian lowercase eh
COLS10+7, // Macedonian uppercase SOFT DJ
COLS10+7, // Macedonian lowercase soft dj
COLS10+4, // Ukrainian uppercase HARD G
COLS10+4, // Ukrainian lowercase hard g
COLS0, // GE bar
COLS0, // ge bar
COLS10+6, // Serbian uppercase SOFT DJ
COLS10+6, // Serbian lowercase SOFT DJ
COLS0, // IE (variant)
COLS0, // ie (variant)
COLS10+10, // Ukrainian uppercase YE
COLS10+10, // Ukrainian lowercase YE
COLS0, // ZHE with right descender
COLS0, // zhe with right descender
COLS10+13, // Macedonian uppercase ZELO
COLS10+13, // Macedonian lowercase ZELO
COLS0, // Old Slovanic uppercase Z
COLS0, // Old Slovanic uppercase z
COLS0, // II with macron
COLS0, // ii with mscron
COLS10+15, // Ukrainian uppercase I
COLS10+15, // Ukrainian lowercase I
COLS10+16, // Ukrainian uppercase I with Two Dots
COLS10+16, // Ukrainian lowercase I with Two Dots
COLS0, // Old Slovanic uppercase I ligature
COLS0, // Old Slovanic lowercase I ligature
COLS10+18, // Serbian--Macedonian uppercase JE
COLS10+18, // Serbian--Macedonian lowercase JE
COLS10+31, // Macedonian uppercase SOFT K
COLS10+31, // Macedonian lowercase SOFT K
COLS0, // KA with right descender
COLS0, // ka with right descender
COLS0, // KA ogonek
COLS0, // ka ogonek
COLS0, // KA vertical bar
COLS0, // ka vertical bar
COLS10+21, // Serbian--Macedonian uppercase SOFT L
COLS10+21, // Serbian--Macedonian lowercase SOFT L
COLS0, // EN with right descender
COLS0, // en with right descender
COLS10+24, // Serbian--Macedonian uppercase SOFT N
COLS10+24, // Serbian--Macedonian lowercase SOFT N
COLS0, // ROUND OMEGA
COLS0, // round omega
COLS0, // OMEGA
COLS0, // omega
COLS10+30, // Serbian uppercase SOFT T
COLS10+30, // Serbian lowercase SOFT T
COLS10+33, // Byelorussian uppercase SHORT U
COLS10+33, // Byelorussian lowercase SHORT U
COLS0, // U with macron
COLS0, // u with macron
COLS0, // STRAIGHT U
COLS0, // straight u
COLS0, // STRAIGHT U bar
COLS0, // straight u bar
COLS0, // OU ligature
COLS0, // ou ligature
COLS0, // KHA with right descender
COLS0, // kha with right descender
COLS0, // KHA ogonek
COLS0, // kha ogonek
COLS0, // H
COLS0, // h
COLS0, // OMEGA titlo
COLS0, // omega titlo
COLS10+38, // Serbian uppercase HARD DJ
COLS10+38, // Serbian lowercase HARD DJ
COLS0, // CHE with right descender
COLS0, // che with right descender
COLS0, // CHE vertical bar
COLS0, // che vertical bar
COLS0, // Old Slavonic SHCHA (variant)
COLS0, // old SLAVONIC shcha (variant)
COLS10+44, // Old Russian uppercase YAT
COLS10+44, // Old Russian lowercase YAT
// END OF UNIQUE COLLATED BYTES
// CHARACTERS BELOW MUST HAVE HAVE THEIR OWN
// SUB-COLLATION VALUE TO COMPARE CORRECTLY.
COLS0, // Old Bulgarian uppercase YUS
COLS0, // Old Bulgarian lowercase YUS
COLS0, // Old Slovanic uppercase YUS MALYI
COLS0, // Old Slovanic uppercase YUS MALYI
COLS0, // KSI
COLS0, // ksi
COLS0, // PSI
COLS0, // psi
COLS0, // Old Russian uppercase FITA
COLS0, // Old Russian lowercase FITA
COLS0, // Old Russian uppercase IZHITSA
COLS0, // Old Russian lowercase IZHITSA
COLS0, // Russian uppercase A acute
COLS0, // Russian lowercase A acute
COLS10+8, // Russian uppercase E acute
COLS10+8, // Russian lowercase E acute
// 160-below all characters are russian to 199
COLS0, // E acute
COLS0, // e acute
COLS10+14, // II acute
COLS10+14, // ii acute
COLS0, // I acute
COLS0, // i acute
COLS0, // YI acute
COLS0, // yi acute
COLS10+25, // O acute
COLS10+25, // o acute
COLS10+32, // U acute
COLS10+32, // u acute
COLS10+42, // YERI acute
COLS10+42, // YERI acute
COLS10+45, // REVERSED E acute
COLS10+45, // reversed e acute
COLS10+46, // YU acute
COLS10+46, // yu acute
COLS10+47, // YA acute
COLS10+47, // ya acute
COLS10, // A grave
COLS10, // a grave
COLS10+8, // E grave
COLS10+8, // e grave
COLS10+9, // YO grave
COLS10+9, // yo grave
COLS10+14, // I grave
COLS10+14, // i grave
COLS10+25, // O grave
COLS10+25, // o grave
COLS10+32, // U grave
COLS10+32, // u grave
COLS10+42, // YERI grave
COLS10+42, // yeri grave
COLS10+45, // REVERSED E grave
COLS10+45, // reversed e grave
COLS10+46, // IU (YU) grave
COLS10+46, // iu (yu) grave
COLS10+47, // ia (YA) grave
COLS10+47, // ia (ya) grave ******* [10,199]
};
/****************************************************************************
Desc: The Hebrew characters are collated over the Russian characters
Therefore sorting both Hebrew and Russian is impossible to do.
****************************************************************************/
static FLMBYTE fwp_heb60TblA[ HEBTBL1LEN + 2] =
{
0, // starting offset
HEBTBL1LEN, // len of table
COLS10h+0, // Alef
COLS10h+1, // Bet
COLS10h+2, // Gimel
COLS10h+3, // Dalet
COLS10h+4, // He
COLS10h+5, // Vav
COLS10h+6, // Zayin
COLS10h+7, // Het
COLS10h+8, // Tet
COLS10h+9, // Yod
COLS10h+10, // Kaf (final) [9,10]
COLS10h+11, // Kaf
COLS10h+12, // Lamed
COLS10h+13, // Mem (final)
COLS10h+14, // Mem
COLS10h+15, // Nun (final)
COLS10h+16, // Nun
COLS10h+17, // Samekh
COLS10h+18, // Ayin
COLS10h+19, // Pe (final)
COLS10h+20, // Pe [9,20]
COLS10h+21, // Tsadi (final)
COLS10h+22, // Tsadi
COLS10h+23, // Qof
COLS10h+24, // Resh
COLS10h+25, // Shin
COLS10h+26 // Tav [9,26]
};
/****************************************************************************
Desc: This is the ANCIENT HEBREW SCRIPT piece.
The actual value will be stored in the subcollation.
This way we don't play diacritic/subcollation games.
****************************************************************************/
static FLMBYTE fwp_heb60TblB[ HEBTBL2LEN + 2] =
{
84,
HEBTBL2LEN,
// [9,84]
COLS10h+0, // Alef Dagesh [9,84]
COLS10h+1, // Bet Dagesh
COLS10h+1, // Vez - looks like a bet
COLS10h+2, // Gimel Dagesh
COLS10h+3, // Dalet Dagesh
COLS10h+4, // He Dagesh
COLS10h+5, // Vav Dagesh [9,90]
COLS10h+5, // Vav Holem
COLS10h+6, // Zayin Dagesh
COLS10h+7, // Het Dagesh
COLS10h+8, // Tet Dagesh
COLS10h+9, // Yod Dagesh
COLS10h+9, // Yod Hiriq [9,96] - not on my list
COLS10h+11, // Kaf Dagesh
COLS10h+10, // Kaf Dagesh (final)
COLS10h+10, // Kaf Sheva (final)
COLS10h+10, // Kaf Tsere (final) [9,100]
COLS10h+10, // Kaf Segol (final)
COLS10h+10, // Kaf Patah (final)
COLS10h+10, // Kaf Qamats (final)
COLS10h+10, // Kaf Dagesh Qamats (final)
COLS10h+12, // Lamed Dagesh
COLS10h+14, // Mem Dagesh
COLS10h+16, // Nun Dagesh
COLS10h+15, // Nun Qamats (final)
COLS10h+17, // Samekh Dagesh
COLS10h+20, // Pe Dagesh [9,110]
COLS10h+20, // Fe - just guessing this is like Pe - was +21
COLS10h+22, // Tsadi Dagesh
COLS10h+23, // Qof Dagesh
COLS10h+25, // Sin (with sin dot)
COLS10h+25, // Sin Dagesh (with sin dot)
COLS10h+25, // Shin
COLS10h+25, // Shin Dagesh
COLS10h+26 // Tav Dagesh [9,118]
};
/****************************************************************************
Desc: The Arabic characters are collated OVER the Russian characters
Therefore sorting both Arabic and Russian in the same database
is not supported.
Arabic starts with a bunch of accents/diacritic marks that are
Actually placed OVER a preceeding character. These accents are
ignored while sorting the first pass - when collation == COLS0.
There are 4 possible states for all/most arabic characters:
?? - occurs as the only character in a word
?? - appears at the first of the word
?? - appears at the middle of a word
?? - appears at the end of the word
Usually only the simple version of the letter is stored.
Therefore we should not have to worry about sub-collation
of these characters.
The arabic characters with diacritics differ however. The alef has
sub-collation values to sort correctly. There is not any more room
to add more collation values. Some chars in CS14 are combined when
urdu, pashto and sindhi characters overlap.
****************************************************************************/
static FLMBYTE fwp_ar160Tbl[ AR1TBLLEN + 2] =
{
38, // starting offset
AR1TBLLEN, // len of table
// [13,38]
COLLS+2, // , comma
COLLS+3, // : colon
// [13,40]
COLLS+7, // ? question mark
COLS4+2, // * asterick
COLS6, // % percent
COLS9+41, // >> alphabetic - end of list)
COLS9+40, // << alphabetic - end of list)
COLS2, // (
COLS2+1, // )
// [13,47]
COLS8+1, // ?? One
COLS8+2, // ?? Two
COLS8+3, // ?? Three
// [13,50]
COLS8+4, // ?? Four
COLS8+5, // ?? Five
COLS8+6, // ?? Six
COLS8+7, // ?? Seven
COLS8+8, // ?? Eight
COLS8+9, // ?? Nine
COLS8+0, // ?? Zero
COLS8+2, // ?? Two (Handwritten)
COLS10a+1, // ?? alif
COLS10a+1, // ?? alif
// [13,60]
COLS10a+2, // ?? ba
COLS10a+2, // ?? ba
COLS10a+2, // ?? ba
COLS10a+2, // ?? ba
COLS10a+6, // ?? ta
COLS10a+6, // ?? ta
COLS10a+6, // ?? ta
COLS10a+6, // ?? ta
COLS10a+8, // ?? tha
COLS10a+8, // ?? tha
// [13,70]
COLS10a+8, // ?? tha
COLS10a+8, // ?? tha
COLS10a+12, // ?? jiim
COLS10a+12, // ?? jiim
COLS10a+12, // ?? jiim
COLS10a+12, // ?? jiim
COLS10a+16, // ?? Ha
COLS10a+16, // ?? Ha
COLS10a+16, // ?? Ha
COLS10a+16, // ?? Ha
// [13,80]
COLS10a+17, // ?? kha
COLS10a+17, // ?? kha
COLS10a+17, // ?? kha
COLS10a+17, // ?? kha
COLS10a+20, // ?? dal
COLS10a+20, // ?? dal
COLS10a+22, // ?? dhal
COLS10a+22, // ?? dhal
COLS10a+27, // ?? ra
COLS10a+27, // ?? ra
// [13,90]
COLS10a+29, // ?? ziin
COLS10a+29, // ?? ziin
COLS10a+31, // ?? siin
COLS10a+31, // ?? siin
COLS10a+31, // ?? siin
COLS10a+31, // ?? siin
COLS10a+32, // ?? shiin
COLS10a+32, // ?? shiin
COLS10a+32, // ?? shiin
COLS10a+32, // ?? shiin
// [13,100]
COLS10a+34, // ?? Sad
COLS10a+34, // ?? Sad
COLS10a+34, // ?? Sad
COLS10a+34, // ?? Sad
COLS10a+35, // ?? Dad
COLS10a+35, // ?? Dad
COLS10a+35, // ?? Dad
COLS10a+35, // ?? Dad
COLS10a+36, // ?? Ta
COLS10a+36, // ?? Ta
// [13,110]
COLS10a+36, // ?? Ta
COLS10a+36, // ?? Ta
COLS10a+37, // ?? Za
COLS10a+37, // ?? Za
COLS10a+37, // ?? Za
COLS10a+37, // ?? Za
COLS10a+38, // ?? 'ain
COLS10a+38, // ?? 'ain
COLS10a+38, // ?? 'ain
COLS10a+38, // ?? 'ain
// [13,120]
COLS10a+39, // ?? ghain
COLS10a+39, // ?? ghain
COLS10a+39, // ?? ghain
COLS10a+39, // ?? ghain
COLS10a+40, // ?? fa
COLS10a+40, // ?? fa
COLS10a+40, // ?? fa
COLS10a+40, // ?? fa
COLS10a+42, // ?? Qaf
COLS10a+42, // ?? Qaf
// [13,130]
COLS10a+42, // ?? Qaf
COLS10a+42, // ?? Qaf
COLS10a+43, // ?? kaf
COLS10a+43, // ?? kaf
COLS10a+43, // ?? kaf
COLS10a+43, // ?? kaf
COLS10a+46, // ?? lam
COLS10a+46, // ?? lam
COLS10a+46, // ?? lam
COLS10a+46, // ?? lam
// [13,140]
COLS10a+47, // ?? miim
COLS10a+47, // ?? miim
COLS10a+47, // ?? miim
COLS10a+47, // ?? miim
COLS10a+48, // ?? nuun
COLS10a+48, // ?? nuun
COLS10a+48, // ?? nuun
COLS10a+48, // ?? nuun
COLS10a+49, // ?? ha
COLS10a+49, // ?? ha
// [13,150]
COLS10a+49, // ?? ha
COLS10a+49, // ?? ha
// ha is also 51 for non-arabic
COLS10a+6, // ?? ta marbuuTah
COLS10a+6, // ?? ta marbuuTah
COLS10a+50, // ?? waw
COLS10a+50, // ?? waw
COLS10a+53, // ?? ya
COLS10a+53, // ?? ya
COLS10a+53, // ?? ya
COLS10a+53, // ?? ya
// [13,160]
COLS10a+52, // ?? alif maqSuurah
COLS10a+52, // ?? ya maqSuurah?
COLS10a+52, // ?? ya maqSuurah?
COLS10a+52, // ?? alif maqSuurah
COLS10a+0, // ?? hamzah accent - never appears alone
// [13,165]
// Store the sub-collation as the actual
// character value from this point on
COLS10a+1, // ?? alif hamzah
COLS10a+1, // ?? alif hamzah
COLS10a+1, // ?? hamzah-under-alif
COLS10a+1, // ?? hamzah-under-alif
COLS10a+1, // ?? waw hamzah
// [13,170]
COLS10a+1, // ?? waw hamzah
COLS10a+1, // ?? ya hamzah
COLS10a+1, // ?? ya hamzah
COLS10a+1, // ?? ya hamzah
COLS10a+1, // ?? ya hamzah
COLS10a+1, // ?? alif fatHataan
COLS10a+1, // ?? alif fatHataan
COLS10a+1, // ?? alif maddah
COLS10a+1, // ?? alif maddah
COLS10a+1, // ?? alif waSlah
// [13,180]
COLS10a+1, // ?? alif waSlah (final)
// LIGATURES
// Should NEVER be stored so will not worry
// about breaking up into pieces for collation.
// NOTE:
// Let's store the "Lam" collation value (+42)
// below and in the sub-collation store the
// actual character. This will sort real close.
// The best implementation is to
// break up ligatures into its base pieces.
COLS10a+46, // ?? lamalif
COLS10a+46, // ?? lamalif
COLS10a+46, // ?? lamalif hamzah
COLS10a+46, // ?? lamalif hamzah
COLS10a+46, // ?? hamzah-under-lamalif
COLS10a+46, // ?? hamzah-under-lamalif
COLS10a+46, // ?? lamalif fatHataan
COLS10a+46, // ?? lamalif fatHataan
COLS10a+46, // ?? lamalif maddah
// [13,190]
COLS10a+46, // ?? lamalif maddah
COLS10a+46, // ?? lamalif waSlah
COLS10a+46, // ?? lamalif waSlah
COLS10a+46, // ?? Allah - khaDalAlif
COLS0_ARABIC, // ?? taTwiil - character extension - throw out
COLS0_ARABIC // ?? taTwiil 1/6 - character extension - throw out
};
/****************************************************************************
Desc: Alef needs a subcollation table.
If colval==COLS10a+1 & char>=165
index through this table. Otherwise
the alef value is [13,58] and subcol
value should be 7. Alef maddah is default (0)
Handcheck if colval==COLS10a+6
Should sort:
[13,152]..[13,153] - taa marbuuTah - nosubcoll
[13,64] ..[13,67] - taa - subcoll of 1
****************************************************************************/
static FLMBYTE fwp_alefSubColTbl[] =
{
// [13,165]
1, // ?? alif hamzah
1, // ?? alif hamzah
3, // ?? hamzah-under-alif
3, // ?? hamzah-under-alif
2, // ?? waw hamzah
// [13,170]
2, // ?? waw hamzah
4, // ?? ya hamzah
4, // ?? ya hamzah
4, // ?? ya hamzah
4, // ?? ya hamzah
5, // ?? alif fatHataan
5, // ?? alif fatHataan
0, // ?? alif maddah
0, // ?? alif maddah
6, // ?? alif waSlah
// [13,180]
6 // ?? alif waSlah (final)
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE fwp_ar260Tbl[ AR2TBLLEN + 2] =
{
41, // starting offset
AR2TBLLEN, // len of table
// [14,41]
COLS8+4, // Farsi and Urdu Four
COLS8+4, // Urdu Four
COLS8+5, // Farsi and Urdu Five
COLS8+6, // Farsi Six
COLS8+6, // Farsi and Urdu Six
COLS8+7, // Urdu Seven
COLS8+8, // Urdu Eight
COLS10a+3, // Sindhi bb - baa /w 2 dots below (67b)
COLS10a+3,
COLS10a+3,
COLS10a+3,
COLS10a+4, // Sindhi bh - baa /w 4 dots below (680)
COLS10a+4,
COLS10a+4,
COLS10a+4,
// [14,56]
COLS10a+5, // Malay, Kurdish, Pashto, Farsi, Sindhi, and Urdu p
COLS10a+5, // =peh - taa /w 3 dots below (67e)
COLS10a+5,
COLS10a+5,
COLS10a+7, // Urdu T - taa /w small tah
COLS10a+7,
COLS10a+7,
COLS10a+7,
COLS10a+7, // Pashto T - taa /w ring (forced to combine)
COLS10a+7,
COLS10a+7,
COLS10a+7,
COLS10a+9, // Sindhi th - taa /w 4 dots above (67f)
COLS10a+9,
// [14,70]
COLS10a+9,
COLS10a+9,
COLS10a+10, // Sindhi Tr - taa /w 3 dots above (67d)
COLS10a+10,
COLS10a+10,
COLS10a+10,
COLS10a+11, // Sindhi Th - taa /w 2 dots above (67a)
COLS10a+11,
COLS10a+11,
COLS10a+11,
COLS10a+13, // Sindhi jj - haa /w 2 middle dots verticle (684)
COLS10a+13,
COLS10a+13,
COLS10a+13,
COLS10a+14, // Sindhi ny - haa /w 2 middle dots (683)
COLS10a+14,
COLS10a+14,
COLS10a+14,
// [14,88]
COLS10a+15, // Malay, Kurdish, Pashto, Farsi, Sindhi, and Urdu ch
COLS10a+15, // =tcheh (686)
COLS10a+15,
COLS10a+15,
COLS10a+15, // Sindhi chh - haa /w middle 4 dots (687)
COLS10a+15, // forced to combine
COLS10a+15,
COLS10a+15,
COLS10a+18, // Pashto ts - haa /w 3 dots above (685)
COLS10a+18,
COLS10a+18,
COLS10a+18,
COLS10a+19, // Pashto dz - hamzah on haa (681)
COLS10a+19,
COLS10a+19,
COLS10a+19,
// [14,104]
COLS10a+21, // Urdu D - dal /w small tah (688)
COLS10a+21,
COLS10a+21, // Pashto D - dal /w ring (689) forced to combine
COLS10a+21,
COLS10a+23, // Sindhi dh - dal /w 2 dots above (68c)
COLS10a+23,
COLS10a+24, // Sindhi D - dal /w 3 dots above (68e)
COLS10a+24,
COLS10a+25, // Sindhi Dr - dal /w dot below (68a)
COLS10a+25,
COLS10a+26, // Sindhi Dh - dal /w 2 dots below (68d)
COLS10a+26,
COLS10a+28, // Pashto r - ra /w ring (693)
COLS10a+28,
// [14,118]
COLS10a+28, // Urdu R - ra /w small tah (691) forced to combine
COLS10a+28,
COLS10a+28, // Sindhi r - ra /w 4 dots above (699) forced to combine
COLS10a+28,
COLS10a+27, // Kurdish rolled r - ra /w 'v' below (695)
COLS10a+27,
COLS10a+27,
COLS10a+27,
// [14,126]
COLS10a+30, // Kurdish, Pashto, Farsi, Sindhi, and Urdu Z
COLS10a+30, // = jeh - ra /w 3 dots above (698)
COLS10a+30, // Pashto zz - ra /w dot below & dot above (696)
COLS10a+30, // forced to combine
COLS10a+30, // Pashto g - not in unicode! - forced to combine
COLS10a+30,
COLS10a+33, // Pashto x - seen dot below & above (69a)
COLS10a+33,
COLS10a+33,
COLS10a+33,
COLS10a+39, // Malay ng - old maly ain /w 3 dots above (6a0)
COLS10a+39, // forced to combine
COLS10a+39,
COLS10a+39,
// [14,140]
COLS10a+41, // Malay p, Kurdish v - Farsi ? - fa /w 3 dots above
COLS10a+41, // = veh - means foreign words (6a4)
COLS10a+41,
COLS10a+41,
COLS10a+41, // Sindhi ph - fa /w 4 dots above (6a6) forced to combine
COLS10a+41,
COLS10a+41,
COLS10a+41,
// [14,148]
COLS10a+43, // Misc k - open caf (6a9)
COLS10a+43,
COLS10a+43,
COLS10a+43,
COLS10a+43, // misc k - no unicode - forced to combine
COLS10a+43,
COLS10a+43,
COLS10a+43,
COLS10a+43, // Sindhi k - swash caf (various) (6aa) -forced to combine
COLS10a+43,
COLS10a+43,
COLS10a+43,
// [14,160]
COLS10a+44, // Persian/Urdu g - gaf (6af)
COLS10a+44,
COLS10a+44,
COLS10a+44,
COLS10a+44, // Persian/Urdu g - no unicode
COLS10a+44,
COLS10a+44,
COLS10a+44,
COLS10a+44, // malay g - gaf /w ring (6b0)
COLS10a+44,
COLS10a+44,
COLS10a+44,
COLS10a+44, // Sindhi ng - gaf /w 2 dots above (6ba)
COLS10a+44, // forced to combine ng only
COLS10a+44,
COLS10a+44,
COLS10a+45, // Sindhi gg - gaf /w 2 dots vertical below (6b3)
COLS10a+45,
COLS10a+45,
COLS10a+45,
// [14,180]
COLS10a+46, // Kurdish velar l - lam /w small v (6b5)
COLS10a+46,
COLS10a+46,
COLS10a+46,
COLS10a+46, // Kurdish Lamalif with diacritic - no unicode
COLS10a+46,
// [14,186]
COLS10a+48, // Urdu n - dotless noon (6ba)
COLS10a+48,
COLS10a+48,
COLS10a+48,
COLS10a+48, // Pashto N - noon /w ring (6bc) - forced to combine
COLS10a+48,
COLS10a+48,
COLS10a+48,
COLS10a+48, // Sindhi N - dotless noon/w small tah (6bb)
COLS10a+48, // forced to combine
COLS10a+48,
COLS10a+48,
COLS10a+50, // Kurdish o - waw /w small v (6c6)
COLS10a+50,
// [14,200]
COLS10a+50, // Kurdish o - waw /w bar above (6c5)
COLS10a+50,
COLS10a+50, // Kurdish o - waw /w 2 dots above (6ca)
COLS10a+50,
// [14,204]
COLS10a+51, // Urdu h - no unicode
COLS10a+51,
COLS10a+51,
COLS10a+51,
COLS10a+52, // Kurdish ? - ya /w small v (6ce)
COLS10a+52,
COLS10a+52,
COLS10a+52,
// [14,212]
COLS10a+54, // Urdu y - ya barree (6d2)
COLS10a+54,
COLS10a+54, // Malay ny - ya /w 3 dots below (6d1) forced to combine
COLS10a+54,
COLS10a+54,
COLS10a+54,
// [14,218]
COLS10a+51, // Farsi hamzah - hamzah on ha (6c0) forced to combine
COLS10a+51
};
/****************************************************************************
Desc: If the bit position is set then save the character in the sub-col
area. The bit values are determined by looking at the
FLAIM COLTBL1 to see which characters are combined with other
Arabic characters.
****************************************************************************/
static FLMBYTE fwp_ar2BitTbl[] =
{
// Start at character 64
// The only 'clean' areas uncollate to the correct place, they are...
// 48..63
// 68..91
// 96..117
// 126..127
// 140..143
// 160..163
// 176..179
// 212..213
0xF0, // 64..71
0x00, // 72..79
0x00, // 80..87
0x0F, // 88..95 - 92..95
0x00, // 96..103
0x00, // 104..111
0x03, // 112..119
0xFC, // 120..127
0xFF, // 128..135
0xF0, // 136..143 - 136..139
0xFF, // 144..151 - 144..147, 148..159
0xFF, // 152..159
0x0F, // 160..167 - 164..175
0xFF, // 168..175
0x0F, // 176..183 - 180..185
0xFF, // 184..191 - 186..197
0xFF, // 192..199 - 198..203
0xFF, // 200..207 - 204..207
0xF3, // 208..215 - 208..211 , 214..217
0xF0 // 216..219 - 218..219
};
/****************************************************************************
Desc: This table describes and gives addresses for collating 5.0
character sets. Each line corresponds with a character set.
***************************************************************************/
static TBL_B_TO_BP fwp_col60Tbl[] =
{
{F_CHSASCI, fwp_asc60Tbl},
{F_CHSMUL1, fwp_mn60Tbl},
{F_CHSSYM1, fwp_sym60Tbl},
{F_CHSGREK, fwp_grk60Tbl},
{F_CHSCYR, fwp_cyrl60Tbl},
{0xFF, 0}
};
/****************************************************************************
Desc: This table is for sorting the hebrew/arabic languages.
These values overlap the end of ASC/european and cyrillic tables.
****************************************************************************/
static TBL_B_TO_BP fwp_HebArabicCol60Tbl[] =
{
{F_CHSASCI, fwp_asc60Tbl},
{F_CHSMUL1, fwp_mn60Tbl},
{F_CHSSYM1, fwp_sym60Tbl},
{F_CHSGREK, fwp_grk60Tbl},
{F_CHSHEB, fwp_heb60TblA},
{F_CHSHEB, fwp_heb60TblB},
{F_CHSARB1, fwp_ar160Tbl},
{F_CHSARB2, fwp_ar260Tbl},
{0xff, 0}
};
/****************************************************************************
Desc: The diacritical to collated table translates the first 26
characters of WP character set #1 into a 5 bit value for "correct"
sorting sequence for that diacritical (DCV) - diacritic collated
value.
The attempt here is to convert the collated character value
along with the DCV to form the original WP character.
The diacriticals are in an order to fit the most languages.
Czech, Swedish, and Finnish will have to manual reposition the
ring above (assign it a value greater then the umlaut)
This table is index by the diacritical value.
****************************************************************************/
static FLMBYTE fwp_dia60Tbl[] =
{
2, // grave offset = 0
16, // centerd offset = 1
7, // tilde offset = 2
4, // circum offset = 3
12, // crossb offset = 4
10, // slash offset = 5
1, // acute offset = 6
6, // umlaut offset = 7
// In SU, SV and CZ will = 9
17, // macron offset = 8
18, // aposab offset = 9
19, // aposbes offset = 10
20, // aposba offset = 11
21, // aposbc offset = 12
22, // abosbl offset = 13
8, // ring offset = 14
13, // dota offset = 15
23, // dacute offset = 16
11, // cedilla offset = 17
14, // ogonek offset = 18
5, // caron offset = 19
15, // stroke offset = 20
24, // bara offset = 21
3, // breve offset = 22
0, // dbls offset = 23 sorts as 'ss'
25, // dotlesi offset = 24
26 // dotlesj offset = 25
};
/****************************************************************************
Desc: This table defines the range of characters within the set
which are case convertible.
****************************************************************************/
static FLMBYTE fwp_caseConvertableRange[] =
{
26,241, // Multinational 1
0,0, // Multinational 2
0,0, // Box Drawing
0,0, // Symbol 1
0,0, // Symbol 2
0,0, // Math 1
0,0, // Math 2
0,69, // Greek 1
0,0, // Hebrew
0,199, // Cyrillic
0,0, // Japanese Kana
0,0, // User-defined
0,0, // Not defined
0,0, // Not defined
0,0, // Not defined
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMUINT16 colToWPChr[ COLS11 - COLLS] =
{
0x20, // colls - <Spc>
0x2e, // colls+1 - .
0x2c, // colls+2 - ,
0x3a, // colls+3 - :
0x3b, // colls+4 - ;
0x21, // colls+5 - !
0, // colls+6 - NO VALUE
0x3f, // colls+7 - ?
0, // colls+8 - NO VALUE
0x22, // cols1 - "
0x27, // cols1+1 - '
0x60, // cols1+2 - `
0, // cols1+3 - NO VALUE
0, // cols1+4 - NO VALUE
0x28, // cols2 - (
0x29, // cols2+1 - )
0x5b, // cols2+2 - japanese angle brackets
0x5d, // cols2+3 - japanese angle brackets
0x7b, // cols2+4 - {
0x7d, // cols2+5 - }
0x24, // cols3 - $
0x413, // cols3+1 - cent
0x40b, // cols3+2 - pound
0x40c, // cols3+3 - yen
0x40d, // cols3+4 - pacetes
0x40e, // cols3+5 - floren
0x2b, // cols4 - +
0x2d, // cols4+1 - -
0x2a, // cols4+2 - *
0x2f, // cols4+3 - /
0x5e, // cols4+4 - ^
0, // cols4+5 - NO VALUE
0, // cols4+6 - NO VALUE
0, // cols4+7 - NO VALUE
0x3c, // cols5 - <
0, // cols5+1 - NO VALUE
0x3d, // cols5+2 - =
0, // cols5+3 - NO VALUE
0x3e, // cols5+4 - >
0, // cols5+5 - NO VALUE
0, // cols5+6 - NO VALUE
0, // cols5+7 - NO VALUE
0, // cols5+8 - NO VALUE
0, // cols5+9 - NO VALUE
0, // cols5+10 - NO VALUE
0, // cols5+11 - NO VALUE
0, // cols5+12 - NO VALUE
0, // cols5+13 - NO VALUE
0x25, // cols6 - %
0x23, // cols6+1 - #
0x26, // cols6+2 - &
0x40, // cols6+3 - @
0x5c, // cols6+4 - Backslash
0x5f, // cols6+5 - _
0x7c, // cols6+6 - |
0x7e, // cols6+7 - ~
0, // cols6+8 - NO VALUE
0, // cols6+9 - NO VALUE
0, // cols6+10 - NO VALUE
0, // cols6+11 - NO VALUE
0, // cols6+12 - NO VALUE
0x800, // cols7 - Uppercase Alpha
0x802, // cols7+1 - Uppercase Beta
0x806, // cols7+2 - Uppercase Gamma
0x808, // cols7+3 - Uppercase Delta
0x80a, // cols7+4 - Uppercase Epsilon
0x80c, // cols7+5 - Uppercase Zeta
0x80e, // cols7+6 - Uppercase Eta
0x810, // cols7+7 - Uppercase Theta
0x812, // cols7+8 - Uppercase Iota
0x814, // cols7+9 - Uppercase Kappa
0x816, // cols7+10 - Uppercase Lambda
0x818, // cols7+11 - Uppercase Mu
0x81a, // cols7+12 - Uppercase Nu
0x81c, // cols7+13 - Uppercase Xi
0x81e, // cols7+14 - Uppercase Omicron
0x820, // cols7+15 - Uppercase Pi
0x822, // cols7+16 - Uppercase Rho
0x824, // cols7+17 - Uppercase Sigma
0x828, // cols7+18 - Uppercase Tau
0x82a, // cols7+19 - Uppercase Upsilon
0x82c, // cols7+20 - Uppercase Phi
0x82e, // cols7+21 - Uppercase Chi
0x830, // cols7+22 - Uppercase Psi
0x832, // cols7+23 - Uppercase Omega
0, // cols7+24 - NO VALUE
0x30, // cols8 - 0
0x31, // cols8+1 - 1
0x32, // cols8+2 - 2
0x33, // cols8+3 - 3
0x34, // cols8+4 - 4
0x35, // cols8+5 - 5
0x36, // cols8+6 - 6
0x37, // cols8+7 - 7
0x38, // cols8+8 - 8
0x39, // cols8+9 - 9
0x41, // cols9 - A
0x124, // cols9+1 - AE digraph
0x42, // cols9+2 - B
0x43, // cols9+3 - C
0xffff, // cols9+4 - CH in spanish
0x162, // cols9+5 - Holder for C caron in Czech
0x44, // cols9+6 - D
0x45, // cols9+7 - E
0x46, // cols9+8 - F
0x47, // cols9+9 - G
0x48, // cols9+10 - H
0xffff, // cols9+11 - CH in czech or dotless i in turkish
0x49, // cols9+12 - I
0x18a, // cols9+13 - IJ Digraph
0x4a, // cols9+14 - J
0x4b, // cols9+15 - K
0x4c, // cols9+16 - L
0xffff, // cols9+17 - LL in spanish
0x4d, // cols9+18 - M
0x4e, // cols9+19 - N
0x138, // cols9+20 - N Tilde
0x4f, // cols9+21 - O
0x1a6, // cols9+22 - OE digraph
0x50, // cols9+23 - P
0x51, // cols9+24 - Q
0x52, // cols9+25 - R
0x1aa, // cols9+26 - Holder for R caron in Czech
0x53, // cols9+27 - S
0x1b0, // cols9+28 - Holder for S caron in Czech
0x54, // cols9+29 - T
0x55, // cols9+30 - U
0x56, // cols9+31 - V
0x57, // cols9+32 - W
0x58, // cols9+33 - X
0x59, // cols9+34 - Y
0x5a, // cols9+35 - Z
0x1ce, // cols9+36 - Holder for Z caron in Czech
0x158, // cols9+37 - Uppercase Thorn
0, // cols9+38 - ???
0, // cols9+39 - ???
0x5b, // cols9+40 - [ (note: alphabetic - end of list)
0x5d, // cols9+41 - ] (note: alphabetic - end of list)
// 0xAA - also start of Hebrew
0x124, // cols9+42 - AE diagraph - DK
0x124, // cols9+43 - AE diagraph - NO
0x122, // cols9+44 - A ring - SW
0x11E, // cols9+45 - A diaeresis - DK
0x124, // cols9+46 - AE diagraph - IC
0x150, // cols9+47 - O slash - NO
0x11e, // cols9+48 - A diaeresis - SW
0x150, // cols9+49 - O slash - DK
0x13E, // cols9+50 - O Diaeresis - IC
0x122, // cols9+51 - A ring - NO
0x13E, // cols9+52 - O Diaeresis - SW
0x13E, // cols9+53 - O Diaeresis - DK
0x150, // cols9+54 - O slash - IC
0x122, // cols9+55 - A ring - DK
0x124, // cols9+56 - AE diagraph future
0x13E, // cols9+57 - O Diaeresis future
0x150, // cols9+58 - O slash future
0, // cols9+59 - NOT USED future
0xA00, // cols10 - Russian A
0xA02, // cols10+1 - Russian BE
0xA04, // cols10+2 - Russian VE
0xA06, // cols10+3 - Russian GHE
0xA46, // cols10+4 - Ukrainian HARD G
0xA08, // cols10+5 - Russian DE
0xA4a, // cols10+6 - Serbian SOFT DJ
0xA44, // cols10+7 - Macedonian SOFT DJ
0xA0a, // cols10+8 - Russian E
0xA0c, // cols10+9 - Russian YO
0xA4e, // cols10+10 - Ukrainian YE
0xA0e, // cols10+11 - Russian ZHE
0xA10, // cols10+12 - Russian ZE
0xA52, // cols10+13 - Macedonian ZELO
0xA12, // cols10+14 - Russian I
0xA58, // cols10+15 - Ukrainian I
0xA5a, // cols10+16 - Ukrainian I with Two dots
0xA14, // cols10+17 - Russian SHORT I
0xA5e, // cols10+18 - Serbian--Macedonian JE
0xA16, // cols10+19 - Russian KA
0xA18, // cols10+20 - Russian EL
0xA68, // cols10+21 - Serbian--Macedonian SOFT L
0xA1a, // cols10+22 - Russian EM
0xA1c, // cols10+23 - Russian EN
0xA6c, // cols10+24 - Serbian--Macedonian SOFT N
0xA1e, // cols10+25 - Russian O
0xA20, // cols10+26 - Russian PE
0xA22, // cols10+27 - Russian ER
0xA24, // cols10+28 - Russian ES
0xA26, // cols10+29 - Russian TE
0xA72, // cols10+30 - Serbian SOFT T
0xA60, // cols10+31 - Macedonian SOFT K
0xA28, // cols10+32 - Russian U
0xA74, // cols10+33 - Byelorussian SHORT U
0xA2a, // cols10+34 - Russian EF
0xA2c, // cols10+35 - Russian HA
0xA2e, // cols10+36 - Russian TSE
0xA30, // cols10+37 - Russian CHE
0xA86, // cols10+38 - Serbian HARD DJ
0xA32, // cols10+39 - Russian SHA
0xA34, // cols10+40 - Russian SHCHA
0xA36, // cols10+41 - Russian ER (also hard
0xA38, // cols10+42 - Russian ERY
0xA3a, // cols10+43 - Russian SOFT SIGN
0xA8e, // cols10+44 - Old Russian YAT
0xA3c, // cols10+45 - Russian uppercase REVERSE E
0xA3e, // cols10+46 - Russian YU
0xA40, // cols10+47 - Russian YA
0xA3a, // cols10+48 - Russian SOFT SIGN - UKRAIN ONLY
0 // cols10+49 - future
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMUINT16 HebArabColToWPChr[] =
{
// Start at COLS10a+0
// [0]
0x0D00 +164, // hamzah
0x0D00 + 58, // [13,177] alef maddah
// Read subcollation to get other alef values
0x0D00 + 60, // baa
0x0E00 + 48, // Sindhi bb
0x0E00 + 52, // Sindhi bh
0x0E00 + 56, // Misc p = peh
0x0D00 +152, // taa marbuuTah
// subcollation of 1 is taa [13,64]
0x0E00 + 60, // Urdu T [14,60]
// Pashto T [14,64]
// [8]
0x0D00 + 68, // thaa
0x0E00 + 68, // Sindhi th
0x0E00 + 72, // Sindhi tr
0x0E00 + 76, // Sindhi Th
0x0D00 + 72, // jiim - jeem
0x0E00 + 80, // Sindhi jj
0x0E00 + 84, // Sindhi ny
0x0E00 + 88, // Misc ch
// Sinhi chh [14,92]
// [16]
0x0D00 + 76, // Haa
0x0D00 + 80, // khaa
0x0E00 + 96, // Pashto ts
0x0E00 +100, // Pashto dz
0x0D00 + 84, // dal
0x0E00 +104, // Urdu D
// Pashto D
0x0D00 + 86, // thal
0x0E00 +108, // Sindhi dh
// [24]
0x0E00 +110, // Sindhi D
0x0E00 +112, // Sindhi Dr
0x0E00 +114, // Sindhi Dh
0x0D00 + 88, // ra
// Kurdish rolled r [14,122]
0x0E00 +116, // Pashto r [14,116] - must pick this!
// Urdu R [14,118]
// Sindhi r [14,120]
0x0D00 + 90, // zain
0x0E00 +126, // Mizc Z=jeh [14,126]
// Pashto zz [14,128]
// Pashto g [14,130]
0x0D00 + 92, // seen
// [32]
0x0D00 + 96, // sheen
0x0E00 +132, // Pashto x
0x0D00 +100, // Sad
0x0D00 +104, // Dad
0x0D00 +108, // Tah
0x0D00 +112, // Za (dhah)
0x0D00 +116, // 'ain
0x0D00 +120, // ghain
// malay ng [14,136]
// [40]
0x0D00 +124, // fa
0x0E00 +140, // Malay p, kurdish v = veh
// Sindhi ph [14,144]
0x0D00 +128, // Qaf
0x0D00 +132, // kaf (caf)
// Misc k [14,148]
// misc k - no unicode [14,152]
// Sindhi k [14,156]
0x0E00 +160, // Persian/Urdu gaf
// gaf - no unicode [14,164]
// malay g [14,168]
// Sindhi ng [14,172]
0x0E00 +176, // Singhi gg
0x0D00 +136, // lam - all ligature variants
// Kurdish valar lam [14,180]
// Kurdish lamalef - no unicode [14,184]
0x0D00 +140, // meem
// [48]
0x0D00 +144, // noon
// Urdu n [14,186]
// Pashto N [14,190]
// Sindhi N [14,194]
0x0D00 +148, // ha - arabic language only!
0x0D00 +154, // waw
// Kurdish o [14,198]
// Kurdish o with bar [14,200]
// Kurdish o with 2 dots [14,202]
0x0D00 +148, // ha - non-arabic language
// Urdu h [14,204]
// Farsi hamzah on ha [14,218]
0x0D00 +160, // alef maqsurah
// Kurdish e - ya /w small v
0x0D00 +156, // ya
0x0E00 +212 // Urdu ya barree
// Malay ny [14,214]
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMUINT16 ArabSubColToWPChr[] =
{
0x0D00 +177, // Alef maddah - default value - here for documentation
0x0D00 +165, // Alef Hamzah
0x0D00 +169, // Waw hamzah
0x0D00 +167, // Hamzah under alef
0x0D00 +171, // ya hamzah
0x0D00 +175, // alef fathattan
0x0D00 +179, // alef waslah
0x0D00 + 58, // alef
0x0D00 + 64 // taa - after taa marbuuTah
};
/****************************************************************************
Desc: Turns a collated diacritic value into the original diacritic value
****************************************************************************/
static FLMBYTE ml1_COLtoD[27] =
{
23, // dbls sort value = 0 sorts as 'ss'
6, // acute sort value = 1
0, // grave sort value = 2
22, // breve sort value = 3
3, // circum sort value = 4
19, // caron sort value = 5
7, // umlaut sort value = 6
2, // tilde sort value = 7
14, // ring sort value = 8
7, // umlaut in SU,SV & CZ after ring = 9
5, // slash sort value = 10
17, // cedilla sort value = 11
4, // crossb sort value = 12
15, // dota sort value = 13
18, // ogonek sort value = 14
20, // stroke sort value = 15
1, // centerd sort value = 16
8, // macron sort value = 17
9, // aposab sort value = 18
10, // aposbes sort value = 19
11, // aposba sort value = 20
12, // aposbc sort value = 21
13, // abosbl sort value = 22
16, // dacute sort value = 23
21, // bara sort value = 24
24, // dotlesi sort value = 25
25 // dotlesj sort value = 26
};
/****************************************************************************
Desc:
Notes: Only 48 values + 0x40, 0x41, 0x42 (169..171)
****************************************************************************/
static FLMBYTE ColToKanaTbl[ 48] =
{
0, // a=0, A=1
2, // i=2, I=3
4, // u=4, U=5, VU=83
6, // e=6, E=7
8, // o=8, O=9
84, // KA=10, GA=11, ka=84 - remember voicing table is optimized
// so that zero value is position and
// if voice=1 and no 0 is changed to 0
12, // KI=12, GI=13
14, // KU=14, GU=15
85, // KE=16, GE=17, ke=85
18, // KO=18, GO=19
20, // SA=20, ZA=21
22, // SHI=22, JI=23
24, // SU=24, ZU=25
26, // SE=26, ZE=27
28, // SO=28, ZO=29
30, // TA=30, DA=31
32, // CHI=32, JI=33
34, // tsu=34, TSU=35, ZU=36
37, // TE=37, DE=38
39, // TO=39, DO=40
41, // NA
42, // NI
43, // NU
44, // NE
45, // NO
46, // HA, BA, PA
49, // HI, BI, PI
52, // FU, BU, PU
55, // HE, BE, PE
58, // HO, BO, PO
61, // MA
62, // MI
63, // MU
64, // ME
65, // MO
66, // ya, YA
68, // yu, YU
70, // yo, YO
72, // RA
73, // RI
74, // RU
75, // RE
76, // RO
77, // wa, WA
79, // WI
80, // WE
81, // WO
82 // N
};
/****************************************************************************
Desc:
****************************************************************************/
static FLMBYTE f_langtbl[ FLM_LAST_LANG + FLM_LAST_LANG] =
{
'U', 'S', // English, United States
'A', 'F', // Afrikaans
'A', 'R', // Arabic
'C', 'A', // Catalan
'H', 'R', // Croatian
'C', 'Z', // Czech
'D', 'K', // Danish
'N', 'L', // Dutch
'O', 'Z', // English, Australia
'C', 'E', // English, Canada
'U', 'K', // English, United Kingdom
'F', 'A', // Farsi
'S', 'U', // Finnish
'C', 'F', // French, Canada
'F', 'R', // French, France
'G', 'A', // Galician
'D', 'E', // German, Germany
'S', 'D', // German, Switzerland
'G', 'R', // Greek
'H', 'E', // Hebrew
'M', 'A', // Hungarian
'I', 'S', // Icelandic
'I', 'T', // Italian
'N', 'O', // Norwegian
'P', 'L', // Polish
'B', 'R', // Portuguese, Brazil
'P', 'O', // Portuguese, Portugal
'R', 'U', // Russian
'S', 'L', // Slovak
'E', 'S', // Spanish
'S', 'V', // Swedish
'Y', 'K', // Ukrainian
'U', 'R', // Urdu
'T', 'K', // Turkey
'J', 'P', // Japanese
'K', 'R', // Korean
'C', 'T', // Chinese-Traditional
'C', 'S', // Chinese-Simplified
'L', 'A' // Future asian language
};
/****************************************************************************
Desc: UNICODE to WP6 character mapping table
Notes: This table is used to convert a subset of Unicode characters to
their WordPerfect equivalents so that the WP collation routines
can be used for indexing. This contains characters that can be
mapped 1:1 from Unicode->WP and from WP->Unicode. There is
no ambiguity and there are no character expansions or
contractions.
****************************************************************************/
#define UTOWP60_ENTRIES 1502
static FLMUINT16 WP_UTOWP60[ UTOWP60_ENTRIES][2] =
{
{ 0x00A1, 0x0407 }, // 7 , 4
{ 0x00A2, 0x0413 }, // 19 , 4
{ 0x00A3, 0x040b }, // 11 , 4
{ 0x00A4, 0x0418 }, // 24 , 4
{ 0x00A5, 0x040c }, // 12 , 4
{ 0x00A7, 0x0406 }, // 6 , 4
{ 0x00A9, 0x0417 }, // 23 , 4
{ 0x00AA, 0x040f }, // 15 , 4
{ 0x00AB, 0x0409 }, // 9 , 4
{ 0x00AC, 0x0614 }, // 20 , 6
{ 0x00AE, 0x0416 }, // 22 , 4
{ 0x00B0, 0x0624 }, // 36 , 6
{ 0x00B1, 0x0601 }, // 1 , 6
{ 0x00B2, 0x0414 }, // 20 , 4
{ 0x00B3, 0x041a }, // 26 , 4
{ 0x00B5, 0x0625 }, // 37 , 6
{ 0x00B6, 0x0405 }, // 5 , 4
{ 0x00B7, 0x0101 }, // 101, 1
{ 0x00B9, 0x044e }, // 78 , 4
{ 0x00BA, 0x0410 }, // 16 , 4
{ 0x00BB, 0x040a }, // 10 , 4
{ 0x00BC, 0x0412 }, // 18 , 4
{ 0x00BD, 0x0411 }, // 17 , 4
{ 0x00BE, 0x0419 }, // 25 , 4
{ 0x00BF, 0x0408 }, // 8 , 4
{ 0x00C0, 0x0120 }, // 32 , 1
{ 0x00C1, 0x011a }, // 26 , 1
{ 0x00C2, 0x011c }, // 28 , 1
{ 0x00C3, 0x014c }, // 76 , 1
{ 0x00C4, 0x011e }, // 30 , 1
{ 0x00C5, 0x0122 }, // 34 , 1
{ 0x00C6, 0x0124 }, // 36 , 1
{ 0x00C7, 0x0126 }, // 38 , 1
{ 0x00C8, 0x012e }, // 46 , 1
{ 0x00C9, 0x0128 }, // 40 , 1
{ 0x00CA, 0x012a }, // 42 , 1
{ 0x00CB, 0x012c }, // 44 , 1
{ 0x00CC, 0x0136 }, // 54 , 1
{ 0x00CD, 0x0130 }, // 48 , 1
{ 0x00CE, 0x0132 }, // 50 , 1
{ 0x00CF, 0x0134 }, // 52 , 1
{ 0x00D0, 0x0156 }, // 86 , 1
{ 0x00D1, 0x0138 }, // 56 , 1
{ 0x00D2, 0x0140 }, // 64 , 1
{ 0x00D3, 0x013a }, // 58 , 1
{ 0x00D4, 0x013c }, // 60 , 1
{ 0x00D5, 0x0152 }, // 82 , 1
{ 0x00D6, 0x013e }, // 62 , 1
{ 0x00D7, 0x0627 }, // 39 , 6
{ 0x00D8, 0x0150 }, // 80 , 1
{ 0x00D9, 0x0148 }, // 72 , 1
{ 0x00DA, 0x0142 }, // 66 , 1
{ 0x00DB, 0x0144 }, // 68 , 1
{ 0x00DC, 0x0146 }, // 70 , 1
{ 0x00DD, 0x0154 }, // 84 , 1
{ 0x00DE, 0x0158 }, // 88 , 1
{ 0x00DF, 0x0117 }, // 23 , 1
{ 0x00E0, 0x0121 }, // 33 , 1
{ 0x00E1, 0x011b }, // 27 , 1
{ 0x00E2, 0x011d }, // 29 , 1
{ 0x00E3, 0x014d }, // 77 , 1
{ 0x00E4, 0x011f }, // 31 , 1
{ 0x00E5, 0x0123 }, // 35 , 1
{ 0x00E6, 0x0125 }, // 37 , 1
{ 0x00E7, 0x0127 }, // 39 , 1
{ 0x00E8, 0x012f }, // 47 , 1
{ 0x00E9, 0x0129 }, // 41 , 1
{ 0x00EA, 0x012b }, // 43 , 1
{ 0x00EB, 0x012d }, // 45 , 1
{ 0x00EC, 0x0137 }, // 55 , 1
{ 0x00ED, 0x0131 }, // 49 , 1
{ 0x00EE, 0x0133 }, // 51 , 1
{ 0x00EF, 0x0135 }, // 53 , 1
{ 0x00F0, 0x0157 }, // 87 , 1
{ 0x00F1, 0x0139 }, // 57 , 1
{ 0x00F2, 0x0141 }, // 65 , 1
{ 0x00F3, 0x013b }, // 59 , 1
{ 0x00F4, 0x013d }, // 61 , 1
{ 0x00F5, 0x0153 }, // 83 , 1
{ 0x00F6, 0x013f }, // 63 , 1
{ 0x00F7, 0x0608 }, // 8 , 6
{ 0x00F8, 0x0151 }, // 81 , 1
{ 0x00F9, 0x0149 }, // 73 , 1
{ 0x00FA, 0x0143 }, // 67 , 1
{ 0x00FB, 0x0145 }, // 69 , 1
{ 0x00FC, 0x0147 }, // 71 , 1
{ 0x00FD, 0x0155 }, // 85 , 1
{ 0x00FE, 0x0159 }, // 89 , 1
{ 0x00FF, 0x014b }, // 75 , 1
{ 0x0100, 0x015c }, // 92 , 1
{ 0x0101, 0x015d }, // 93 , 1
{ 0x0102, 0x015a }, // 90 , 1
{ 0x0103, 0x015b }, // 91 , 1
{ 0x0104, 0x015e }, // 94 , 1
{ 0x0105, 0x015f }, // 95 , 1
{ 0x0106, 0x0160 }, // 96 , 1
{ 0x0107, 0x0161 }, // 97 , 1
{ 0x0108, 0x0164 }, // 100, 1
{ 0x0109, 0x0165 }, // 101, 1
{ 0x010A, 0x0166 }, // 102, 1
{ 0x010B, 0x0167 }, // 103, 1
{ 0x010C, 0x0162 }, // 98 , 1
{ 0x010D, 0x0163 }, // 99 , 1
{ 0x010E, 0x0168 }, // 104, 1
{ 0x010F, 0x0169 }, // 105, 1
{ 0x0110, 0x014e }, // 78 , 1
{ 0x0111, 0x014f }, // 79 , 1
{ 0x0112, 0x016e }, // 110, 1
{ 0x0113, 0x016f }, // 111, 1
{ 0x0114, 0x01ea }, // 234, 1
{ 0x0115, 0x01eb }, // 235, 1
{ 0x0116, 0x016c }, // 108, 1
{ 0x0117, 0x016d }, // 109, 1
{ 0x0118, 0x0170 }, // 112, 1
{ 0x0119, 0x0171 }, // 113, 1
{ 0x011A, 0x016a }, // 106, 1
{ 0x011B, 0x016b }, // 107, 1
{ 0x011C, 0x017a }, // 122, 1
{ 0x011D, 0x017b }, // 123, 1
{ 0x011E, 0x0174 }, // 116, 1
{ 0x011F, 0x0175 }, // 117, 1
{ 0x0120, 0x017c }, // 124, 1
{ 0x0121, 0x017d }, // 125, 1
{ 0x0122, 0x0178 }, // 120, 1
{ 0x0123, 0x0179 }, // 121, 1
{ 0x0124, 0x017e }, // 126, 1
{ 0x0125, 0x017f }, // 127, 1
{ 0x0126, 0x0180 }, // 128, 1
{ 0x0127, 0x0181 }, // 129, 1
{ 0x0128, 0x0188 }, // 136, 1
{ 0x0129, 0x0189 }, // 137, 1
{ 0x012A, 0x0184 }, // 132, 1
{ 0x012B, 0x0185 }, // 133, 1
{ 0x012C, 0x01ec }, // 236, 1
{ 0x012D, 0x01ed }, // 237, 1
{ 0x012E, 0x0186 }, // 134, 1
{ 0x012F, 0x0187 }, // 135, 1
{ 0x0130, 0x0182 }, // 130, 1
{ 0x0131, 0x01ef }, // 239, 1
{ 0x0132, 0x018a }, // 138, 1
{ 0x0133, 0x018b }, // 139, 1
{ 0x0134, 0x018c }, // 140, 1
{ 0x0135, 0x018d }, // 141, 1
{ 0x0136, 0x018e }, // 142, 1
{ 0x0137, 0x018f }, // 143, 1
{ 0x0138, 0x0118 }, // 24 , 1
{ 0x0139, 0x0190 }, // 144, 1
{ 0x013A, 0x0191 }, // 145, 1
{ 0x013B, 0x0194 }, // 148, 1
{ 0x013C, 0x0195 }, // 149, 1
{ 0x013D, 0x0192 }, // 146, 1
{ 0x013E, 0x0193 }, // 147, 1
{ 0x013F, 0x0196 }, // 150, 1
{ 0x0140, 0x0197 }, // 151, 1
{ 0x0141, 0x0198 }, // 152, 1
{ 0x0142, 0x0199 }, // 153, 1
{ 0x0143, 0x019a }, // 154, 1
{ 0x0144, 0x019b }, // 155, 1
{ 0x0145, 0x01a0 }, // 160, 1
{ 0x0146, 0x01a1 }, // 161, 1
{ 0x0147, 0x019e }, // 158, 1
{ 0x0148, 0x019f }, // 159, 1
{ 0x0149, 0x019d }, // 157, 1
{ 0x014A, 0x01d2 }, // 210, 1
{ 0x014B, 0x01d3 }, // 211, 1
{ 0x014C, 0x01a4 }, // 164, 1
{ 0x014D, 0x01a5 }, // 165, 1
{ 0x014E, 0x01f0 }, // 240, 1
{ 0x014F, 0x01f1 }, // 241, 1
{ 0x0150, 0x01a2 }, // 162, 1
{ 0x0151, 0x01a3 }, // 163, 1
{ 0x0152, 0x01a6 }, // 166, 1
{ 0x0153, 0x01a7 }, // 167, 1
{ 0x0154, 0x01a8 }, // 168, 1
{ 0x0155, 0x01a9 }, // 169, 1
{ 0x0156, 0x01ac }, // 172, 1
{ 0x0157, 0x01ad }, // 173, 1
{ 0x0158, 0x01aa }, // 170, 1
{ 0x0159, 0x01ab }, // 171, 1
{ 0x015A, 0x01ae }, // 174, 1
{ 0x015B, 0x01af }, // 175, 1
{ 0x015C, 0x01b4 }, // 180, 1
{ 0x015D, 0x01b5 }, // 181, 1
{ 0x015E, 0x01b2 }, // 178, 1
{ 0x015F, 0x01b3 }, // 179, 1
{ 0x0160, 0x01b0 }, // 176, 1
{ 0x0161, 0x01b1 }, // 177, 1
{ 0x0162, 0x01b8 }, // 184, 1
{ 0x0163, 0x01b9 }, // 185, 1
{ 0x0164, 0x01b6 }, // 182, 1
{ 0x0165, 0x01b7 }, // 183, 1
{ 0x0166, 0x01ba }, // 186, 1
{ 0x0167, 0x01bb }, // 187, 1
{ 0x0168, 0x01c6 }, // 198, 1
{ 0x0169, 0x01c7 }, // 199, 1
{ 0x016A, 0x01c0 }, // 192, 1
{ 0x016B, 0x01c1 }, // 193, 1
{ 0x016C, 0x01bc }, // 188, 1
{ 0x016D, 0x01bd }, // 189, 1
{ 0x016E, 0x01c4 }, // 196, 1
{ 0x016F, 0x01c5 }, // 197, 1
{ 0x0170, 0x01be }, // 190, 1
{ 0x0171, 0x01bf }, // 191, 1
{ 0x0172, 0x01c2 }, // 194, 1
{ 0x0173, 0x01c3 }, // 195, 1
{ 0x0174, 0x01c8 }, // 200, 1
{ 0x0175, 0x01c9 }, // 201, 1
{ 0x0176, 0x01ca }, // 202, 1
{ 0x0177, 0x01cb }, // 203, 1
{ 0x0178, 0x014a }, // 74 , 1
{ 0x0179, 0x01cc }, // 204, 1
{ 0x017A, 0x01cd }, // 205, 1
{ 0x017B, 0x01d0 }, // 208, 1
{ 0x017C, 0x01d1 }, // 209, 1
{ 0x017D, 0x01ce }, // 206, 1
{ 0x017E, 0x01cf }, // 207, 1
{ 0x0192, 0x040e }, // 14 , 4
{ 0x0194, 0x0a7c }, // 124, 10
{ 0x01A0, 0x01e6 }, // 230, 1
{ 0x01A1, 0x01e7 }, // 231, 1
{ 0x01AF, 0x01e8 }, // 232, 1
{ 0x01B0, 0x01e9 }, // 233, 1
{ 0x01C0, 0x0605 }, // 5 , 6
{ 0x0250, 0x0237 }, // 55 , 2
{ 0x0251, 0x0238 }, // 56 , 2
{ 0x0252, 0x0239 }, // 57 , 2
{ 0x0253, 0x023a }, // 58 , 2
{ 0x0254, 0x023c }, // 60 , 2
{ 0x0255, 0x023d }, // 61 , 2
{ 0x0256, 0x023f }, // 63 , 2
{ 0x0257, 0x0240 }, // 64 , 2
{ 0x0258, 0x0241 }, // 65 , 2
{ 0x0259, 0x0242 }, // 66 , 2
{ 0x025A, 0x0243 }, // 67 , 2
{ 0x025B, 0x0244 }, // 68 , 2
{ 0x025C, 0x0245 }, // 69 , 2
{ 0x025D, 0x0246 }, // 70 , 2
{ 0x025E, 0x0248 }, // 72 , 2
{ 0x025F, 0x0249 }, // 73 , 2
{ 0x0260, 0x024c }, // 76 , 2
{ 0x0261, 0x024b }, // 75 , 2
{ 0x0262, 0x024d }, // 77 , 2
{ 0x0263, 0x024f }, // 79 , 2
{ 0x0264, 0x0250 }, // 80 , 2
{ 0x0265, 0x0251 }, // 81 , 2
{ 0x0266, 0x0252 }, // 82 , 2
{ 0x0267, 0x0253 }, // 83 , 2
{ 0x0268, 0x0255 }, // 85 , 2
{ 0x0269, 0x0257 }, // 87 , 2
{ 0x026A, 0x0256 }, // 86 , 2
{ 0x026B, 0x025a }, // 90 , 2
{ 0x026C, 0x025b }, // 91 , 2
{ 0x026D, 0x025c }, // 92 , 2
{ 0x026E, 0x025e }, // 94 , 2
{ 0x026F, 0x0260 }, // 96 , 2
{ 0x0270, 0x0261 }, // 97 , 2
{ 0x0271, 0x0262 }, // 98 , 2
{ 0x0272, 0x0263 }, // 99 , 2
{ 0x0273, 0x0264 }, // 100, 2
{ 0x0274, 0x0265 }, // 101, 2
{ 0x0275, 0x0279 }, // 121, 2
{ 0x0276, 0x0266 }, // 102, 2
{ 0x0277, 0x0267 }, // 103, 2
{ 0x0278, 0x024a }, // 74 , 2
{ 0x0279, 0x0269 }, // 105, 2
{ 0x027A, 0x026a }, // 106, 2
{ 0x027B, 0x026b }, // 107, 2
{ 0x027C, 0x026c }, // 108, 2
{ 0x027D, 0x026d }, // 109, 2
{ 0x027E, 0x026e }, // 110, 2
{ 0x027F, 0x026f }, // 111, 2
{ 0x0280, 0x0270 }, // 112, 2
{ 0x0281, 0x0271 }, // 113, 2
{ 0x0282, 0x0272 }, // 114, 2
{ 0x0283, 0x0273 }, // 115, 2
{ 0x0284, 0x0274 }, // 116, 2
{ 0x0285, 0x0275 }, // 117, 2
{ 0x0286, 0x0276 }, // 118, 2
{ 0x0287, 0x0277 }, // 119, 2
{ 0x0288, 0x0278 }, // 120, 2
{ 0x0289, 0x027a }, // 122, 2
{ 0x028A, 0x027b }, // 123, 2
{ 0x028B, 0x027d }, // 125, 2
{ 0x028C, 0x027c }, // 124, 2
{ 0x028D, 0x027e }, // 126, 2
{ 0x028E, 0x025f }, // 95 , 2
{ 0x028F, 0x0280 }, // 128, 2
{ 0x0290, 0x0281 }, // 129, 2
{ 0x0291, 0x0282 }, // 130, 2
{ 0x0292, 0x0283 }, // 131, 2
{ 0x0293, 0x0284 }, // 132, 2
{ 0x0294, 0x0285 }, // 133, 2
{ 0x0295, 0x0286 }, // 134, 2
{ 0x0296, 0x0287 }, // 135, 2
{ 0x0297, 0x023e }, // 62 , 2
{ 0x0298, 0x028a }, // 138, 2
{ 0x0299, 0x023b }, // 59 , 2
{ 0x029A, 0x0247 }, // 71 , 2
{ 0x029B, 0x024e }, // 78 , 2
{ 0x029C, 0x0254 }, // 84 , 2
{ 0x029D, 0x0258 }, // 88 , 2
{ 0x029E, 0x0259 }, // 89 , 2
{ 0x029F, 0x025d }, // 93 , 2
{ 0x02A0, 0x0268 }, // 104, 2
{ 0x02A1, 0x0288 }, // 136, 2
{ 0x02A2, 0x0289 }, // 137, 2
{ 0x02A3, 0x028b }, // 139, 2
{ 0x02A4, 0x028c }, // 140, 2
{ 0x02A5, 0x028d }, // 141, 2
{ 0x02A6, 0x028e }, // 142, 2
{ 0x02A7, 0x028f }, // 143, 2
{ 0x02A8, 0x0290 }, // 144, 2
{ 0x02B0, 0x0235 }, // 53 , 2
{ 0x02B6, 0x0236 }, // 54 , 2
{ 0x02B9, 0x0200 }, // 0 , 2
{ 0x02BA, 0x0201 }, // 1 , 2
{ 0x02BB, 0x0202 }, // 2 , 2
{ 0x02BC, 0x0205 }, // 5 , 2
{ 0x02BD, 0x0204 }, // 4 , 2
{ 0x02BE, 0x0207 }, // 7 , 2
{ 0x02BF, 0x0208 }, // 8 , 2
{ 0x02C6, 0x0217 }, // 23 , 2
{ 0x02C7, 0x0218 }, // 24 , 2
{ 0x02C8, 0x020f }, // 15 , 2
{ 0x02C9, 0x0211 }, // 17 , 2
{ 0x02CA, 0x0212 }, // 18 , 2
{ 0x02CB, 0x0213 }, // 19 , 2
{ 0x02CC, 0x0210 }, // 16 , 2
{ 0x02CD, 0x0214 }, // 20 , 2
{ 0x02CE, 0x0215 }, // 21 , 2
{ 0x02CF, 0x0216 }, // 22 , 2
{ 0x02D0, 0x020a }, // 10 , 2
{ 0x02D1, 0x020b }, // 11 , 2
{ 0x02D2, 0x022a }, // 42 , 2
{ 0x02D3, 0x022b }, // 43 , 2
{ 0x02DA, 0x021b }, // 27 , 2
{ 0x02DB, 0x0231 }, // 49 , 2
{ 0x02DC, 0x0219 }, // 25 , 2
{ 0x02DE, 0x0233 }, // 51 , 2
{ 0x0300, 0x0100 }, // 0 , 1
{ 0x0301, 0x0106 }, // 6 , 1
{ 0x0302, 0x0103 }, // 3 , 1
{ 0x0303, 0x0102 }, // 2 , 1
{ 0x0304, 0x0108 }, // 8 , 1
{ 0x0305, 0x0115 }, // 21 , 1
{ 0x0306, 0x0116 }, // 22 , 1
{ 0x0307, 0x010f }, // 15 , 1
{ 0x0308, 0x0107 }, // 7 , 1
{ 0x030A, 0x010e }, // 14 , 1
{ 0x030B, 0x0110 }, // 16 , 1
{ 0x030C, 0x0113 }, // 19 , 1
{ 0x0310, 0x0209 }, // 9 , 2
{ 0x0311, 0x0858 }, // 88 , 8
{ 0x0313, 0x0109 }, // 9 , 1
{ 0x0314, 0x085a }, // 90 , 8
{ 0x0315, 0x010a }, // 10 , 1
{ 0x031C, 0x0221 }, // 33 , 2
{ 0x031D, 0x0222 }, // 34 , 2
{ 0x031E, 0x0223 }, // 35 , 2
{ 0x031F, 0x0224 }, // 36 , 2
{ 0x0320, 0x0225 }, // 37 , 2
{ 0x0321, 0x0226 }, // 38 , 2
{ 0x0322, 0x0227 }, // 39 , 2
{ 0x0323, 0x021e }, // 30 , 2
{ 0x0324, 0x0220 }, // 32 , 2
{ 0x0325, 0x021a }, // 26 , 2
{ 0x0326, 0x010c }, // 12 , 1
{ 0x0327, 0x0111 }, // 17 , 1
{ 0x0328, 0x0112 }, // 18 , 1
{ 0x0329, 0x020e }, // 14 , 2
{ 0x032A, 0x0228 }, // 40 , 2
{ 0x032B, 0x0229 }, // 41 , 2
{ 0x032C, 0x021d }, // 29 , 2
{ 0x032D, 0x021c }, // 28 , 2
{ 0x032E, 0x020d }, // 13 , 2
{ 0x0335, 0x0104 }, // 4 , 1
{ 0x0337, 0x0114 }, // 20 , 1
{ 0x0338, 0x0105 }, // 5 , 1
{ 0x033E, 0x0230 }, // 48 , 2
{ 0x0345, 0x085b }, // 91 , 8
{ 0x0374, 0x0851 }, // 81 , 8
{ 0x0375, 0x0852 }, // 82 , 8
{ 0x0391, 0x0800 }, // 0 , 8
{ 0x0392, 0x0802 }, // 2 , 8
{ 0x0393, 0x0806 }, // 6 , 8
{ 0x0394, 0x0808 }, // 8 , 8
{ 0x0395, 0x080a }, // 10 , 8
{ 0x0396, 0x080c }, // 12 , 8
{ 0x0397, 0x080e }, // 14 , 8
{ 0x0398, 0x0810 }, // 16 , 8
{ 0x0399, 0x0812 }, // 18 , 8
{ 0x039A, 0x0814 }, // 20 , 8
{ 0x039B, 0x0816 }, // 22 , 8
{ 0x039C, 0x0818 }, // 24 , 8
{ 0x039D, 0x081a }, // 26 , 8
{ 0x039E, 0x081c }, // 28 , 8
{ 0x039F, 0x081e }, // 30 , 8
{ 0x03A0, 0x0820 }, // 32 , 8
{ 0x03A1, 0x0822 }, // 34 , 8
{ 0x03A3, 0x0824 }, // 36 , 8
{ 0x03A4, 0x0828 }, // 40 , 8
{ 0x03A5, 0x082a }, // 42 , 8
{ 0x03A6, 0x082c }, // 44 , 8
{ 0x03A7, 0x082e }, // 46 , 8
{ 0x03A8, 0x0830 }, // 48 , 8
{ 0x03A9, 0x0832 }, // 50 , 8
{ 0x03AA, 0x083c }, // 60 , 8
{ 0x03AB, 0x0842 }, // 66 , 8
{ 0x03AC, 0x0835 }, // 53 , 8
{ 0x03AD, 0x0837 }, // 55 , 8
{ 0x03AE, 0x0839 }, // 57 , 8
{ 0x03AF, 0x083b }, // 59 , 8
{ 0x03B1, 0x0801 }, // 1 , 8
{ 0x03B2, 0x0803 }, // 3 , 8
{ 0x03B3, 0x0807 }, // 7 , 8
{ 0x03B4, 0x0809 }, // 9 , 8
{ 0x03B5, 0x080b }, // 11 , 8
{ 0x03B6, 0x080d }, // 13 , 8
{ 0x03B7, 0x080f }, // 15 , 8
{ 0x03B8, 0x0811 }, // 17 , 8
{ 0x03B9, 0x0813 }, // 19 , 8
{ 0x03BA, 0x0815 }, // 21 , 8
{ 0x03BB, 0x0817 }, // 23 , 8
{ 0x03BC, 0x0819 }, // 25 , 8
{ 0x03BD, 0x081b }, // 27 , 8
{ 0x03BE, 0x081d }, // 29 , 8
{ 0x03BF, 0x081f }, // 31 , 8
{ 0x03C0, 0x0821 }, // 33 , 8
{ 0x03C1, 0x0823 }, // 35 , 8
{ 0x03C2, 0x0827 }, // 39 , 8
{ 0x03C3, 0x0825 }, // 37 , 8
{ 0x03C4, 0x0829 }, // 41 , 8
{ 0x03C5, 0x082b }, // 43 , 8
{ 0x03C6, 0x082d }, // 45 , 8
{ 0x03C7, 0x082f }, // 47 , 8
{ 0x03C8, 0x0831 }, // 49 , 8
{ 0x03C9, 0x0833 }, // 51 , 8
{ 0x03CA, 0x083d }, // 61 , 8
{ 0x03CB, 0x0843 }, // 67 , 8
{ 0x03CC, 0x083f }, // 63 , 8
{ 0x03CD, 0x0841 }, // 65 , 8
{ 0x03CE, 0x0845 }, // 69 , 8
{ 0x03D0, 0x0805 }, // 5 , 8
{ 0x03D1, 0x0847 }, // 71 , 8
{ 0x03D2, 0x084c }, // 76 , 8
{ 0x03D5, 0x084d }, // 77 , 8
{ 0x03D6, 0x0849 }, // 73 , 8
{ 0x03D7, 0x084f }, // 79 , 8
{ 0x03DA, 0x08d7 }, // 215, 8
{ 0x03DB, 0x084B }, // 75 , 8
{ 0x03DC, 0x08d8 }, // 216, 8
{ 0x03DE, 0x08d9 }, // 217, 8
{ 0x03E0, 0x08da }, // 218, 8
{ 0x03F0, 0x0848 }, // 72 , 8
{ 0x03F1, 0x084a }, // 74 , 8
{ 0x0401, 0x0a0c }, // 12 , 10
{ 0x0402, 0x0a4a }, // 74 , 10
{ 0x0403, 0x0a44 }, // 68 , 10
{ 0x0404, 0x0a4e }, // 78 , 10
{ 0x0405, 0x0a52 }, // 82 , 10
{ 0x0406, 0x0a58 }, // 88 , 10
{ 0x0407, 0x0a5a }, // 90 , 10
{ 0x0408, 0x0a5e }, // 94 , 10
{ 0x0409, 0x0a68 }, // 104, 10
{ 0x040A, 0x0a6c }, // 108, 10
{ 0x040B, 0x0a72 }, // 114, 10
{ 0x040C, 0x0a60 }, // 96 , 10
{ 0x040E, 0x0a74 }, // 116, 10
{ 0x040F, 0x0a86 }, // 134, 10
{ 0x0410, 0x0a00 }, // 0 , 10
{ 0x0411, 0x0a02 }, // 2 , 10
{ 0x0412, 0x0a04 }, // 4 , 10
{ 0x0413, 0x0a06 }, // 6 , 10
{ 0x0414, 0x0a08 }, // 8 , 10
{ 0x0415, 0x0a0a }, // 10 , 10
{ 0x0416, 0x0a0e }, // 14 , 10
{ 0x0417, 0x0a10 }, // 16 , 10
{ 0x0418, 0x0a12 }, // 18 , 10
{ 0x0419, 0x0a14 }, // 20 , 10
{ 0x041A, 0x0a16 }, // 22 , 10
{ 0x041B, 0x0a18 }, // 24 , 10
{ 0x041C, 0x0a1a }, // 26 , 10
{ 0x041D, 0x0a1c }, // 28 , 10
{ 0x041E, 0x0a1e }, // 30 , 10
{ 0x041F, 0x0a20 }, // 32 , 10
{ 0x0420, 0x0a22 }, // 34 , 10
{ 0x0421, 0x0a24 }, // 36 , 10
{ 0x0422, 0x0a26 }, // 38 , 10
{ 0x0423, 0x0a28 }, // 40 , 10
{ 0x0424, 0x0a2a }, // 42 , 10
{ 0x0425, 0x0a2c }, // 44 , 10
{ 0x0426, 0x0a2e }, // 46 , 10
{ 0x0427, 0x0a30 }, // 48 , 10
{ 0x0428, 0x0a32 }, // 50 , 10
{ 0x0429, 0x0a34 }, // 52 , 10
{ 0x042A, 0x0a36 }, // 54 , 10
{ 0x042B, 0x0a38 }, // 56 , 10
{ 0x042C, 0x0a3a }, // 58 , 10
{ 0x042D, 0x0a3c }, // 60 , 10
{ 0x042E, 0x0a3e }, // 62 , 10
{ 0x042F, 0x0a40 }, // 64 , 10
{ 0x0430, 0x0a01 }, // 1 , 10
{ 0x0431, 0x0a03 }, // 3 , 10
{ 0x0432, 0x0a05 }, // 5 , 10
{ 0x0433, 0x0a07 }, // 7 , 10
{ 0x0434, 0x0a09 }, // 9 , 10
{ 0x0435, 0x0a0b }, // 11 , 10
{ 0x0436, 0x0a0f }, // 15 , 10
{ 0x0437, 0x0a11 }, // 17 , 10
{ 0x0438, 0x0a13 }, // 19 , 10
{ 0x0439, 0x0a15 }, // 21 , 10
{ 0x043A, 0x0a17 }, // 23 , 10
{ 0x043B, 0x0a19 }, // 25 , 10
{ 0x043C, 0x0a1b }, // 27 , 10
{ 0x043D, 0x0a1d }, // 29 , 10
{ 0x043E, 0x0a1f }, // 31 , 10
{ 0x043F, 0x0a21 }, // 33 , 10
{ 0x0440, 0x0a23 }, // 35 , 10
{ 0x0441, 0x0a25 }, // 37 , 10
{ 0x0442, 0x0a27 }, // 39 , 10
{ 0x0443, 0x0a29 }, // 41 , 10
{ 0x0444, 0x0a2b }, // 43 , 10
{ 0x0445, 0x0a2d }, // 45 , 10
{ 0x0446, 0x0a2f }, // 47 , 10
{ 0x0447, 0x0a31 }, // 49 , 10
{ 0x0448, 0x0a33 }, // 51 , 10
{ 0x0449, 0x0a35 }, // 53 , 10
{ 0x044A, 0x0a37 }, // 55 , 10
{ 0x044B, 0x0a39 }, // 57 , 10
{ 0x044C, 0x0a3b }, // 59 , 10
{ 0x044D, 0x0a3d }, // 61 , 10
{ 0x044E, 0x0a3f }, // 63 , 10
{ 0x044F, 0x0a41 }, // 65 , 10
{ 0x0451, 0x0a0d }, // 13 , 10
{ 0x0452, 0x0a4b }, // 75 , 10
{ 0x0453, 0x0a45 }, // 69 , 10
{ 0x0454, 0x0a4f }, // 79 , 10
{ 0x0455, 0x0a53 }, // 83 , 10
{ 0x0456, 0x0a59 }, // 89 , 10
{ 0x0457, 0x0a5b }, // 91 , 10
{ 0x0458, 0x0a5f }, // 95 , 10
{ 0x0459, 0x0a69 }, // 105, 10
{ 0x045A, 0x0a6d }, // 109, 10
{ 0x045B, 0x0a73 }, // 115, 10
{ 0x045C, 0x0a61 }, // 97 , 10
{ 0x045E, 0x0a75 }, // 117, 10
{ 0x045F, 0x0a87 }, // 135, 10
{ 0x0460, 0x0a70 }, // 112, 10
{ 0x0461, 0x0a71 }, // 113, 10
{ 0x0462, 0x0a8e }, // 142, 10
{ 0x0463, 0x0a8f }, // 143, 10
{ 0x0466, 0x0a90 }, // 144, 10
{ 0x0467, 0x0a91 }, // 145, 10
{ 0x046A, 0x0a92 }, // 146, 10
{ 0x046B, 0x0a93 }, // 147, 10
{ 0x046E, 0x0a94 }, // 148, 10
{ 0x046F, 0x0a95 }, // 149, 10
{ 0x0470, 0x0a96 }, // 150, 10
{ 0x0471, 0x0a97 }, // 151, 10
{ 0x0472, 0x0a98 }, // 152, 10
{ 0x0473, 0x0a99 }, // 153, 10
{ 0x0474, 0x0a9a }, // 154, 10
{ 0x0475, 0x0a9b }, // 155, 10
{ 0x047A, 0x0a6e }, // 110, 10
{ 0x047B, 0x0a6f }, // 111, 10
{ 0x047E, 0x0a84 }, // 132, 10
{ 0x047F, 0x0a85 }, // 133, 10
{ 0x0490, 0x0a46 }, // 70 , 10
{ 0x0491, 0x0a47 }, // 71 , 10
{ 0x0492, 0x0a48 }, // 72 , 10
{ 0x0493, 0x0a49 }, // 73 , 10
{ 0x0496, 0x0a50 }, // 80 , 10
{ 0x0497, 0x0a51 }, // 81 , 10
{ 0x049A, 0x0a62 }, // 98 , 10
{ 0x049B, 0x0a63 }, // 99 , 10
{ 0x049C, 0x0a66 }, // 102, 10
{ 0x049D, 0x0a67 }, // 103, 10
{ 0x04A2, 0x0a6a }, // 106, 10
{ 0x04A3, 0x0a6b }, // 107, 10
{ 0x04AE, 0x0a78 }, // 120, 10
{ 0x04AF, 0x0a79 }, // 121, 10
{ 0x04B0, 0x0a7a }, // 122, 10
{ 0x04B1, 0x0a7b }, // 123, 10
{ 0x04B2, 0x0a7e }, // 126, 10
{ 0x04B3, 0x0a7f }, // 127, 10
{ 0x04B6, 0x0a88 }, // 136, 10
{ 0x04B7, 0x0a89 }, // 137, 10
{ 0x04B8, 0x0a8a }, // 138, 10
{ 0x04B9, 0x0a8b }, // 139, 10
{ 0x04BA, 0x0a82 }, // 130, 10
{ 0x04BB, 0x0a83 }, // 131, 10
{ 0x04D8, 0x0a42 }, // 66 , 10
{ 0x04D9, 0x0a43 }, // 67 , 10
{ 0x04EE, 0x0a76 }, // 118, 10
{ 0x04EF, 0x0a77 }, // 119, 10
{ 0x05B0, 0x0920 }, // 32 , 9
{ 0x05B1, 0x0921 }, // 33 , 9
{ 0x05B2, 0x0922 }, // 34 , 9
{ 0x05B3, 0x0923 }, // 35 , 9
{ 0x05B4, 0x0924 }, // 36 , 9
{ 0x05B5, 0x0925 }, // 37 , 9
{ 0x05B6, 0x0926 }, // 38 , 9
{ 0x05B7, 0x0927 }, // 39 , 9
{ 0x05B8, 0x0928 }, // 40 , 9
{ 0x05B9, 0x0929 }, // 41 , 9
{ 0x05BB, 0x092b }, // 43 , 9
{ 0x05BC, 0x092c }, // 44 , 9
{ 0x05BD, 0x092d }, // 45 , 9
{ 0x05BF, 0x092e }, // 46 , 9
{ 0x05C0, 0x091c }, // 28 , 9
{ 0x05C3, 0x091d }, // 29 , 9
{ 0x05D0, 0x0900 }, // 0 , 9
{ 0x05D1, 0x0901 }, // 1 , 9
{ 0x05D2, 0x0902 }, // 2 , 9
{ 0x05D3, 0x0903 }, // 3 , 9
{ 0x05D4, 0x0904 }, // 4 , 9
{ 0x05D5, 0x0905 }, // 5 , 9
{ 0x05D6, 0x0906 }, // 6 , 9
{ 0x05D7, 0x0907 }, // 7 , 9
{ 0x05D8, 0x0908 }, // 8 , 9
{ 0x05D9, 0x0909 }, // 9 , 9
{ 0x05DA, 0x090a }, // 10 , 9
{ 0x05DB, 0x090b }, // 11 , 9
{ 0x05DC, 0x090c }, // 12 , 9
{ 0x05DD, 0x090d }, // 13 , 9
{ 0x05DE, 0x090e }, // 14 , 9
{ 0x05DF, 0x090f }, // 15 , 9
{ 0x05E0, 0x0910 }, // 16 , 9
{ 0x05E1, 0x0911 }, // 17 , 9
{ 0x05E2, 0x0912 }, // 18 , 9
{ 0x05E3, 0x0913 }, // 19 , 9
{ 0x05E4, 0x0914 }, // 20 , 9
{ 0x05E5, 0x0915 }, // 21 , 9
{ 0x05E6, 0x0916 }, // 22 , 9
{ 0x05E7, 0x0917 }, // 23 , 9
{ 0x05E8, 0x0918 }, // 24 , 9
{ 0x05E9, 0x0919 }, // 25 , 9
{ 0x05EA, 0x091a }, // 26 , 9
{ 0x05F0, 0x0931 }, // 49 , 9
{ 0x05F1, 0x0932 }, // 50 , 9
{ 0x05F2, 0x0933 }, // 51 , 9
{ 0x05F3, 0x091e }, // 30 , 9
{ 0x05F4, 0x091f }, // 31 , 9
{ 0x060C, 0x0d26 }, // 38 , 13
{ 0x061B, 0x0d27 }, // 39 , 13
{ 0x061F, 0x0d28 }, // 40 , 13
{ 0x0621, 0x0da4 }, // 164, 13
{ 0x0622, 0x0db1 }, // 177, 13
{ 0x0623, 0x0da5 }, // 165, 13
{ 0x0624, 0x0da9 }, // 169, 13
{ 0x0625, 0x0da7 }, // 167, 13
{ 0x0626, 0x0dab }, // 171, 13
{ 0x0627, 0x0d3a }, // 58 , 13
{ 0x0628, 0x0d3c }, // 60 , 13
{ 0x0629, 0x0d98 }, // 152, 13
{ 0x062A, 0x0d40 }, // 64 , 13
{ 0x062B, 0x0d44 }, // 68 , 13
{ 0x062C, 0x0d48 }, // 72 , 13
{ 0x062D, 0x0d4c }, // 76 , 13
{ 0x062E, 0x0d50 }, // 80 , 13
{ 0x062F, 0x0d54 }, // 84 , 13
{ 0x0630, 0x0d56 }, // 86 , 13
{ 0x0631, 0x0d58 }, // 88 , 13
{ 0x0632, 0x0d5a }, // 90 , 13
{ 0x0633, 0x0d5c }, // 92 , 13
{ 0x0634, 0x0d60 }, // 96 , 13
{ 0x0635, 0x0d64 }, // 100, 13
{ 0x0636, 0x0d68 }, // 104, 13
{ 0x0637, 0x0d6c }, // 108, 13
{ 0x0638, 0x0d70 }, // 112, 13
{ 0x0639, 0x0d74 }, // 116, 13
{ 0x063A, 0x0d78 }, // 120, 13
{ 0x0640, 0x0dc2 }, // 194, 13
{ 0x0641, 0x0d7c }, // 124, 13
{ 0x0642, 0x0d80 }, // 128, 13
{ 0x0643, 0x0d84 }, // 132, 13
{ 0x0644, 0x0d88 }, // 136, 13
{ 0x0645, 0x0d8c }, // 140, 13
{ 0x0646, 0x0d90 }, // 144, 13
{ 0x0647, 0x0d94 }, // 148, 13
{ 0x0648, 0x0d9a }, // 154, 13
{ 0x0649, 0x0da0 }, // 160, 13
{ 0x064A, 0x0d9c }, // 156, 13
{ 0x064B, 0x0d10 }, // 16 , 13
{ 0x064C, 0x0d11 }, // 17 , 13
{ 0x064E, 0x0d0a }, // 10 , 13
{ 0x064F, 0x0d0c }, // 12 , 13
{ 0x0650, 0x0d0e }, // 14 , 13
{ 0x0651, 0x0d16 }, // 22 , 13
{ 0x0652, 0x0d14 }, // 20 , 13
{ 0x0660, 0x0d38 }, // 56 , 13
{ 0x0661, 0x0d2f }, // 47 , 13
{ 0x0662, 0x0d30 }, // 48 , 13
{ 0x0663, 0x0d31 }, // 49 , 13
{ 0x0664, 0x0d32 }, // 50 , 13
{ 0x0665, 0x0d33 }, // 51 , 13
{ 0x0666, 0x0d34 }, // 52 , 13
{ 0x0667, 0x0d35 }, // 53 , 13
{ 0x0668, 0x0d36 }, // 54 , 13
{ 0x0669, 0x0d37 }, // 55 , 13
{ 0x066A, 0x0d2a }, // 42 , 13
{ 0x0671, 0x0db3 }, // 179, 13
{ 0x0674, 0x0d24 }, // 36 , 13
{ 0x0679, 0x0e3c }, // 60 , 14
{ 0x067A, 0x0e4c }, // 76 , 14
{ 0x067B, 0x0e30 }, // 48 , 14
{ 0x067C, 0x0e40 }, // 64 , 14
{ 0x067D, 0x0e48 }, // 72 , 14
{ 0x067E, 0x0e38 }, // 56 , 14
{ 0x067F, 0x0e44 }, // 68 , 14
{ 0x0680, 0x0e34 }, // 52 , 14
{ 0x0681, 0x0e64 }, // 100, 14
{ 0x0683, 0x0e54 }, // 84 , 14
{ 0x0684, 0x0e50 }, // 80 , 14
{ 0x0685, 0x0e60 }, // 96 , 14
{ 0x0686, 0x0e58 }, // 88 , 14
{ 0x0687, 0x0e5c }, // 92 , 14
{ 0x0688, 0x0e68 }, // 104, 14
{ 0x0689, 0x0e6a }, // 106, 14
{ 0x068A, 0x0e70 }, // 112, 14
{ 0x068C, 0x0e6c }, // 108, 14
{ 0x068D, 0x0e72 }, // 114, 14
{ 0x068E, 0x0e6e }, // 110, 14
{ 0x0691, 0x0e76 }, // 118, 14
{ 0x0692, 0x0e7C }, // 124, 14
{ 0x0693, 0x0e74 }, // 116, 14
{ 0x0695, 0x0e7a }, // 122, 14
{ 0x0696, 0x0e80 }, // 128, 14
{ 0x0698, 0x0e7e }, // 126, 14
{ 0x0699, 0x0e78 }, // 120, 14
{ 0x069A, 0x0e84 }, // 132, 14
{ 0x06A0, 0x0e88 }, // 136, 14
{ 0x06A4, 0x0e8c }, // 140, 14
{ 0x06A6, 0x0e90 }, // 144, 14
{ 0x06A9, 0x0e94 }, // 148, 14
{ 0x06AA, 0x0e9c }, // 156, 14
{ 0x06AB, 0x0ea8 }, // 168, 14
{ 0x06AF, 0x0ea0 }, // 160, 14
{ 0x06B1, 0x0eac }, // 172, 14
{ 0x06B3, 0x0eb0 }, // 176, 14
{ 0x06B5, 0x0eb4 }, // 180, 14
{ 0x06BA, 0x0eba }, // 186, 14
{ 0x06BB, 0x0ec2 }, // 194, 14
{ 0x06BC, 0x0ebe }, // 190, 14
{ 0x06C0, 0x0eda }, // 218, 14
{ 0x06C6, 0x0ec6 }, // 198, 14
{ 0x06CA, 0x0ec8 }, // 200, 14
{ 0x06CE, 0x0ed0 }, // 208, 14
{ 0x06D1, 0x0ed6 }, // 214, 14
{ 0x06D2, 0x0ed4 }, // 212, 14
{ 0x06D6, 0x0d25 }, // 37 , 13
{ 0x06E4, 0x0d22 }, // 34 , 13
{ 0x06F4, 0x0e29 }, // 41 , 14
{ 0x06F5, 0x0e2b }, // 43 , 14
{ 0x06F6, 0x0e2c }, // 44 , 14
{ 0x06F7, 0x0e2e }, // 46 , 14
{ 0x06F8, 0x0e2f }, // 47 , 14
{ 0x10D0, 0x0ad2 }, // 210, 10
{ 0x10D1, 0x0ad3 }, // 211, 10
{ 0x10D2, 0x0ad4 }, // 212, 10
{ 0x10D3, 0x0ad5 }, // 213, 10
{ 0x10D4, 0x0ad6 }, // 214, 10
{ 0x10D5, 0x0ad7 }, // 215, 10
{ 0x10D6, 0x0ad8 }, // 216, 10
{ 0x10D7, 0x0ada }, // 218, 10
{ 0x10D8, 0x0adb }, // 219, 10
{ 0x10D9, 0x0adc }, // 220, 10
{ 0x10DA, 0x0add }, // 221, 10
{ 0x10DB, 0x0ade }, // 222, 10
{ 0x10DC, 0x0adf }, // 223, 10
{ 0x10DD, 0x0ae1 }, // 225, 10
{ 0x10DE, 0x0ae2 }, // 226, 10
{ 0x10DF, 0x0ae3 }, // 227, 10
{ 0x10E0, 0x0ae4 }, // 228, 10
{ 0x10E1, 0x0ae5 }, // 229, 10
{ 0x10E2, 0x0ae6 }, // 230, 10
{ 0x10E3, 0x0ae7 }, // 231, 10
{ 0x10E4, 0x0ae9 }, // 233, 10
{ 0x10E5, 0x0aea }, // 234, 10
{ 0x10E6, 0x0aeb }, // 235, 10
{ 0x10E7, 0x0aec }, // 236, 10
{ 0x10E8, 0x0aed }, // 237, 10
{ 0x10E9, 0x0aee }, // 238, 10
{ 0x10EA, 0x0aef }, // 239, 10
{ 0x10EB, 0x0af0 }, // 240, 10
{ 0x10EC, 0x0af1 }, // 241, 10
{ 0x10ED, 0x0af2 }, // 242, 10
{ 0x10EE, 0x0af3 }, // 243, 10
{ 0x10EF, 0x0af5 }, // 245, 10
{ 0x10F0, 0x0af6 }, // 246, 10
{ 0x10F1, 0x0ad9 }, // 217, 10
{ 0x10F2, 0x0ae0 }, // 224, 10
{ 0x10F3, 0x0ae8 }, // 232, 10
{ 0x10F4, 0x0af4 }, // 244, 10
{ 0x10F5, 0x0af7 }, // 247, 10
{ 0x10F6, 0x0af8 }, // 248, 10
{ 0x1F00, 0x0873 }, // 115, 8
{ 0x1F01, 0x087b }, // 123, 8
{ 0x1F02, 0x0875 }, // 117, 8
{ 0x1F03, 0x087d }, // 125, 8
{ 0x1F04, 0x0874 }, // 116, 8
{ 0x1F05, 0x087c }, // 124, 8
{ 0x1F10, 0x0884 }, // 132, 8
{ 0x1F11, 0x0887 }, // 135, 8
{ 0x1F12, 0x0886 }, // 134, 8
{ 0x1F13, 0x0889 }, // 137, 8
{ 0x1F14, 0x0885 }, // 133, 8
{ 0x1F15, 0x0888 }, // 136, 8
{ 0x1F20, 0x0890 }, // 144, 8
{ 0x1F21, 0x0898 }, // 152, 8
{ 0x1F22, 0x0892 }, // 146, 8
{ 0x1F23, 0x089a }, // 154, 8
{ 0x1F24, 0x0891 }, // 145, 8
{ 0x1F25, 0x0899 }, // 153, 8
{ 0x1F30, 0x08a4 }, // 164, 8
{ 0x1F31, 0x08a8 }, // 168, 8
{ 0x1F32, 0x08a6 }, // 166, 8
{ 0x1F33, 0x08aa }, // 170, 8
{ 0x1F34, 0x08a5 }, // 165, 8
{ 0x1F35, 0x08a9 }, // 169, 8
{ 0x1F40, 0x08ad }, // 173, 8
{ 0x1F41, 0x08b0 }, // 176, 8
{ 0x1F42, 0x08af }, // 175, 8
{ 0x1F43, 0x08b2 }, // 178, 8
{ 0x1F44, 0x08ae }, // 174, 8
{ 0x1F45, 0x08b1 }, // 177, 8
{ 0x1F50, 0x08b9 }, // 185, 8
{ 0x1F51, 0x08bd }, // 189, 8
{ 0x1F52, 0x08bb }, // 187, 8
{ 0x1F53, 0x08bf }, // 191, 8
{ 0x1F54, 0x08ba }, // 186, 8
{ 0x1F55, 0x08be }, // 190, 8
{ 0x1F60, 0x08c7 }, // 199, 8
{ 0x1F61, 0x08cf }, // 207, 8
{ 0x1F62, 0x08c9 }, // 201, 8
{ 0x1F63, 0x08d1 }, // 209, 8
{ 0x1F64, 0x08c8 }, // 200, 8
{ 0x1F65, 0x08d0 }, // 208, 8
{ 0x1F70, 0x086d }, // 109, 8
{ 0x1F72, 0x0883 }, // 131, 8
{ 0x1F74, 0x088a }, // 138, 8
{ 0x1F76, 0x08a0 }, // 160, 8
{ 0x1F78, 0x08ac }, // 172, 8
{ 0x1F7A, 0x08b5 }, // 181, 8
{ 0x1F7C, 0x08c1 }, // 193, 8
{ 0x1F80, 0x0877 }, // 119, 8
{ 0x1F81, 0x087f }, // 127, 8
{ 0x1F82, 0x0879 }, // 121, 8
{ 0x1F83, 0x0881 }, // 129, 8
{ 0x1F84, 0x0878 }, // 120, 8
{ 0x1F85, 0x0880 }, // 128, 8
{ 0x1F90, 0x0894 }, // 148, 8
{ 0x1F91, 0x089c }, // 156, 8
{ 0x1F92, 0x0896 }, // 150, 8
{ 0x1F93, 0x089e }, // 158, 8
{ 0x1F94, 0x0895 }, // 149, 8
{ 0x1F95, 0x089d }, // 157, 8
{ 0x1FA0, 0x08cb }, // 203, 8
{ 0x1FA1, 0x08d3 }, // 211, 8
{ 0x1FA2, 0x08cd }, // 205, 8
{ 0x1FA3, 0x08d5 }, // 213, 8
{ 0x1FA4, 0x08cc }, // 204, 8
{ 0x1FA5, 0x08d4 }, // 212, 8
{ 0x1FB2, 0x0871 }, // 113, 8
{ 0x1FB3, 0x086f }, // 111, 8
{ 0x1FB4, 0x0870 }, // 112, 8
{ 0x1FC2, 0x088e }, // 142, 8
{ 0x1FC3, 0x088c }, // 140, 8
{ 0x1FC4, 0x088d }, // 141, 8
{ 0x1FCD, 0x085e }, // 94 , 8
{ 0x1FCE, 0x085c }, // 92 , 8
{ 0x1FDD, 0x085f }, // 95 , 8
{ 0x1FDE, 0x085d }, // 93 , 8
{ 0x1FE4, 0x08B4 }, // 180, 8
{ 0x1FE5, 0x08B3 }, // 179, 8
{ 0x1FF2, 0x08c5 }, // 197, 8
{ 0x1FF3, 0x08c3 }, // 195, 8
{ 0x1FF4, 0x08c4 }, // 196, 8
{ 0x2007, 0x0517 }, // 23 , 5
{ 0x2012, 0x0432 }, // 50 , 4
{ 0x2013, 0x0421 }, // 33 , 4
{ 0x2014, 0x0422 }, // 34 , 4
{ 0x2017, 0x022f }, // 47 , 2
{ 0x2018, 0x041d }, // 29 , 4
{ 0x2019, 0x041c }, // 28 , 4
{ 0x201A, 0x043e }, // 62 , 4
{ 0x201B, 0x041b }, // 27 , 4
{ 0x201C, 0x0420 }, // 32 , 4
{ 0x201D, 0x041f }, // 31 , 4
{ 0x201E, 0x043f }, // 63 , 4
{ 0x201F, 0x041e }, // 30 , 4
{ 0x2020, 0x0427 }, // 39 , 4
{ 0x2021, 0x0428 }, // 40 , 4
{ 0x2022, 0x0403 }, // 3 , 4
{ 0x2026, 0x0438 }, // 56 , 4
{ 0x2030, 0x044b }, // 75 , 4
{ 0x2033, 0x0580 }, // 128, 5
{ 0x2034, 0x0671 }, // 113, 6
{ 0x2036, 0x057f }, // 127, 5
{ 0x2039, 0x0423 }, // 35 , 4
{ 0x203A, 0x0424 }, // 36 , 4
{ 0x203C, 0x050d }, // 13 , 5
{ 0x203E, 0x0626 }, // 38 , 6
{ 0x207F, 0x0415 }, // 21 , 4
{ 0x20A0, 0x043c }, // 60 , 4
{ 0x20A2, 0x043b }, // 59 , 4
{ 0x20A3, 0x043a }, // 58 , 4
{ 0x20A4, 0x043d }, // 61 , 4
{ 0x20A6, 0x0457 }, // 87 , 4
{ 0x20A7, 0x040d }, // 13 , 4
{ 0x20A8, 0x0458 }, // 88 , 4
{ 0x20A9, 0x0456 }, // 86 , 4
{ 0x20AA, 0x097A }, // 122, 9
{ 0x20AC, 0x0466 }, // 102, 4, Euro Sign - GW assigned x448 [4,72]
{ 0x20DD, 0x066d }, // 109, 6
{ 0x20E1, 0x06e1 }, // 225, 6
{ 0x2102, 0x06d5 }, // 213, 6
{ 0x2104, 0x0515 }, // 21 , 5
{ 0x2105, 0x0449 }, // 73 , 4
{ 0x2106, 0x044a }, // 74 , 4
{ 0x210C, 0x06e9 }, // 233, 6
{ 0x210F, 0x0632 }, // 50 , 6
{ 0x2111, 0x0633 }, // 51 , 6
{ 0x2112, 0x0669 }, // 105, 6
{ 0x2113, 0x0631 }, // 49 , 6
{ 0x2115, 0x06d7 }, // 215, 6
{ 0x2116, 0x044c }, // 76 , 4
{ 0x2118, 0x0635 }, // 53 , 6
{ 0x211C, 0x0634 }, // 52 , 6
{ 0x211D, 0x06d8 }, // 216, 6
{ 0x211E, 0x042b }, // 43 , 4
{ 0x2120, 0x042a }, // 42 , 4
{ 0x2122, 0x0429 }, // 41 , 4
{ 0x2127, 0x06a7 }, // 167, 6
{ 0x2128, 0x066b }, // 107, 6
{ 0x212B, 0x0623 }, // 35 , 6
{ 0x212D, 0x066a }, // 106, 6
{ 0x212F, 0x0630 }, // 48 , 6
{ 0x2130, 0x06d3 }, // 211, 6
{ 0x2131, 0x06d4 }, // 212, 6
{ 0x2153, 0x0440 }, // 64 , 4
{ 0x2154, 0x0441 }, // 65 , 4
{ 0x215B, 0x0442 }, // 66 , 4
{ 0x215C, 0x0443 }, // 67 , 4
{ 0x215D, 0x0444 }, // 68 , 4
{ 0x215E, 0x0445 }, // 69 , 4
{ 0x2190, 0x0590 }, // 144, 5
{ 0x2191, 0x0617 }, // 23 , 6
{ 0x2192, 0x05d5 }, // 213, 5
{ 0x2193, 0x0618 }, // 24 , 6
{ 0x2194, 0x05d6 }, // 214, 5
{ 0x2195, 0x05d7 }, // 215, 5
{ 0x2196, 0x0640 }, // 64 , 6
{ 0x2197, 0x063e }, // 62 , 6
{ 0x2198, 0x063f }, // 63 , 6
{ 0x2199, 0x0641 }, // 65 , 6
{ 0x219D, 0x0690 }, // 144, 6
{ 0x21A3, 0x0693 }, // 147, 6
{ 0x21A8, 0x050f }, // 15 , 5
{ 0x21A9, 0x0691 }, // 145, 6
{ 0x21AA, 0x0692 }, // 146, 6
{ 0x21B5, 0x0514 }, // 20 , 5
{ 0x21BC, 0x0694 }, // 148, 6
{ 0x21BD, 0x0695 }, // 149, 6
{ 0x21BE, 0x069b }, // 155, 6
{ 0x21BF, 0x069a }, // 154, 6
{ 0x21C0, 0x0696 }, // 150, 6
{ 0x21C1, 0x0697 }, // 151, 6
{ 0x21C2, 0x069d }, // 157, 6
{ 0x21C3, 0x069c }, // 156, 6
{ 0x21C4, 0x0636 }, // 54 , 6
{ 0x21C6, 0x0637 }, // 55 , 6
{ 0x21C7, 0x069f }, // 159, 6
{ 0x21C9, 0x069e }, // 158, 6
{ 0x21CB, 0x0699 }, // 153, 6
{ 0x21CC, 0x0698 }, // 152, 6
{ 0x21D0, 0x0639 }, // 57 , 6
{ 0x21D1, 0x063a }, // 58 , 6
{ 0x21D2, 0x0638 }, // 56 , 6
{ 0x21D3, 0x063b }, // 59 , 6
{ 0x21D4, 0x063c }, // 60 , 6
{ 0x21D5, 0x063d }, // 61 , 6
{ 0x21E6, 0x0597 }, // 151, 5
{ 0x21E8, 0x0596 }, // 150, 5
{ 0x2200, 0x067a }, // 122, 6
{ 0x2202, 0x062c }, // 44 , 6
{ 0x2203, 0x0679 }, // 121, 6
{ 0x2204, 0x06d0 }, // 208, 6
{ 0x2205, 0x0648 }, // 72 , 6
{ 0x2207, 0x062b }, // 43 , 6
{ 0x2208, 0x060f }, // 15 , 6
{ 0x2209, 0x06d1 }, // 209, 6
{ 0x220B, 0x06db }, // 219, 6
{ 0x220D, 0x0647 }, // 71 , 6
{ 0x220F, 0x0629 }, // 41 , 6
{ 0x2210, 0x0672 }, // 114, 6
{ 0x2211, 0x0612 }, // 18 , 6
{ 0x2212, 0x0600 }, // 0 , 6
{ 0x2213, 0x062a }, // 42 , 6
{ 0x2214, 0x06ae }, // 174, 6
{ 0x2215, 0x0606 }, // 6 , 6
{ 0x2216, 0x0607 }, // 7 , 6
{ 0x2218, 0x0621 }, // 33 , 6
{ 0x2219, 0x0622 }, // 34 , 6
{ 0x221A, 0x0704 }, // 4 , 7
{ 0x221D, 0x0604 }, // 4 , 6
{ 0x221E, 0x0613 }, // 19 , 6
{ 0x221F, 0x06da }, // 218, 6
{ 0x2220, 0x064f }, // 79 , 6
{ 0x2221, 0x06a8 }, // 168, 6
{ 0x2222, 0x06a9 }, // 169, 6
{ 0x2223, 0x0609 }, // 9 , 6
{ 0x2224, 0x06ce }, // 206, 6
{ 0x2225, 0x0611 }, // 17 , 6
{ 0x2226, 0x06cd }, // 205, 6
{ 0x2227, 0x0655 }, // 85 , 6
{ 0x2228, 0x0656 }, // 86 , 6
{ 0x2229, 0x0610 }, // 16 , 6
{ 0x222A, 0x0642 }, // 66 , 6
{ 0x222B, 0x0628 }, // 40 , 6
{ 0x222E, 0x0668 }, // 104, 6
{ 0x2234, 0x0666 }, // 102, 6
{ 0x2235, 0x0665 }, // 101, 6
{ 0x2237, 0x0667 }, // 103, 6
{ 0x223C, 0x060c }, // 12 , 6
{ 0x2241, 0x06bd }, // 189, 6
{ 0x2243, 0x0673 }, // 115, 6
{ 0x2244, 0x06be }, // 190, 6
{ 0x2245, 0x0674 }, // 116, 6
{ 0x2247, 0x06bf }, // 191, 6
{ 0x2248, 0x060d }, // 13 , 6
{ 0x2249, 0x06c0 }, // 192, 6
{ 0x224D, 0x06b3 }, // 179, 6
{ 0x224E, 0x06b2 }, // 178, 6
{ 0x2250, 0x06af }, // 175, 6
{ 0x2252, 0x06b0 }, // 176, 6
{ 0x2253, 0x06b1 }, // 177, 6
{ 0x225F, 0x06d9 }, // 217, 6
{ 0x2260, 0x0663 }, // 99 , 6
{ 0x2261, 0x060e }, // 14 , 6
{ 0x2262, 0x0664 }, // 100, 6
{ 0x2264, 0x0602 }, // 2 , 6
{ 0x2265, 0x0603 }, // 3 , 6
{ 0x226A, 0x064d }, // 77 , 6
{ 0x226B, 0x064e }, // 78 , 6
{ 0x226C, 0x06b6 }, // 182, 6
{ 0x226D, 0x06cf }, // 207, 6
{ 0x226E, 0x06b9 }, // 185, 6
{ 0x226F, 0x06bb }, // 187, 6
{ 0x2270, 0x06ba }, // 186, 6
{ 0x2271, 0x06bc }, // 188, 6
{ 0x2272, 0x06eb }, // 235, 6
{ 0x2273, 0x06ec }, // 236, 6
{ 0x227A, 0x0675 }, // 117, 6
{ 0x227B, 0x0677 }, // 119, 6
{ 0x227C, 0x0676 }, // 118, 6
{ 0x227D, 0x0678 }, // 120, 6
{ 0x2280, 0x06c1 }, // 193, 6
{ 0x2281, 0x06c3 }, // 195, 6
{ 0x2282, 0x0643 }, // 67 , 6
{ 0x2283, 0x0644 }, // 68 , 6
{ 0x2284, 0x06c5 }, // 197, 6
{ 0x2285, 0x06c6 }, // 198, 6
{ 0x2286, 0x0645 }, // 69 , 6
{ 0x2287, 0x0646 }, // 70 , 6
{ 0x2288, 0x06c7 }, // 199, 6
{ 0x2289, 0x06c8 }, // 200, 6
{ 0x228A, 0x067e }, // 126, 6
{ 0x228B, 0x067f }, // 127, 6
{ 0x228E, 0x067d }, // 125, 6
{ 0x228F, 0x0682 }, // 130, 6
{ 0x2290, 0x0685 }, // 133, 6
{ 0x2291, 0x0683 }, // 131, 6
{ 0x2292, 0x0686 }, // 134, 6
{ 0x2293, 0x0680 }, // 128, 6
{ 0x2294, 0x0681 }, // 129, 6
{ 0x2295, 0x0651 }, // 81 , 6
{ 0x2296, 0x0652 }, // 82 , 6
{ 0x2297, 0x0650 }, // 80 , 6
{ 0x2299, 0x0654 }, // 84 , 6
{ 0x229A, 0x06a4 }, // 164, 6
{ 0x229B, 0x06a5 }, // 165, 6
{ 0x229D, 0x06a6 }, // 166, 6
{ 0x22A2, 0x065b }, // 91 , 6
{ 0x22A3, 0x065c }, // 92 , 6
{ 0x22A4, 0x0658 }, // 88 , 6
{ 0x22A5, 0x0659 }, // 89 , 6
{ 0x22A8, 0x06b4 }, // 180, 6
{ 0x22BB, 0x0657 }, // 87 , 6
{ 0x22C5, 0x061f }, // 31 , 6
{ 0x22C6, 0x0670 }, // 112, 6
{ 0x22C8, 0x068c }, // 140, 6
{ 0x22D0, 0x06a2 }, // 162, 6
{ 0x22D1, 0x06a3 }, // 163, 6
{ 0x22D2, 0x06a1 }, // 161, 6
{ 0x22D3, 0x06a0 }, // 160, 6
{ 0x22D8, 0x067b }, // 123, 6
{ 0x22D9, 0x067c }, // 124, 6
{ 0x22E0, 0x06c2 }, // 194, 6
{ 0x22E1, 0x06c4 }, // 196, 6
{ 0x22E2, 0x06cb }, // 203, 6
{ 0x22E3, 0x06cc }, // 204, 6
{ 0x22E4, 0x0684 }, // 132, 6
{ 0x22E5, 0x0687 }, // 135, 6
{ 0x22EE, 0x06de }, // 222, 6
{ 0x22EF, 0x06dc }, // 220, 6
{ 0x22F1, 0x06df }, // 223, 6
{ 0x2302, 0x050c }, // 12 , 5
{ 0x2308, 0x0649 }, // 73 , 6
{ 0x2309, 0x064a }, // 74 , 6
{ 0x230A, 0x064b }, // 75 , 6
{ 0x230B, 0x064c }, // 76 , 6
{ 0x2310, 0x0510 }, // 16 , 5
{ 0x2312, 0x065a }, // 90 , 6
{ 0x2319, 0x0511 }, // 17 , 5
{ 0x231A, 0x051f }, // 31 , 5
{ 0x231B, 0x0520 }, // 32 , 5
{ 0x2320, 0x0700 }, // 0 , 7
{ 0x2321, 0x0701 }, // 1 , 7
{ 0x2322, 0x068e }, // 142, 6
{ 0x2323, 0x068d }, // 141, 6
{ 0x2329, 0x060a }, // 10 , 6
{ 0x232A, 0x060b }, // 11 , 6
{ 0x2409, 0x044f }, // 79 , 4
{ 0x240A, 0x0452 }, // 82 , 4
{ 0x240B, 0x0454 }, // 84 , 4
{ 0x240C, 0x0450 }, // 80 , 4
{ 0x240D, 0x0451 }, // 81 , 4
{ 0x2424, 0x0453 }, // 83 , 4
{ 0x24C2, 0x0446 }, // 70 , 4
{ 0x24C5, 0x0447 }, // 71 , 4
{ 0x24CA, 0x0448 }, // 72 , 4, - circled U
{ 0x2500, 0x0308 }, // 8 , 3
{ 0x2502, 0x0309 }, // 9 , 3
{ 0x250C, 0x030a }, // 10 , 3
{ 0x2510, 0x030b }, // 11 , 3
{ 0x2514, 0x030d }, // 13 , 3
{ 0x2518, 0x030c }, // 12 , 3
{ 0x251C, 0x030e }, // 14 , 3
{ 0x251E, 0x033e }, // 62 , 3
{ 0x251F, 0x033c }, // 60 , 3
{ 0x2521, 0x033f }, // 63 , 3
{ 0x2522, 0x033d }, // 61 , 3
{ 0x2524, 0x0310 }, // 16 , 3
{ 0x2526, 0x0345 }, // 69 , 3
{ 0x2527, 0x0344 }, // 68 , 3
{ 0x2529, 0x0347 }, // 71 , 3
{ 0x252A, 0x0346 }, // 70 , 3
{ 0x252C, 0x030f }, // 15 , 3
{ 0x252D, 0x0342 }, // 66 , 3
{ 0x252E, 0x0340 }, // 64 , 3
{ 0x2531, 0x0343 }, // 67 , 3
{ 0x2532, 0x0341 }, // 65 , 3
{ 0x2534, 0x0311 }, // 17 , 3
{ 0x2535, 0x034a }, // 74 , 3
{ 0x2536, 0x0348 }, // 72 , 3
{ 0x2539, 0x034b }, // 75 , 3
{ 0x253A, 0x0349 }, // 73 , 3
{ 0x253C, 0x0312 }, // 18 , 3
{ 0x253D, 0x0352 }, // 82 , 3
{ 0x253E, 0x034e }, // 78 , 3
{ 0x2540, 0x034f }, // 79 , 3
{ 0x2541, 0x034c }, // 76 , 3
{ 0x2543, 0x0355 }, // 85 , 3
{ 0x2544, 0x0350 }, // 80 , 3
{ 0x2545, 0x0353 }, // 83 , 3
{ 0x2546, 0x034d }, // 77 , 3
{ 0x2547, 0x0357 }, // 87 , 3
{ 0x2548, 0x0354 }, // 84 , 3
{ 0x2549, 0x0356 }, // 86 , 3
{ 0x254A, 0x0351 }, // 81 , 3
{ 0x2550, 0x0313 }, // 19 , 3
{ 0x2551, 0x0314 }, // 20 , 3
{ 0x2552, 0x031e }, // 30 , 3
{ 0x2553, 0x0322 }, // 34 , 3
{ 0x2554, 0x0315 }, // 21 , 3
{ 0x2555, 0x031f }, // 31 , 3
{ 0x2556, 0x0323 }, // 35 , 3
{ 0x2557, 0x0316 }, // 22 , 3
{ 0x2558, 0x0321 }, // 33 , 3
{ 0x2559, 0x0325 }, // 37 , 3
{ 0x255A, 0x0318 }, // 24 , 3
{ 0x255B, 0x0320 }, // 32 , 3
{ 0x255C, 0x0324 }, // 36 , 3
{ 0x255D, 0x0317 }, // 23 , 3
{ 0x255E, 0x0326 }, // 38 , 3
{ 0x255F, 0x032a }, // 42 , 3
{ 0x2560, 0x0319 }, // 25 , 3
{ 0x2561, 0x0328 }, // 40 , 3
{ 0x2562, 0x032c }, // 44 , 3
{ 0x2563, 0x031b }, // 27 , 3
{ 0x2564, 0x032b }, // 43 , 3
{ 0x2565, 0x0327 }, // 39 , 3
{ 0x2566, 0x031a }, // 26 , 3
{ 0x2567, 0x032d }, // 45 , 3
{ 0x2568, 0x0329 }, // 41 , 3
{ 0x2569, 0x031c }, // 28 , 3
{ 0x256A, 0x032f }, // 47 , 3
{ 0x256B, 0x032e }, // 46 , 3
{ 0x256C, 0x031d }, // 29 , 3
{ 0x2574, 0x0330 }, // 48 , 3
{ 0x2575, 0x0331 }, // 49 , 3
{ 0x2576, 0x0332 }, // 50 , 3
{ 0x2577, 0x0333 }, // 51 , 3
{ 0x2578, 0x0334 }, // 52 , 3
{ 0x2579, 0x0335 }, // 53 , 3
{ 0x257A, 0x0336 }, // 54 , 3
{ 0x257B, 0x0337 }, // 55 , 3
{ 0x257C, 0x0338 }, // 56 , 3
{ 0x257D, 0x033a }, // 58 , 3
{ 0x257E, 0x0339 }, // 57 , 3
{ 0x257F, 0x033b }, // 59 , 3
{ 0x2580, 0x0305 }, // 5 , 3
{ 0x2584, 0x0307 }, // 7 , 3
{ 0x2588, 0x0303 }, // 3 , 3
{ 0x258C, 0x0304 }, // 4 , 3
{ 0x2590, 0x0306 }, // 6 , 3
{ 0x2591, 0x0300 }, // 0 , 3
{ 0x2592, 0x0301 }, // 1 , 3
{ 0x2593, 0x0302 }, // 2 , 3
{ 0x25A0, 0x0402 }, // 2 , 4
{ 0x25A1, 0x0426 }, // 38 , 4
{ 0x25AA, 0x042f }, // 47 , 4
{ 0x25AB, 0x0431 }, // 49 , 4
{ 0x25AC, 0x050b }, // 11 , 5
{ 0x25B2, 0x0573 }, // 115, 5
{ 0x25B3, 0x0688 }, // 136, 6
{ 0x25B4, 0x061d }, // 29 , 6
{ 0x25B5, 0x06ac }, // 172, 6
{ 0x25B8, 0x061b }, // 27 , 6
{ 0x25B9, 0x068b }, // 139, 6
{ 0x25BC, 0x0574 }, // 116, 5
{ 0x25BD, 0x0689 }, // 137, 6
{ 0x25BE, 0x061e }, // 30 , 6
{ 0x25BF, 0x06ad }, // 173, 6
{ 0x25C2, 0x061c }, // 28 , 6
{ 0x25C3, 0x068a }, // 138, 6
{ 0x25C6, 0x0575 }, // 117, 5
{ 0x25C7, 0x066f }, // 111, 6
{ 0x25CA, 0x065f }, // 95 , 6
{ 0x25CB, 0x0401 }, // 1 , 4
{ 0x25CF, 0x0400 }, // 0 , 4
{ 0x25D6, 0x059e }, // 158, 5
{ 0x25D7, 0x0577 }, // 119, 5
{ 0x25D8, 0x0512 }, // 18 , 5
{ 0x25D9, 0x0513 }, // 19 , 5
{ 0x25E6, 0x042d }, // 45 , 4
{ 0x2605, 0x0548 }, // 72, 5
{ 0x260E, 0x051e }, // 30 , 5
{ 0x2610, 0x0518 }, // 24 , 5
{ 0x2612, 0x0519 }, // 25 , 5
{ 0x261B, 0x052a }, // 42 , 5
{ 0x261C, 0x0516 }, // 22 , 5
{ 0x261E, 0x052b }, // 43 , 5
{ 0x2639, 0x051a }, // 26 , 5
{ 0x263A, 0x0507 }, // 7 , 5
{ 0x263B, 0x0508 }, // 8 , 5
{ 0x263C, 0x0506 }, // 6 , 5
{ 0x2640, 0x0505 }, // 5 , 5
{ 0x2642, 0x0504 }, // 4 , 5
{ 0x2660, 0x05ab }, // 171, 5
{ 0x2661, 0x0500 }, // 0 , 5
{ 0x2662, 0x0501 }, // 1 , 5
{ 0x2663, 0x05a8 }, // 168, 5
{ 0x2664, 0x0503 }, // 3 , 5
{ 0x2665, 0x05aa }, // 170, 5
{ 0x2666, 0x05a9 }, // 169, 5
{ 0x2667, 0x0502 }, // 2 , 5
{ 0x266A, 0x0509 }, // 9 , 5
{ 0x266C, 0x050a }, // 10 , 5
{ 0x266D, 0x051c }, // 28 , 5
{ 0x266E, 0x051d }, // 29 , 5
{ 0x266F, 0x051b }, // 27 , 5
{ 0x2701, 0x0521 }, // 33 , 5
{ 0x2702, 0x0522 }, // 34 , 5
{ 0x2703, 0x0523 }, // 35 , 5
{ 0x2704, 0x0524 }, // 36 , 5
{ 0x2706, 0x0526 }, // 38 , 5
{ 0x2707, 0x0527 }, // 39 , 5
{ 0x2708, 0x0528 }, // 40 , 5
{ 0x2709, 0x0529 }, // 41 , 5
{ 0x270C, 0x052c }, // 44 , 5
{ 0x270D, 0x052d }, // 45 , 5
{ 0x270E, 0x052e }, // 46 , 5
{ 0x270F, 0x052f }, // 47 , 5
{ 0x2710, 0x0530 }, // 48 , 5
{ 0x2711, 0x0531 }, // 49 , 5
{ 0x2712, 0x0532 }, // 50 , 5
{ 0x2713, 0x0533 }, // 51 , 5
{ 0x2714, 0x0534 }, // 52 , 5
{ 0x2715, 0x0535 }, // 53 , 5
{ 0x2716, 0x0536 }, // 54 , 5
{ 0x2717, 0x0537 }, // 55 , 5
{ 0x2718, 0x0538 }, // 56 , 5
{ 0x2719, 0x0539 }, // 57 , 5
{ 0x271A, 0x053a }, // 58 , 5
{ 0x271B, 0x053b }, // 59 , 5
{ 0x271C, 0x053c }, // 60 , 5
{ 0x271D, 0x053d }, // 61 , 5
{ 0x271E, 0x053e }, // 62 , 5
{ 0x271F, 0x053f }, // 63 , 5
{ 0x2720, 0x0540 }, // 64 , 5
{ 0x2721, 0x0541 }, // 65 , 5
{ 0x2722, 0x0542 }, // 66 , 5
{ 0x2723, 0x0543 }, // 67 , 5
{ 0x2724, 0x0544 }, // 68 , 5
{ 0x2725, 0x0545 }, // 69 , 5
{ 0x2726, 0x0546 }, // 70 , 5
{ 0x2727, 0x0547 }, // 71 , 5
{ 0x2729, 0x0549 }, // 73 , 5
{ 0x272A, 0x054a }, // 74 , 5
{ 0x272B, 0x054b }, // 75 , 5
{ 0x272C, 0x054c }, // 76 , 5
{ 0x272D, 0x054d }, // 77 , 5
{ 0x272E, 0x054e }, // 78 , 5
{ 0x272F, 0x054f }, // 79 , 5
{ 0x2730, 0x0550 }, // 80 , 5
{ 0x2731, 0x0551 }, // 81 , 5
{ 0x2732, 0x0552 }, // 82 , 5
{ 0x2733, 0x0553 }, // 83 , 5
{ 0x2734, 0x0554 }, // 84 , 5
{ 0x2735, 0x0555 }, // 85 , 5
{ 0x2736, 0x0556 }, // 86 , 5
{ 0x2737, 0x0557 }, // 87 , 5
{ 0x2738, 0x0558 }, // 88 , 5
{ 0x2739, 0x0559 }, // 89 , 5
{ 0x273A, 0x055a }, // 90 , 5
{ 0x273B, 0x055b }, // 91 , 5
{ 0x273C, 0x055c }, // 92 , 5
{ 0x273D, 0x055d }, // 93 , 5
{ 0x273E, 0x055e }, // 94 , 5
{ 0x273F, 0x055f }, // 95 , 5
{ 0x2740, 0x0560 }, // 96 , 5
{ 0x2741, 0x0561 }, // 97 , 5
{ 0x2742, 0x0562 }, // 98 , 5
{ 0x2743, 0x0563 }, // 99 , 5
{ 0x2744, 0x0564 }, // 100, 5
{ 0x2745, 0x0565 }, // 101, 5
{ 0x2746, 0x0566 }, // 102, 5
{ 0x2747, 0x0567 }, // 103, 5
{ 0x2748, 0x0568 }, // 104, 5
{ 0x2749, 0x0569 }, // 105, 5
{ 0x274A, 0x056a }, // 106, 5
{ 0x274B, 0x056b }, // 107, 5
{ 0x274D, 0x056d }, // 109, 5
{ 0x274F, 0x056f }, // 111, 5
{ 0x2750, 0x0570 }, // 112, 5
{ 0x2751, 0x0571 }, // 113, 5
{ 0x2752, 0x0572 }, // 114, 5
{ 0x2756, 0x0576 }, // 118, 5
{ 0x2758, 0x0578 }, // 120, 5
{ 0x2759, 0x0579 }, // 121, 5
{ 0x275A, 0x057a }, // 122, 5
{ 0x275B, 0x057b }, // 123, 5
{ 0x275C, 0x057c }, // 124, 5
{ 0x275D, 0x057d }, // 125, 5
{ 0x275E, 0x057e }, // 126, 5
{ 0x2761, 0x05a1 }, // 161, 5
{ 0x2762, 0x05a2 }, // 162, 5
{ 0x2763, 0x05a3 }, // 163, 5
{ 0x2764, 0x05a4 }, // 164, 5
{ 0x2765, 0x05a5 }, // 165, 5
{ 0x2766, 0x05a6 }, // 166, 5
{ 0x2767, 0x05a7 }, // 167, 5
{ 0x2776, 0x05b6 }, // 182, 5
{ 0x2777, 0x05b7 }, // 183, 5
{ 0x2778, 0x05b8 }, // 184, 5
{ 0x2779, 0x05b9 }, // 185, 5
{ 0x277A, 0x05ba }, // 186, 5
{ 0x277B, 0x05bb }, // 187, 5
{ 0x277C, 0x05bc }, // 188, 5
{ 0x277D, 0x05bd }, // 189, 5
{ 0x277E, 0x05be }, // 190, 5
{ 0x277F, 0x05bf }, // 191, 5
{ 0x2780, 0x05c0 }, // 192, 5
{ 0x2781, 0x05c1 }, // 193, 5
{ 0x2782, 0x05c2 }, // 194, 5
{ 0x2783, 0x05c3 }, // 195, 5
{ 0x2784, 0x05c4 }, // 196, 5
{ 0x2785, 0x05c5 }, // 197, 5
{ 0x2786, 0x05c6 }, // 198, 5
{ 0x2787, 0x05c7 }, // 199, 5
{ 0x2788, 0x05c8 }, // 200, 5
{ 0x2789, 0x05c9 }, // 201, 5
{ 0x278A, 0x05ca }, // 202, 5
{ 0x278B, 0x05cb }, // 203, 5
{ 0x278C, 0x05cc }, // 204, 5
{ 0x278D, 0x05cd }, // 205, 5
{ 0x278E, 0x05ce }, // 206, 5
{ 0x278F, 0x05cf }, // 207, 5
{ 0x2790, 0x05d0 }, // 208, 5
{ 0x2791, 0x05d1 }, // 209, 5
{ 0x2792, 0x05d2 }, // 210, 5
{ 0x2793, 0x05d3 }, // 211, 5
{ 0x2794, 0x05d4 }, // 212, 5
{ 0x2798, 0x05d8 }, // 216, 5
{ 0x2799, 0x05d9 }, // 217, 5
{ 0x279A, 0x05da }, // 218, 5
{ 0x279B, 0x05db }, // 219, 5
{ 0x279C, 0x05dc }, // 220, 5
{ 0x279D, 0x05dd }, // 221, 5
{ 0x279E, 0x05de }, // 222, 5
{ 0x279F, 0x05df }, // 223, 5
{ 0x27A0, 0x05e0 }, // 224, 5
{ 0x27A1, 0x05e1 }, // 225, 5
{ 0x27A2, 0x05e2 }, // 226, 5
{ 0x27A3, 0x05e3 }, // 227, 5
{ 0x27A4, 0x05e4 }, // 228, 5
{ 0x27A5, 0x05e5 }, // 229, 5
{ 0x27A6, 0x05e6 }, // 230, 5
{ 0x27A7, 0x05e7 }, // 231, 5
{ 0x27A8, 0x05e8 }, // 232, 5
{ 0x27A9, 0x05e9 }, // 233, 5
{ 0x27AA, 0x05ea }, // 234, 5
{ 0x27AB, 0x05eb }, // 235, 5
{ 0x27AC, 0x05ec }, // 236, 5
{ 0x27AD, 0x05ed }, // 237, 5
{ 0x27AE, 0x05ee }, // 238, 5
{ 0x27AF, 0x05ef }, // 239, 5
{ 0x27B1, 0x05f1 }, // 241, 5
{ 0x27B2, 0x05f2 }, // 242, 5
{ 0x27B3, 0x05f3 }, // 243, 5
{ 0x27B4, 0x05f4 }, // 244, 5
{ 0x27B5, 0x05f5 }, // 245, 5
{ 0x27B6, 0x05f6 }, // 246, 5
{ 0x27B7, 0x05f7 }, // 247, 5
{ 0x27B8, 0x05f8 }, // 248, 5
{ 0x27B9, 0x05f9 }, // 249, 5
{ 0x27BA, 0x05fa }, // 250, 5
{ 0x27BB, 0x05fb }, // 251, 5
{ 0x27BC, 0x05fc }, // 252, 5
{ 0x27BD, 0x05fd }, // 253, 5
{ 0x27BE, 0x05fe }, // 254, 5
// Range 0xE000 through 0xF8FF is reserved for private use.
// We cannot try to interpret characters in this range nor
// assign any default collation or meaning.
{ 0xFB00, 0x0433 }, // 51 , 4
{ 0xFB01, 0x0436 }, // 54 , 4
{ 0xFB02, 0x0437 }, // 55 , 4
{ 0xFB03, 0x0434 }, // 52 , 4
{ 0xFB04, 0x0435 }, // 53 , 4
{ 0xFB1E, 0x0930 }, // 48 , 9
{ 0xFF61, 0x0b00 }, // 0 , 11
{ 0xFF62, 0x0b01 }, // 1 , 11
{ 0xFF63, 0x0b02 }, // 2 , 11
{ 0xFF64, 0x0b03 }, // 3 , 11
{ 0xFF65, 0x0b04 }, // 4 , 11
{ 0xFF66, 0x0b05 }, // 5 , 11
{ 0xFF67, 0x0b06 }, // 6 , 11
{ 0xFF68, 0x0b07 }, // 7 , 11
{ 0xFF69, 0x0b08 }, // 8 , 11
{ 0xFF6A, 0x0b09 }, // 9 , 11
{ 0xFF6B, 0x0b0a }, // 10 , 11
{ 0xFF6C, 0x0b0b }, // 11 , 11
{ 0xFF6D, 0x0b0c }, // 12 , 11
{ 0xFF6E, 0x0b0d }, // 13 , 11
{ 0xFF6F, 0x0b0e }, // 14 , 11
{ 0xFF70, 0x0b0f }, // 15 , 11
{ 0xFF71, 0x0b10 }, // 16 , 11
{ 0xFF72, 0x0b11 }, // 17 , 11
{ 0xFF73, 0x0b12 }, // 18 , 11
{ 0xFF74, 0x0b13 }, // 19 , 11
{ 0xFF75, 0x0b14 }, // 20 , 11
{ 0xFF76, 0x0b15 }, // 21 , 11
{ 0xFF77, 0x0b16 }, // 22 , 11
{ 0xFF78, 0x0b17 }, // 23 , 11
{ 0xFF79, 0x0b18 }, // 24 , 11
{ 0xFF7A, 0x0b19 }, // 25 , 11
{ 0xFF7B, 0x0b1a }, // 26 , 11
{ 0xFF7C, 0x0b1b }, // 27 , 11
{ 0xFF7D, 0x0b1c }, // 28 , 11
{ 0xFF7E, 0x0b1d }, // 29 , 11
{ 0xFF7F, 0x0b1e }, // 30 , 11
{ 0xFF80, 0x0b1f }, // 31 , 11
{ 0xFF81, 0x0b20 }, // 32 , 11
{ 0xFF82, 0x0b21 }, // 33 , 11
{ 0xFF83, 0x0b22 }, // 34 , 11
{ 0xFF84, 0x0b23 }, // 35 , 11
{ 0xFF85, 0x0b24 }, // 36 , 11
{ 0xFF86, 0x0b25 }, // 37 , 11
{ 0xFF87, 0x0b26 }, // 38 , 11
{ 0xFF88, 0x0b27 }, // 39 , 11
{ 0xFF89, 0x0b28 }, // 40 , 11
{ 0xFF8A, 0x0b29 }, // 41 , 11
{ 0xFF8B, 0x0b2a }, // 42 , 11
{ 0xFF8C, 0x0b2b }, // 43 , 11
{ 0xFF8D, 0x0b2c }, // 44 , 11
{ 0xFF8E, 0x0b2d }, // 45 , 11
{ 0xFF8F, 0x0b2e }, // 46 , 11
{ 0xFF90, 0x0b2f }, // 47 , 11
{ 0xFF91, 0x0b30 }, // 48 , 11
{ 0xFF92, 0x0b31 }, // 49 , 11
{ 0xFF93, 0x0b32 }, // 50 , 11
{ 0xFF94, 0x0b33 }, // 51 , 11
{ 0xFF95, 0x0b34 }, // 52 , 11
{ 0xFF96, 0x0b35 }, // 53 , 11
{ 0xFF97, 0x0b36 }, // 54 , 11
{ 0xFF98, 0x0b37 }, // 55 , 11
{ 0xFF99, 0x0b38 }, // 56 , 11
{ 0xFF9A, 0x0b39 }, // 57 , 11
{ 0xFF9B, 0x0b3a }, // 58 , 11
{ 0xFF9C, 0x0b3b }, // 59 , 11
{ 0xFF9D, 0x0b3c }, // 60 , 11
{ 0xFF9E, 0x0b3d }, // 61 , 11
{ 0xFF9F, 0x0b3e } // 62 , 11
};
/****************************************************************************
Desc:
****************************************************************************/
FINLINE FLMUINT bytesInBits(
FLMUINT uiBits)
{
return( (uiBits + 7) >> 3);
}
/****************************************************************************
Desc:
****************************************************************************/
FINLINE FLMBOOL testOneBit(
const FLMBYTE * pucBuf,
FLMUINT uiBit)
{
return( (((pucBuf[ uiBit >> 3]) >> (7 - (uiBit & 7))) & 1)
? TRUE
: FALSE);
}
/****************************************************************************
Desc:
****************************************************************************/
FINLINE FLMUINT getNBits(
FLMUINT uiNumBits,
const FLMBYTE * pucBuf,
FLMUINT uiBit)
{
return(((FLMUINT)(
((FLMUINT)pucBuf[ uiBit >> 3] << 8) | // append high bits (byte 1) to ...
(FLMUINT)pucBuf[ (uiBit >> 3) + 1]) >> // ... overflow bits in 2nd byte
(16 - uiNumBits - (uiBit & 7))) & // reposition to low end of value
((1 << uiNumBits) - 1)); // mask off high bits
}
/****************************************************************************
Desc:
****************************************************************************/
FINLINE void setBit(
FLMBYTE * pucBuf,
FLMUINT uiBit)
{
pucBuf[ uiBit >> 3] |= (FLMBYTE)(1 << (7 - (uiBit & 7)));
}
/****************************************************************************
Desc:
****************************************************************************/
FINLINE void setBits(
FLMUINT uiCount,
FLMBYTE * pucBuf,
FLMUINT uiBit,
FLMUINT uiVal)
{
pucBuf[ uiBit >> 3] |= // 1st byte
(FLMBYTE)((uiVal << (8 - uiCount)) // Align to bit 0
>>
(uiBit & 7)); // Re-align to actual bit position
pucBuf[ (uiBit >> 3) + 1] = // 2nd byte
(FLMBYTE)(uiVal
<<
(16 - uiCount - (uiBit & 7))); // Align spill-over bits
}
/****************************************************************************
Desc: Returns TRUE if the character is upper case, FALSE if lower case.
****************************************************************************/
FINLINE FLMBOOL charIsUpper(
FLMUINT16 ui16Char)
{
return( (FLMBOOL)((ui16Char < 0x7F)
? (FLMBOOL)((ui16Char >= ASCII_LOWER_A &&
ui16Char <= ASCII_LOWER_Z)
? (FLMBOOL)FALSE
: (FLMBOOL)TRUE)
: f_wpIsUpper( ui16Char)));
}
/****************************************************************************
Desc: flmGetNextCharState can be thought of as a 2 dimentional array with
i and j as the row and column indicators respectively. If a value
exists at the intersection of i and j, it is returned. Sparse array
techniques are used to minimize memory usage.
Return: 0 = no valid next state
non-zero = valid next state, offset for action, or collating value
****************************************************************************/
FINLINE FLMUINT16 flmGetNextCharState(
FLMUINT i,
FLMUINT j)
{
FLMUINT k;
FLMUINT x;
for( k = fwp_indexi[ x = (i > START_COL) ? (START_ALL) : i];
k <= (FLMUINT) (fwp_indexi[ x + 1] - 1);
k++ )
{
if( j == fwp_indexj[ k])
{
return( fwp_valuea[ (i > START_COL)
? (k + (FIXUP_AREA_SIZE * (i - START_ALL)))
: k]);
}
}
return( 0);
}
/****************************************************************************
Desc: Convert a Unicode character to its WP equivalent
Ret: Returns TRUE if the character could be converted
****************************************************************************/
FLMBOOL FLMAPI f_unicodeToWP(
FLMUNICODE uUniChar, // Unicode character to convert
FLMUINT16 * pui16WPChar) // Returns 0 or WPChar converted.
{
if( uUniChar <= 127)
{
// Character is in the ASCII conversion range
*pui16WPChar = uUniChar;
return( TRUE);
}
if( uUniChar < gv_uiMinUniChar || uUniChar > gv_uiMaxUniChar)
{
*pui16WPChar = 0;
return( FALSE);
}
if( (*pui16WPChar = gv_pUnicodeToWP60[ uUniChar - gv_uiMinUniChar]) != 0)
{
return( TRUE);
}
return( FALSE);
}
/****************************************************************************
Desc: Convert a Unicode character to its WP equivalent using the
depricated FLAIM conversion rules
Ret: Returns TRUE if the character could be converted
****************************************************************************/
FLMBOOL FLMAPI f_depricatedUnicodeToWP(
FLMUNICODE uUniChar, // Unicode character to convert
FLMUINT16 * pui16WPChar) // Returns 0 or WPChar converted.
{
if( uUniChar < 127)
{
*pui16WPChar = uUniChar;
return( TRUE);
}
if( uUniChar < gv_uiMinUniChar ||
uUniChar > gv_uiMaxUniChar ||
uUniChar > 0x222E)
{
*pui16WPChar = 0;
return( FALSE);
}
if( (*pui16WPChar = gv_pUnicodeToWP60[ uUniChar - gv_uiMinUniChar]) != 0)
{
return( TRUE);
}
return( FALSE);
}
/****************************************************************************
Desc: Convert a WP character to its Unicode equivalent
****************************************************************************/
RCODE FLMAPI f_wpToUnicode(
FLMUINT16 ui16WPChar,
FLMUNICODE * puUniChar)
{
if( ui16WPChar <= 127)
{
// Character is in the ASCII conversion range
*puUniChar = (FLMUNICODE)ui16WPChar;
return( NE_FLM_OK);
}
if( ui16WPChar < gv_uiMinWPChar || ui16WPChar > gv_uiMaxWPChar)
{
*puUniChar = 0;
return( RC_SET( NE_FLM_CONV_ILLEGAL));
}
if( (*puUniChar = gv_pWP60ToUnicode[ ui16WPChar - gv_uiMinWPChar]) == 0)
{
return( RC_SET( NE_FLM_CONV_ILLEGAL));
}
return( NE_FLM_OK);
}
/****************************************************************************
Desc: Reads the next character from the storage buffer
****************************************************************************/
FINLINE RCODE flmGetCharFromUTF8Buf(
const FLMBYTE ** ppucBuf,
const FLMBYTE * pucEnd,
FLMUNICODE * puChar)
{
const FLMBYTE * pucBuf = *ppucBuf;
FLMUINT uiMaxLen = pucEnd ? (FLMUINT)(pucEnd - *ppucBuf) : 3;
if( !uiMaxLen)
{
*puChar = 0;
return( NE_FLM_OK);
}
if( pucBuf[ 0] <= 0x7F)
{
if( (*puChar = (FLMUNICODE)pucBuf[ 0]) != 0)
{
(*ppucBuf)++;
}
return( NE_FLM_OK);
}
if( uiMaxLen < 2 || (pucBuf[ 1] >> 6) != 0x02)
{
return( RC_SET( NE_FLM_BAD_UTF8));
}
if( (pucBuf[ 0] >> 5) == 0x06)
{
*puChar =
(FLMUNICODE)(((FLMUNICODE)( pucBuf[ 0] - 0xC0) << 6) +
(FLMUNICODE)(pucBuf[ 1] - 0x80));
(*ppucBuf) += 2;
return( NE_FLM_OK);
}
if( uiMaxLen < 3 ||
(pucBuf[ 0] >> 4) != 0x0E ||
(pucBuf[ 2] >> 6) != 0x02)
{
return( RC_SET( NE_FLM_BAD_UTF8));
}
*puChar =
(FLMUNICODE)(((FLMUNICODE)(pucBuf[ 0] - 0xE0) << 12) +
((FLMUNICODE)(pucBuf[ 1] - 0x80) << 6) +
(FLMUNICODE)(pucBuf[ 2] - 0x80));
(*ppucBuf) += 3;
return( NE_FLM_OK);
}
/****************************************************************************
Desc: Convert a Unicode character to UTF-8
*****************************************************************************/
FINLINE RCODE flmUni2UTF8(
FLMUNICODE uChar,
FLMBYTE * pucBuf,
FLMUINT * puiBufSize)
{
if( uChar <= 0x007F)
{
if( pucBuf)
{
if( *puiBufSize < 1)
{
return( RC_SET( NE_FLM_CONV_DEST_OVERFLOW));
}
*pucBuf = (FLMBYTE)uChar;
}
*puiBufSize = 1;
}
else if( uChar <= 0x07FF)
{
if( pucBuf)
{
if( *puiBufSize < 2)
{
return( RC_SET( NE_FLM_CONV_DEST_OVERFLOW));
}
*pucBuf++ = (FLMBYTE)(0xC0 | (FLMBYTE)(uChar >> 6));
*pucBuf = (FLMBYTE)(0x80 | (FLMBYTE)(uChar & 0x003F));
}
*puiBufSize = 2;
}
else
{
if( pucBuf)
{
if( *puiBufSize < 3)
{
return( RC_SET( NE_FLM_CONV_DEST_OVERFLOW));
}
*pucBuf++ = (FLMBYTE)(0xE0 | (FLMBYTE)(uChar >> 12));
*pucBuf++ = (FLMBYTE)(0x80 | (FLMBYTE)((uChar & 0x0FC0) >> 6));
*pucBuf = (FLMBYTE)(0x80 | (FLMBYTE)(uChar & 0x003F));
}
*puiBufSize = 3;
}
return( NE_FLM_OK);
}
/****************************************************************************
Desc: Reads the next UTF-8 character from a UTF-8 buffer
Notes: This routine assumes that the destination buffer can hold at least
three bytes
****************************************************************************/
FINLINE RCODE flmGetUTF8CharFromUTF8Buf(
FLMBYTE ** ppucBuf,
FLMBYTE * pucEnd,
FLMBYTE * pucDestBuf,
FLMUINT * puiLen)
{
FLMBYTE * pucBuf = *ppucBuf;
FLMUINT uiMaxLen = pucEnd ? (FLMUINT)(pucEnd - *ppucBuf) : 3;
if( !uiMaxLen || !pucBuf[ 0])
{
*puiLen = 0;
return( NE_FLM_OK);
}
if( pucBuf[ 0] <= 0x7F)
{
*pucDestBuf = pucBuf[ 0];
(*ppucBuf)++;
*puiLen = 1;
return( NE_FLM_OK);
}
if( uiMaxLen < 2 || (pucBuf[ 1] >> 6) != 0x02)
{
return( RC_SET( NE_FLM_BAD_UTF8));
}
if( (pucBuf[ 0] >> 5) == 0x06)
{
pucDestBuf[ 0] = pucBuf[ 0];
pucDestBuf[ 1] = pucBuf[ 1];
(*ppucBuf) += 2;
*puiLen = 2;
return( NE_FLM_OK);
}
if( uiMaxLen < 3 ||
(pucBuf[ 0] >> 4) != 0x0E ||
(pucBuf[ 2] >> 6) != 0x02)
{
return( RC_SET( NE_FLM_BAD_UTF8));
}
pucDestBuf[ 0] = pucBuf[ 0];
pucDestBuf[ 1] = pucBuf[ 1];
pucDestBuf[ 2] = pucBuf[ 2];
(*ppucBuf) += 3;
*puiLen = 3;
return( NE_FLM_OK);
}
/****************************************************************************
Desc:
****************************************************************************/
FINLINE RCODE flmGetUTF8Length(
const FLMBYTE * pucBuf,
FLMUINT uiBufLen,
FLMUINT * puiBytes,
FLMUINT * puiChars)
{
const FLMBYTE * pucStart = pucBuf;
const FLMBYTE * pucEnd = uiBufLen ? (pucStart + uiBufLen) : NULL;
FLMUINT uiChars = 0;
if (!pucBuf)
{
goto Exit;
}
while( (!pucEnd || pucBuf < pucEnd) && *pucBuf)
{
if( *pucBuf <= 0x7F)
{
pucBuf++;
uiChars++;
continue;
}
if( (pucEnd && pucBuf + 1 >= pucEnd) ||
(pucBuf[ 1] >> 6) != 0x02)
{
return( RC_SET( NE_FLM_BAD_UTF8));
}
if( ((*pucBuf) >> 5) == 0x06)
{
pucBuf += 2;
uiChars++;
continue;
}
if( (pucEnd && pucBuf + 2 >= pucEnd) ||
(pucBuf[ 0] >> 4) != 0x0E ||
(pucBuf[ 2] >> 6) != 0x02)
{
return( RC_SET( NE_FLM_BAD_UTF8));
}
pucBuf += 3;
uiChars++;
}
Exit:
*puiChars = uiChars;
if (pucEnd && pucBuf == pucEnd)
{
*puiBytes = (FLMUINT)(pucBuf - pucStart);
}
else
{
// Hit a null byte
*puiBytes = (FLMUINT)(pucBuf - pucStart) + 1;
}
return( NE_FLM_OK);
}
/****************************************************************************
Desc: Converts a character to upper case (if possible)
****************************************************************************/
FLMUINT16 FLMAPI f_wpUpper(
FLMUINT16 ui16WpChar)
{
if( ui16WpChar < 256)
{
if( ui16WpChar >= ASCII_LOWER_A && ui16WpChar <= ASCII_LOWER_Z)
{
// Return ASCII upper case
return( ui16WpChar & 0xdf);
}
}
else
{
FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8);
if( ucCharSet == F_CHSMUL1)
{
FLMBYTE ucChar = (FLMBYTE)(ui16WpChar & 0xFF);
if( ucChar >= fwp_caseConvertableRange[ (F_CHSMUL1 - 1) * 2] &&
ucChar <= fwp_caseConvertableRange[ ((F_CHSMUL1 - 1) * 2) + 1])
{
return( ui16WpChar & 0xFFFE);
}
}
else if( ucCharSet == F_CHSGREK)
{
if( (ui16WpChar & 0xFF) <=
fwp_caseConvertableRange[ ((F_CHSGREK - 1) * 2) + 1])
{
return( ui16WpChar & 0xFFFE);
}
}
else if( ucCharSet == F_CHSCYR)
{
if( (ui16WpChar & 0xFF) <=
fwp_caseConvertableRange[ ((F_CHSCYR - 1) * 2) + 1])
{
return( ui16WpChar & 0xFFFE);
}
}
else if( ui16WpChar >= Lower_JP_a)
{
// Possible double byte character set alphabetic character?
if( ui16WpChar <= Lower_JP_z)
{
// Japanese?
ui16WpChar = (ui16WpChar - Lower_JP_a) + Upper_JP_A;
}
else if( ui16WpChar >= Lower_KR_a && ui16WpChar <= Lower_KR_z)
{
// Korean?
ui16WpChar = (ui16WpChar - Lower_KR_a) + Upper_KR_A;
}
else if( ui16WpChar >= Lower_CS_a && ui16WpChar <= Lower_CS_z)
{
// Chinese Simplified?
ui16WpChar = (ui16WpChar - Lower_CS_a) + Upper_CS_A;
}
else if( ui16WpChar >= Lower_CT_a && ui16WpChar <= Lower_CT_z)
{
// Chinese Traditional?
ui16WpChar = (ui16WpChar - Lower_CT_a) + Upper_CT_A;
}
}
}
// Return original character - original not in lower case.
return( ui16WpChar);
}
/****************************************************************************
Desc: Checks to see if WP character is upper case
****************************************************************************/
FLMBOOL FLMAPI f_wpIsUpper(
FLMUINT16 ui16WpChar)
{
FLMBYTE ucChar;
FLMBYTE ucCharSet;
// Get character
ucChar = (FLMBYTE)(ui16WpChar & 0xFF);
// Test if ASCII character set
if( !(ui16WpChar & 0xFF00))
{
return( (ucChar >= ASCII_LOWER_A && ucChar <= ASCII_LOWER_Z)
? FALSE
: TRUE);
}
// Get the character set
ucCharSet = (FLMBYTE) (ui16WpChar >> 8);
if( (ucCharSet == F_CHSMUL1 && ucChar >= 26 && ucChar <= 241) ||
(ucCharSet == F_CHSGREK && ucChar <= 69) ||
(ucCharSet == F_CHSCYR && ucChar <= 199))
{
return( (ucChar & 1) ? FALSE : TRUE);
}
// Don't care that double ss is lower
return( TRUE);
}
/****************************************************************************
Desc: Converts a character to lower case (if possible)
****************************************************************************/
FLMUINT16 FLMAPI f_wpLower(
FLMUINT16 ui16WpChar)
{
if( ui16WpChar < 256)
{
if( ui16WpChar >= ASCII_UPPER_A && ui16WpChar <= ASCII_UPPER_Z)
{
return( ui16WpChar | 0x20);
}
}
else
{
FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8);
if( ucCharSet == F_CHSMUL1)
{
FLMBYTE ucChar = (FLMBYTE)(ui16WpChar & 0xFF);
if( ucChar >= fwp_caseConvertableRange[ (F_CHSMUL1 - 1) * 2] &&
ucChar <= fwp_caseConvertableRange[ ((F_CHSMUL1 - 1) * 2) + 1] )
{
return( ui16WpChar | 1);
}
}
else if( ucCharSet == F_CHSGREK)
{
if( (ui16WpChar & 0xFF) <=
fwp_caseConvertableRange[ ((F_CHSGREK - 1) * 2) + 1])
{
return( ui16WpChar | 1);
}
}
else if( ucCharSet == F_CHSCYR)
{
if( (ui16WpChar & 0xFF) <=
fwp_caseConvertableRange[ ((F_CHSCYR-1) * 2) + 1])
{
return( ui16WpChar | 1);
}
}
else if( ui16WpChar >= Upper_JP_A)
{
// Possible double byte character set alphabetic character?
if( ui16WpChar <= Upper_JP_Z)
{
// Japanese?
ui16WpChar = ui16WpChar - Upper_JP_A + Lower_JP_a;
}
else if( ui16WpChar >= Upper_KR_A && ui16WpChar <= Upper_KR_Z)
{
// Korean?
ui16WpChar = ui16WpChar - Upper_KR_A + Lower_KR_a;
}
else if( ui16WpChar >= Upper_CS_A && ui16WpChar <= Upper_CS_Z)
{
// Chinese Simplified?
ui16WpChar = ui16WpChar - Upper_CS_A + Lower_CS_a;
}
else if( ui16WpChar >= Upper_CT_A && ui16WpChar <= Upper_CT_Z)
{
// Chinese Traditional?
ui16WpChar = ui16WpChar - Upper_CT_A + Lower_CT_a;
}
}
}
// Return original character, original not in upper case
return( ui16WpChar);
}
/****************************************************************************
Desc: Break a WP character into a base and a diacritical char.
****************************************************************************/
FLMBOOL FLMAPI f_breakWPChar(
FLMUINT16 ui16WpChar,
FLMUINT16 * pui16BaseChar,
FLMUINT16 * pui16DiacriticChar)
{
BASE_DIACRIT * pBaseDiacritic;
FLMINT iTableIndex;
if( HI(ui16WpChar) >= F_NCHSETS ||
(pBaseDiacritic = fwp_car60_c[ HI(ui16WpChar)]) == 0)
{
return( TRUE);
}
iTableIndex = ((FLMBYTE)ui16WpChar) - pBaseDiacritic->start_char;
if( iTableIndex < 0 ||
iTableIndex >= pBaseDiacritic->char_count ||
pBaseDiacritic->table [iTableIndex].base == (FLMBYTE)0xFF)
{
return( TRUE);
}
if( (HI( ui16WpChar) != F_CHSMUL1) ||
((fwp_ml1_cb60[ ((FLMBYTE) ui16WpChar) >> 3] >>
(7 - (ui16WpChar & 0x07))) & 0x01))
{
// normal case, same base as same as characters
*pui16BaseChar = (ui16WpChar & 0xFF00) |
pBaseDiacritic->table [iTableIndex].base;
*pui16DiacriticChar = (ui16WpChar & 0xFF00) |
pBaseDiacritic->table[iTableIndex].diacrit;
}
else
{
// Multi-national where base is ascii value.
*pui16BaseChar = pBaseDiacritic->table [iTableIndex].base;
*pui16DiacriticChar = (ui16WpChar & 0xFF00) |
pBaseDiacritic->table[iTableIndex].diacrit;
}
return( FALSE);
}
/****************************************************************************
Desc: Take a base and a diacritic and compose a WP character.
Note on base character: i's and j's must be dotless i's and j's (for
those which use them) or they will not be found.
Ret: TRUE - if not found
FALSE - if found
Notes: ascii characters with diacriticals are in multi-national if anywhere;
all other base chars with diacritics are found in their own sets.
****************************************************************************/
FLMBOOL FLMAPI f_combineWPChar(
FLMUINT16 * pui16WpChar,
FLMUINT16 ui16BaseChar,
FLMINT16 ui16DiacriticChar)
{
FLMUINT uiRemaining;
FLMBYTE ucCharSet;
FLMBYTE ucChar;
BASE_DIACRIT * pBaseDiacritic;
BASE_DIACRIT_TABLE * pTable;
ucCharSet = HI( ui16BaseChar);
if( ucCharSet >= F_NCHSETS)
{
return( TRUE);
}
// Is base ASCII? If so, look in multinational 1
if( !ucCharSet)
{
ucCharSet = F_CHSMUL1;
}
if( ucCharSet >= F_NCHSETS ||
(pBaseDiacritic = fwp_car60_c[ ucCharSet]) == 0)
{
return( TRUE);
}
ucChar = LO( ui16BaseChar);
ui16DiacriticChar = LO( ui16DiacriticChar);
pTable = pBaseDiacritic->table;
for( uiRemaining = pBaseDiacritic->char_count;
uiRemaining;
uiRemaining--, pTable++ )
{
// Same base?
if( pTable->base == ucChar &&
(pTable->diacrit & 0x7F) == ui16DiacriticChar)
{
// Same diacritic?
*pui16WpChar = (FLMUINT16) (((FLMUINT16) ucCharSet << 8) +
(pBaseDiacritic->start_char +
(FLMUINT16)(pTable - pBaseDiacritic->table)));
return( FALSE);
}
}
return( TRUE);
}
/**************************************************************************
Desc: Find the collating value of a WP character
ret: Collating value (COLS0 is high value - undefined WP char)
***********************************************************************/
FLMUINT16 FLMAPI f_wpGetCollationImp(
FLMUINT16 ui16WpChar,
FLMUINT uiLanguage)
{
FLMUINT16 ui16State;
FLMBYTE ucCharVal;
FLMBYTE ucCharSet;
FLMBOOL bHebrewArabicFlag;
TBL_B_TO_BP * pColTbl;
if( uiLanguage == FLM_US_LANG)
{
return( gv_pui16USCollationTable[ ui16WpChar]);
}
else if( uiLanguage == FLM_AR_LANG || uiLanguage == FLM_FA_LANG ||
uiLanguage == FLM_HE_LANG || uiLanguage == FLM_UR_LANG)
{
pColTbl = fwp_HebArabicCol60Tbl;
bHebrewArabicFlag = TRUE;
}
else
{
// Check if uiLanguage candidate for alternate double collating
ui16State = flmGetNextCharState( START_COL, uiLanguage);
if( 0 != (ui16State = flmGetNextCharState( (ui16State
? ui16State // look at special case languages
: START_ALL), // look at US and European
(FLMUINT) ui16WpChar)))
{
return( ui16State);
}
pColTbl = fwp_col60Tbl;
bHebrewArabicFlag = FALSE;
}
ucCharVal = (FLMBYTE)ui16WpChar;
ucCharSet = (FLMBYTE)(ui16WpChar >> 8);
do
{
if( pColTbl->key == ucCharSet)
{
FLMBYTE * pucColVals = pColTbl->charPtr;
// Check if the value is in the range of collated chars
// Above lower range of table?
if( ucCharVal >= *pucColVals)
{
// Make value zero based to index
ucCharVal -= *pucColVals++;
// Below maximum number of table entries?
if( ucCharVal < *pucColVals++)
{
// Return collated value.
return( pucColVals[ ucCharVal]);
}
}
}
// Go to next table entry
pColTbl++;
} while( pColTbl->key != 0xFF);
if( bHebrewArabicFlag)
{
if( ucCharSet == F_CHSHEB || ucCharSet == F_CHSARB1 ||
ucCharSet == F_CHSARB2)
{
return( COLS0_ARABIC);
}
}
// Defaults for characters that don't have a collation value.
return( COLS0);
}
/****************************************************************************
Desc: Check for double characters that sort as 1 (like ch in Spanish) or
1 character that should sort as 2 (like ? sorts as ae in French).
Return: 0 = nothing changes
1 if sorting 2 characters as 1 - *pui16WpChar is the one character.
second character value if 1 character sorts as 2,
*pui16WpChar changes to first character in sequence
****************************************************************************/
RCODE FLMAPI f_wpCheckDoubleCollation(
IF_PosIStream * pIStream,
FLMBOOL bUnicodeStream,
FLMBOOL bAllowTwoIntoOne,
FLMUNICODE * puzChar,
FLMUNICODE * puzChar2,
FLMBOOL * pbTwoIntoOne,
FLMUINT uiLanguage)
{
RCODE rc = NE_FLM_OK;
FLMUINT16 ui16CurState;
FLMUINT16 ui16WpChar;
FLMUNICODE uzLastChar = 0;
FLMUNICODE uChar = *puzChar;
FLMUNICODE uDummy;
FLMBOOL bUpperFlag;
FLMUINT64 ui64SavePosition = pIStream->getCurrPosition();
if (!f_unicodeToWP( *puzChar, &ui16WpChar))
{
ui16WpChar = UNK_UNICODE_CODE;
}
bUpperFlag = f_wpIsUpper( ui16WpChar);
*pbTwoIntoOne = FALSE;
*puzChar2 = 0;
if ((ui16CurState = flmGetNextCharState( 0, uiLanguage)) == 0)
{
goto Exit;
}
for (;;)
{
switch (ui16CurState)
{
case INSTSG:
{
*puzChar = *puzChar2 = (FLMUNICODE)f_toascii( 's');
*pbTwoIntoOne = FALSE;
goto Exit;
}
case INSTAE:
{
if (bUpperFlag)
{
*puzChar = (FLMUNICODE)f_toascii( 'A');
*puzChar2 = (FLMUNICODE)f_toascii( 'E');
}
else
{
*puzChar = (FLMUNICODE)f_toascii( 'a');
*puzChar2 = (FLMUNICODE)f_toascii( 'e');
}
*pbTwoIntoOne = FALSE;
goto Exit;
}
case INSTIJ:
{
if (bUpperFlag)
{
*puzChar = (FLMUNICODE)f_toascii( 'I');
*puzChar2 = (FLMUNICODE)f_toascii( 'J');
}
else
{
*puzChar = (FLMUNICODE)f_toascii( 'i');
*puzChar2 = (FLMUNICODE)f_toascii( 'j');
}
*pbTwoIntoOne = FALSE;
goto Exit;
}
case INSTOE:
{
if (bUpperFlag)
{
*puzChar = (FLMUNICODE)f_toascii( 'O');
*puzChar2 = (FLMUNICODE)f_toascii( 'E');
}
else
{
*puzChar = (FLMUNICODE)f_toascii( 'o');
*puzChar2 = (FLMUNICODE)f_toascii( 'e');
}
*pbTwoIntoOne = FALSE;
goto Exit;
}
case WITHAA:
{
*puzChar = (FLMUNICODE)(bUpperFlag
? (FLMUNICODE)0xC5
: (FLMUNICODE)0xE5);
if (RC_BAD( rc = pIStream->positionTo( ui64SavePosition)))
{
goto Exit;
}
if( bUnicodeStream)
{
rc = pIStream->read( &uDummy, sizeof( FLMUNICODE), NULL);
}
else
{
rc = f_readUTF8CharAsUnicode( pIStream, &uDummy);
}
if( RC_BAD( rc))
{
if (rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
}
else
{
goto Exit;
}
}
ui64SavePosition = pIStream->getCurrPosition();
break;
}
case AFTERC:
{
*puzChar = (FLMUINT16)(bUpperFlag
? (FLMUNICODE)f_toascii( 'C')
: (FLMUNICODE)f_toascii( 'c'));
Position_After_2nd:
if( bAllowTwoIntoOne)
{
*puzChar2 = uzLastChar;
*pbTwoIntoOne = TRUE;
if (RC_BAD( rc = pIStream->positionTo( ui64SavePosition)))
{
goto Exit;
}
if( bUnicodeStream)
{
rc = pIStream->read( &uChar, sizeof( FLMUNICODE), NULL);
}
else
{
rc = f_readUTF8CharAsUnicode( pIStream, &uChar);
}
if (RC_BAD( rc))
{
if (rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
}
else
{
goto Exit;
}
}
ui64SavePosition = pIStream->getCurrPosition();
}
goto Exit;
}
case AFTERH:
{
*puzChar = (FLMUINT16)(bUpperFlag
? (FLMUNICODE)f_toascii( 'H')
: (FLMUNICODE)f_toascii( 'h'));
goto Position_After_2nd;
}
case AFTERL:
{
*puzChar = (FLMUINT16)(bUpperFlag
? (FLMUNICODE)f_toascii( 'L')
: (FLMUNICODE)f_toascii( 'l'));
goto Position_After_2nd;
}
default:
{
// Handles STATE1 through STATE11 also
break;
}
}
if ((ui16CurState = flmGetNextCharState( ui16CurState,
f_wpLower( ui16WpChar))) == 0)
{
break;
}
uzLastChar = uChar;
if( bUnicodeStream)
{
rc = pIStream->read( &uChar, sizeof( FLMUNICODE), NULL);
}
else
{
rc = f_readUTF8CharAsUnicode( pIStream, &uChar);
}
if (RC_BAD( rc))
{
if (rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
}
else
{
goto Exit;
}
}
if (!f_unicodeToWP( uChar, &ui16WpChar))
{
ui16WpChar = UNK_UNICODE_CODE;
}
}
Exit:
if (RC_OK( rc))
{
rc = pIStream->positionTo( ui64SavePosition);
}
return( rc);
}
/****************************************************************************
Desc: Check for double characters that sort as 1 (like ch in Spanish) or
1 character that should sort as 2 (like <20> sorts as ae in French).
Return: 0 = nothing changes. Otherwise, *pui16WpChar is the first
character, and the return value contains the 2nd character.
In addition, *pbTwoIntoOne will be TRUE if we should take two
characters and treat as one (i.e, change the collation on the
outside to one more than the collation of the first character).
****************************************************************************/
FLMUINT16 FLMAPI f_wpCheckDoubleCollation(
FLMUINT16 * pui16WpChar,
FLMBOOL * pbTwoIntoOne,
const FLMBYTE ** ppucInputStr,
FLMUINT uiLanguage)
{
FLMUINT16 ui16CurState;
FLMUINT16 ui16WpChar;
FLMUINT16 ui16SecondChar;
FLMUINT16 ui16LastChar = 0;
FLMUINT uiInLen;
FLMBOOL bUpperFlag;
ui16WpChar = *pui16WpChar;
bUpperFlag = f_wpIsUpper( ui16WpChar);
uiInLen = 0;
ui16SecondChar = 0;
// Primer read
if ((ui16CurState = flmGetNextCharState( 0, uiLanguage)) == 0)
{
goto Exit;
}
for (;;)
{
switch (ui16CurState)
{
case INSTSG:
{
*pui16WpChar = ui16SecondChar = (FLMUINT16) f_toascii( 's');
*pbTwoIntoOne = FALSE;
goto Exit;
}
case INSTAE:
{
if (bUpperFlag)
{
*pui16WpChar = (FLMUINT16) f_toascii( 'A');
ui16SecondChar = (FLMUINT16) f_toascii( 'E');
}
else
{
*pui16WpChar = (FLMUINT16) f_toascii( 'a');
ui16SecondChar = (FLMUINT16) f_toascii( 'e');
}
*pbTwoIntoOne = FALSE;
goto Exit;
}
case INSTIJ:
{
if (bUpperFlag)
{
*pui16WpChar = (FLMUINT16) f_toascii( 'I');
ui16SecondChar = (FLMUINT16) f_toascii( 'J');
}
else
{
*pui16WpChar = (FLMUINT16) f_toascii( 'i');
ui16SecondChar = (FLMUINT16) f_toascii( 'j');
}
*pbTwoIntoOne = FALSE;
goto Exit;
}
case INSTOE:
{
if (bUpperFlag)
{
*pui16WpChar = (FLMUINT16) f_toascii( 'O');
ui16SecondChar = (FLMUINT16) f_toascii( 'E');
}
else
{
*pui16WpChar = (FLMUINT16) f_toascii( 'o');
ui16SecondChar = (FLMUINT16) f_toascii( 'e');
}
*pbTwoIntoOne = FALSE;
goto Exit;
}
case WITHAA:
{
*pui16WpChar = (FLMUINT16) (bUpperFlag
? (FLMUINT16) 0x122
: (FLMUINT16) 0x123);
(*ppucInputStr)++;
break;
}
case AFTERC:
{
*pui16WpChar = (FLMUINT16) (bUpperFlag
? (FLMUINT16) f_toascii( 'C')
: (FLMUINT16) f_toascii( 'c'));
ui16SecondChar = ui16LastChar;
*pbTwoIntoOne = TRUE;
(*ppucInputStr)++;
goto Exit;
}
case AFTERH:
{
*pui16WpChar = (FLMUINT16) (bUpperFlag
? (FLMUINT16) f_toascii( 'H')
: (FLMUINT16) f_toascii( 'h'));
ui16SecondChar = ui16LastChar;
*pbTwoIntoOne = TRUE;
(*ppucInputStr)++;
goto Exit;
}
case AFTERL:
{
*pui16WpChar = (FLMUINT16) (bUpperFlag
? (FLMUINT16) f_toascii( 'L')
: (FLMUINT16) f_toascii( 'l'));
ui16SecondChar = ui16LastChar;
*pbTwoIntoOne = TRUE;
(*ppucInputStr)++;
goto Exit;
}
default:
{
// Handles STATE1 through STATE11 also
break;
}
}
if ((ui16CurState = flmGetNextCharState( ui16CurState,
f_wpLower( ui16WpChar))) == 0)
{
goto Exit;
}
ui16LastChar = ui16WpChar;
ui16WpChar = (FLMUINT16) * ((*ppucInputStr) + (uiInLen++));
}
Exit:
return (ui16SecondChar);
}
/****************************************************************************
Desc: Returns the collation value of the input WP character.
If in charset 11 will convert the character to Zenkaku (double wide).
In: ui16WpChar - Char to collate off of - could be in CS0..14 or x24..up
ui16NextWpChar - next WP char for CS11 voicing marks
ui16PrevColValue - previous collating value - for repeat/vowel repeat
pui16ColValue - returns 2 byte collation value
pui16SubColVal - 0, 6 or 16 bit value for the latin sub collation
or the kana size & vowel voicing
001 - set if large (upper) character
010 - set if voiced
100 - set if half voiced
pucCaseBits - returns 2 bits
Latin/Greek/Cyrillic
01 - case bit set if character is uppercase
10 - double wide character in CS 0x25xx, 0x26xx and 0x27xx
Japanese
00 - double wide hiragana 0x255e..25b0
01 - double wide katakana 0x2600..2655
10 - double wide symbols that map to charset 11
11 - single wide katakana from charset 11
Ret: 0 - no valid collation value
high values set for pui16ColValue
Sub-collation gets original WP character value
1 - valid collation value
2 - valid collation value and used the ui16NextWpChar
Notes: Code taken from XCH2COL.ASM - routine xch2col_f
also from CMPWS.ASM - routine getcase
Terms:
HANKAKU - single wide characters in charsets 0..14
ZENKAKU - double wide characters in charsets 0x24..end of kanji
KANJI - collation values are 0x2900 less than WPChar value
****************************************************************************/
FLMUINT16 flmWPAsiaGetCollation(
FLMUINT16 ui16WpChar, // WP char to get collation values
FLMUINT16 ui16NextWpChar, // Next WP char - for CS11 voicing marks
FLMUINT16 ui16PrevColValue, // Previous collating value
FLMUINT16 * pui16ColValue, // Returns collation value
FLMUINT16 * pui16SubColVal, // Returns sub-collation value
FLMBYTE * pucCaseBits, // Returns case bits value
FLMBOOL bUppercaseFlag) // Set if to convert to uppercase
{
FLMUINT16 ui16ColValue;
FLMUINT16 ui16SubColVal;
FLMBYTE ucCaseBits = 0;
FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8);
FLMBYTE ucCharVal = (FLMBYTE)(ui16WpChar & 0xFF);
FLMUINT16 ui16Hankaku;
FLMUINT uiLoop;
FLMUINT16 ui16ReturnValue = 1;
ui16ColValue = ui16SubColVal = 0;
// Kanji or above
if( ucCharSet >= 0x2B)
{
// Puts 2 or above into high byte.
ui16ColValue = ui16WpChar - 0x2900;
// No subcollation or case bits need to be set
goto Exit;
}
// Single wide character? (HANKAKU)
if( ucCharSet < 11)
{
// Get the values from a non-asian character
// LATIN, GREEK or CYRILLIC
// The width bit may have been set on a jump to
// label from below.
Latin_Greek_Cyrillic:
// YES: Pass FLM_US_LANG because this is what we want -
// Prevents double character sorting.
ui16ColValue = f_wpGetCollation( ui16WpChar, FLM_US_LANG);
if (bUppercaseFlag || f_wpIsUpper( ui16WpChar))
{
// Uppercase - set case bit
ucCaseBits |= SET_CASE_BIT;
}
// Character for which there is no collation value?
if( ui16ColValue == COLS0)
{
ui16ReturnValue = 0;
if( !f_wpIsUpper( ui16WpChar))
{
// Convert to uppercase
ui16WpChar--;
}
ui16ColValue = 0xFFFF;
ui16SubColVal = ui16WpChar;
}
else if( ucCharSet) // Don't bother with ascii
{
if( !f_wpIsUpper( ui16WpChar))
{
// Convert to uppercase
ui16WpChar--;
}
if( ucCharSet == F_CHSMUL1)
{
FLMUINT16 ui16Base;
FLMUINT16 ui16Diacritic;
ui16SubColVal = !f_breakWPChar( ui16WpChar, &ui16Base,
&ui16Diacritic)
? fwp_dia60Tbl[ ui16Diacritic & 0xFF]
: ui16WpChar;
}
else if( ucCharSet == F_CHSGREK)
{
if( ui16WpChar >= 0x834 || // [8,52] or above
ui16WpChar == 0x804 || // [8,4] BETA Medial | Terminal
ui16WpChar == 0x826) // [8,38] SIGMA terminal
{
ui16SubColVal = ui16WpChar;
}
}
else if( ucCharSet == F_CHSCYR)
{
if( ui16WpChar >= 0xA90) // [10, 144] or above
{
ui16SubColVal = ui16WpChar; // Dup collation values
}
}
// else don't need a sub collation value
}
goto Exit;
}
// Single wide Japanese character?
if( ucCharSet == 11)
{
FLMUINT16 ui16KanaChar;
// Convert charset 11 to Zenkaku (double wide) CS24 or CS26 hex.
// All characters in charset 11 will convert to CS24 or CS26.
// when combining the collation and the sub-collation values.
if( f_wpHanToZenkaku( ui16WpChar,
ui16NextWpChar, &ui16KanaChar ) == 2)
{
// Return 2
ui16ReturnValue++;
}
ucCaseBits |= SET_WIDTH_BIT; // Set so will allow to go back
ui16WpChar = ui16KanaChar; // If in CS24 will fall through to ZenKaku
ucCharSet = (FLMBYTE)(ui16KanaChar >> 8);
ucCharVal = (FLMBYTE)(ui16KanaChar & 0xFF);
}
if( ui16WpChar < 0x2400)
{
// In some other character set
goto Latin_Greek_Cyrillic;
}
else if( ui16WpChar >= 0x255e && // Hiragana?
ui16WpChar <= 0x2655) // Katakana?
{
if( ui16WpChar >= 0x2600)
{
ucCaseBits |= SET_KATAKANA_BIT;
}
// HIRAGANA & KATAKANA
// Kana contains both hiragana and katakana.
// The tables contain the same characters in same order
if( ucCharSet == 0x25)
{
// Change value to be in character set 26
ucCharVal -= 0x5E;
}
ui16ColValue = 0x0100 + KanaColTbl[ ucCharVal ];
ui16SubColVal = KanaSubColTbl[ ucCharVal ];
goto Exit;
}
// ZenKaku - means any double wide character
// Hankaku - single wide character
// Inputs: 0x2400..2559 symbols..latin - Zenkaku
// 0x265B..2750 greek..cyrillic - Zenkaku
// SET_WIDTH_BIT may have been set if original char
// was in 11 and got converted to CS24. [1,2,5,27(extendedVowel),53,54]
// Original chars from CS11 will have some collation value that when
// combined with the sub-collation value will format a character in
// CS24. The width bit will then convert back to CS11.
if( (ui16Hankaku = f_wpZenToHankaku( ui16WpChar, NULL)) != 0)
{
if( (ui16Hankaku >> 8) != 11) // if CharSet11 was a CS24 symbol
{
ui16WpChar = ui16Hankaku; // May be CS24 symbol/latin/gk/cy
ucCharSet = (FLMBYTE)(ui16WpChar >> 8);
ucCharVal = (FLMBYTE)(ui16WpChar & 0xFF);
ucCaseBits |= SET_WIDTH_BIT; // Latin symbols double wide
goto Latin_Greek_Cyrillic;
}
}
// 0x2400..0x24bc Japanese symbols that cannot be converted to Hankaku.
// All 6 original symbol chars from 11 will also be here.
// First try to find a collation value of the symbol.
// The sub-collation value will be the position in the CS24 table + 1.
for( uiLoop = 0;
uiLoop < (sizeof( fwp_Ch24ColTbl) / sizeof( BYTE_WORD_TBL));
uiLoop++ )
{
if( ucCharVal == fwp_Ch24ColTbl[ uiLoop].ByteValue)
{
if( (ui16ColValue = fwp_Ch24ColTbl[ uiLoop].WordValue) < 0x100)
{
// Don't save for chuuten, dakuten, handakuten
ui16SubColVal = (FLMUINT16)(uiLoop + 1);
}
break;
}
}
if( !ui16ColValue)
{
// Now see if it's a repeat or repeat-vowel character
if( (((ucCharVal >= 0x12) && (ucCharVal <= 0x15)) ||
(ucCharVal == 0x17) ||
(ucCharVal == 0x18)) &&
((ui16PrevColValue >> 8) == 1))
{
ui16ColValue = ui16PrevColValue;
// Store original WP character
ui16SubColVal = ui16WpChar;
}
else if( (ucCharVal == 0x1B) && // repeat vowel?
(ui16PrevColValue >= 0x100) &&
(ui16PrevColValue < COLS_ASIAN_MARKS)) // Previous kana char?
{
ui16ColValue = 0x0100 + KanaColToVowel[ ui16PrevColValue & 0xFF ];
// Store original WP character
ui16SubColVal = ui16WpChar;
}
else
{
ui16ReturnValue = 0;
ui16ColValue = 0xFFFF; // No collation value
ui16SubColVal = ui16WpChar; // Never have changed if gets here
}
}
Exit:
// Set return values
*pui16ColValue = ui16ColValue;
*pui16SubColVal = ui16SubColVal;
*pucCaseBits = ucCaseBits;
return( ui16ReturnValue);
}
/****************************************************************************
Desc: Convert a zenkaku (double wide) char to a hankaku (single wide) char
Ret: Hankaku char or 0 if a conversion doesn't exist
Notes: Taken from CHAR.ASM - zen2han_f routine
****************************************************************************/
FLMUINT16 FLMAPI f_wpZenToHankaku(
FLMUINT16 ui16WpChar,
FLMUINT16 * pui16DakutenOrHandakuten)
{
FLMUINT16 ui16Hankaku = 0;
FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8);
FLMBYTE ucCharVal = (FLMBYTE)(ui16WpChar & 0xFF);
FLMUINT uiLoop;
switch( ucCharSet)
{
// SYMBOLS
case 0x24:
{
for( uiLoop = 0;
uiLoop < (sizeof( Zen24ToHankaku) / sizeof( BYTE_WORD_TBL));
uiLoop++)
{
// List is sorted so table entry is more you are done
if( Zen24ToHankaku[ uiLoop].ByteValue >= ucCharVal)
{
if( Zen24ToHankaku[ uiLoop].ByteValue == ucCharVal)
{
ui16Hankaku = Zen24ToHankaku[ uiLoop].WordValue;
}
break;
}
}
break;
}
// ROMAN - 0x250F..2559
// Hiragana - 0x255E..2580
case 0x25:
{
if( ucCharVal >= 0x0F && ucCharVal < 0x5E)
{
ui16Hankaku = ucCharVal + 0x21;
}
break;
}
// Katakana - 0x2600..2655
// Greek - 0x265B..2695
case 0x26:
{
if( ucCharVal <= 0x55) // Katakana range
{
FLMBYTE ucCS11CharVal;
FLMUINT16 ui16NextWpChar = 0;
if( (ucCS11CharVal = MapCS26ToCharSet11[ ucCharVal ]) != 0xFF)
{
if( ucCS11CharVal & 0x80)
{
if( ucCS11CharVal & 0x40)
{
// Handakuten voicing
ui16NextWpChar = 0xB3E;
}
else
{
// Dakuten voicing
ui16NextWpChar = 0xB3D;
}
ucCS11CharVal &= 0x3F;
}
ui16Hankaku = 0x0b00 + ucCS11CharVal;
if( ui16NextWpChar && pui16DakutenOrHandakuten)
{
*pui16DakutenOrHandakuten = ui16NextWpChar;
}
}
}
else if( ucCharVal <= 0x95) // Greek
{
FLMBYTE ucGreekChar = ucCharVal;
// Make a zero based number.
ucGreekChar -= 0x5E;
// Check for lowercase
if( ucGreekChar >= 0x20)
{
// Convert to upper case for now
ucGreekChar -= 0x20;
}
if( ucGreekChar >= 2)
{
ucGreekChar++;
}
if (ucGreekChar >= 19)
{
ucGreekChar++;
}
// Convert to character set 8
ui16Hankaku = (ucGreekChar << 1) + 0x800;
if( ucCharVal >= (0x5E + 0x20))
{
// Adjust to lower case character
ui16Hankaku++;
}
}
break;
}
// Cyrillic
case 0x27:
{
// Uppercase?
if( ucCharVal <= 0x20)
{
ui16Hankaku = (ucCharVal << 1) + 0xa00;
}
else if( ucCharVal >= 0x30 && ucCharVal <= 0x50)
{
// Lower case
ui16Hankaku = ((ucCharVal - 0x30) << 1) + 0xa01;
}
break;
}
}
return( ui16Hankaku);
}
/****************************************************************************
Desc: Convert a WPChar from hankaku (single wide) to zenkaku (double wide).
1) Used to see if a char in CS11 can map to a double wide character
2) Used to convert keys into original data.
Ret: 0 = no conversion
1 = converted character to zenkaku
2 = ui16NextWpChar dakuten or handakuten voicing got combined
Notes: Taken from char.asm - han2zen()
From8ToZen could be taken out and placed in code.
****************************************************************************/
FLMUINT16 FLMAPI f_wpHanToZenkaku(
FLMUINT16 ui16WpChar,
FLMUINT16 ui16NextWpChar,
FLMUINT16 * pui16Zenkaku)
{
FLMUINT16 ui16Zenkaku = 0;
FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8);
FLMBYTE ucCharVal = (FLMBYTE)(ui16WpChar & 0xFF);
FLMUINT uiLoop;
FLMUINT16 ui16CharsUsed = 1;
switch( ucCharSet)
{
// Character set 0 - symbols
case 0:
{
// Invalid? - all others are used.
if( ucCharVal < 0x20)
{
;
}
else if( ucCharVal <= 0x2F)
{
// Symbols A
ui16Zenkaku = 0x2400 + From0AToZen[ ucCharVal - 0x20 ];
}
else if( ucCharVal <= 0x39)
{
// 0..9
ui16Zenkaku = 0x2500 + (ucCharVal - 0x21);
}
else if( ucCharVal <= 0x40)
{
// Symbols B
ui16Zenkaku = 0x2400 + From0BToZen[ ucCharVal - 0x3A ];
}
else if( ucCharVal <= 0x5A)
{
// A..Z
ui16Zenkaku = 0x2500 + (ucCharVal - 0x21);
}
else if( ucCharVal <= 0x60)
{
// Symbols C
ui16Zenkaku = 0x2400 + From0CToZen[ ucCharVal - 0x5B ];
}
else if( ucCharVal <= 0x7A)
{
// a..z
ui16Zenkaku = 0x2500 + (ucCharVal - 0x21);
}
else if( ucCharVal <= 0x7E)
{
// Symbols D
ui16Zenkaku = 0x2400 + From0DToZen[ ucCharVal - 0x7B ];
}
break;
}
// GREEK
case 8:
{
if( (ucCharVal >= sizeof( From8ToZen)) ||
((ui16Zenkaku = 0x2600 + From8ToZen[ ucCharVal ]) == 0x26FF))
{
ui16Zenkaku = 0;
}
break;
}
// CYRILLIC
case 10:
{
// Check range
ui16Zenkaku = 0x2700 + (ucCharVal >> 1); // Uppercase value
// Convert to lower case?
if( ucCharVal & 0x01)
{
ui16Zenkaku += 0x30;
}
break;
}
// JAPANESE
case 11:
{
if( ucCharVal < 5)
{
ui16Zenkaku = 0x2400 + From11AToZen[ ucCharVal];
}
else if( ucCharVal < 0x3D) // katakana?
{
if( (ui16Zenkaku = 0x2600 +
From11BToZen[ ucCharVal - 5 ]) == 0x26FF)
{
// Dash - convert to this
ui16Zenkaku = 0x241b;
}
else
{
if( ui16NextWpChar == 0xB3D)
{
// First check exception(s) then
// check if voicing exists! - will NOT access out of table
if( (ui16Zenkaku != 0x2652) && // is not 'N'?
(KanaSubColTbl[ ui16Zenkaku - 0x2600 + 1 ] == 3))
{
ui16Zenkaku++;
// Return 2
ui16CharsUsed++;
}
}
else if( ui16NextWpChar == 0xB3E) // handakuten? - voicing
{
// Check if voicing exists! - will NOT access out of table
if( KanaSubColTbl [ui16Zenkaku - 0x2600 + 2 ] == 5)
{
ui16Zenkaku += 2;
// Return 2
ui16CharsUsed++;
}
}
}
}
else if( ucCharVal == 0x3D) // dakuten?
{
// Convert to voicing symbol
ui16Zenkaku = 0x240A;
}
else if( ucCharVal == 0x3E) // handakuten?
{
// Convert to voicing symbol
ui16Zenkaku = 0x240B;
}
// else cannot convert
break;
}
// Other character sets
// CS 1,4,5,6 - symbols
default:
{
// Look in the Zen24Tohankaku table for a matching value
for( uiLoop = 0;
uiLoop < (sizeof( Zen24ToHankaku) / sizeof( BYTE_WORD_TBL));
uiLoop++)
{
if( Zen24ToHankaku[ uiLoop].WordValue == ui16WpChar)
{
ui16Zenkaku = 0x2400 + Zen24ToHankaku[ uiLoop].ByteValue;
break;
}
}
break;
}
}
if( !ui16Zenkaku)
{
// Change return value
ui16CharsUsed = 0;
}
*pui16Zenkaku = ui16Zenkaku;
return( ui16CharsUsed);
}
/****************************************************************************
Desc: Converts a 2-byte language code into its corresponding language ID
****************************************************************************/
FLMUINT FLMAPI f_languageToNum(
const char * pszLanguage)
{
FLMBYTE ucFirstChar = (FLMBYTE)(*pszLanguage);
FLMBYTE ucSecondChar = (FLMBYTE)(*(pszLanguage + 1));
FLMUINT uiTablePos;
for( uiTablePos = 0;
uiTablePos < (FLM_LAST_LANG + FLM_LAST_LANG); uiTablePos += 2)
{
if( f_langtbl [uiTablePos] == ucFirstChar &&
f_langtbl [uiTablePos+1] == ucSecondChar)
{
return( uiTablePos >> 1);
}
}
// Language not found, return default US language
return( FLM_US_LANG);
}
/****************************************************************************
Desc: Converts a language ID to its corresponding 2-byte language code
****************************************************************************/
void FLMAPI f_languageToStr(
FLMINT iLangNum,
char * pszLanguage)
{
// iLangNum could be negative
if( iLangNum < 0 || iLangNum >= FLM_LAST_LANG)
{
iLangNum = FLM_US_LANG;
}
iLangNum += iLangNum;
*pszLanguage++ = (char)f_langtbl [iLangNum ];
*pszLanguage++ = (char)f_langtbl [iLangNum+1];
*pszLanguage = 0;
}
/***************************************************************************
Desc: Return the sub-collation value of a WP character. Unconverted
unicode values always have a sub-collation value of
11110+UnicodeChar
***************************************************************************/
FLMUINT16 flmWPGetSubCol(
FLMUINT16 ui16WPValue, // [in] WP Character value.
FLMUINT16 ui16ColValue, // [in] Collation Value (for arabic)
FLMUINT uiLanguage) // [in] WP Language ID.
{
FLMUINT16 ui16SubColVal;
FLMBYTE ucCharVal;
FLMBYTE ucCharSet;
FLMUINT16 ui16Base;
// Easy case first - ascii characters.
ui16SubColVal = 0;
if (ui16WPValue <= 127)
{
goto Exit;
}
// From here down default ui16SubColVal is WP value.
ui16SubColVal = ui16WPValue;
ucCharVal = (FLMBYTE) ui16WPValue;
ucCharSet = (FLMBYTE) (ui16WPValue >> 8);
// Convert char to uppercase because case information
// is stored above. This will help
// ensure that the "ETA" doesn't sort before "eta"
// could use is lower code here for added performance.
// This just happens to work with all WP character values.
if (!f_wpIsUpper( ui16WPValue))
{
ui16WPValue &= ~1;
}
switch (ucCharSet)
{
case F_CHSMUL1:
{
// If you cannot break down a char into base and
// diacritic then you cannot combine the charaacter
// later when converting back the key. So, write
// the entire WP char in the sub-collation area.
// We can ONLY SUPPORT MULTINATIONAL 1 for brkcar()
if (f_breakWPChar( ui16WPValue, &ui16Base, &ui16SubColVal))
{
// WordPerfect character cannot be broken down.
// If we had a collation value other than 0xFF (COLS0), don't
// return a sub-collation value. This will allow things like
// upper and lower AE digraphs to compare properly.
if (ui16ColValue != COLS0)
{
ui16SubColVal = 0;
}
goto Exit;
}
// Write the FLAIM diacritic sub-collation value.
// Prefix is 2 bits "10". Remember to leave
// "111" alone for the future.
// Bug 11/16/92 = was only writing a "1" and not "10"
ui16SubColVal = (
(ui16SubColVal & 0xFF) == F_UMLAUT
&& ( (uiLanguage == FLM_SU_LANG) ||
(uiLanguage == FLM_SV_LANG) ||
(uiLanguage == FLM_CZ_LANG) ||
(uiLanguage == FLM_SL_LANG)
)
)
? (FLMUINT16)(fwp_dia60Tbl[ F_RING] + 1) // umlaut must be after ring above
: (FLMUINT16)(fwp_dia60Tbl[ ui16SubColVal & 0xFF]);
break;
}
case F_CHSGREK:
{
if( (ucCharVal >= 52) || // Keep case bit for 52-69 else ignore
(ui16WPValue == 0x804) || // [ 8,4] BETA Medial | Terminal
(ui16WPValue == 0x826)) // [ 8,38] SIGMA termainal
{
ui16SubColVal = ui16WPValue;
}
// else no subcollation to worry about
break;
}
case F_CHSCYR:
{
if (ucCharVal >= 144)
{
ui16SubColVal = ui16WPValue;
}
break;
}
case F_CHSHEB:
{
// Three sections in Hebrew:
// 0..26 - main characters
// 27..83 - accents that apear over previous character
// 84..118- dagesh (ancient) hebrew with accents
// Because the ancient is only used for sayings & scriptures
// we will support a collation value and in the sub-collation
// store the actual character because sub-collation is in
// character order.
if (ucCharVal >= 84) // Save ancient - value 84 and above
{
ui16SubColVal = ui16WPValue;
}
break;
}
case F_CHSARB1: // Arabic 1
{
// Three sections in Arabic:
// 00..37 - accents that display OVER a previous character
// 38..46 - symbols
// 47..57 - numbers
// 58..163 - characters
// 164 - hamzah accent
// 165..180- common characters with accents
// 181..193- ligatures - common character combinations
// 194..195- extensions - throw away when sorting
if (ucCharVal <= 46)
{
ui16SubColVal = ui16WPValue;
}
else
{
if (ui16ColValue == COLS10a+1) // Alef?
{
ui16SubColVal = (ucCharVal >= 165)
? (FLMUINT16)(fwp_alefSubColTbl[ ucCharVal - 165 ])
: (FLMUINT16)7; // Alef subcol value
}
else
{
if (ucCharVal >= 181) // Ligatures - char combination
{
ui16SubColVal = ui16WPValue;
}
else if (ucCharVal == 64) // taa exception
{
ui16SubColVal = 8;
}
}
}
break;
}
case F_CHSARB2: // Arabic 2
{
// There are some characters that share the same slot
// Check the bit table if above character 64
if ((ucCharVal >= 64) &&
(fwp_ar2BitTbl[(ucCharVal-64)>> 3] & (0x80 >> (ucCharVal&0x07))))
{
ui16SubColVal = ui16WPValue;
}
break;
}
}
Exit:
return( ui16SubColVal);
}
/*****************************************************************************
Desc:
******************************************************************************/
RCODE F_CollIStream::read(
FLMBOOL bAllowTwoIntoOne,
FLMUNICODE * puChar,
FLMBOOL * pbCharIsWild,
FLMUINT16 * pui16Col,
FLMUINT16 * pui16SubCol,
FLMBYTE * pucCase)
{
RCODE rc = NE_FLM_OK;
FLMUNICODE uChar;
FLMUINT16 ui16WpChar;
FLMUINT16 ui16NextWpChar;
FLMUINT16 ui16Col;
FLMUINT16 ui16SubCol;
FLMBOOL bTwoIntoOne;
FLMBYTE ucCase;
FLMBOOL bAsian;
FLMBOOL bLastCharWasSpace = FALSE;
FLMUINT64 ui64AfterLastSpacePos = 0;
FLMUINT64 ui64CurrCharPos = 0;
if (pbCharIsWild)
{
*pbCharIsWild = FALSE;
}
// Is this a double-byte (Asian) character set?
bAsian = (m_uiLanguage >= FLM_FIRST_DBCS_LANG &&
m_uiLanguage <= FLM_LAST_DBCS_LANG)
? TRUE
: FALSE;
// Get the next character from the stream
GetNextChar:
ui16WpChar = 0;
ui16NextWpChar = 0;
ui16Col = 0;
ui16SubCol = 0;
bTwoIntoOne = FALSE;
ucCase = 0;
if (m_uNextChar)
{
uChar = m_uNextChar;
m_uNextChar = 0;
}
else
{
ui64CurrCharPos = m_pIStream->getCurrPosition();
if( RC_BAD( rc = readCharFromStream( &uChar)))
{
if (rc != NE_FLM_EOF_HIT)
{
goto Exit;
}
// If we were skipping spaces, we need to
// process a single space character, unless we are
// ignoring trailing white space.
if (bLastCharWasSpace &&
!(m_uiCompareRules & FLM_COMP_IGNORE_TRAILING_SPACE))
{
// bLastCharWasSpace flag can only be TRUE if either
// FLM_COMP_IGNORE_TRAILING_SPACE is set or
// FLM_COMP_COMPRESS_WHITESPACE is set.
flmAssert( m_uiCompareRules & FLM_COMP_COMPRESS_WHITESPACE);
uChar = ASCII_SPACE;
rc = NE_FLM_OK;
goto Process_Char;
}
goto Exit;
}
}
if ((uChar = f_convertChar( uChar, m_uiCompareRules)) == 0)
{
goto GetNextChar;
}
// Deal with spaces
if (uChar == ASCII_SPACE)
{
if (m_uiCompareRules & FLM_COMP_COMPRESS_WHITESPACE)
{
bLastCharWasSpace = TRUE;
ui64AfterLastSpacePos = m_pIStream->getCurrPosition();
goto GetNextChar;
}
else if (m_uiCompareRules & FLM_COMP_IGNORE_TRAILING_SPACE)
{
if (!bLastCharWasSpace)
{
bLastCharWasSpace = TRUE;
// Save where we are at so that if this doesn't turn out
// to be trailing spaces, we can restore this position.
ui64AfterLastSpacePos = m_pIStream->getCurrPosition();
}
goto GetNextChar;
}
}
else
{
if (m_uiCompareRules & FLM_COMP_IGNORE_LEADING_SPACE)
{
m_ui64EndOfLeadingSpacesPos = ui64CurrCharPos;
m_uiCompareRules &= (~(FLM_COMP_IGNORE_LEADING_SPACE));
}
// If the last character was a space, we need to process it.
if (bLastCharWasSpace)
{
// Position back to after the last space, and process a space
// character.
if (RC_BAD( rc = m_pIStream->positionTo( ui64AfterLastSpacePos)))
{
goto Exit;
}
uChar = ASCII_SPACE;
bLastCharWasSpace = FALSE;
}
else if (uChar == ASCII_BACKSLASH)
{
// If wildcards are allowed, the backslash should be treated
// as an escape character, and the next character is the one
// we want. Otherwise, it should be treated as
// the actual character we want returned.
if (m_bMayHaveWildCards)
{
// Got a backslash. Means the next character is to be taken
// no matter what because it is escaped.
if (RC_BAD( rc = readCharFromStream( &uChar)))
{
if (rc != NE_FLM_EOF_HIT)
{
goto Exit;
}
rc = NE_FLM_OK;
uChar = ASCII_BACKSLASH;
}
}
}
else if (uChar == ASCII_WILDCARD)
{
if (m_bMayHaveWildCards && pbCharIsWild)
{
*pbCharIsWild = TRUE;
}
}
}
Process_Char:
if (!bAsian)
{
// Must check for double characters if non-US and non-Asian
// character set
if (m_uiLanguage != FLM_US_LANG)
{
if (RC_BAD( rc = f_wpCheckDoubleCollation(
m_pIStream, m_bUnicodeStream, bAllowTwoIntoOne,
&uChar, &m_uNextChar, &bTwoIntoOne, m_uiLanguage)))
{
goto Exit;
}
}
}
else
{
if (RC_BAD( rc = readCharFromStream( &m_uNextChar)))
{
if (rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
m_uNextChar = 0;
}
else
{
RC_UNEXPECTED_ASSERT( rc);
goto Exit;
}
}
}
// Convert each character to its WP equivalent
if (!f_unicodeToWP( uChar, &ui16WpChar))
{
ui16WpChar = 0;
}
if (!f_unicodeToWP( m_uNextChar, &ui16NextWpChar))
{
ui16NextWpChar = 0;
}
// If we have an unconvertible UNICODE character, the collation
// value for it will be COLS0
if (!ui16WpChar)
{
if (!bAsian)
{
ui16Col = COLS0;
}
else
{
if (uChar < 0x20)
{
ui16Col = 0xFFFF;
ui16SubCol = uChar;
}
else
{
ui16Col = uChar;
ui16SubCol = 0;
}
}
}
else
{
if (!bAsian)
{
ui16Col = f_wpGetCollation( ui16WpChar, m_uiLanguage);
if (bTwoIntoOne)
{
// Since two characters were merged into one, increment
// the collation value by one. In the case of something
// like 'ch', there is a collation value between 'c' and
// 'd'. f_wpGetCollation would have returned the
// collation value for 'c' ... incrementing by one gives
// us the proper collation value for 'ch' (i.e., the
// collation value between 'c' and 'd').
ui16Col++;
}
}
else
{
if (flmWPAsiaGetCollation( ui16WpChar, ui16NextWpChar, ui16Col,
&ui16Col, &ui16SubCol, &ucCase, !m_bCaseSensitive) == 2)
{
// Next character was consumed by collation
m_uNextChar = 0;
}
}
}
if (pui16Col)
{
*pui16Col = ui16Col;
}
// Consume m_uNextChar if two characters merged into one
if (bTwoIntoOne)
{
m_uNextChar = 0;
}
// Subcollation
if( pui16SubCol)
{
if( uChar > 127 && !bAsian)
{
ui16SubCol = ui16WpChar
? flmWPGetSubCol( ui16WpChar, ui16Col, m_uiLanguage)
: uChar;
if( !m_bCaseSensitive)
{
// If the sub-collation value is the original
// character, it means that the collation could not
// distinguish the characters and sub-collation is being
// used to do it. However, this creates a problem when the
// characters are the same character except for case. In that
// scenario, we incorrectly return a not-equal when we are
// doing a case-insensitive comparison. So, at this point,
// we need to use the sub-collation for the upper-case of the
// character instead of the sub-collation for the character
// itself.
if( ui16WpChar && ui16SubCol == ui16WpChar)
{
ui16SubCol = flmWPGetSubCol(
f_wpUpper( ui16WpChar),
ui16Col, m_uiLanguage);
}
}
}
*pui16SubCol = ui16SubCol;
}
// Case
if( pucCase)
{
if (!m_bCaseSensitive)
{
*pucCase = 0;
}
else
{
if (!bAsian && ui16WpChar)
{
// f_wpIsUpper() returns FALSE if the character is lower or
// TRUE if the character is not lower case.
if( f_wpIsUpper( ui16WpChar))
{
if( bTwoIntoOne)
{
if( f_wpIsUpper( ui16NextWpChar))
{
ucCase = 0x03;
}
else
{
ucCase = 0x10;
}
}
else
{
ucCase = 0x01;
}
}
}
*pucCase = ucCase;
}
}
if (puChar)
{
*puChar = uChar;
}
Exit:
return( rc);
}
/***************************************************************************
Desc: Compare two entire strings.
****************************************************************************/
RCODE FLMAPI f_compareCollStreams(
IF_CollIStream * pLStream,
IF_CollIStream * pRStream,
FLMBOOL bOpIsMatch,
FLMUINT uiLanguage,
FLMINT * piResult)
{
RCODE rc = NE_FLM_OK;
FLMUINT16 ui16RCol;
FLMUINT16 ui16LCol;
FLMUINT16 ui16RSubCol;
FLMUINT16 ui16LSubCol;
FLMBYTE ucRCase;
FLMBYTE ucLCase;
F_CollStreamPos savedRPos;
F_CollStreamPos savedLPos;
F_CollStreamPos startLPos;
FLMUNICODE uLChar = 0;
FLMBOOL bLCharIsWild = FALSE;
FLMUNICODE uRChar = 0;
FLMBOOL bRCharIsWild = FALSE;
FLMBOOL bPrevLWasWild = FALSE;
FLMBOOL bPrevRWasWild = FALSE;
FLMBOOL bAllowTwoIntoOne;
// If we are doing a "match" operation, we don't want two
// character sequences like Ch, ae, etc. turned into a single
// a single collation, because then matches that involve wildcards
// like "aetna == a*" would not match properly.
// When not doing a match operation, we WANT two character sequences
// turned into a single collation value so that we can know if
// something is > or <. When doing match operations, all we care
// about is if they are equal or not, so there is no need to look
// at double character collation properties.
bAllowTwoIntoOne = bOpIsMatch ? FALSE : TRUE;
for( ;;)
{
GetNextLChar:
if( bLCharIsWild)
{
bPrevLWasWild = TRUE;
}
pLStream->getCurrPosition( &startLPos);
if( RC_BAD( rc = pLStream->read(
bAllowTwoIntoOne,
&uLChar, &bLCharIsWild, &ui16LCol, &ui16LSubCol, &ucLCase)))
{
if( rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
// If the last character was a wildcard, we have a match!
if( bPrevLWasWild)
{
*piResult = 0;
goto Exit;
}
for( ;;)
{
if( RC_BAD( rc = pRStream->read(
bAllowTwoIntoOne,
&uRChar, &bRCharIsWild, &ui16RCol, &ui16RSubCol, &ucRCase)))
{
if( rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
*piResult = 0;
}
goto Exit;
}
// Break out when we hit a non-wild character
if( !bRCharIsWild)
{
break;
}
}
*piResult = -1;
}
goto Exit;
}
if( bLCharIsWild)
{
// Consume multiple wildcards
if( bPrevLWasWild)
{
goto GetNextLChar;
}
// See if we match anywhere on the remaining right string
for( ;;)
{
pRStream->getCurrPosition( &savedRPos);
pLStream->getCurrPosition( &savedLPos);
if( RC_BAD( rc = f_compareCollStreams( pLStream, pRStream,
bOpIsMatch, uiLanguage, piResult)))
{
goto Exit;
}
if( !(*piResult))
{
goto Exit;
}
if( RC_BAD( rc = pRStream->positionTo( &savedRPos)))
{
goto Exit;
}
if( RC_BAD( rc = pRStream->read(
bAllowTwoIntoOne,
NULL, NULL, NULL, NULL, NULL)))
{
if( rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
break;
}
goto Exit;
}
if( RC_BAD( rc = pLStream->positionTo( &savedLPos)))
{
goto Exit;
}
}
*piResult = 1;
goto Exit;
}
GetNextRChar:
if( bRCharIsWild)
{
bPrevRWasWild = TRUE;
}
if( RC_BAD( rc = pRStream->read(
bAllowTwoIntoOne,
&uRChar, &bRCharIsWild, &ui16RCol, &ui16RSubCol, &ucRCase)))
{
if( rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
// If the last character was a wildcard, we have a match!
if( bPrevRWasWild)
{
*piResult = 0;
}
else
{
*piResult = 1;
}
}
goto Exit;
}
if( bRCharIsWild)
{
if( bPrevRWasWild)
{
goto GetNextRChar;
}
// See if we match anywhere on the remaining left string
if( RC_BAD( rc = pLStream->positionTo( &startLPos)))
{
goto Exit;
}
for( ;;)
{
pLStream->getCurrPosition( &savedLPos);
pRStream->getCurrPosition( &savedRPos);
if( RC_BAD( rc = f_compareCollStreams( pLStream, pRStream,
bOpIsMatch, uiLanguage, piResult)))
{
goto Exit;
}
if( !(*piResult))
{
goto Exit;
}
if( RC_BAD( rc = pRStream->positionTo( &savedRPos)))
{
goto Exit;
}
if( RC_BAD( rc = pLStream->positionTo( &savedLPos)))
{
goto Exit;
}
// Skip the character we just processed
if( RC_BAD( rc = pLStream->read(
bAllowTwoIntoOne,
NULL, NULL, NULL, NULL, NULL)))
{
if( rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
break;
}
goto Exit;
}
}
*piResult = -1;
goto Exit;
}
if( ui16LCol != ui16RCol)
{
*piResult = ui16LCol < ui16RCol ? -1 : 1;
goto Exit;
}
else if( ui16LSubCol != ui16RSubCol)
{
*piResult = ui16LSubCol < ui16RSubCol ? -1 : 1;
goto Exit;
}
else if( ucLCase != ucRCase)
{
// NOTE: If we are doing a case insensitive comparison,
// ucLCase and ucRCase should be equal (both will have been
// set to zero
*piResult = ucLCase < ucRCase ? -1 : 1;
goto Exit;
}
}
Exit:
return( rc);
}
/***************************************************************************
Desc:
****************************************************************************/
FLMUNICODE FLMAPI f_convertChar(
FLMUNICODE uzChar,
FLMUINT uiCompareRules)
{
if (uzChar == ASCII_SPACE ||
(uzChar == ASCII_UNDERSCORE &&
(uiCompareRules & FLM_COMP_NO_UNDERSCORES)) ||
(f_isWhitespace( uzChar) &&
(uiCompareRules & FLM_COMP_WHITESPACE_AS_SPACE)))
{
return( (FLMUNICODE)((uiCompareRules &
(FLM_COMP_NO_WHITESPACE |
FLM_COMP_IGNORE_LEADING_SPACE))
? (FLMUNICODE)0
: (FLMUNICODE)ASCII_SPACE));
}
else if (uzChar == ASCII_DASH && (uiCompareRules & FLM_COMP_NO_DASHES))
{
return( (FLMUNICODE)0);
}
else
{
return( uzChar);
}
}
/****************************************************************************
Desc: Called by ftkStartup, this routine initializes the Unicode to
WP and WP to Unicode mapping tables.
****************************************************************************/
RCODE f_initCharMappingTables( void)
{
RCODE rc = NE_FLM_OK;
FLMUINT16 * puStaticPtr;
FLMUINT uiLoop;
FLMUINT uiEntries;
FLMUINT uiOffset;
if( gv_pUnicodeToWP60 || gv_pWP60ToUnicode || gv_pui16USCollationTable)
{
rc = RC_SET_AND_ASSERT( NE_FLM_FAILURE);
goto Exit;
}
gv_uiMinUniChar = 0;
gv_uiMaxUniChar = 0;
gv_uiMinWPChar = 0;
gv_uiMaxWPChar = 0;
// Make an initial pass over the table to determine
// what our allocation sizes will need to be.
for( uiLoop = 0, puStaticPtr = (FLMUINT16 *)WP_UTOWP60;
uiLoop < UTOWP60_ENTRIES;
uiLoop++, puStaticPtr += 2)
{
// Unicode
if( (FLMUINT)puStaticPtr[ 0] < gv_uiMinUniChar ||
!gv_uiMinUniChar)
{
flmAssert( puStaticPtr[ 0] != 0);
gv_uiMinUniChar = (FLMUINT)puStaticPtr[ 0];
}
if( (FLMUINT)puStaticPtr[ 0] > gv_uiMaxUniChar)
{
gv_uiMaxUniChar = (FLMUINT)puStaticPtr[ 0];
}
// WordPerfect
if( (FLMUINT)puStaticPtr[ 1] < gv_uiMinWPChar ||
!gv_uiMinWPChar)
{
flmAssert( puStaticPtr[ 1] != 0);
gv_uiMinWPChar = (FLMUINT)puStaticPtr[ 1];
}
if( (FLMUINT)puStaticPtr[ 1] > gv_uiMaxWPChar)
{
gv_uiMaxWPChar = (FLMUINT)puStaticPtr[ 1];
}
}
// Allocate the Unicode table
uiEntries = (gv_uiMaxUniChar - gv_uiMinUniChar) + 1;
if (RC_BAD( rc = f_calloc( uiEntries * sizeof( FLMUINT16),
&gv_pUnicodeToWP60)))
{
goto Exit;
}
// Populate the Unicode table
for( uiLoop = 0, puStaticPtr = (FLMUINT16 *)WP_UTOWP60;
uiLoop < UTOWP60_ENTRIES; uiLoop++, puStaticPtr += 2)
{
uiOffset = (FLMUINT)puStaticPtr[ 0] - gv_uiMinUniChar;
flmAssert( gv_pUnicodeToWP60[ uiOffset] == 0);
gv_pUnicodeToWP60[ uiOffset] = puStaticPtr[ 1];
}
// Allocate the WordPerfect table
uiEntries = (gv_uiMaxWPChar - gv_uiMinWPChar) + 1;
if (RC_BAD( rc = f_calloc( uiEntries * sizeof( FLMUINT16),
&gv_pWP60ToUnicode)))
{
goto Exit;
}
// Populate the WordPerfect table
for( uiLoop = 0, puStaticPtr = (FLMUINT16 *)WP_UTOWP60;
uiLoop < UTOWP60_ENTRIES; uiLoop++, puStaticPtr += 2)
{
uiOffset = (FLMUINT)puStaticPtr[ 1] - gv_uiMinWPChar;
flmAssert( gv_pWP60ToUnicode[ uiOffset] == 0);
gv_pWP60ToUnicode[ uiOffset] = puStaticPtr[ 0];
}
// Allocate the US collation mapping table
uiEntries = 0x10000;
if (RC_BAD( rc = f_calloc( uiEntries * sizeof( FLMUINT16),
&gv_pui16USCollationTable)))
{
goto Exit;
}
// Populate the US collation mapping table
for( uiLoop = 0; uiLoop < uiEntries; uiLoop++)
{
FLMBYTE ucCharVal = (FLMBYTE)uiLoop;
FLMBYTE ucCharSet = (FLMBYTE)(uiLoop >> 8);
TBL_B_TO_BP * pColTbl = fwp_col60Tbl;
do
{
if( pColTbl->key == ucCharSet)
{
FLMBYTE * pucColVals = pColTbl->charPtr;
// Check if the value is in the range of collated chars
// Above lower range of table?
if( ucCharVal >= *pucColVals)
{
// Make value zero based to index
ucCharVal -= *pucColVals++;
// Below maximum number of table entries?
if( ucCharVal < *pucColVals++)
{
// Return collated value.
gv_pui16USCollationTable[ uiLoop] = pucColVals[ ucCharVal];
break;
}
}
}
// Go to next table entry
pColTbl++;
} while( pColTbl->key != 0xFF);
if( pColTbl->key == 0xFF)
{
gv_pui16USCollationTable[ uiLoop] = COLS0;
}
}
Exit:
if( RC_BAD( rc))
{
if( gv_pUnicodeToWP60)
{
f_free( &gv_pUnicodeToWP60);
}
if( gv_pWP60ToUnicode)
{
f_free( &gv_pWP60ToUnicode);
}
if( gv_pui16USCollationTable)
{
f_free( &gv_pui16USCollationTable);
}
gv_uiMinUniChar = 0;
gv_uiMaxUniChar = 0;
gv_uiMinWPChar = 0;
gv_uiMaxWPChar = 0;
}
return( rc);
}
/****************************************************************************
Desc: Called by ftkShutdown, this routine frees the Unicode to WP and
WP to Unicode mapping tables.
****************************************************************************/
void f_freeCharMappingTables( void)
{
if( gv_pUnicodeToWP60)
{
f_free( &gv_pUnicodeToWP60);
}
if( gv_pWP60ToUnicode)
{
f_free( &gv_pWP60ToUnicode);
}
if( gv_pui16USCollationTable)
{
f_free( &gv_pui16USCollationTable);
}
gv_uiMinUniChar = 0;
gv_uiMaxUniChar = 0;
gv_uiMinWPChar = 0;
gv_uiMaxWPChar = 0;
}
/**************************************************************************
Desc: Convert the WP string to lower case chars given low/up bit string
Out: WP characters that have been modified to their original case
Ret: Number of bytes used in the lower/upper buffer
Notes: Only WP to lower case conversion is done here for each bit NOT set.
***************************************************************************/
FLMUINT FLMAPI f_wpToMixed(
FLMBYTE * pucWPStr, // Existing WP string to modify
FLMUINT uiWPStrLen, // Length of the WP string in bytes
const FLMBYTE * pucLowUpBitStr, // Lower/upper case bit string
FLMUINT uiLang)
{
FLMUINT uiNumChars;
FLMUINT uiTempWord;
FLMBYTE ucTempByte = 0;
FLMBYTE ucMaskByte;
FLMBYTE ucXorByte; // Used to reverse GR, bits
ucXorByte = (uiLang == FLM_US_LANG) // Do most common compare first
? (FLMBYTE)0
: (uiLang == FLM_GR_LANG) // Greek has uppercase first
? (FLMBYTE)0xFF
: (FLMBYTE)0 ;
// For each character (two bytes) in the word string ...
for( uiNumChars = uiWPStrLen >> 1, ucMaskByte = 0;
uiNumChars--;
pucWPStr += 2, ucMaskByte >>= 1)
{
if( ucMaskByte == 0)
{
// Time to get another byte
ucTempByte = ucXorByte ^ *pucLowUpBitStr++;
ucMaskByte = 0x80;
}
// If lowercase convert, else is upper
if( (ucTempByte & ucMaskByte) == 0)
{
// Convert to lower case - COLL -> WP is already in upper case
uiTempWord = (FLMUINT) FB2UW( pucWPStr);
if( uiTempWord >= ASCII_UPPER_A && uiTempWord <= ASCII_UPPER_Z)
{
uiTempWord |= 0x20;
}
else
{
FLMBYTE ucCharVal = (FLMBYTE)( uiTempWord & 0xFF);
FLMBYTE ucCharSet = (FLMBYTE)( uiTempWord >> 8);
// Check if charact within region of character set
if( ((ucCharSet == F_CHSMUL1) &&
((ucCharVal >= 26) && (ucCharVal <= 241))) ||
((ucCharSet == F_CHSGREK) && (ucCharVal <= 69)) ||
((ucCharSet == F_CHSCYR) && (ucCharVal <= 199)))
{
uiTempWord |= 0x01; // Set the bit ... don't increment!
}
}
UW2FBA( (FLMUINT16)uiTempWord, pucWPStr);
}
}
uiNumChars = uiWPStrLen >> 1;
return( bytesInBits( uiNumChars));
}
/****************************************************************************
Desc: Convert a text string to a collated string.
If NE_FLM_CONV_DEST_OVERFLOW is returned the string is truncated as
best as it can be. The caller must decide to return the error up
or deal with the truncation.
VISIT: If the string is EXACTLY the length of the truncation
length then it should, but doesn't, set the truncation flag.
The code didn't match the design intent. Fix next major
version.
****************************************************************************/
RCODE flmUTF8ToColText(
IF_PosIStream * pIStream,
FLMBYTE * pucCollatedStr, // Returns collated string
FLMUINT * puiCollatedStrLen, // Returns total collated string length
// Input is maximum bytes in buffer
FLMBOOL bCaseInsensitive, // Set if to convert to uppercase
FLMUINT * puiCollationLen, // Returns the collation bytes length
FLMUINT * puiCaseLen, // Returns length of case bytes
FLMUINT uiLanguage, // Language
FLMUINT uiCharLimit, // Max number of characters in this key piece
FLMBOOL bFirstSubstring, // TRUE is this is the first substring key
FLMBOOL bDataTruncated, // TRUE if data is coming in truncated.
FLMBOOL * pbOriginalCharsLost,
FLMBOOL * pbDataTruncated)
{
RCODE rc = NE_FLM_OK;
FLMUINT16 ui16Base; // Value of the base character
FLMUINT16 ui16SubColVal; // Sub-collated value (diacritic)
FLMUINT uiLength; // Temporary variable for length
FLMUINT uiTargetColLen = *puiCollatedStrLen - 8; // 4=ovhd,4=worse char
// Need to increase the buffer sizes to not overflow.
// Characaters without COLL values will take up 3 bytes in
// the ucSubColBuf[] and easily overflow the buffer.
// Hard coded the values so as to minimize changes.
FLMBYTE ucSubColBuf[ MAX_SUBCOL_BUF + 301]; // Holds sub-collated values(diac)
FLMBYTE ucCaseBits[ MAX_CASE_BYTES + 81]; // Holds case bits
FLMUINT16 ui16WpChr; // Current WP character
FLMUNICODE uChar = 0; // Current unconverted Unicode character
FLMUNICODE uChar2;
FLMUINT16 ui16WpChr2; // 2nd character if any; default 0 for US lang
FLMUINT uiColLen; // Return value of collated length
FLMUINT uiSubColBitPos; // Sub-collation bit position
FLMUINT uiCaseBitPos; // Case bit position
FLMUINT uiFlags; // Clear all bit flags
FLMBOOL bHebrewArabic = FALSE; // Set if language is hebrew, arabic, farsi
FLMBOOL bTwoIntoOne = FALSE;
FLMUINT uiUppercaseFlag;
uiColLen = 0;
uiSubColBitPos = 0;
uiCaseBitPos = 0;
uiFlags = 0;
ui16WpChr2 = 0;
// We don't want any single key piece to "pig out" more
// than 256 bytes of the key
if( uiTargetColLen > 256 - 8)
{
uiTargetColLen = 256 - 8;
}
// Code below sets ucSubColBuf[] and ucCaseBits[] values to zero.
if (uiLanguage != FLM_US_LANG)
{
if (uiLanguage == FLM_AR_LANG || // Arabic
uiLanguage == FLM_FA_LANG || // Farsi - persian
uiLanguage == FLM_HE_LANG || // Hebrew
uiLanguage == FLM_UR_LANG) // Urdu
{
bHebrewArabic = TRUE;
}
}
for (;;)
{
// Set the case bits and sub-collation bits to zero when
// on the first bit of the byte.
if (!(uiCaseBitPos & 0x07))
{
ucCaseBits [uiCaseBitPos >> 3] = 0;
}
if (!(uiSubColBitPos & 0x07))
{
ucSubColBuf [uiSubColBitPos >> 3] = 0;
}
ui16SubColVal = 0; // Default sub-collation value
// Get the next character from the string.
if( RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar)))
{
if (rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
break;
}
goto Exit;
}
// f_wpCheckDoubleCollation modifies ui16WpChr if a digraph or a double
// character sequence is found. If a double character is found, pucStr
// is incremented past the next character and ui16WpChr2 is set to 1.
// If a digraph is found, pucStr is not changed, but ui16WpChr
// contains the first character and ui16WpChr2 contains the second
// character of the digraph.
if (uiLanguage != FLM_US_LANG)
{
if( RC_BAD( rc = f_wpCheckDoubleCollation(
pIStream, FALSE, TRUE, &uChar, &uChar2, &bTwoIntoOne, uiLanguage)))
{
goto Exit;
}
if (!f_unicodeToWP( uChar, &ui16WpChr))
{
ui16WpChr = UNK_UNICODE_CODE;
}
if (uChar2)
{
if (!f_unicodeToWP( uChar2, &ui16WpChr2))
{
ui16WpChr2 = UNK_UNICODE_CODE;
}
}
else
{
ui16WpChr2 = 0;
}
}
else
{
// Convert the character to its WP equivalent
if( !f_unicodeToWP( uChar, &ui16WpChr))
{
ui16WpChr = UNK_UNICODE_CODE;
}
}
// Save the case bit if not case-insensitive
if (!bCaseInsensitive)
{
// charIsUpper returns TRUE if upper case, 0 if lower case.
if (!charIsUpper( ui16WpChr))
{
uiFlags |= F_HAD_LOWER_CASE;
}
else
{
// Set if upper case.
setBit( ucCaseBits, uiCaseBitPos);
}
uiCaseBitPos++;
}
// Handle non-collating characters with subcollating values,
// Get the collated value from the WP character-if not collating value
if ((pucCollatedStr[ uiColLen++] =
(FLMBYTE)(f_wpGetCollation( ui16WpChr, uiLanguage))) >= COLS11)
{
FLMUINT uiTemp;
// If lower case, convert to upper case.
if (!charIsUpper( ui16WpChr))
{
ui16WpChr &= ~1;
}
// No collating value given for this WP char.
// Save original WP char (2 bytes) in subcollating
// buffer.
// 1110 is a new code that will store an insert over
// the character OR a non-convertable unicode character.
// Store with the same alignment as "store_extended_char"
// below.
// 11110 is code for unmappable UNICODE value.
// A value 0xFE will be the collation value. The sub-collation
// value will be 0xFFFF followed by the UNICODE value.
// Be sure to eat an extra case bit.
// See specific Hebrew and Arabic comments in the
// switch statement below.
// Set the next byte that follows in the sub collation buffer.
ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0;
if (bHebrewArabic && (pucCollatedStr[ uiColLen - 1] == COLS0_ARABIC))
{
// Store first bit of 1110, fall through & store remaining 3 bits
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
// Don't store collation value
uiColLen--;
}
else if( uChar)
{
ui16WpChr = uChar;
uChar = 0;
// Store 11 out of 11110
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
if (!bCaseInsensitive)
{
ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0;
// Set upper case bit.
setBit( ucCaseBits, uiCaseBitPos);
uiCaseBitPos++;
}
}
store_extended_char:
// Set the next byte that follows in the sub collation buffer.
ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0;
ucSubColBuf [(uiSubColBitPos + 16) >> 3] = 0;
uiFlags |= F_HAD_SUB_COLLATION;
// Set 110 bits in sub-collation - continued from above.
// No need to explicitly set the zero, but must increment
// for it.
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos += 2;
// store_aligned_word: This label is not referenced.
// Go to the next byte boundary to write the character.
uiSubColBitPos = (uiSubColBitPos + 7) & (~7);
uiTemp = bytesInBits( uiSubColBitPos);
// Need to big-endian - so it will sort correctly.
ucSubColBuf [uiTemp] = (FLMBYTE)(ui16WpChr >> 8);
ucSubColBuf [uiTemp + 1] = (FLMBYTE)(ui16WpChr);
uiSubColBitPos += 16;
ucSubColBuf [uiSubColBitPos >> 3] = 0;
}
else
{
// Had a collation value
// Add the lower/uppercase bit if a mixed case output.
// If not lower ASCII set - check diacritic value for sub-collation
if( !(ui16WpChr & 0xFF00))
{
// ASCII character set - set a single 0 bit - just need to
// increment to do this.
uiSubColBitPos++;
}
else
{
FLMBYTE ucChar = (FLMBYTE)ui16WpChr;
FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChr >> 8);
// Convert char to uppercase because case information
// is stored above. This will help
// ensure that the "ETA" doesn't sort before "eta"
if( !charIsUpper( ui16WpChr))
{
ui16WpChr &= ~1;
}
switch( ucCharSet)
{
case F_CHSMUL1: // Multinational 1
{
// If we cannot break down a char into base and
// diacritic we cannot combine the charaacter
// later when converting back the key. In that case,
// write the entire WP char in the sub-collation area.
if( f_breakWPChar( ui16WpChr, &ui16Base, &ui16SubColVal))
{
goto store_extended_char;
}
// Write the FLAIM diacritic sub-collation value.
// Prefix is 2 bits "10". Remember to leave
// "111" alone for the future.
// NOTE: The "unlaut" character must sort after the "ring"
// character.
ui16SubColVal = ((ui16SubColVal & 0xFF) == F_UMLAUT &&
(uiLanguage == FLM_SU_LANG ||
uiLanguage == FLM_SV_LANG ||
uiLanguage == FLM_CZ_LANG ||
uiLanguage == FLM_SL_LANG))
? (FLMUINT16)(fwp_dia60Tbl[ F_RING] + 1)
: (FLMUINT16)(fwp_dia60Tbl[ ui16SubColVal & 0xFF]);
store_sub_col:
// Set the next byte that follows in the sub collation buffer.
ucSubColBuf[ (uiSubColBitPos + 8) >> 3] = 0;
uiFlags |= F_HAD_SUB_COLLATION;
// Set the 10 bits - no need to explicitly set the zero, but
// must increment for it.
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos += 2;
// Set sub-collation bits.
setBits( 5, ucSubColBuf, uiSubColBitPos, ui16SubColVal);
uiSubColBitPos += 5;
break;
}
case F_CHSGREK: // Greek
{
if (ucChar >= 52 || // Keep case bit for 52-69 else ignore
ui16WpChr == 0x804 || // [ 8,4] BETA Medial | Terminal
ui16WpChr == 0x826) // [ 8,38] SIGMA terminal
{
goto store_extended_char;
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
break;
}
case F_CHSCYR:
{
if (ucChar >= 144)
{
goto store_extended_char;
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
// Georgian covers 208-249 - no collation defined yet
break;
}
case F_CHSHEB: // Hebrew
{
// Three sections in Hebrew:
// 0..26 - main characters
// 27..83 - accents that apear over previous character
// 84..118- dagesh (ancient) hebrew with accents
// Because the ancient is only used for sayings & scriptures
// we will support a collation value and in the sub-collation
// store the actual character because sub-collation is in
// character order.
if (ucChar >= 84) // Save ancient - value 84 and above
{
goto store_extended_char;
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
break;
}
case F_CHSARB1: // Arabic 1
{
// Three sections in Arabic:
// 00..37 - accents that display OVER a previous character
// 38..46 - symbols
// 47..57 - numbers
// 58..163 - characters
// 164 - hamzah accent
// 165..180- common characters with accents
// 181..193- ligatures - common character combinations
// 194..195- extensions - throw away when sorting
if( ucChar <= 46)
{
goto store_extended_char; // save original character
}
if( pucCollatedStr[ uiColLen - 1] == COLS10a + 1) // Alef?
{
ui16SubColVal = (ucChar >= 165)
? (FLMUINT16)(fwp_alefSubColTbl[ ucChar - 165 ])
: (FLMUINT16)7; // Alef subcol value
goto store_sub_col;
}
if (ucChar >= 181) // Ligatures - char combination
{
goto store_extended_char; // save original character
}
if (ucChar == 64) // taa exception
{
ui16SubColVal = 8;
goto store_sub_col;
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
break;
}
case F_CHSARB2: // Arabic 2
{
// There are some characters that share the same slot
// Check the bit table if above character 64
if (ucChar >= 64 &&
fwp_ar2BitTbl[(ucChar-64)>> 3] & (0x80 >> (ucChar&0x07)))
{
goto store_extended_char; // Will save original
}
// No subcollation to worry about - set a zero bit by
// incrementing the bit position.
uiSubColBitPos++;
break;
}
default:
{
// Increment bit position to set a zero bit.
uiSubColBitPos++;
break;
}
}
}
// Now let's worry about double character sorting
if (ui16WpChr2)
{
if (pbOriginalCharsLost)
{
*pbOriginalCharsLost = TRUE;
}
// Set the next byte that follows in the sub collation buffer.
ucSubColBuf[ (uiSubColBitPos + 7) >> 3] = 0;
if (bTwoIntoOne)
{
// Sorts after character in ui16WpChr after call to
// f_wpCheckDoubleCollation
// Write the char 2 times so lower/upper bits are correct.
// Could write infinite times because of collation rules.
pucCollatedStr[ uiColLen] = ++pucCollatedStr[ uiColLen - 1];
uiColLen++;
// If original was upper case, set one more upper case bit
if( !bCaseInsensitive)
{
ucCaseBits[ (uiCaseBitPos + 7) >> 3] = 0;
if( !charIsUpper( ui16WpChr2))
{
uiFlags |= F_HAD_LOWER_CASE;
}
else
{
setBit( ucCaseBits, uiCaseBitPos);
}
uiCaseBitPos++;
}
// Take into account the diacritical space
uiSubColBitPos++;
}
else
{
// We have a digraph, get second collation value
pucCollatedStr[ uiColLen++] =
(FLMBYTE)(f_wpGetCollation( ui16WpChr2, uiLanguage));
// Normal case, assume no diacritics set
uiSubColBitPos++;
// If first was upper, set one more upper bit.
if( !bCaseInsensitive)
{
ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0;
if (charIsUpper( ui16WpChr))
{
setBit( ucCaseBits, uiCaseBitPos);
}
uiCaseBitPos++;
// no need to reset the uiFlags
}
}
}
}
// Check to see if uiColLen is at some overflow limit.
if (uiColLen >= uiCharLimit ||
uiColLen + bytesInBits( uiSubColBitPos) +
bytesInBits( uiCaseBitPos) >= uiTargetColLen)
{
// We hit the maximum number of characters. See if we hit the
// end of the string.
if (RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar)))
{
if (rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
}
else
{
goto Exit;
}
}
else
{
bDataTruncated = TRUE;
}
break;
}
}
if (puiCollationLen)
{
*puiCollationLen = uiColLen;
}
// Add the first substring marker - also serves as making the string non-null.
if (bFirstSubstring)
{
pucCollatedStr[ uiColLen++] = F_COLL_FIRST_SUBSTRING;
}
if (bDataTruncated)
{
pucCollatedStr[ uiColLen++ ] = F_COLL_TRUNCATED;
}
// Return NOTHING if no values found
if (!uiColLen && !uiSubColBitPos)
{
if (puiCaseLen)
{
*puiCaseLen = 0;
}
goto Exit;
}
// Store extra zero bit in the sub-collation area for Hebrew/Arabic
if (bHebrewArabic)
{
uiSubColBitPos++;
}
// Done putting the string into 4 sections - build the COLLATED KEY
// Don't set uiUppercaseFlag earlier than here because F_SC_LOWER
// may be zero
uiUppercaseFlag = (uiLanguage == FLM_GR_LANG)
? F_SC_LOWER
: F_SC_UPPER;
// Did we write anything to the subcollation area?
// The default terminating characters is (F_COLL_MARKER | F_SC_UPPER)
if (uiFlags & F_HAD_SUB_COLLATION)
{
// Writes out a 0x7
pucCollatedStr[ uiColLen++] = F_COLL_MARKER | F_SC_SUB_COL;
// Move the sub-collation into the collating string
uiLength = bytesInBits( uiSubColBitPos);
f_memcpy( &pucCollatedStr[ uiColLen], ucSubColBuf, uiLength);
uiColLen += uiLength;
}
// Move the upper/lower case stuff - force bits for Greek ONLY
// This is such a small size that a memcpy is not worth it
if( uiFlags & F_HAD_LOWER_CASE)
{
FLMUINT uiNumBytes = bytesInBits( uiCaseBitPos);
FLMBYTE * pucCasePtr = ucCaseBits;
// Output the 0x5
pucCollatedStr[ uiColLen++] = (FLMBYTE)(F_COLL_MARKER | F_SC_MIXED);
if( puiCaseLen)
{
*puiCaseLen = uiNumBytes + 1;
}
if( uiUppercaseFlag == F_SC_LOWER)
{
// Negate case bits for languages (like GREEK) that sort
// upper case before lower case.
while( uiNumBytes--)
{
pucCollatedStr[ uiColLen++] = ~(*pucCasePtr++);
}
}
else
{
while( uiNumBytes--)
{
pucCollatedStr[ uiColLen++] = *pucCasePtr++;
}
}
}
else
{
// All characters are either upper or lower case, as determined
// by uiUppercaseFlag.
pucCollatedStr[ uiColLen++] = (FLMBYTE)(F_COLL_MARKER | uiUppercaseFlag);
if( puiCaseLen)
{
*puiCaseLen = 1;
}
}
Exit:
if( pbDataTruncated)
{
*pbDataTruncated = bDataTruncated;
}
*puiCollatedStrLen = uiColLen;
return( rc);
}
/*****************************************************************************
Desc: Convert a collated string to a WP word string
*****************************************************************************/
RCODE FLMAPI f_colStr2WPStr(
const FLMBYTE * pucColStr, // Points to the collated string
FLMUINT uiColStrLen, // Length of the collated string
FLMBYTE * pucWPStr, // Output string to build - WP word string
FLMUINT * puiWPStrLen,
FLMUINT uiLang,
FLMUINT * puiUnconvChars,
FLMBOOL * pbDataTruncated, // Set to TRUE if truncated
FLMBOOL * pbFirstSubstring) // Sets to TRUE if first substring
{
FLMBYTE * pucWPPtr = pucWPStr; // Points to the word string data area
FLMBYTE * pucWPEnd = &pucWPPtr[ *puiWPStrLen];
FLMUINT uiMaxWPBytes = *puiWPStrLen;
FLMUINT uiLength = uiColStrLen; // May optimize as a register
FLMUINT uiPos = 0; // Position in pucColStr
FLMUINT uiBitPos; // Computed bit position
FLMUINT uiColChar; // Not portable if a FLMBYTE value
FLMUINT uiWPStrLen;
FLMUINT uiUnconvChars = 0;
FLMBOOL bHebrewArabic = FALSE;
RCODE rc = NE_FLM_OK;
// WARNING:
// The code is duplicated for performance reasons.
// The US code below is much more optimized so
// any changes must be done twice.
if( uiLang == FLM_US_LANG)
{
while( uiLength && (pucColStr[ uiPos] > F_MAX_COL_OPCODE))
{
uiLength--;
// Move in the WP value given uppercase collated value
uiColChar = (FLMUINT)pucColStr[ uiPos++];
if( uiColChar == COLS0)
{
uiColChar = (FLMUINT)0xFFFF;
uiUnconvChars++;
}
else
{
uiColChar = (FLMUINT)colToWPChr[ uiColChar - COLLS];
}
// Put the WP char in the word string
if( pucWPPtr + 2 >= pucWPEnd)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
UW2FBA( (FLMUINT16)uiColChar, pucWPPtr);
pucWPPtr += 2;
}
}
else // Non-US collation
{
if( (uiLang == FLM_AR_LANG ) || // Arabic
(uiLang == FLM_FA_LANG ) || // Farsi - Persian
(uiLang == FLM_HE_LANG ) || // Hebrew
(uiLang == FLM_UR_LANG)) // Urdu
{
bHebrewArabic = TRUE;
}
while( uiLength && (pucColStr[ uiPos] > F_MAX_COL_OPCODE))
{
uiLength--;
uiColChar = (FLMUINT)pucColStr[ uiPos++];
switch( uiColChar)
{
case COLS9+4: // ch in spanish
case COLS9+11: // ch in czech
{
// Put the WP char in the word string
if( pucWPPtr + 2 >= pucWPEnd)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
UW2FBA( (FLMUINT16) 'C', pucWPPtr);
pucWPPtr += 2;
uiColChar = (FLMUINT)'H';
uiPos++; // Move past second duplicate char
break;
}
case COLS9+17: // ll in spanish
{
// Put the WP char in the word string
if( pucWPPtr + 2 >= pucWPEnd)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
UW2FBA( (FLMUINT16)'L', pucWPPtr);
pucWPPtr += 2;
uiColChar = (FLMUINT)'L';
uiPos++; // Move past duplicate character
break;
}
case COLS0: // Non-collating character or OEM character
{
// Actual character is in sub-collation area
uiColChar = (FLMUINT)0xFFFF;
uiUnconvChars++;
break;
}
default:
{
// Watch out COLS10h has () around it for subtraction
if( bHebrewArabic && (uiColChar >= COLS10h))
{
uiColChar = (uiColChar < COLS10a) // Hebrew only?
? (FLMUINT) (0x900 + (uiColChar - (COLS10h))) // Hebrew
: (FLMUINT) (HebArabColToWPChr[ uiColChar - (COLS10a)]); // Arabic
}
else
{
uiColChar = (FLMUINT)colToWPChr[ uiColChar - COLLS];
}
break;
}
}
// Put the WP char in the word string
if( pucWPPtr + 2 >= pucWPEnd)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
UW2FBA( (FLMUINT16)uiColChar, pucWPPtr);
pucWPPtr += 2;
}
}
// Terminate the string
if( pucWPPtr + 2 >= pucWPEnd)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
UW2FBA( (FLMUINT16)0, pucWPPtr);
uiWPStrLen = uiPos + uiPos; // Multiply by 2
// Parse through the sub-collation and case information.
// Here are values for some of the codes:
// [ 0x04] - case information is all uppercase (IS,DK,GR)
// [ 0x05] - case bits follow
// [ 0x06] - case information is all uppercase
// [ 0x07] - beginning of sub-collation information
// [ 0x08] - first substring field that is made
// [ 0x09] - truncation marker for text and binary
//
// Below are some cases to consider...
//
// [ COLLATION][ 0x07 sub-collation][ 0x05 case info]
// [ COLLATION][ 0x07 sub-collation][ 0x05 case info]
// [ COLLATION][ 0x07 sub-collation]
// [ COLLATION][ 0x07 sub-collation]
// [ COLLATION][ 0x05 case info]
// [ COLLATION][ 0x05 case info]
// [ COLLATION]
// [ COLLATION]
//
// In the future still want[ 0x06] to be compressed out for uppercase
// only indexes.
// Check first substring before truncated
if( uiLength && pucColStr[ uiPos] == F_COLL_FIRST_SUBSTRING)
{
if( pbFirstSubstring)
{
*pbFirstSubstring = TRUE; // Don't need to initialize to FALSE.
}
uiLength--;
uiPos++;
}
// Is the key truncated?
if( uiLength && pucColStr[ uiPos] == F_COLL_TRUNCATED)
{
if( pbDataTruncated)
{
*pbDataTruncated = TRUE; // Don't need to initialize to FALSE.
}
uiLength--;
uiPos++;
}
// Does sub-collation follow?
// Still more to process - first work on the sub-collation (diacritics)
// Hebrew/Arabic may have empty collation area
if( uiLength && (pucColStr[ uiPos] == (F_COLL_MARKER | F_SC_SUB_COL)))
{
FLMUINT uiTempLen;
// Do another pass on the word string adding the diacritics
if( RC_BAD( rc = flmWPCmbSubColBuf( pucWPStr, &uiWPStrLen, uiMaxWPBytes,
&pucColStr[ ++uiPos], bHebrewArabic, &uiBitPos)))
{
goto Exit;
}
// Move pos to next byte value
uiTempLen = bytesInBits( uiBitPos);
uiPos += uiTempLen;
uiLength -= uiTempLen + 1; // The 1 includes the 0x07 byte
}
// Does the case info follow?
if( uiLength && (pucColStr[ uiPos] >= 0x04))
{
// Take care of the lower and upper case conversion
// If mixed case then convert using case bits
if( pucColStr[ uiPos++] & F_SC_MIXED) // Increment pos here!
{
// Don't pre-increment pos on line below!
uiPos += f_wpToMixed( pucWPStr, uiWPStrLen,
&pucColStr[ uiPos], uiLang);
}
// else 0x04 or 0x06 - all characters already in uppercase
}
// Should end perfectly at the end of the collation buffer.
if (uiPos != uiColStrLen)
{
rc = RC_SET_AND_ASSERT( NE_FLM_DATA_ERROR);
goto Exit;
}
*puiWPStrLen = uiWPStrLen;
*puiUnconvChars = uiUnconvChars;
Exit:
return( rc);
}
/****************************************************************************
Desc: Convert a text string to a collated string.
****************************************************************************/
RCODE FLMAPI f_asiaUTF8ToColText(
IF_PosIStream * pIStream,
FLMBYTE * pucColStr, // Output collated string
FLMUINT * puiColStrLen, // Collated string length return value
// Input value is MAX num of bytes in buffer
FLMBOOL bCaseInsensitive, // Set if to convert to uppercase
FLMUINT * puiCollationLen, // Returns the collation bytes length
FLMUINT * puiCaseLen, // Returns length of case bytes
FLMUINT uiCharLimit, // Max number of characters in this key piece
FLMBOOL bFirstSubstring, // TRUE is this is the first substring key
FLMBOOL bDataTruncated, // Was input data already truncated.
FLMBOOL * pbDataTruncated)
{
RCODE rc = NE_FLM_OK;
FLMBOOL bEndOfStr = FALSE;
FLMUINT uiLength;
FLMUINT uiTargetColLen = *puiColStrLen - 12; // 6=ovhd,6=worst char
FLMBYTE ucSubColBuf[ MAX_SUBCOL_BUF + 1]; // Holds Sub-col values (diac)
FLMBYTE ucLowUpBuf[ MAX_CASE_BYTES + MAX_CASE_BYTES + 2]; // 2 case bits/wpchar
FLMUINT uiColLen;
FLMUINT uiSubColBitPos;
FLMUINT uiLowUpBitPos;
FLMUINT uiFlags;
FLMUNICODE uChar;
FLMUINT16 ui16NextWpChar;
FLMUINT16 ui16ColValue;
uiColLen = uiSubColBitPos = uiLowUpBitPos = uiFlags = 0;
uChar = ui16ColValue = 0;
// We don't want any single key piece to "pig out" more
// than 256 bytes of the key
if( uiTargetColLen > 256 - 12)
{
uiTargetColLen = 256 - 12;
}
// Make sure ucSubColBuf and ucLowUpBuf are set to 0
f_memset( ucSubColBuf, 0, sizeof( ucSubColBuf));
f_memset( ucLowUpBuf, 0, sizeof( ucLowUpBuf));
ui16NextWpChar = 0;
while( !bEndOfStr || ui16NextWpChar || uChar)
{
FLMUINT16 ui16WpChar; // Current WP character
FLMUINT16 ui16SubColVal; // Sub-collated value (diacritic)
FLMBYTE ucCaseFlags;
FLMUINT16 ui16CurWpChar;
// Get the next character from the string.
ui16WpChar = ui16NextWpChar;
for( ui16NextWpChar = 0;
(!ui16WpChar || !ui16NextWpChar) &&
!uChar && !bEndOfStr;)
{
if (!bEndOfStr)
{
if( RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar)))
{
if (rc == NE_FLM_EOF_HIT)
{
rc = NE_FLM_OK;
bEndOfStr = TRUE;
}
else
{
goto Exit;
}
}
}
else
{
uChar = 0;
}
if( f_unicodeToWP( uChar, &ui16CurWpChar))
{
uChar = 0;
}
if( !ui16WpChar)
{
ui16WpChar = ui16CurWpChar;
}
else
{
ui16NextWpChar = ui16CurWpChar;
}
}
// If we didn't get a character, break out of the outer
// processing loop.
if( !ui16WpChar && !uChar)
{
break;
}
if( ui16WpChar)
{
if( flmWPAsiaGetCollation( ui16WpChar, ui16NextWpChar, ui16ColValue,
&ui16ColValue, &ui16SubColVal, &ucCaseFlags, bCaseInsensitive) == 2)
{
// Took the ui16NextWpChar value
// Force to skip this value
ui16NextWpChar = 0;
}
}
else // Use the uChar value for this pass
{
// This handles all of the UNICODE characters that could not
// be converted to WP characters - which will include most
// of the Asian characters.
ucCaseFlags = 0;
if( uChar < 0x20)
{
ui16ColValue = 0xFFFF;
// Setting ui16SubColVal to a high code will ensure
// that the code that the uChar value will be stored
// in in the sub-collation area.
ui16SubColVal = 0xFFFF;
// NOTE: uChar SHOULD NOT be set to zero here.
// It will be set to zero below.
}
else
{
ui16ColValue = uChar;
ui16SubColVal = 0;
uChar = 0;
}
}
// Store the values in 2 bytes
pucColStr[ uiColLen++] = (FLMBYTE)(ui16ColValue >> 8);
pucColStr[ uiColLen++] = (FLMBYTE)(ui16ColValue & 0xFF);
if( ui16SubColVal)
{
uiFlags |= F_HAD_SUB_COLLATION;
if( ui16SubColVal <= 31) // 5 bit - store bits 10
{
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos += 1 + 1; // Stores a zero
setBits( 5, ucSubColBuf, uiSubColBitPos, ui16SubColVal);
uiSubColBitPos += 5;
}
else // 2 bytes - store bits 110 or 11110
{
FLMUINT uiTemp;
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
if( !ui16WpChar && uChar) // Store as "11110"
{
ui16SubColVal = uChar;
uChar = 0;
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
setBit( ucSubColBuf, uiSubColBitPos);
uiSubColBitPos++;
}
uiSubColBitPos++; // Skip past the zero
// Go to the next byte boundary to write the WP char
uiSubColBitPos = (uiSubColBitPos + 7) & (~7);
uiTemp = bytesInBits( uiSubColBitPos);
// Need to store HIGH-Low - PC format is Low-high!
ucSubColBuf[ uiTemp ] = (FLMBYTE)(ui16SubColVal >> 8);
ucSubColBuf[ uiTemp + 1] = (FLMBYTE)(ui16SubColVal);
uiSubColBitPos += 16;
}
}
else
{
uiSubColBitPos++;
}
// Save case information - always 2 bits worth for Asian
if( ucCaseFlags & 0x02)
{
setBit( ucLowUpBuf, uiLowUpBitPos);
}
uiLowUpBitPos++;
if( ucCaseFlags & 0x01)
{
setBit( ucLowUpBuf, uiLowUpBitPos);
}
uiLowUpBitPos++;
// Check to see if uiColLen is within 1 byte of max
if( (uiColLen >= uiCharLimit) ||
(uiColLen + bytesInBits( uiSubColBitPos) +
bytesInBits( uiLowUpBitPos) >= uiTargetColLen))
{
// Still something left?
if (ui16NextWpChar || uChar)
{
bDataTruncated = TRUE;
}
else if (!bEndOfStr)
{
if (RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar)))
{
if (rc == NE_FLM_EOF_HIT)
{
bEndOfStr = TRUE;
rc = NE_FLM_OK;
}
else
{
goto Exit;
}
}
else
{
bDataTruncated = TRUE;
}
}
break; // Hit the max. number of characters
}
}
if( puiCollationLen)
{
*puiCollationLen = uiColLen;
}
// Add the first substring marker - also serves
// as making the string non-null.
if( bFirstSubstring)
{
pucColStr[ uiColLen++] = 0;
pucColStr[ uiColLen++] = F_COLL_FIRST_SUBSTRING;
}
if( bDataTruncated)
{
pucColStr[ uiColLen++] = 0;
pucColStr[ uiColLen++] = F_COLL_TRUNCATED;
}
// Return NOTHING if no values found
if( !uiColLen && !uiSubColBitPos)
{
if( puiCaseLen)
{
*puiCaseLen = 0;
}
goto Exit;
}
// Done putting the String into 3 sections - build the COLLATED KEY
if( uiFlags & F_HAD_SUB_COLLATION)
{
pucColStr[ uiColLen++] = 0;
pucColStr[ uiColLen++] = F_COLL_MARKER | F_SC_SUB_COL;
// Move the Sub-collation (diacritics) into the collating string
uiLength = (FLMUINT)(bytesInBits( uiSubColBitPos));
f_memcpy( &pucColStr[ uiColLen], ucSubColBuf, uiLength);
uiColLen += uiLength;
}
// Always represent the marker as 2 bytes and case bits in Asia
pucColStr[ uiColLen++] = 0;
pucColStr[ uiColLen++] = F_COLL_MARKER | F_SC_MIXED;
uiLength = (FLMUINT)(bytesInBits( uiLowUpBitPos));
f_memcpy( &pucColStr[ uiColLen ], ucLowUpBuf, uiLength);
if( puiCaseLen)
{
*puiCaseLen = (FLMUINT)(uiLength + 2);
}
uiColLen += uiLength;
Exit:
if( pbDataTruncated)
{
*pbDataTruncated = bDataTruncated;
}
*puiColStrLen = uiColLen;
return( rc);
}
/****************************************************************************
Desc: Combine the diacritic 5 and 16 bit values to an existing word string.
Ret: FLMUINT - Number of bytes parsed
Notes: For each bit in the sub-collation section:
0 - no subcollation information
10 - take next 5 bits - will tell about diacritics or japanese vowel
110 - align to next byte & take word value as extended character
****************************************************************************/
RCODE FLMAPI f_asiaParseSubCol(
FLMBYTE * pucWPStr,
FLMUINT * puiWPStrLen,
FLMUINT uiMaxWPBytes,
const FLMBYTE * pucSubColBuf,
FLMUINT * puiSubColBitPos)
{
RCODE rc = NE_FLM_OK;
FLMUINT uiSubColBitPos = 0;
FLMUINT uiNumChars = *puiWPStrLen >> 1;
FLMUINT16 ui16Diac;
FLMUINT16 ui16WpChar;
// For each character (16 bits) in the WP string ...
while( uiNumChars--)
{
// Have to skip 0, because it is not accounted for
// in the sub-collation bits. It was inserted when we
// encountered unconverted unicode characters (Asian).
// Will be converted to something else later on.
// SEE NOTE ABOVE.
if( FB2UW( pucWPStr) == 0)
{
pucWPStr += 2;
continue;
}
// This macro DOESN'T increment uiBitPos
if( testOneBit( pucSubColBuf, uiSubColBitPos))
{
// Bits 10 - take next 5 bits
// Bits 110 align and take next word
// Bits 11110 align and take unicode value
uiSubColBitPos++;
if( !testOneBit( pucSubColBuf, uiSubColBitPos))
{
uiSubColBitPos++;
ui16Diac = (FLMUINT16)(getNBits( 5, pucSubColBuf, uiSubColBitPos));
uiSubColBitPos += 5;
if( (ui16WpChar = FB2UW( pucWPStr)) < 0x100)
{
if( (ui16WpChar >= 'A') && (ui16WpChar <= 'Z'))
{
// Convert to WP diacritic and combine characters
f_combineWPChar( &ui16WpChar, ui16WpChar,
(FLMUINT16)ml1_COLtoD[ ui16Diac]);
// Even if cmbcar fails, WpChar is still set to a valid value
}
else
{
// Symbols from charset 0x24
ui16WpChar = (FLMUINT16)(0x2400 +
fwp_Ch24ColTbl[ ui16Diac - 1 ].ByteValue);
}
}
else if( ui16WpChar >= 0x2600) // Katakana
{
// Voicings - will allow to select original char
// 000 - some 001 are changed to 000 to save space
// 001 - set if large char (uppercase)
// 010 - set if voiced
// 100 - set if half voiced
//
// Should NOT match voicing or wouldn't be here!
FLMBYTE ucChar = (FLMBYTE)(ui16WpChar & 0xFF);
// Try exceptions first so don't access out of bounds
if( ucChar == 84)
{
ui16WpChar = (FLMUINT16)(0x2600 +
((ui16Diac == 1)
? (FLMUINT16)10
: (FLMUINT16)11));
}
else if( ucChar == 85)
{
ui16WpChar = (FLMUINT16)(0x2600 +
((ui16Diac == 1)
? (FLMUINT16)16
: (FLMUINT16)17));
}
// Try the next 2 slots, if not then
// value is 83, 84 or 85
else if( KanaSubColTbl[ ucChar + 1 ] == ui16Diac)
{
ui16WpChar++;
}
else if( KanaSubColTbl[ ucChar + 2 ] == ui16Diac)
{
ui16WpChar += 2;
}
else if( ucChar == 4) // Last exception
{
ui16WpChar = 0x2600 + 83;
}
// else, leave alone! - invalid storage
}
UW2FBA( ui16WpChar, pucWPStr); // Set if changed or not
}
else // "110"
{
FLMUINT uiTemp;
uiSubColBitPos++; // Skip second '1'
if( testOneBit( pucSubColBuf, uiSubColBitPos)) // 11?10 ?
{
if( (*puiWPStrLen) + 2 > uiMaxWPBytes)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
// Unconvertable UNICODE character
// The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode
shiftN( pucWPStr,
(FLMUINT16)(uiNumChars + uiNumChars + 4), 2);
pucWPStr += 2; // Skip the 0xFFFF for now
uiSubColBitPos += 2; // Skip next "11"
(*puiWPStrLen) += 2;
}
uiSubColBitPos++; // Skip the zero
// Round up to next byte
uiSubColBitPos = (uiSubColBitPos + 7) & (~7);
uiTemp = bytesInBits( uiSubColBitPos);
pucWPStr[ 1] = pucSubColBuf[ uiTemp]; // Character set
pucWPStr[ 0] = pucSubColBuf[ uiTemp + 1]; // Character
uiSubColBitPos += 16;
}
}
else
{
uiSubColBitPos++; // Be sure to increment this!
}
pucWPStr += 2; // Next WP character
}
*puiSubColBitPos = bytesInBits( uiSubColBitPos);
Exit:
return( rc);
}
/****************************************************************************
Desc: The case bits for asia are:
Latin/Greek/Cyrillic
01 - case bit set if character is uppercase
10 - double wide character in CS 0x25xx, 0x26xx and 0x27xx
Japanese
00 - double wide hiragana 0x255e..25b0
01 - double wide katakana 0x2600..2655
10 - single wide symbols from charset 11 that map to CS24??
11 - single wide katakana from charset 11
Ret:
Notes: This is tricky to really understand the inputs.
This looks at the bits according to the current character value.
****************************************************************************/
FSTATIC RCODE flmAsiaParseCase(
FLMBYTE * pucWPStr,
FLMUINT * puiWPStrLen,
FLMUINT uiMaxWPBytes,
const FLMBYTE * pucCaseBits,
FLMUINT * puiColBytesProcessed)
{
RCODE rc = NE_FLM_OK;
FLMUINT uiWPStrLen = *puiWPStrLen;
FLMUINT uiCharCnt;
FLMUINT uiExtraBytes = 0;
FLMUINT16 ui16WpChar;
FLMBYTE ucTempByte = 0;
FLMBYTE ucMaskByte;
// For each character (two bytes) in the string ...
for( uiCharCnt = uiWPStrLen >> 1, ucMaskByte = 0; uiCharCnt--;)
{
FLMBYTE ucChar;
FLMBYTE ucCharSet;
ui16WpChar = FB2UW( pucWPStr); // Get the next character
// Must skip any 0xFFFFs or zeroes that were inserted.
if( ui16WpChar == 0xFFFF || ui16WpChar == 0)
{
// Put back 0xFFFF in case it was a zero.
UW2FBA( 0xFFFF, pucWPStr);
pucWPStr += 2;
uiExtraBytes += 2;
continue;
}
// Time to get another byte?
if( ucMaskByte == 0)
{
ucTempByte = *pucCaseBits++;
ucMaskByte = 0x80;
}
ucCharSet = (FLMBYTE)(ui16WpChar >> 8);
ucChar = (FLMBYTE)(ui16WpChar & 0xFF);
// SINGLE WIDE - NORMAL CHARACTERS
if( ui16WpChar < 0x2400)
{
// Convert to double wide?
if( ucTempByte & ucMaskByte)
{
// Latin/greek/cyrillic
// Convert to uppercase double wide char
if( ucCharSet == 0) // Latin - uppercase
{
// May convert to 0x250F (Latin) or CS24
if( ui16WpChar >= ASCII_UPPER_A && ui16WpChar <= ASCII_UPPER_Z)
{
// Convert to double wide
ui16WpChar = (FLMUINT16)(ui16WpChar - 0x30 + 0x250F);
}
else
{
f_wpHanToZenkaku( ui16WpChar, 0, &ui16WpChar);
}
}
else if( ucCharSet == 8) // Greek
{
if( ucChar > 38) // Adjust for spaces in Greek
{
ucChar -= 2;
}
if( ucChar > 4)
{
ucChar -= 2;
}
ui16WpChar = (FLMUINT16)((ucChar >> 1) + 0x265E);
}
else if( ucCharSet == 10) // Cyrillic
{
ui16WpChar = (FLMUINT16)((ucChar >> 1) + 0x2700);
}
else
{
f_wpHanToZenkaku( ui16WpChar, 0, &ui16WpChar);
}
ucCharSet = (FLMBYTE)(ui16WpChar >> 8);
ucChar = (FLMBYTE)(ui16WpChar & 0xFF);
}
ucMaskByte >>= 1; // Next bit
// Change to lower case?
if( (ucTempByte & ucMaskByte) == 0)
{
// Convert ui16WpChar to lower case
switch( ucCharSet)
{
case 0:
{
// Bit zero only if lower case
ui16WpChar |= 0x20;
break;
}
case 1:
{
// In upper/lower case region?
if( ucChar >= 26)
{
ui16WpChar++;
}
break;
}
case 8:
{
// All lowercase after 69
if( ucChar <= 69)
{
ui16WpChar++;
}
break;
}
case 10:
{
// No cases after 199
if( ucChar <= 199)
{
ui16WpChar++;
}
break;
}
case 0x25:
case 0x26:
{
// Should be double wide latin or Greek
// Add offset to convert to lowercase
ui16WpChar += 0x20;
break;
}
case 0x27:
{
// Double wide cyrillic only
// Add offset to convert to lowercase
ui16WpChar += 0x30;
break;
}
}
}
}
else // JAPANESE CHARACTERS
{
if( ucTempByte & ucMaskByte) // Original chars from CharSet 11
{
if( ucCharSet == 0x26) // Convert to Zen to Hankaku
{
FLMUINT16 ui16NextChar = 0;
ui16WpChar = f_wpZenToHankaku( ui16WpChar, &ui16NextChar);
if( ui16NextChar) // Move everyone down
{
if( (*puiWPStrLen) + 2 > uiMaxWPBytes)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
uiCharCnt++;
shiftN( pucWPStr, uiCharCnt + uiCharCnt + 2, 2);
UW2FBA( ui16WpChar, pucWPStr);
pucWPStr += 2;
ui16WpChar = ui16NextChar; // This will be stored below
// Adjust the length
*puiWPStrLen = *puiWPStrLen + 2;
}
}
else if( ucCharSet == 0x24)
{
ui16WpChar = f_wpZenToHankaku( ui16WpChar, NULL);
}
ucMaskByte >>= 1; // Eat the next bit
}
else
{
ucMaskByte >>= 1; // Next bit
if( (ucTempByte & ucMaskByte) == 0) // Convert to Hiragana?
{
// Kanji will also fall through here
if( ucCharSet == 0x26)
{
// Convert to Hiragana
ui16WpChar = (FLMUINT16)(0x255E + ucChar);
}
}
}
}
UW2FBA( ui16WpChar, pucWPStr);
pucWPStr += 2;
ucMaskByte >>= 1;
}
uiCharCnt = uiWPStrLen - uiExtraBytes; // Should be 2 bits for each character.
*puiColBytesProcessed = bytesInBits( uiCharCnt);
Exit:
return( rc);
}
/***************************************************************************
Desc: Get the original string from an asian collation string
Ret: Length of the word string in bytes
****************************************************************************/
RCODE FLMAPI f_asiaColStr2WPStr(
const FLMBYTE * pucColStr, // Points to the collated string
FLMUINT uiColStrLen, // Length of the collated string
FLMBYTE * pucWPStr, // Output string to build - WP word string
FLMUINT * puiWPStrLen,
FLMUINT * puiUnconvChars,
FLMBOOL * pbDataTruncated, // Set to TRUE if truncated
FLMBOOL * pbFirstSubstring) // Sets to TRUE if first substring
{
FLMBYTE * pucWPStrPtr = pucWPStr;
FLMBYTE * pucWPEnd = &pucWPStr[ *puiWPStrLen];
FLMUINT uiLength = uiColStrLen;
FLMUINT uiMaxWPBytes = *puiWPStrLen;
FLMUINT uiColStrPos = 0;
FLMBOOL bHadExtended = FALSE;
FLMUINT uiWPStrLen;
FLMUINT16 ui16ColChar;
FLMUINT uiUnconvChars = 0;
FLMUINT uiColBytesProcessed;
RCODE rc = NE_FLM_OK;
while( uiLength)
{
FLMBYTE ucChar = pucColStr[ uiColStrPos + 1];
FLMBYTE ucCharSet = pucColStr[ uiColStrPos];
ui16ColChar = (FLMUINT16)((ucCharSet << 8) + ucChar);
if( ui16ColChar <= F_MAX_COL_OPCODE)
{
break;
}
uiColStrPos += 2;
uiLength -= 2;
if( ucCharSet == 0) // Normal Latin/Greek/Cyrillic value
{
ui16ColChar = colToWPChr[ ucChar - COLLS];
}
else if( ucCharSet == 1) // Katakana or Hiragana character
{
if( ucChar > sizeof( ColToKanaTbl)) // Special cases below
{
if( ucChar == COLS_ASIAN_MARK_VAL) // Dakuten
{
ui16ColChar = 0x240a;
}
else if( ucChar == COLS_ASIAN_MARK_VAL + 1) // Handakuten
{
ui16ColChar = 0x240b;
}
else if( ucChar == COLS_ASIAN_MARK_VAL + 2) // Chuuten
{
ui16ColChar = 0x2405;
}
else
{
ui16ColChar = 0xFFFF; // Error
}
}
else
{
ui16ColChar = (FLMUINT16)(0x2600 + ColToKanaTbl[ ucChar]);
}
}
else if( ucCharSet != 0xFF || ucChar != 0xFF) // Asian characters
{
// Insert zeroes that will be treated as a signal for
// uncoverted unicode characters later on. NOTE: Cannot
// use 0xFFFF, because we need to be able to detect this
// case in the sub-collation stuff, and we don't want
// to confuse it with the 0xFFFF that may have been inserted
// in another case.
// THIS IS A REALLY BAD HACK, BUT IT IS THE BEST WE CAN DO
// FOR NOW!
if( pucWPStrPtr + 2 >= pucWPEnd)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
*pucWPStrPtr++ = 0;
*pucWPStrPtr++ = 0;
uiUnconvChars++;
bHadExtended = TRUE;
}
// else, there is no collation value - found in sub-collation part
if( pucWPStrPtr + 2 >= pucWPEnd)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
UW2FBA( ui16ColChar, pucWPStrPtr); // Put the uncollation value back
pucWPStrPtr += 2;
}
if( pucWPStrPtr + 2 >= pucWPEnd)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
UW2FBA( 0, pucWPStrPtr); // Terminate the string
uiWPStrLen = (FLMUINT)(pucWPStrPtr - pucWPStr);
// Parse through the sub-collation and case information.
// Here are values for some of the codes:
// [ 0x05] - case bits follow
// [ 0x06] - case information is all uppercase
// [ 0x07] - beginning of sub-collation information
// [ 0x08] - first substring field that is made
// [ 0x09] - truncation marker for text and binary
//
// Asian chars the case information should always be there and not
// compressed out. This is because the case information could change
// the actual width of the character from 0x26xx to charset 11.
// Does truncation marker or sub-collation follow?
if( uiLength)
{
ui16ColChar = (FLMUINT16)((pucColStr[ uiColStrPos] << 8) +
pucColStr[ uiColStrPos + 1]);
// First substring is before truncated.
if( ui16ColChar == F_COLL_FIRST_SUBSTRING)
{
if( pbFirstSubstring)
{
*pbFirstSubstring = TRUE; // Don't need to initialize to FALSE.
}
uiLength -= 2;
uiColStrPos += 2;
ui16ColChar = (FLMUINT16)((pucColStr[ uiColStrPos] << 8) +
pucColStr[ uiColStrPos + 1]);
}
if( ui16ColChar == F_COLL_TRUNCATED)
{
if( pbDataTruncated)
{
*pbDataTruncated = TRUE; // Don't need to initialize to FALSE.
}
uiLength -= 2;
uiColStrPos += 2;
ui16ColChar = (FLMUINT16)((pucColStr[ uiColStrPos] << 8) +
pucColStr[ uiColStrPos+1]);
}
if( ui16ColChar == (F_COLL_MARKER | F_SC_SUB_COL))
{
FLMUINT uiTempLen;
// Do another pass on the word string adding diacritics/voicings
uiColStrPos += 2;
uiLength -= 2;
if( RC_BAD( rc = f_asiaParseSubCol( pucWPStr, &uiWPStrLen,
uiMaxWPBytes, &pucColStr[ uiColStrPos], &uiTempLen)))
{
goto Exit;
}
uiColStrPos += uiTempLen;
uiLength -= uiTempLen;
}
else
{
goto check_case;
}
}
// Does the case info follow?
if( uiLength)
{
ui16ColChar = (FLMUINT16)((pucColStr[ uiColStrPos] << 8) +
pucColStr[ uiColStrPos + 1]);
check_case:
if( ui16ColChar == (F_COLL_MARKER | F_SC_MIXED))
{
uiColStrPos += 2;
if( RC_BAD( rc = flmAsiaParseCase( pucWPStr, &uiWPStrLen,
uiMaxWPBytes, &pucColStr[ uiColStrPos], &uiColBytesProcessed)))
{
goto Exit;
}
uiColStrPos += uiColBytesProcessed;
// Set bHadExtended to FALSE, because they will have
// been taken care of in this pass.
bHadExtended = FALSE;
}
}
// Change embedded zeroes to 0xFFFFs
if (bHadExtended)
{
FLMUINT uiCnt;
FLMBYTE * pucTmp;
for( uiCnt = 0, pucTmp = pucWPStr;
uiCnt < uiWPStrLen;
uiCnt += 2, pucTmp += 2)
{
if( FB2UW( pucTmp) == 0)
{
UW2FBA( 0xFFFF, pucTmp);
}
}
}
if (uiColStrLen != uiColStrPos)
{
rc = RC_SET_AND_ASSERT( NE_FLM_DATA_ERROR);
goto Exit;
}
*puiUnconvChars = uiUnconvChars;
*puiWPStrLen = uiWPStrLen;
Exit:
return( rc);
}
/**************************************************************************
Desc: Combine the diacritic 5-bit values to an existing WP string
***************************************************************************/
FSTATIC RCODE flmWPCmbSubColBuf(
FLMBYTE * pucWPStr, // Existing WP string to modify
FLMUINT * puiWPStrLen, // WP string length in bytes
FLMUINT uiMaxWPBytes,
const FLMBYTE * pucSubColBuf, // Diacritic values in 5 bit sets
FLMBOOL bHebrewArabic, // Set if language is Hebrew or Arabic
FLMUINT * puiSubColBitPos)
{
RCODE rc = NE_FLM_OK;
FLMUINT uiSubColBitPos = 0;
FLMUINT uiNumChars = *puiWPStrLen >> 1;
FLMUINT16 ui16Diac;
FLMUINT16 ui16WPChar;
FLMUINT uiTemp;
// For each character (two bytes) in the WP string ...
while( uiNumChars--)
{
// Label used for hebrew/arabic - additional subcollation can follow
// This macro DOESN'T increment bitPos
if( testOneBit( pucSubColBuf, uiSubColBitPos))
{
// If "11110" - unmappable unicode char - 0xFFFF is before it
// If "1110" then INDEX extended char is inserted
// If "110" then extended char follows that replaces collation
// If "10" then take next 5 bits which
// contain the diacritic subcollation value.
after_last_character:
uiSubColBitPos++; // Eat the first 1 bit
if( !testOneBit( pucSubColBuf, uiSubColBitPos))
{
uiSubColBitPos++; // Eat the 0 bit
ui16Diac = (FLMUINT16)(getNBits( 5, pucSubColBuf, uiSubColBitPos));
uiSubColBitPos += 5;
// If not extended base
if( (ui16WPChar = FB2UW( pucWPStr)) < 0x100)
{
// Convert to WP diacritic and combine characters
f_combineWPChar( &ui16WPChar, ui16WPChar,
(FLMUINT16)ml1_COLtoD[ ui16Diac]);
// Even if cmbcar fails, wpchar is still set to a valid value
UW2FBA( ui16WPChar, pucWPStr);
}
else if( (ui16WPChar & 0xFF00) == 0x0D00) // Arabic?
{
ui16WPChar = ArabSubColToWPChr[ ui16Diac];
UW2FBA( ui16WPChar, pucWPStr);
}
// else diacritic is extra info
// cmbcar should not handle extended chars for this design
}
else // "110" or "1110" or "11110"
{
uiSubColBitPos++; // Eat the 2nd '1' bit
if( testOneBit( pucSubColBuf, uiSubColBitPos)) // Test the 3rd bit
{
if( (*puiWPStrLen) + 2 > uiMaxWPBytes)
{
rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW);
goto Exit;
}
// 1110 - shift wpchars down 1 word and insert value below
uiSubColBitPos++; // Eat the 3rd '1' bit
*puiWPStrLen += 2; // Return 2 more bytes
if( testOneBit( pucSubColBuf, uiSubColBitPos)) // Test 4th bit
{
// Unconvertable UNICODE character
// The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode
shiftN( pucWPStr, uiNumChars + uiNumChars + 4, 2);
uiSubColBitPos++; // Eat the 4th '1' bit
pucWPStr += 2; // Skip the 0xFFFF for now
}
else
{
// Move down 2 byte NULL and rest of the 2 byte characters
// The extended character does not have a 0xFF col value
shiftN( pucWPStr, uiNumChars + uiNumChars + 2, 2);
uiNumChars++; // Increment because inserted
// Fall through reading the actual charater value
}
}
uiSubColBitPos++; // Skip past the zero bit
uiSubColBitPos = (uiSubColBitPos + 7) & (~7); // roundup to next byte
uiTemp = bytesInBits( uiSubColBitPos); // compute position
pucWPStr[ 1] = pucSubColBuf[ uiTemp]; // Character set
pucWPStr[ 0] = pucSubColBuf[ uiTemp + 1]; // Character
uiSubColBitPos += 16;
}
}
else
{
uiSubColBitPos++;
}
pucWPStr += 2; // Next WP character
}
if( bHebrewArabic)
{
if( testOneBit( pucSubColBuf, uiSubColBitPos))
{
// Hebrew/Arabic can have trailing accents that
// don't have a matching collation value.
// Keep looping in this case.
// Note that subColBitPos isn't incremented above.
uiNumChars = 0; // Set so we won't loop forever!
goto after_last_character; // process trailing bit
}
uiSubColBitPos++; // Eat the last '0' bit
}
*puiSubColBitPos = uiSubColBitPos;
Exit:
return( rc);
}
/**************************************************************************
Desc:
***************************************************************************/
void FLMAPI F_CollIStream::getCurrPosition(
F_CollStreamPos * pPos)
{
pPos->uNextChar = m_uNextChar;
pPos->ui64Position = m_pIStream->getCurrPosition();
}