diff --git a/ftk/src/ftkcoll.cpp b/ftk/src/ftkcoll.cpp new file mode 100644 index 0000000..47cd437 --- /dev/null +++ b/ftk/src/ftkcoll.cpp @@ -0,0 +1,9968 @@ +//------------------------------------------------------------------------------ +// Desc: Routines for building collation keys +// +// Tabs: 3 +// +// Copyright (c) 1993-2006 Novell, Inc. All Rights Reserved. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of version 2 of the GNU General Public +// License as published by the Free Software Foundation. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, contact Novell, Inc. +// +// To contact Novell about this file by physical or electronic mail, +// you may find current contact information at www.novell.com +// +// $Id: fcollate.cpp 3111 2006-01-19 13:10:50 -0700 (Thu, 19 Jan 2006) dsanders $ +//------------------------------------------------------------------------------ + +#include "ftksys.h" + +// Character set #'s are same as high byte values +// except for algorithmic set. + +#define CHSASCI 0 // ASCII +#define CHSMUL1 1 // Multinational 1 +#define CHSMUL2 2 // Multinational 2 +#define CHSBOXD 3 // Box drawing +#define CHSSYM1 4 // Typographic Symbols +#define CHSSYM2 5 // Iconic Symbols +#define CHSMATH 6 // Math +#define CHMATHX 7 // Math Extension +#define CHSGREK 8 // Greek +#define CHSHEB 9 // Hebrew +#define CHSCYR 10 // Cyrillic +#define CHSKANA 11 // Japanese Kana +#define CHSUSER 12 // User-defined +#define CHSARB1 13 // Arabic +#define CHSARB2 14 // Arabic script + +#define NCHSETS 15 // # of character sets (excluding Asian) +#define WP_MAX_CAR60_SIZE NCHSETS +#define ACHSETS 0x0E0 // Maximum character set value - Asian +#define ACHSMIN 0x024 // Minimum character set value - Asian +#define ACHCMAX 0x0FE // Maxmimum character value in Asian sets + +// Collating Sequence Equates + +#define COLLS 32 // first collating number (space/end of line) +#define COLS0 255 // graphics/misc - chars without a collate value +#define COLS1 (COLLS + 9) // quotes +#define COLS2 (COLS1 + 5) // parens +#define COLS3 (COLS2 + 6) // money +#define COLS4 (COLS3 + 6) // math ops +#define COLS5 (COLS4 + 8) // math others +#define COLS6 (COLS5 + 14) // others: %#&@\_|~ +#define COLS7 (COLS6 + 13) // greek +#define COLS8 (COLS7 + 25) // numbers +#define COLS9 (COLS8 + 10) // alphabet +#define COLS10 (COLS9 + 60) // cyrillic +#define COLS10h (COLS9 + 42) // hebrew - writes over european & cyrilic +#define COLS10a (COLS10h + 28) // arabic - inclusive from 198(C6)-252(FC) +#define COLS11 253 // End of list - arabic goes to the end +#define COLS0_ARABIC COLS11 // Set if arabic accent marking +#define COLS0_HEBREW COLS11 // Set if hebrew accent marking +#define COLS_ASIAN_MARKS 0x140 +#define COLS_ASIAN_MARK_VAL 0x40 // Without 0x100 + +#define SET_CASE_BIT 0x01 +#define SET_KATAKANA_BIT 0x01 +#define SET_WIDTH_BIT 0x02 + +#define UNK_UNICODE_CODE 0xFFFE + +#define MAX_SUBCOL_BUF (500) +#define MAX_CASE_BYTES (150) + +// Definitions for diacritics. + +#define grave 0 +#define centerd 1 +#define tilde 2 +#define circum 3 +#define crossb 4 +#define slash 5 +#define acute 6 +#define umlaut 7 +#define macron 8 + +#define aposab 9 +#define aposbes 10 +#define aposba 11 + +#define ring 14 +#define dota 15 +#define dacute 16 +#define cedilla 17 +#define ogonek 18 +#define caron 19 +#define stroke 20 + +#define breve 22 +#define dotlesi 239 +#define dotlesj 25 + +#define gacute 83 // greek acute +#define gdia 84 // greek diaeresis +#define gactdia 85 // acute diaeresis +#define ggrvdia 86 // grave diaeresis +#define ggrave 87 // greek grave +#define gcircm 88 // greek circumflex +#define gsmooth 89 // smooth breathing +#define grough 90 // rough breathing +#define giota 91 // iota subscript +#define gsmact 92 // smooth breathing acute +#define grgact 93 // rough breathing acute +#define gsmgrv 94 // smooth breathing grave +#define grggrv 95 // rough breathing grave +#define gsmcir 96 // smooth breathing circumflex +#define grgcir 97 // rough breathing circumflex +#define gactio 98 // acute iota +#define ggrvio 99 // grave iota +#define gcirio 100 // circumflex iota +#define gsmio 101 // smooth iota +#define grgio 102 // rough iota +#define gsmaio 103 // smooth acute iota +#define grgaio 104 // rough acute iota +#define gsmgvio 105 // smooth grave iota +#define grggvio 106 // rough grave iota +#define gsmcio 107 // smooth circumflex iota +#define grgcio 108 // rough circumflex iota +#define ghprime 81 // high prime +#define glprime 82 // low prime + +#define racute 200 // russian acute +#define rgrave 201 // russian grave +#define rrtdesc 204 // russian right descender +#define rogonek 205 // russian ogonek +#define rmacron 206 // russian macron + +#define ASCTBLLEN 95 +#define MNTBLLEN 219 +#define SYMTBLLEN 9 +#define GRKTBLLEN 219 +#define CYRLTBLLEN 200 +#define HEBTBL1LEN 27 +#define HEBTBL2LEN 35 +#define AR1TBLLEN 158 +#define AR2TBLLEN 179 + +#define Upper_JP_A 0x2520 +#define Upper_JP_Z 0x2539 +#define Upper_KR_A 0x5420 +#define Upper_KR_Z 0x5439 +#define Upper_CS_A 0x82FC +#define Upper_CS_Z 0x8316 +#define Upper_CT_A 0xA625 +#define Upper_CT_Z 0xA63E + +#define Lower_JP_a 0x2540 +#define Lower_JP_z 0x2559 +#define Lower_KR_a 0x5440 +#define Lower_KR_z 0x5459 +#define Lower_CS_a 0x82DC +#define Lower_CS_z 0x82F5 +#define Lower_CT_a 0xA60B +#define Lower_CT_z 0xA624 + +// # of characters in each character set. +// CHANGING ANY OF THESE DEFINES WILL CAUSE BUGS! + +#define ASC_N 95 +#define ML1_N 242 +#define ML2_N 145 +#define BOX_N 88 +#define TYP_N 103 +#define ICN_N 255 +#define MTH_N 238 +#define MTX_N 229 +#define GRK_N 219 +#define HEB_N 123 +#define CYR_N 250 +#define KAN_N 63 +#define USR_N 255 +#define ARB_N 196 +#define ARS_N 220 + +// TOTAL: 1447 WP + 255 User Characters + +#define C_N ASC_N + ML1_N + ML2_N + BOX_N +\ + MTH_N + MTX_N + TYP_N + ICN_N +\ + GRK_N + HEB_N + CYR_N + KAN_N +\ + USR_N + ARB_N + ARS_N + +// State table constants for double character sorting + +#define STATE1 1 +#define STATE2 2 +#define STATE3 3 +#define STATE4 4 +#define STATE5 5 +#define STATE6 6 +#define STATE7 7 +#define STATE8 8 +#define STATE9 9 +#define STATE10 10 +#define STATE11 11 +#define AFTERC 12 +#define AFTERH 13 +#define AFTERL 14 +#define INSTAE 15 +#define INSTOE 16 +#define INSTSG 17 +#define INSTIJ 18 +#define WITHAA 19 + +#define START_COL 12 +#define START_ALL (START_COL + 1) // all US and european +#define START_DK (START_COL + 2) // Danish +#define START_IS (START_COL + 3) // Icelandic +#define START_NO (START_COL + 4) // Norwegian +#define START_SU (START_COL + 5) // Finnish +#define START_SV (START_COL + 5) // Swedish +#define START_YK (START_COL + 6) // Ukrain +#define START_TK (START_COL + 7) // Turkish +#define START_CZ (START_COL + 8) // Czech +#define START_SL (START_COL + 8) // Slovak + +#define FIXUP_AREA_SIZE 24 // Number of characters to fix up + +FLMUINT16 flmWPAsiaGetCollation( + FLMUINT16 ui16WpChar, + FLMUINT16 ui16NextWpChar, + FLMUINT16 ui16PrevColValue, + FLMUINT16 * pui16ColValue, + FLMUINT16 * pui16SubColVal, + FLMBYTE * pucCaseBits, + FLMBOOL bUppercaseFlag); + +FLMUINT16 flmWPGetCollation( + FLMUINT16 ui16WpChar, + FLMUINT uiLanguage); + +FLMUINT16 flmWPUpper( + FLMUINT16 ui16WpChar); + +FLMUINT16 flmWPLower( + FLMUINT16 ui16WpChar); + +FLMBOOL flmWPIsUpper( + FLMUINT16 ui16WpChar); + +FLMBOOL flmWPBrkcar( + FLMUINT16 ui16WpChar, + FLMUINT16 * pui16BaseChar, + FLMUINT16 * pui16DiacriticChar); + +FLMUINT16 flmWPGetSubCol( + FLMUINT16 ui16WPValue, + FLMUINT16 ui16ColValue, + FLMUINT uiLanguage); + +typedef struct +{ + FLMBYTE base; + FLMBYTE diacrit; +} BASE_DIACRIT_TABLE; + +typedef struct +{ + FLMUINT16 char_count; // # of characters in table + FLMUINT16 start_char; // start char. + BASE_DIACRIT_TABLE * table; + +} BASE_DIACRIT; + +typedef struct +{ + FLMBYTE key; // character key to search on + FLMBYTE * charPtr; // character pointer for matched key +} TBL_B_TO_BP; + +typedef struct +{ + FLMBYTE ByteValue; + FLMUINT16 WordValue; +} BYTE_WORD_TBL; + +// Static functions + +FSTATIC RCODE flmWPCmbSubColBuf( + FLMBYTE * pucWPStr, + FLMUINT * puiWPStrLen, + FLMUINT uiMaxWPBytes, + const FLMBYTE * pucSubColBuf, + FLMBOOL bHebrewArabic, + FLMUINT * puiSubColBitPos); + +FSTATIC FLMUINT flmWPToMixed( + FLMBYTE * pucWPStr, + FLMUINT uiWPStrLen, + const FLMBYTE * pucLowUpBitStr, + FLMUINT uiLang); + +FSTATIC FLMUINT16 flmWPZenToHankaku( + FLMUINT16 ui16WpChar, + FLMUINT16 * pui16DakutenOrHandakuten); + +FSTATIC FLMUINT16 flmWPHanToZenkaku( + FLMUINT16 ui16WpChar, + FLMUINT16 ui16NextWpChar, + FLMUINT16 * pui16Zenkaku); + +FSTATIC RCODE flmAsiaParseSubCol( + FLMBYTE * pucWPStr, + FLMUINT * puiWPStrLen, + FLMUINT uiMaxWPBytes, + const FLMBYTE * pucSubColBuf, + FLMUINT * puiSubColBitPos); + +FSTATIC RCODE flmAsiaParseCase( + FLMBYTE * pucWPStr, + FLMUINT * puiWPStrLen, + FLMUINT uiMaxWPBytes, + const FLMBYTE * pucCaseBits, + FLMUINT * puiColBytesProcessed); + +// Global data + +static FLMUINT16 * gv_pUnicodeToWP60 = NULL; +static FLMUINT16 * gv_pWP60ToUnicode = NULL; +static FLMUINT gv_uiMinUniChar = 0; +static FLMUINT gv_uiMaxUniChar = 0; +static FLMUINT gv_uiMinWPChar = 0; +static FLMUINT gv_uiMaxWPChar = 0; + +// Collation tables + +/**************************************************************************** +Desc: Table of # of characters in each character set +****************************************************************************/ +FLMBYTE fwp_c60_max[] = +{ + ASC_N, // ascii + ML1_N, // multinational 1 + ML2_N, // multinational 2 + BOX_N, // line draw + TYP_N, // typographic + ICN_N, // icons + MTH_N, // math + MTX_N, // math extension + GRK_N, // Greek + HEB_N, // Hebrew + CYR_N, // Cyrillic - Russian + KAN_N, // Kana + USR_N, // user + ARB_N, // Arabic + ARS_N, // Arabic Script +}; + +/**************************************************************************** +Desc: Base character location table + Bit mapped table. (1) - corresponding base char is in same + set as combined + (0) - corresponding base char is in ascii set + +Notes: In the following table, the bits are numbered from left + to right relative to each individual byte. + EX. 00000000b ;0-7 + bit# 01234567 +****************************************************************************/ +FLMBYTE fwp_ml1_cb60[] = +{ + 0x00, // 0-7 + 0x00, // 8-15 + 0x00, // 16-23 + 0x00, // 24-31 + 0x00, // 32-39 + 0x00, // 40-47 + 0x55, // 48-55 + 0x00, // 56-63 + 0x00, // 64-71 + 0x00, // 72-79 + 0x00, // 80-87 + 0x00, // 88-95 + 0x00, // 96-103 + 0x00, // 104-111 + 0x00, // 112-119 + 0x00, // 120-127 + 0x14, // 128-135 + 0x44, // 136-143 + 0x00, // 144-151 + 0x00, // 152-159 + 0x00, // 160-167 + 0x00, // 168-175 + 0x00, // 176-183 + 0x00, // 184-191 + 0x00, // 192-199 + 0x00, // 200-207 + 0x00, // 208-215 + 0x00, // 216-223 + 0x00, // 224-231 + 0x04, // 232-239 + 0x00, // 240-241 +}; + +/**************************************************************************** +Desc: Format of index: + 2 words before = count. + word before = start character. + db code for base char. + db code for diacritic +Notes: Diacritical char is always in same set as composed char + base is in same set if other table indicates, else in ASCII +****************************************************************************/ +BASE_DIACRIT_TABLE fwp_ml1c_table[] = +{ + {'A',acute}, + {'a',acute}, + {'A',circum}, + {'a',circum}, + {'A',umlaut}, + {'a',umlaut}, + {'A',grave}, + {'a',grave}, + {'A',ring}, + {'a',ring}, + {0xff,0xff}, // no AE diagraph + {0xff,0xff}, // no ae diagraph + {'C',cedilla}, + {'c',cedilla}, + {'E',acute}, + {'e',acute}, + {'E',circum}, + {'e',circum}, + {'E',umlaut}, + {'e',umlaut}, + {'E',grave}, + {'e',grave}, + {'I',acute}, + {dotlesi,acute}, + {'I',circum}, + {dotlesi,circum}, + {'I',umlaut}, + {dotlesi,umlaut}, + {'I',grave}, + {dotlesi,grave}, + {'N',tilde}, + {'n',tilde}, + {'O',acute}, + {'o',acute}, + {'O',circum}, + {'o',circum}, + {'O',umlaut}, + {'o',umlaut}, + {'O',grave}, + {'o',grave}, + {'U',acute}, + {'u',acute}, + {'U',circum}, + {'u',circum}, + {'U',umlaut}, + {'u',umlaut}, + {'U',grave}, + {'u',grave}, + {'Y',umlaut}, + {'y',umlaut}, + {'A',tilde}, + {'a',tilde}, + {'D',crossb}, + {'d',crossb}, + {'O',slash}, + {'o',slash}, + {'O',tilde}, + {'o',tilde}, + {'Y',acute}, + {'y',acute}, + {0xff,0xff}, // no eth + {0xff,0xff}, // no eth + {0xff,0xff}, // no Thorn + {0xff,0xff}, // no Thorn + {'A',breve}, + {'a',breve}, + {'A',macron}, + {'a',macron}, + {'A',ogonek}, + {'a',ogonek}, + {'C',acute}, + {'c',acute}, + {'C',caron}, + {'c',caron}, + {'C',circum}, + {'c',circum}, + {'C',dota}, + {'c',dota}, + {'D',caron}, + {'d',caron}, + {'E',caron}, + {'e',caron}, + {'E',dota}, + {'e',dota}, + {'E',macron}, + {'e',macron}, + {'E',ogonek}, + {'e',ogonek}, + {'G',acute}, + {'g',acute}, + {'G',breve}, + {'g',breve}, + {'G',caron}, + {'g',caron}, + {'G',cedilla}, + {'g',aposab}, + {'G',circum}, + {'g',circum}, + {'G',dota}, + {'g',dota}, + {'H',circum}, + {'h',circum}, + {'H',crossb}, + {'h',crossb}, + {'I',dota}, + {dotlesi,dota}, + {'I',macron}, + {dotlesi,macron}, + {'I',ogonek}, + {'i',ogonek}, + {'I',tilde}, + {dotlesi,tilde}, + {0xff,0xff}, // no IJ digraph + {0xff,0xff}, // no ij digraph + {'J',circum}, + {dotlesj,circum}, + {'K',cedilla}, + {'k',cedilla}, + {'L',acute}, + {'l',acute}, + {'L',caron}, + {'l',caron}, + {'L',cedilla}, + {'l',cedilla}, + {'L',centerd}, + {'l',centerd}, + {'L',stroke}, + {'l',stroke}, + {'N',acute}, + {'n',acute}, + {'N',aposba}, + {'n',aposba}, + {'N',caron}, + {'n',caron}, + {'N',cedilla}, + {'n',cedilla}, + {'O',dacute}, + {'o',dacute}, + {'O',macron}, + {'o',macron}, + {0xff,0xff}, // OE digraph + {0xff,0xff}, // oe digraph + {'R',acute}, + {'r',acute}, + {'R',caron}, + {'r',caron}, + {'R',cedilla}, + {'r',cedilla}, + {'S',acute}, + {'s',acute}, + {'S',caron}, + {'s',caron}, + {'S',cedilla}, + {'s',cedilla}, + {'S',circum}, + {'s',circum}, + {'T',caron}, + {'t',caron}, + {'T',cedilla}, + {'t',cedilla}, + {'T',crossb}, + {'t',crossb}, + {'U',breve}, + {'u',breve}, + {'U',dacute}, + {'u',dacute}, + {'U',macron}, + {'u',macron}, + {'U',ogonek}, + {'u',ogonek}, + {'U',ring}, + {'u',ring}, + {'U',tilde}, + {'u',tilde}, + {'W',circum}, + {'w',circum}, + {'Y',circum}, + {'y',circum}, + {'Z',acute}, + {'z',acute}, + {'Z',caron}, + {'z',caron}, + {'Z',dota}, + {'z',dota}, + {0xff,0xff}, // no Eng + {0xff,0xff}, // no eng + {'D',macron}, + {'d',macron}, + {'L',macron}, + {'l',macron}, + {'N',macron}, + {'n',macron}, + {'R',grave}, + {'r',grave}, + {'S',macron}, + {'s',macron}, + {'T',macron}, + {'t',macron}, + {'Y',breve}, + {'y',breve}, + {'Y',grave}, + {'y',grave}, + {'D',aposbes}, + {'d',aposbes}, + {'O',aposbes}, + {'o',aposbes}, + {'U',aposbes}, + {'u',aposbes}, + {'E',breve}, + {'e',breve}, + {'I',breve}, + {dotlesi,breve}, + {0xff,0xff}, // no dotless I + {0xff,0xff}, // no dotless i + {'O',breve}, + {'o',breve} +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +BASE_DIACRIT fwp_ml1c = +{ + 216, // # of characters in table + 26, // start char + fwp_ml1c_table, +}; + +/**************************************************************************** +Desc: Format of index: + 2 words before = count. + word before = start character. + db code for base char. + db code for diacritic +Notes: Diacritical char is always in same set as composed char + base is in same set +****************************************************************************/ +static BASE_DIACRIT_TABLE fwp_grk_c_table[] = +{ + { 0, ghprime }, // ALPHA High Prime + { 1, gacute }, // alpha acute + { 10, ghprime }, // EPSILON High Prime + { 11, gacute }, // epsilon Acute + { 14, ghprime }, // ETA High Prime + { 15, gacute }, // eta Acute + { 18, ghprime }, // IOTA High Prime + { 19, gacute }, // iota Acute + { 0xFF, 0xFF }, // IOTA Diaeresis + { 19, gdia }, // iota Diaeresis + { 30, ghprime }, // OMICRON High Prime + { 31, gacute }, // omicron Acute + { 42, ghprime }, // UPSILON High Prime + { 43, gacute }, // upsilon Acute + { 0xFF, 0xFF }, // UPSILON Diaeresis + { 43,gdia }, // upsilon Diaeresis + { 50,ghprime }, // OMEGA High Prime + { 51,gacute }, // omega Acute + { 0xFF, 0xFF }, // epsilon (Variant) + { 0xFF, 0xFF }, // theta (Variant) + { 0xFF, 0xFF }, // kappa (Variant) + { 0xFF, 0xFF }, // pi (Variant) + { 0xFF, 0xFF }, // rho (Variant) + { 0xFF, 0xFF }, // sigma (Variant) + { 0xFF, 0xFF }, // UPSILON (Variant) + { 0xFF, 0xFF }, // phi (Variant) + { 0xFF, 0xFF }, // omega (Variant) + { 0xFF, 0xFF }, // Greek Question Mark + { 0xFF, 0xFF }, // Greek Semicolon + { 0xFF, 0xFF }, // High Prime + { 0xFF, 0xFF }, // Low Prime + { 0xFF, 0xFF }, // Acute (Greek) + { 0xFF, 0xFF }, // Diaeresis (Greek) + { gacute,gdia }, // Acute Diaeresis + { ggrave, gdia }, // Grave Diaeresis + { 0xFF, 0xFF }, // Grave (Greek) + { 0xFF, 0xFF }, // Circumflex (Greek) + { 0xFF, 0xFF }, // Smooth Breathing + { 0xFF, 0xFF }, // Rough Breathing + { 0xFF, 0xFF }, // Iota Subscript + { gsmooth, gacute }, // Smooth Breathing Acute + { grough, gacute }, // Rough Breathing Acute + { gsmooth, ggrave }, // Smooth Breathing Grave + { grough, ggrave }, // Rough Breathing Grave + { gsmooth, gcircm }, // Smooth Breathing Circumflex + { grough, gcircm }, // Rough Breathing Circumflex + { gacute, giota }, // Acute w/Iota Subscript + { ggrave, giota }, // Grave w/Iota Subscript + { gcircm, giota }, // Circumflex w/Iota Subscript + { gsmooth, giota }, // Smooth Breathing w/Iota Subscript + { grough, giota }, // Rough Breathing w/Iota Subscript + { gsmact, giota }, // Smooth Breathing Acute w/Iota Subscript + { grgact, giota }, // Rough Breathing Acute w/Iota Subscript + { gsmgrv, giota }, // Smooth Breathing Grave w/Iota Subscript + { grggrv, giota }, // Rough Breathing Grave w/Iota Subscript + { gsmcir, giota }, // Smooth Breathing Circumflex w/Iota Sub + { grgcir, giota }, // Rough Breathing Circumflex w/Iota Sub + { 1, ggrave }, // alpha Grave + { 1, gcircm }, // alpha Circumflex + { 1, giota }, // alpha w/Iota + { 1, gactio }, // alpha Acute w/Iota + { 1, ggrvio }, // alpha Grave w/Iota + { 1, gcirio }, // alpha Circumflex w/Iota + { 1, gsmooth }, // alpha Smooth + { 1, gsmact }, // alpha Smooth Acute + { 1, gsmgrv }, // alpha Smooth Grave + { 1, gsmcir }, // alpha Smooth Circumflex + { 1, gsmio }, // alpha Smooth w/Iota + { 1, gsmaio }, // alpha Smooth Acute w/Iota + { 1, gsmgvio }, // alpha Smooth Grave w/Iota + { 1, gsmcio }, // alpha Smooth Circumflex w/Iota + { 1, grough }, // alpha Rough + { 1, grgact }, // alpha Rough Acute + { 1, grggrv }, // alpha Rough Grave + { 1, grgcir }, // alpha Rough Circumflex + { 1, grgio }, // alpha Rough w/Iota + { 1, grgaio }, // alpha Rough Acute w/Iota + { 1, grggvio }, // alpha Rough Grave w/Iota + { 1, grgcio }, // alpha Rough Circumflex w/Iota + { 11, ggrave }, // epsilon Grave + { 11, gsmooth }, // epsilon Smooth + { 11, gsmact }, // epsilon Smooth Acute + { 11, gsmgrv }, // epsilon Smooth Grave + { 11, grough }, // epsilon Rough + { 11, grgact }, // epsilon Rough Acute + { 11, grggrv }, // epsilon Rough Grave + { 15, ggrave }, // eta Grave + { 15, gcircm }, // eta Circumflex + { 15, giota }, // eta w/Iota + { 15, gactio }, // eta Acute w/Iota + { 15, ggrvio }, // eta Grave w/Iota + { 15, gcirio }, // eta Circumflex w/Iota + { 15, gsmooth }, // eta Smooth + { 15, gsmact }, // eta Smooth Acute + { 15, gsmgrv }, // eta Smooth Grave + { 15, gsmcir }, // eta Smooth Circumflex + { 15, gsmio }, // eta Smooth w/Iota + { 15, gsmaio }, // eta Smooth Acute w/Iota + { 15, gsmgvio }, // eta Smooth Grave w/Iota + { 15, gsmcio }, // eta Smooth Circumflex w/Iota + { 15, grough }, // eta Rough + { 15, grgact }, // eta Rough Acute + { 15, grggrv }, // eta Rough Grave + { 15, grgcir }, // eta Rough Circumflex + { 15, grgio }, // eta Rough w/Iota + { 15, grgaio }, // eta Rough Acute w/Iota + { 15, grggvio }, // eta Rough Grave w/Iota + { 15, grgcio }, // eta Rough Circumflex w/Iota + { 19, ggrave }, // iota Grave + { 19, gcircm }, // iota Circumflex + { 19, gactdia }, // iota Acute Diaeresis + { 19, ggrvdia }, // iota Grave Diaeresis + { 19, gsmooth }, // iota Smooth + { 19, gsmact }, // iota Smooth Acute + { 19, gsmgrv }, // iota Smooth Grave + { 19, gsmcir }, // iota Smooth Circumflex + { 19, grough }, // iota Rough + { 19, grgact }, // iota Rough Acute + { 19, grggrv }, // iota Rough Grave + { 19, grgcir }, // iota Rough Circumflex + { 31, ggrave }, // omicron Grave + { 31, gsmooth }, // omicron Smooth + { 31, gsmact }, // omicron Smooth Acute + { 31, gsmgrv }, // omicron Smooth Grave + { 31, grough }, // omicron Rough + { 31, grgact }, // omicron Rough Acute + { 31, grggrv }, // omicron Rough Grave + { 0xFF, 0xFF }, // rho rough + { 0xFF, 0xFF }, // rho smooth + { 43, ggrave }, // upsilon Grave + { 43, gcircm }, // upsilon Circumflex + { 43, gactdia }, // upsilon Acute Diaeresis + { 43, ggrvdia }, // upsilon Grave Diaeresis + { 43, gsmooth }, // upsilon Smooth + { 43, gsmact }, // upsilon Smooth Acute + { 43, gsmgrv }, // upsilon Smooth Grave + { 43, gsmcir }, // upsilon Smooth Circumflex + { 43, grough }, // upsilon Rough + { 43, grgact }, // upsilon Rough Acute + { 43, grggrv }, // upsilon Rough Grave + { 43, grgcir }, // upsilon Rough Circumflex + { 51, ggrave }, // omega Grave + { 51, gcircm }, // omega Circumflex + { 51, giota }, // omega w/Iota + { 51, gactio }, // omega Acute w/Iota + { 51, ggrvio }, // omega Grave w/Iota + { 51, gcirio }, // omega Circumflex w/Iota + { 51, gsmooth }, // omega Smooth + { 51, gsmact }, // omega Smooth Acute + { 51, gsmgrv }, // omega Smooth Grave + { 51, gsmcir }, // omega Smooth Circumflex + { 51, gsmio }, // omega Smooth w/Iota + { 51, gsmaio }, // omega Smooth Acute w/Iota + { 51, gsmgvio }, // omega Smooth Grave w/Iota + { 51, gsmcio }, // omega Smooth Circumflex w/Iota + { 51, grough }, // omega Rough + { 51, grgact }, // omega Rough Acute + { 51, grggrv }, // omega Rough Grave + { 51, grgcir }, // omega Rough Circumflex + { 51, grgio }, // omega Rough w/Iota + { 51, grgaio }, // omega Rough Acute w/Iota + { 51, grggvio }, // omega Rough Grave w/Iota + { 51, grgcio} // omega Rough Circumflex w/Iota +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +static BASE_DIACRIT fwp_grk_c = +{ + 163, // # of characters in table. + 52, // start char. + fwp_grk_c_table +}; + +/**************************************************************************** +Desc: Format of index: + 2 words before = count. + word before = start character. + db code for base char. + db code for diacritic +Notes: Diacritical char is always in same set as composed char + base is in same set +****************************************************************************/ +static BASE_DIACRIT_TABLE fwp_rus_c_table[] = +{ + { 14, 204 }, // ZHE with right descender + { 15, 204 }, // zhe with right descender + { 0xFF, 0xFF}, // DZE + { 0xFF, 0xFF}, // dze + { 0xFF, 0xFF}, // Z + { 0xFF, 0xFF}, // z + { 18, 206 }, // II with macron + { 19, 206}, // ii with macron + { 0xFF, 0xFF}, // I + { 0xFF, 0xFF}, // i + { 0xFF, 0xFF}, // YI + { 0xFF, 0xFF}, // yi + { 0xFF, 0xFF}, // I ligature + { 0xFF, 0xFF}, // i ligature + { 0xFF, 0xFF}, // JE + { 0xFF, 0xFF}, // je + { 0xFF, 0xFF}, // KJE + { 0xFF, 0xFF}, // kje + { 22, 204}, // KA with right descender + { 23, 204}, // ka with right descender + { 22, 205 }, // KA ogonek + { 23, 205 }, // ka ogonek + { 0xFF, 0xFF}, // KA vertical bar + { 0xFF, 0xFF}, // ka vertical bar + { 0xFF, 0xFF}, // LJE + { 0xFF, 0xFF}, // lje + { 28, 204 }, // EN with right descender + { 29, 204 }, // en with right descender + { 0xFF, 0xFF}, // NJE + { 0xFF, 0xFF}, // nje + { 0xFF, 0xFF}, // ROUND OMEGA + { 0xFF, 0xFF}, // round omega + { 0xFF, 0xFF}, // OMEGA + { 0xFF, 0xFF}, // omega + { 0xFF, 0xFF}, // TSHE + { 0xFF, 0xFF}, // tshe + { 0xFF, 0xFF}, // SHORT U + { 0xFF, 0xFF}, // short u + { 40, 206}, // U with macron + { 41, 206 }, // u with macron + { 0xFF, 0xFF}, // STRAIGHT U + { 0xFF, 0xFF}, // straight u + { 0xFF, 0xFF}, // STRAIGHT U BAR + { 0xFF, 0xFF}, // straight u bar + { 0xFF, 0xFF}, // OU ligature + { 0xFF, 0xFF}, // ou ligature + { 44, 204 }, // KHA with right descender + { 45, 204 }, // kha with right descender + { 44, 205 }, // KHA ogonek + { 45, 205 }, // kha ogonek + { 0xFF, 0xFF}, // H + { 0xFF, 0xFF}, // h + { 0xFF, 0xFF}, // OMEGA titlo + { 0xFF, 0xFF}, // omega titlo + { 0xFF, 0xFF}, // DZHE + { 0xFF, 0xFF}, // dzhe + { 48, 204 }, // CHE with right descender + { 49, 204 }, // che with right descender + { 0xFF, 0xFF}, // CHE vertical bar + { 0xFF, 0xFF}, // che vertical bar + { 0xFF, 0xFF}, // SHCHA (variant) + { 0xFF, 0xFF}, // shcha (variant) + { 0xFF, 0xFF}, // YAT + { 0xFF, 0xFF}, // yat + { 0xFF, 0xFF}, // YUS BOLSHOI + { 0xFF, 0xFF}, // yus bolshoi + { 0xFF, 0xFF}, // BIG MALYI + { 0xFF, 0xFF}, // big malyi + { 0xFF, 0xFF}, // KSI + { 0xFF, 0xFF}, // ksi + { 0xFF, 0xFF}, // PSI + { 0xFF, 0xFF}, // psi + { 0xFF, 0xFF}, // FITA + { 0xFF, 0xFF}, // fita + { 0xFF, 0xFF}, // IZHITSA + { 0xFF, 0xFF}, // izhitsa + { 00, racute}, // Russian A acute + { 01, racute }, // Russian a acute + { 10, racute }, // Russian IE acute + { 11, racute }, // Russian ie acute + { 78, racute }, // Russian E acute + { 79, racute }, // Russian e acute + { 18, racute }, // Russian II acute + { 19, racute }, // Russian ii acute + { 88, racute }, // Russian I acute + { 89, racute }, // Russian i acute + { 90, racute }, // Russian YI acute + { 91, racute }, // Russian yi acute + { 30, racute }, // Russian O acute + { 31, racute }, // Russian o acute + { 40, racute }, // Russian U acute + { 41, racute }, // Russian u acute + { 56, racute }, // Russian YERI acute + { 57, racute }, // Russian yeri acute + { 60, racute }, // Russian REVERSED E acute + { 61, racute }, // Russian reversed e acute + { 62, racute }, // Russian IU acute + { 63, racute }, // Russian iu acute + { 64, racute }, // Russian IA acute + { 65, racute }, // Russian ia acute + { 00, rgrave }, // Russian A grave + { 01, rgrave }, // Russian a grave + { 10, rgrave }, // Russian IE grave + { 11, rgrave }, // Russian ie grave + { 12, rgrave }, // Russian YO grave + { 13, rgrave }, // Russian yo grave + { 18, rgrave }, // Russian I grave + { 19, rgrave }, // Russian i grave + { 30, rgrave }, // Russian O grave + { 31, rgrave }, // Russian o grave + { 40, rgrave }, // Russian U grave + { 41, rgrave }, // Russian u grave + { 56, rgrave }, // Russian YERI grave + { 57, rgrave }, // Russian yeri grave + { 60, rgrave }, // Russian REVERSED E grave + { 61, rgrave }, // Russian reversed e grave + { 62, rgrave }, // Russian IU grave + { 63, rgrave }, // Russian iu grave + { 64, rgrave }, // Russian IA grave + { 65, rgrave} // Russian ia grave +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +static BASE_DIACRIT fwp_rus_c = +{ + 120, // # of characters in table. + 156, // start char. + fwp_rus_c_table, +}; + +/**************************************************************************** +Desc: Table of pointers to character component tables. +****************************************************************************/ +BASE_DIACRIT * fwp_car60_c[ NCHSETS] = +{ + (BASE_DIACRIT*)0, // no composed characters for ascii. + &fwp_ml1c, + (BASE_DIACRIT*)0, // no composed characters for multinational 2 + (BASE_DIACRIT*)0, // no composed characters for line draw. + (BASE_DIACRIT*)0, // no composed characters for typographic. + (BASE_DIACRIT*)0, // no composed characters for icons. + (BASE_DIACRIT*)0, // no composed characters for math. + (BASE_DIACRIT*)0, // no composed characters for math extension. + &fwp_grk_c, // Greek + (BASE_DIACRIT*)0, // Hebrew + &fwp_rus_c, // Cyrillic - Russian + (BASE_DIACRIT*)0, // Hiragana or Katakana (Japanese) + (BASE_DIACRIT*)0, // no composed characters for user. + (BASE_DIACRIT*)0, // no composed characters for Arabic. + (BASE_DIACRIT*)0, // no composed characters for Arabic Script . +}; + +/**************************************************************************** +Desc: Map special chars in CharSet (x24) to collation values +****************************************************************************/ +BYTE_WORD_TBL fwp_Ch24ColTbl[] = // Position in the table+1 is subColValue +{ + {1, COLLS+2}, // comma + {2, COLLS+1}, // maru + {5, COLS_ASIAN_MARKS+2}, // chuuten + {10, COLS_ASIAN_MARKS}, // dakuten + {11, COLS_ASIAN_MARKS+1}, // handakuten + {43, COLS2+2}, // angled brackets + {44, COLS2+3}, // + {49, COLS2+2}, // pointy brackets + {50, COLS2+3}, + {51, COLS2+2}, // double pointy brackets + {52, COLS2+3}, + {53, COLS1}, // Japanese quotes + {54, COLS1}, + {55, COLS1}, // hollow Japanese quotes + {56, COLS1}, + {57, COLS2+2}, // filled rounded brackets + {58, COLS2+3} +}; + +/**************************************************************************** +Desc: Kana subcollation values + BIT 0: set if large char + BIT 1: set if voiced + BIT 2: set if half voiced +Notes: + To save space should be nibbles + IMPORTANT: + The '1' entries that do not have + a matching '0' entry have been + changed to zero to save space in + the subcollation area. + The original table is listed below. +****************************************************************************/ +FLMBYTE KanaSubColTbl[] = +{ + 0,1,0,1,0,1,0,1,0,1, // a A i I u U e E o O + 1,3,0,3,0,3,1,3,0,3, // KA GA KI GI KU GU KE GE KO GO + 0,3,0,3,0,3,0,3,0,3, // SA ZA SHI JI SU ZU SE ZE SO ZO + 0,3,0,3,0,1,3,0,3,0,3, // TA DA CHI JI tsu TSU ZU TE DE TO DO + 0,0,0,0,0, // NA NI NU NE NO + 0,3,5,0,3,5,0,3,5, // HA BA PA HI BI PI FU BU PU + 0,3,5,0,3,5, // HE BE PE HO BO PO + 0,0,0,0,0, // MA MI MU ME MO + 0,1,0,1,0,1, // ya YA yu YU yo YO + 0,0,0,0,0, // RA RI RU RE RO + 0,1,0,0,0, // wa WA WI WE WO + 0,3,0,0 // N VU ka ke +}; + +/**************************************************************************** +Desc: Map katakana (CharSet x26) to collation values + kana collating values are two byte values + where the high byte is 0x01. +****************************************************************************/ +FLMBYTE KanaColTbl[] = +{ + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, // a A i I u U e E o O + 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, // KA GA KI GI KU GU KE GE KO GO + 10,10,11,11,12,12,13,13,14,14, // SA ZA SHI JI SU ZU SE ZE SO ZO + 15,15,16,16,17,17,17,18,18,19,19, // TA DA CHI JI tsu TSU ZU TE DE TO DO + 20,21,22,23,24, // NA NI NU NE NO + 25,25,25,26,26,26,27,27,27, // HA BA PA HI BI PI FU BU PU + 28,28,28,29,29,29, // HE BE PE HO BO PO + 30,31,32,33,34, // MA MI MU ME MO + 35,35,36,36,37,37, // ya YA yu YU yo YO + 38,39,40,41,42, // RA RI RU RE RO + 43,43,44,45,46, // wa WA WI WE WO + 47, 2, 5, 8 // N VU ka ke +}; + +/**************************************************************************** +Desc: Map KataKana collated value to vowel value for + use for the previous char. +****************************************************************************/ +FLMBYTE KanaColToVowel[] = +{ + 0,1,2,3,4, // a i u e o + 0,1,2,3,4, // ka ki ku ke ko + 0,1,2,3,4, // sa shi su se so + 0,1,2,3,4, // ta chi tsu te to + 0,1,2,3,4, // na ni nu ne no + 0,1,2,3,4, // ha hi hu he ho + 0,1,2,3,4, // ma mi mu me mo + 0,2,4, // ya yu yo + 0,1,2,3,4, // ra ri ru re ro + 0,1,3,4, // wa wi we wo +}; + +/**************************************************************************** +Desc: Convert Zenkaku (double wide) to Hankaku (single wide) + Character set 0x24 maps to single wide chars in other char sets. + This enables collation values to be found on some symbols. + This is also used to convert symbols from hankaku to Zen24. +****************************************************************************/ +BYTE_WORD_TBL Zen24ToHankaku[] = +{ + { 0 ,0x0020 }, // space + { 1 ,0x0b03 }, // japanese comma + { 2 ,0x0b00 }, // circle period + { 3 , 44 }, // comma + { 4 , 46 }, // period + { 5 ,0x0b04 }, // center dot + { 6 , 58 }, // colon + { 7 , 59 }, // semicolon + { 8 , 63 }, // question mark + { 9 , 33 }, // exclamation mark + { 10 ,0x0b3d }, // dakuten + { 11 ,0x0b3e }, // handakuten + { 12 ,0x0106 }, // accent mark + { 13 , 96 }, // accent mark + { 14 ,0x0107 }, // umlat + { 15 , 94 }, // caret + { 16 ,0x0108 }, // macron + { 17 , 95 }, // underscore + { 27 ,0x0b0f }, // extend vowel + { 28 ,0x0422 }, // mdash + { 29 , 45 }, // hyphen + { 30 , 47 }, // slash + { 31 ,0x0607 }, // backslash + { 32 , 126 }, // tilde + { 33 ,0x0611 }, // doubleline + { 34 ,0x0609 }, // line + { 37 ,0x041d }, // left apostrophe + { 38 ,0x041c }, // right apostrophe + { 39 ,0x0420 }, // left quote + { 40 ,0x041f }, // right quote + { 41 , 40 }, // left paren + { 42 , 41 }, // right paren + { 45 , 91 }, // left bracket + { 46 , 93 }, // right bracket + { 47 , 123 }, // left curly bracket + { 48 , 125 }, // right curly bracket + { 53 ,0x0b01 }, // left j quote + { 54 ,0x0b02 }, // right j quote + { 59 , 43 }, // plus + { 60 ,0x0600 }, // minus + { 61 ,0x0601 }, // plus/minus + { 62 ,0x0627 }, // times + { 63 ,0x0608 }, // divide + { 64 , 61 }, // equal + { 65 ,0x0663 }, // unequal + { 66 , 60 }, // less + { 67 , 62 }, // greater + { 68 ,0x0602 }, // less/equal + { 69 ,0x0603 }, // greater/equal + { 70 ,0x0613 }, // infinity + { 71 ,0x0666 }, // traingle dots + { 72 ,0x0504 }, // man + { 73 ,0x0505 }, // woman + { 75 ,0x062d }, // prime + { 76 ,0x062e }, // double prime + { 78 ,0x040c }, // yen + { 79 , 36 }, // $ + { 80 ,0x0413 }, // cent + { 81 ,0x040b }, // pound + { 82 , 37 }, // % + { 83 , 35 }, // # + { 84 , 38 }, // & + { 85 , 42 }, // * + { 86 , 64 }, // @ + { 87 ,0x0406 }, // squiggle + { 89 ,0x06b8 }, // filled star + { 90 ,0x0425 }, // hollow circle + { 91 ,0x042c }, // filled circle + { 93 ,0x065f }, // hollow diamond + { 94 ,0x0660 }, // filled diamond + { 95 ,0x0426 }, // hollow box + { 96 ,0x042e }, // filled box + { 97 ,0x0688 }, // hollow triangle + { 99 ,0x0689 }, // hollow upside down triangle + { 103,0x0615 }, // right arrow + { 104,0x0616 }, // left arrow + { 105,0x0617 }, // up arrow + { 106,0x0622 }, // down arrow + { 119,0x060f }, + { 121,0x0645 }, + { 122,0x0646 }, + { 123,0x0643 }, + { 124,0x0644 }, + { 125,0x0642 }, // union + { 126,0x0610 }, // intersection + { 135,0x0655 }, + { 136,0x0656 }, + { 138,0x0638 }, // right arrow + { 139,0x063c }, // left/right arrow + { 140,0x067a }, + { 141,0x0679 }, + { 153,0x064f }, // angle + { 154,0x0659 }, + { 155,0x065a }, + { 156,0x062c }, + { 157,0x062b }, + { 158,0x060e }, + { 159,0x06b0 }, + { 160,0x064d }, + { 161,0x064e }, + { 162,0x050e }, // square root + { 164,0x0604 }, + { 175,0x0623 }, // angstrom + { 176,0x044b }, // percent + { 177,0x051b }, // sharp + { 178,0x051c }, // flat + { 179,0x0509 }, // musical note + { 180,0x0427 }, // dagger + { 181,0x0428 }, // double dagger + { 182,0x0405 }, // paragraph + { 187,0x068f } // big hollow circle +}; + +/**************************************************************************** +Desc: Maps CS26 to CharSet 11 + Used to uncollate characters for FLAIM - placed here for consistency + 0x80 - add dakuten + 0xC0 - add handakuten + 0xFF - no mapping exists +****************************************************************************/ +FLMBYTE MapCS26ToCharSet11[ 86] = +{ + 0x06, // 0 a + 0x10, // 1 A + 0x07, // 2 i + 0x11, // 3 I + 0x08, // 4 u + 0x12, // 5 U + 0x09, // 6 e + 0x13, // 7 E + 0x0a, // 8 o + 0x14, // 9 O + + 0x15, // 0x0a KA + 0x95, // GA - 21 followed by 0x3D dakuten + + 0x16, // 0x0c KI + 0x96, // GI + 0x17, // 0x0e KU + 0x97, // GU + 0x18, // 0x10 KE + 0x98, // GE + 0x19, // 0x12 KO + 0x99, // GO + + 0x1a, // 0x14 SA + 0x9a, // ZA + 0x1b, // 0x16 SHI + 0x9b, // JI + 0x1c, // 0x18 SU + 0x9c, // ZU + 0x1d, // 0x1a SE + 0x9d, // ZE + 0x1e, // 0x1c SO + 0x9e, // ZO + + 0x1f, // 0x1e TA + 0x9f, // DA + 0x20, // 0x20 CHI + 0xa0, // JI + 0x0e, // 0x22 small tsu + 0x21, // 0x23 TSU + 0xa1, // ZU + 0x22, // 0x25 TE + 0xa2, // DE + 0x23, // 0x27 TO + 0xa3, // DO + + 0x24, // 0x29 NA + 0x25, // 0x2a NI + 0x26, // 0x2b NU + 0x27, // 0x2c NE + 0x28, // 0x2d NO + + 0x29, // 0x2e HA + 0xa9, // 0x2f BA + 0xe9, // 0x30 PA + 0x2a, // 0x31 HI + 0xaa, // 0x32 BI + 0xea, // 0x33 PI + 0x2b, // 0x34 FU + 0xab, // 0x35 BU + 0xeb, // 0x36 PU + 0x2c, // 0x37 HE + 0xac, // 0x38 BE + 0xec, // 0x39 PE + 0x2d, // 0x3a HO + 0xad, // 0x3b BO + 0xed, // 0x3c PO + + 0x2e, // 0x3d MA + 0x2f, // 0x3e MI + 0x30, // 0x3f MU + 0x31, // 0x40 ME + 0x32, // 0x41 MO + + 0x0b, // 0x42 small ya + 0x33, // 0x43 YA + 0x0c, // 0x44 small yu + 0x34, // 0x45 YU + 0x0d, // 0x46 small yo + 0x35, // 0x47 YO + + 0x36, // 0x48 RA + 0x37, // 0x49 RI + 0x38, // 0x4a RU + 0x39, // 0x4b RE + 0x3a, // 0x4c RO + + 0xff, // 0x4d small wa + 0x3b, // 0x4e WA + 0xff, // 0x4f WI + 0xff, // 0x50 WE + 0x05, // 0x51 WO + + 0x3c, // 0x52 N + 0xff, // 0x53 VU + 0xff, // 0x54 ka + 0xff // 0x55 ke +}; + +/**************************************************************************** +Desc: Conversion from single (Hankaku) to double (Zenkaku) wide characters + Used in flmWPHanToZenkaku() + Maps from charset 11 to CS24 (punctuation) (starting from 11,0) +****************************************************************************/ +FLMBYTE From0AToZen[] = // ' changed because of windows +{ + 0, 9, 40, 0x53, // sp ! " # + 0x4f, 0x52, 0x54, 38, // $ % & ' + // Was 187 for ! and 186 for ' + 0x29, 0x2a, 0x55, 0x3b, // ( ) * + + 3, 0x1d, 4, 0x1e // , - . / +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE From0BToZen[] = +{ + 6, 7, 0x42, 0x40, // : ; < = + 0x43, 8, 0x56 // > ? @ +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE From0CToZen[] = +{ + 0x2d, 0x1f, 0x2e, 0x0f, 0x11, 0x0d // [ BACKSLASH ] ^ _ ` +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE From0DToZen[] = +{ + 0x2f, 0x22, 0x30, 0x20 // { | } ~ +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE From8ToZen[] = +{ + 0x5e, 0x7e, 0x5f, 0x7f, 0x5f, 0xFF, 0x60, 0x80, + 0x61, 0x81, 0x62, 0x82, 0x63, 0x83, 0x64, 0x84, + 0x65, 0x85, 0x66, 0x86, 0x67, 0x87, 0x68, 0x88, + 0x69, 0x89, 0x6a, 0x8a, 0x6b, 0x8b, 0x6c, 0x8c, + 0x6d, 0x8d, 0x6e, 0x8e, 0x6f, 0x8f, 0x6f, 0xFF, + 0x70, 0x90, 0x71, 0x91, 0x72, 0x92, 0x73, 0x93, + 0x74, 0x94, 0x75, 0x95 +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE From11AToZen[] = // 11 to 24 punctuation except dash +{ + 2, // japanese period + 0x35, // left bracket + 0x36, // right bracket + 0x01, // comma + 0x05 // chuuten +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE From11BToZen[] = // 11 to 26 (katakana) from 11,5 +{ + 0x51, // wo + 0,2,4,6,8,0x42,0x44,0x46,0x22, // small a i u e o ya yu yo tsu + 0xFF, 1, 3, 5, 7, 9, // dash (x241b) a i u e o + 0x0a, 0x0c, 0x0e, 0x10, 0x12, // ka ki ku ke ko + 0x14, 0x16, 0x18, 0x1a, 0x1c, // sa shi su se so + 0x1e, 0x20, 0x23, 0x25, 0x27, // ta chi tsu te to + 0x29, 0x2a, 0x2b, 0x2c, 0x2d, // na ni nu ne no + 0x2e, 0x31, 0x34, 0x37, 0x3a, // ha hi fu he ho + 0x3d, 0x3e, 0x3f, 0x40, 0x41, // ma mi mu me mo + 0x43, 0x45, 0x47, // ya yu yo + 0x48, 0x49, 0x4a, 0x4b, 0x4c, // ra ri ru re ro + 0x4e, 0x52 // WA N +}; // does not have wa WI WE VU ka ke + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMUINT16 fwp_indexi[] = +{ + 0,11,14,15,17,18,19,21,22,23,24,25,26,35,59 +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMUINT16 fwp_indexj[] = // DOUBLE CHAR AREA - LANGUAGES +{ + FLM_CA_LANG, // Catalan (0) + FLM_CF_LANG, // Canadian French + FLM_CZ_LANG, // Czech + FLM_SL_LANG, // Slovak + FLM_DE_LANG, // German + FLM_SD_LANG, // Swiss German + FLM_ES_LANG, // Spanish (Spain) + FLM_FR_LANG, // French + FLM_NL_LANG, // Netherlands + 0xFFFF, // DK_LANG, Danish - support for 'aa' -> a-ring out + 0xFFFF, // NO_LANG, Norwegian - support for 'aa' -> a-ring out + 0x0063, // c - DOUBLE CHARACTERS - STATE ENTRIES + 0x006c, // l + 0x0197, // l with center dot + 0x0063, // c + 0x0125, // ae digraph + 0x01a7, // oe digraph + 0x0068, // h + 0x0068, // h + 0x006c, // l + 0x0101, // center dot alone + 0x006c, // l + 0x0117, // ? (for German) + 0x018b, // ij digraph + 0x0000, // was 'a' - will no longer map 'aa' to a-ring + 0x0000, // was 'a' + + FLM_CZ_LANG, // SINGLE CHARS - LANGUAGES + FLM_DK_LANG, + FLM_NO_LANG, + FLM_SL_LANG, + FLM_TK_LANG, + FLM_SU_LANG, + FLM_IS_LANG, + FLM_SV_LANG, + FLM_YK_LANG, + // SINGLE CHARS + 0x011e, // A Diaeresis - alternate collating sequences + 0x011f, // a Diaeresis + 0x0122, // A Ring - 2 + 0x0123, // a Ring + 0x0124, // AE Diagraph - 4 + 0x0125, // ae diagraph + 0x013e, // O Diaeresis - 6 + 0x013f, // o Diaeresis + 0x0146, // U Diaeresis - 8 + 0x0147, // u Diaeresis + 0x0150, // O Slash - 10 + 0x0151, // o Slash + + 0x0A3a, // CYRILLIC SOFT SIGN - 12 + 0x0A3b, // CYRILLIC soft sign + 0x01ee, // dotless i - turkish - 14 + 0x01ef, // dotless I - turkish + 0x0162, // C Hacek/caron - 1,98 - 16 + 0x0163, // c Hacek/caron - 1,99 + 0x01aa, // R Hacek/caron - 1,170 - 18 + 0x01ab, // r Hacek/caron - 1,171 + 0x01b0, // S Hacek/caron - 1,176 - 20 + 0x01b1, // s Hacek/caron - 1,177 + 0x01ce, // Z Hacek/caron - 1,206 - 22 + 0x01cf, // z Hacek/caron - 1,207 +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMUINT16 fwp_valuea[] = +{ +// DOUBLE CHAR STATE VALUES + STATE1, // 00 + STATE3, + STATE2, + STATE2, + STATE8, + STATE8, + STATE1, + STATE3, + STATE9, + STATE10, // No longer in use + STATE10, // No longer in use + STATE4, + STATE6, + STATE6, + STATE5, + INSTAE, + INSTOE, + AFTERC, + AFTERH, + AFTERL, + STATE7, + STATE6, + INSTSG, // ss for German + INSTIJ, + STATE11, // aa - no longer in use + WITHAA, // aa - no longer in use + +// SINGLE CHARS - LANGUAGES + START_CZ, // Czech + START_DK, // Danish + START_NO, // Norwegian + START_SL, // Slovak + START_TK, // Turkish + START_SU, // Finnish + START_IS, // Icelandic + START_SV, // Swedish + START_YK, // Ukrainian + +// SINGLE CHARS FIXUP AREAS + COLS9, COLS9, COLS9, COLS9, // US & OTHERS + COLS9+1, COLS9+1, COLS9+21, COLS9+21, + COLS9+30, COLS9+30, COLS9+21, COLS9+21, + COLS10+43, COLS10+43, COLS9+12, COLS9+12, + COLS9+3, COLS9+3, COLS9+25, COLS9+25, + COLS9+27, COLS9+27, COLS9+35, COLS9+35, + + COLS9+45, COLS9+45, COLS9+55, COLS9+55, // DANISH + COLS9+42, COLS9+42, COLS9+53, COLS9+53, + COLS9+30, COLS9+30, COLS9+49, COLS9+49, // Oct98 U Diaer no longer to y Diaer + COLS10+43, COLS10+43, COLS9+12, COLS9+12, + COLS9+3, COLS9+3, COLS9+25, COLS9+25, + COLS9+27, COLS9+27, COLS9+35, COLS9+35, + + COLS9, COLS9, COLS9, COLS9, // Icelandic + COLS9+46, COLS9+46, COLS9+50, COLS9+50, + COLS9+30, COLS9+30, COLS9+54, COLS9+54, + COLS10+43, COLS10+43, COLS9+12, COLS9+12, + COLS9+3, COLS9+3, COLS9+25, COLS9+25, + COLS9+27, COLS9+27, COLS9+35, COLS9+35, + + COLS9, COLS9, COLS9+51, COLS9+51, // Norwegian + COLS9+43, COLS9+43, COLS9+21, COLS9+21, + COLS9+30, COLS9+30, COLS9+47, COLS9+47, + COLS10+43, COLS10+43, COLS9+12, COLS9+12, + COLS9+3, COLS9+3, COLS9+25, COLS9+25, + COLS9+27, COLS9+27, COLS9+35, COLS9+35, + + COLS9+48, COLS9+48, COLS9+44, COLS9+44, // Finnish/Swedish + COLS9+1, COLS9+1, COLS9+52, COLS9+52, + COLS9+30, COLS9+30, COLS9+21, COLS9+21, // Oct98 U Diaer no longer to y Diaer + COLS10+43, COLS10+43, COLS9+12, COLS9+12, + COLS9+3, COLS9+3, COLS9+25, COLS9+25, + COLS9+27, COLS9+27, COLS9+35, COLS9+35, + + COLS9, COLS9, COLS9, COLS9, // Ukrain + COLS9+1, COLS9+1, COLS9+21, COLS9+21, + COLS9+30, COLS9+30, COLS9+21, COLS9+21, + COLS10+48, COLS10+48, COLS9+12, COLS9+12, + COLS9+3, COLS9+3, COLS9+25, COLS9+25, + COLS9+27, COLS9+27, COLS9+35, COLS9+35, + + COLS9, COLS9, COLS9, COLS9, // Turkish + COLS9+1, COLS9+1, COLS9+21, COLS9+21, + COLS9+30, COLS9+30, COLS9+21, COLS9+21, + COLS9+43, COLS9+43, COLS9+11, COLS9+11, // dotless i same as + COLS9+3, COLS9+3, COLS9+25, COLS9+25, // the "CH" in Czech + COLS9+27, COLS9+27, COLS9+35, COLS9+35, // works because char + // fails brkcar() + + COLS9, COLS9, COLS9, COLS9, // Czech / Slovak + COLS9+1, COLS9+1, COLS9+21, COLS9+21, + COLS9+30, COLS9+30, COLS9+21, COLS9+21, + COLS10+43, COLS10+43, COLS9+12, COLS9+12, + COLS9+5, COLS9+5, COLS9+26, COLS9+26, // carons + COLS9+28, COLS9+28, COLS9+36, COLS9+36 +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE fwp_asc60Tbl[ ASCTBLLEN + 2] = +{ + 0x20, // initial character offset!! + ASCTBLLEN, // len of this table + COLLS, // + COLLS+5, // ! + COLS1, // " + COLS6+1, // # + COLS3, // $ + COLS6, // % + COLS6+2, // & + COLS1+1, // ' + COLS2, // ( + COLS2+1, // ) + COLS4+2, // * + COLS4, // + + COLLS+2, // , + COLS4+1, // - + COLLS+1, // . + COLS4+3, // / + COLS8, // 0 + COLS8+1, // 1 + COLS8+2, // 2 + COLS8+3, // 3 + COLS8+4, // 4 + COLS8+5, // 5 + COLS8+6, // 6 + COLS8+7, // 7 + COLS8+8, // 8 + COLS8+9, // 9 + COLLS+3, // : + COLLS+4, // ; + COLS5, // < + COLS5+2, // = + COLS5+4, // > + COLLS+7, // ? + COLS6+3, // @ + COLS9, // A + COLS9+2, // B + COLS9+3, // C + COLS9+6, // D + COLS9+7, // E + COLS9+8, // F + COLS9+9, // G + COLS9+10, // H + COLS9+12, // I + COLS9+14, // J + COLS9+15, // K + COLS9+16, // L + COLS9+18, // M + COLS9+19, // N + COLS9+21, // O + COLS9+23, // P + COLS9+24, // Q + COLS9+25, // R + COLS9+27, // S + COLS9+29, // T + COLS9+30, // U + COLS9+31, // V + COLS9+32, // W + COLS9+33, // X + COLS9+34, // Y + COLS9+35, // Z + COLS9+40, // [ (note: alphabetic - end of list) + COLS6+4, // Backslash + COLS9+41, // ] (note: alphabetic - end of list) + COLS4+4, // ^ + COLS6+5, // _ + COLS1+2, // ` + COLS9, // a + COLS9+2, // b + COLS9+3, // c + COLS9+6, // d + COLS9+7, // e + COLS9+8, // f + COLS9+9, // g + COLS9+10, // h + COLS9+12, // i + COLS9+14, // j + COLS9+15, // k + COLS9+16, // l + COLS9+18, // m + COLS9+19, // n + COLS9+21, // o + COLS9+23, // p + COLS9+24, // q + COLS9+25, // r + COLS9+27, // s + COLS9+29, // t + COLS9+30, // u + COLS9+31, // v + COLS9+32, // w + COLS9+33, // x + COLS9+34, // y + COLS9+35, // z + COLS2+4, // { + COLS6+6, // | + COLS2+5, // } + COLS6+7 // ~ +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE fwp_mn60Tbl[ MNTBLLEN + 2] = // multinational table +{ + 23, // initial character offset!! + MNTBLLEN, // len of this table + COLS9+27, // German Double s + COLS9+15, // Icelandic k + COLS9+14, // Dotless j + +// IBM Charset + + COLS9, // A Acute + COLS9, // a Acute + COLS9, // A Circumflex + COLS9, // a Circumflex + COLS9, // A Diaeresis or Umlaut + COLS9, // a Diaeresis or Umlaut + COLS9, // A Grave + COLS9, // a Grave + COLS9, // A Ring + COLS9, // a Ring + COLS9+1, // AE digraph + COLS9+1, // ae digraph + COLS9+3, // C Cedilla + COLS9+3, // c Cedilla + COLS9+7, // E Acute + COLS9+7, // e Acute + COLS9+7, // E Circumflex + COLS9+7, // e Circumflex + COLS9+7, // E Diaeresis or Umlaut + COLS9+7, // e Diaeresis or Umlaut + COLS9+7, // E Grave + COLS9+7, // e Grave + COLS9+12, // I Acute + COLS9+12, // i Acute + COLS9+12, // I Circumflex + COLS9+12, // i Circumflex + COLS9+12, // I Diaeresis or Umlaut + COLS9+12, // i Diaeresis or Umlaut + COLS9+12, // I Grave + COLS9+12, // i Grave + COLS9+20, // N Tilde + COLS9+20, // n Tilde + COLS9+21, // O Acute + COLS9+21, // o Acute + COLS9+21, // O Circumflex + COLS9+21, // o Circumflex + COLS9+21, // O Diaeresis or Umlaut + COLS9+21, // o Diaeresis or Umlaut + COLS9+21, // O Grave + COLS9+21, // o Grave + COLS9+30, // U Acute + COLS9+30, // u Acute + COLS9+30, // U Circumflex + COLS9+30, // u Circumflex + COLS9+30, // U Diaeresis or Umlaut + COLS9+30, // u Diaeresis or Umlaut + COLS9+30, // U Grave + COLS9+30, // u Grave + COLS9+34, // Y Diaeresis or Umlaut + COLS9+34, // y Diaeresis or Umlaut + +// IBM foreign + + COLS9, // A Tilde + COLS9, // a Tilde + COLS9+6, // D Cross Bar + COLS9+6, // d Cross Bar + COLS9+21, // O Slash + COLS9+21, // o Slash + COLS9+21, // O Tilde + COLS9+21, // o Tilde + COLS9+34, // Y Acute + COLS9+34, // y Acute + COLS9+6, // Uppercase Eth + COLS9+6, // Lowercase Eth + COLS9+37, // Uppercase Thorn + COLS9+37, // Lowercase Thorn + +// Teletex chars + + COLS9, // A Breve + COLS9, // a Breve + COLS9, // A Macron + COLS9, // a Macron + COLS9, // A Ogonek + COLS9, // a Ogonek + COLS9+3, // C Acute + COLS9+3, // c Acute + COLS9+3, // C Caron or Hachek + COLS9+3, // c Caron or Hachek + COLS9+3, // C Circumflex + COLS9+3, // c Circumflex + COLS9+3, // C Dot Above + COLS9+3, // c Dot Above + COLS9+6, // D Caron or Hachek (Apostrophe Beside) + COLS9+6, // d Caron or Hachek (Apostrophe Beside) + COLS9+7, // E Caron or Hachek + COLS9+7, // e Caron or Hachek + COLS9+7, // E Dot Above + COLS9+7, // e Dot Above + COLS9+7, // E Macron + COLS9+7, // e Macron + COLS9+7, // E Ogonek + COLS9+7, // e Ogonek + COLS9+9, // G Acute + COLS9+9, // g Acute + COLS9+9, // G Breve + COLS9+9, // g Breve + COLS9+9, // G Caron or Hachek + COLS9+9, // g Caron or Hachek + COLS9+9, // G Cedilla (Apostrophe Under) + COLS9+9, // g Cedilla (Apostrophe Over) + COLS9+9, // G Circumflex + COLS9+9, // g Circumflex + COLS9+9, // G Dot Above + COLS9+9, // g Dot Above + COLS9+10, // H Circumflex + COLS9+10, // h Circumflex + COLS9+10, // H Cross Bar + COLS9+10, // h Cross Bar + COLS9+12, // I Dot Above (Sharp Accent) + COLS9+12, // i Dot Above (Sharp Accent) + COLS9+12, // I Macron + COLS9+12, // i Macron + COLS9+12, // I Ogonek + COLS9+12, // i Ogonek + COLS9+12, // I Tilde + COLS9+12, // i Tilde + COLS9+13, // IJ Digraph + COLS9+13, // ij Digraph + COLS9+14, // J Circumflex + COLS9+14, // j Circumflex + COLS9+15, // K Cedilla (Apostrophe Under) + COLS9+15, // k Cedilla (Apostrophe Under) + COLS9+16, // L Acute + COLS9+16, // l Acute + COLS9+16, // L Caron or Hachek (Apostrophe Beside) + COLS9+16, // l Caron or Hachek (Apostrophe Beside) + COLS9+16, // L Cedilla (Apostrophe Under) + COLS9+16, // l Cedilla (Apostrophe Under) + COLS9+16, // L Center Dot + COLS9+16, // l Center Dot + COLS9+16, // L Stroke + COLS9+16, // l Stroke + COLS9+19, // N Acute + COLS9+19, // n Acute + COLS9+19, // N Apostrophe + COLS9+19, // n Apostrophe + COLS9+19, // N Caron or Hachek + COLS9+19, // n Caron or Hachek + COLS9+19, // N Cedilla (Apostrophe Under) + COLS9+19, // n Cedilla (Apostrophe Under) + COLS9+21, // O Double Acute + COLS9+21, // o Double Acute + COLS9+21, // O Macron + COLS9+21, // o Macron + COLS9+22, // OE digraph + COLS9+22, // oe digraph + COLS9+25, // R Acute + COLS9+25, // r Acute + COLS9+25, // R Caron or Hachek + COLS9+25, // r Caron or Hachek + COLS9+25, // R Cedilla (Apostrophe Under) + COLS9+25, // r Cedilla (Apostrophe Under) + COLS9+27, // S Acute + COLS9+27, // s Acute + COLS9+27, // S Caron or Hachek + COLS9+27, // s Caron or Hachek + COLS9+27, // S Cedilla + COLS9+27, // s Cedilla + COLS9+27, // S Circumflex + COLS9+27, // s Circumflex + COLS9+29, // T Caron or Hachek (Apostrophe Beside) + COLS9+29, // t Caron or Hachek (Apostrophe Beside) + COLS9+29, // T Cedilla (Apostrophe Under) + COLS9+29, // t Cedilla (Apostrophe Under) + COLS9+29, // T Cross Bar + COLS9+29, // t Cross Bar + COLS9+30, // U Breve + COLS9+30, // u Breve + COLS9+30, // U Double Acute + COLS9+30, // u Double Acute + COLS9+30, // U Macron + COLS9+30, // u Macron + COLS9+30, // U Ogonek + COLS9+30, // u Ogonek + COLS9+30, // U Ring + COLS9+30, // u Ring + COLS9+30, // U Tilde + COLS9+30, // u Tilde + COLS9+32, // W Circumflex + COLS9+32, // w Circumflex + COLS9+34, // Y Circumflex + COLS9+34, // y Circumflex + COLS9+35, // Z Acute + COLS9+35, // z Acute + COLS9+35, // Z Caron or Hachek + COLS9+35, // z Caron or Hachek + COLS9+35, // Z Dot Above + COLS9+35, // z Dot Above + COLS9+19, // Uppercase Eng + COLS9+19, // Lowercase Eng + +// Other + + COLS9+6, // D Macron + COLS9+6, // d Macron + COLS9+16, // L Macron + COLS9+16, // l Macron + COLS9+19, // N Macron + COLS9+19, // n Macron + COLS9+25, // R Grave + COLS9+25, // r Grave + COLS9+27, // S Macron + COLS9+27, // s Macron + COLS9+29, // T Macron + COLS9+29, // t Macron + COLS9+34, // Y Breve + COLS9+34, // y Breve + COLS9+34, // Y Grave + COLS9+34, // y Grave + COLS9+6, // D Apostrophe Beside + COLS9+6, // d Apostrophe Beside + COLS9+21, // O Apostrophe Beside + COLS9+21, // o Apostrophe Beside + COLS9+30, // U Apostrophe Beside + COLS9+30, // u Apostrophe Beside + COLS9+7, // E breve + COLS9+7, // e breve + COLS9+12, // I breve + COLS9+12, // i breve + COLS9+12, // dotless I + COLS9+12, // dotless i + COLS9+21, // O breve + COLS9+21 // o breve +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE fwp_sym60Tbl[ SYMTBLLEN + 2] = +{ + 11, // initial character offset!! + SYMTBLLEN, // len of this table + COLS3+2, // pound + COLS3+3, // yen + COLS3+4, // pacetes + COLS3+5, // floren + COLS0, + COLS0, + COLS0, + COLS0, + COLS3+1, // cent +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE fwp_grk60Tbl[ GRKTBLLEN + 2] = +{ + 0, // starting offset + GRKTBLLEN, // length + COLS7, // Uppercase Alpha + COLS7, // Lowercase Alpha + COLS7+1, // Uppercase Beta + COLS7+1, // Lowercase Beta + COLS7+1, // Uppercase Beta Medial + COLS7+1, // Lowercase Beta Medial + COLS7+2, // Uppercase Gamma + COLS7+2, // Lowercase Gamma + COLS7+3, // Uppercase Delta + COLS7+3, // Lowercase Delta + COLS7+4, // Uppercase Epsilon + COLS7+4, // Lowercase Epsilon + COLS7+5, // Uppercase Zeta + COLS7+5, // Lowercase Zeta + COLS7+6, // Uppercase Eta + COLS7+6, // Lowercase Eta + COLS7+7, // Uppercase Theta + COLS7+7, // Lowercase Theta + COLS7+8, // Uppercase Iota + COLS7+8, // Lowercase Iota + COLS7+9, // Uppercase Kappa + COLS7+9, // Lowercase Kappa + COLS7+10, // Uppercase Lambda + COLS7+10, // Lowercase Lambda + COLS7+11, // Uppercase Mu + COLS7+11, // Lowercase Mu + COLS7+12, // Uppercase Nu + COLS7+12, // Lowercase Nu + COLS7+13, // Uppercase Xi + COLS7+13, // Lowercase Xi + COLS7+14, // Uppercase Omicron + COLS7+14, // Lowercase Omicron + COLS7+15, // Uppercase Pi + COLS7+15, // Lowercase Pi + COLS7+16, // Uppercase Rho + COLS7+16, // Lowercase Rho + COLS7+17, // Uppercase Sigma + COLS7+17, // Lowercase Sigma + COLS7+17, // Uppercase Sigma Terminal + COLS7+17, // Lowercase Sigma Terminal + COLS7+18, // Uppercase Tau + COLS7+18, // Lowercase Tau + COLS7+19, // Uppercase Upsilon + COLS7+19, // Lowercase Upsilon + COLS7+20, // Uppercase Phi + COLS7+20, // Lowercase Phi + COLS7+21, // Uppercase Chi + COLS7+21, // Lowercase Chi + COLS7+22, // Uppercase Psi + COLS7+22, // Lowercase Psi + COLS7+23, // Uppercase Omega + COLS7+23, // Lowercase Omega + +// Other Modern Greek Characters [8,52] + + COLS7, // Uppercase ALPHA Tonos high prime + COLS7, // Lowercase Alpha Tonos - acute + COLS7+4, // Uppercase EPSILON Tonos - high prime + COLS7+4, // Lowercase Epslion Tonos - acute + COLS7+6, // Uppercase ETA Tonos - high prime + COLS7+6, // Lowercase Eta Tonos - acute + COLS7+8, // Uppercase IOTA Tonos - high prime + COLS7+8, // Lowercase iota Tonos - acute + COLS7+8, // Uppercase IOTA Diaeresis + COLS7+8, // Lowercase iota diaeresis + COLS7+14, // Uppercase OMICRON Tonos - high prime + COLS7+14, // Lowercase Omicron Tonos - acute + COLS7+19, // Uppercase UPSILON Tonos - high prime + COLS7+19, // Lowercase Upsilon Tonos - acute + COLS7+19, // Uppercase UPSILON Diaeresis + COLS7+19, // Lowercase Upsilon diaeresis + COLS7+23, // Uppercase OMEGA Tonos - high prime + COLS7+23, // Lowercase Omega Tonso - acute + +// Variants [8,70] + + COLS7+4, // epsilon (variant) + COLS7+7, // theta (variant) + COLS7+9, // kappa (variant) + COLS7+15, // pi (variant) + COLS7+16, // rho (variant) + COLS7+17, // sigma (variant) + COLS7+19, // upsilon (variant) + COLS7+20, // phi (variant) + COLS7+23, // omega (variant) + +// Greek Diacritic marks [8,79] + + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, + COLS0, // 8,108 end of diacritic marks + +// Ancient Greek [8,109] + + COLS7, // alpha grave + COLS7, // alpha circumflex + COLS7, // alpha w/iota + COLS7, // alpha acute w/iota + COLS7, // alpha grave w/iota + COLS7, // alpha circumflex w/Iota + COLS7, // alpha smooth + COLS7, // alpha smooth acute + COLS7, // alpha smooth grave + COLS7, // alpha smooth circumflex + COLS7, // alpha smooth w/Iota + COLS7, // alpha smooth acute w/Iota + COLS7, // alpha smooth grave w/Iota + COLS7, // alpha smooth circumflex w/Iota +// [8,123] + COLS7, // alpha rough + COLS7, // alpha rough acute + COLS7, // alpha rough grave + COLS7, // alpha rough circumflex + COLS7, // alpha rough w/Iota + COLS7, // alpha rough acute w/Iota + COLS7, // alpha rough grave w/Iota + COLS7, // alpha rough circumflex w/Iota +// [8,131] + COLS7+4, // epsilon grave + COLS7+4, // epsilon smooth + COLS7+4, // epsilon smooth acute + COLS7+4, // epsilon smooth grave + COLS7+4, // epsilon rough + COLS7+4, // epsilon rough acute + COLS7+4, // epsilon rough grave +// [8,138] + COLS7+6, // eta grave + COLS7+6, // eta circumflex + COLS7+6, // eta w/iota + COLS7+6, // eta acute w/iota + COLS7+6, // eta grave w/Iota + COLS7+6, // eta circumflex w/Iota + COLS7+6, // eta smooth + COLS7+6, // eta smooth acute + COLS7+6, // eta smooth grave + COLS7+6, // eta smooth circumflex + COLS7+6, // eta smooth w/Iota + COLS7+6, // eta smooth acute w/Iota + COLS7+6, // eta smooth grave w/Iota + COLS7+6, // eta smooth circumflex w/Iota + COLS7+6, // eta rough + COLS7+6, // eta rough acute + COLS7+6, // eta rough grave + COLS7+6, // eta rough circumflex + COLS7+6, // eta rough w/Iota + COLS7+6, // eta rough acute w/Iota + COLS7+6, // eta rough grave w/Iota + COLS7+6, // eta rough circumflex w/Iota +// [8,160] + COLS7+8, // iota grave + COLS7+8, // iota circumflex + COLS7+8, // iota acute diaeresis + COLS7+8, // iota grave diaeresis + COLS7+8, // iota smooth + COLS7+8, // iota smooth acute + COLS7+8, // iota smooth grave + COLS7+8, // iota smooth circumflex + COLS7+8, // iota rough + COLS7+8, // iota rough acute + COLS7+8, // iota rough grave + COLS7+8, // iota rough circumflex +// [8,172] + COLS7+14, // omicron grave + COLS7+14, // omicron smooth + COLS7+14, // omicron smooth acute + COLS7+14, // omicron smooth grave + COLS7+14, // omicron rough + COLS7+14, // omicron rough acute + COLS7+14, // omicron rough grave +// [8,179] + COLS7+16, // rho smooth + COLS7+16, // rho rough +// [8,181] + COLS7+19, // upsilon grave + COLS7+19, // upsilon circumflex + COLS7+19, // upsilon acute diaeresis + COLS7+19, // upsilon grave diaeresis + COLS7+19, // upsilon smooth + COLS7+19, // upsilon smooth acute + COLS7+19, // upsilon smooth grave + COLS7+19, // upsilon smooth circumflex + COLS7+19, // upsilon rough + COLS7+19, // upsilon rough acute + COLS7+19, // upsilon rough grave + COLS7+19, // upsilon rough circumflex +// [8,193] + COLS7+23, // omega grave + COLS7+23, // omega circumflex + COLS7+23, // omega w/Iota + COLS7+23, // omega acute w/Iota + COLS7+23, // omega grave w/Iota + COLS7+23, // omega circumflex w/Iota + COLS7+23, // omega smooth + COLS7+23, // omega smooth acute + COLS7+23, // omega smooth grave + COLS7+23, // omega smooth circumflex + COLS7+23, // omega smooth w/Iota + COLS7+23, // omega smooth acute w/Iota + COLS7+23, // omega smooth grave w/Iota + COLS7+23, // omega smooth circumflex w/Iota + COLS7+23, // omega rough + COLS7+23, // omega rough acute + COLS7+23, // omega rough grave + COLS7+23, // omega rough circumflex + COLS7+23, // omega rough w/Iota + COLS7+23, // omega rough acute w/Iota + COLS7+23, // omega rough grave w/Iota + COLS7+23, // omega rough circumflex w/Iota +// [8,215] + COLS7+24, // Uppercase Stigma--the number 6 + COLS7+24, // Uppercase Digamma--Obsolete letter used as 6 + COLS7+24, // Uppercase Koppa--Obsolete letter used as 90 + COLS7+24 // Uppercase Sampi--Obsolete letter used as 900 +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE fwp_cyrl60Tbl[ CYRLTBLLEN + 2] = +{ + 0, // starting offset + CYRLTBLLEN, // len of table + + COLS10, // Russian uppercase A + COLS10, // Russian lowercase A + COLS10+1, // Russian uppercase BE + COLS10+1, // Russian lowercase BE + COLS10+2, // Russian uppercase VE + COLS10+2, // Russian lowercase VE + COLS10+3, // Russian uppercase GHE + COLS10+3, // Russian lowercase GHE + COLS10+5, // Russian uppercase DE + COLS10+5, // Russian lowercase DE + + COLS10+8, // Russian uppercase E + COLS10+8, // Russian lowercase E + COLS10+9, // Russian lowercase YO + COLS10+9, // Russian lowercase YO + COLS10+11, // Russian uppercase ZHE + COLS10+11, // Russian lowercase ZHE + COLS10+12, // Russian uppercase ZE + COLS10+12, // Russian lowercase ZE + COLS10+14, // Russian uppercase I + COLS10+14, // Russian lowercase I + + COLS10+17, // Russian uppercase SHORT I + COLS10+17, // Russian lowercase SHORT I + COLS10+19, // Russian uppercase KA + COLS10+19, // Russian lowercase KA + COLS10+20, // Russian uppercase EL + COLS10+20, // Russian lowercase EL + COLS10+22, // Russian uppercase EM + COLS10+22, // Russian lowercase EM + COLS10+23, // Russian uppercase EN + COLS10+23, // Russian lowercase EN + + COLS10+25, // Russian uppercase O + COLS10+25, // Russian lowercase O + COLS10+26, // Russian uppercase PE + COLS10+26, // Russian lowercase PE + COLS10+27, // Russian uppercase ER + COLS10+27, // Russian lowercase ER + COLS10+28, // Russian uppercase ES + COLS10+28, // Russian lowercase ES + COLS10+29, // Russian uppercase TE + COLS10+29, // Russian lowercase TE + + COLS10+32, // Russian uppercase U + COLS10+32, // Russian lowercase U + COLS10+34, // Russian uppercase EF + COLS10+34, // Russian lowercase EF + COLS10+35, // Russian uppercase HA + COLS10+35, // Russian lowercase HA + COLS10+36, // Russian uppercase TSE + COLS10+36, // Russian lowercase TSE + COLS10+37, // Russian uppercase CHE + COLS10+37, // Russian lowercase CHE + + COLS10+39, // Russian uppercase SHA + COLS10+39, // Russian lowercase SHA + COLS10+40, // Russian uppercase SHCHA + COLS10+40, // Russian lowercase SHCHA + COLS10+41, // Russian lowercase ER (also hard sign) + COLS10+41, // Russian lowercase ER (also hard sign) + COLS10+42, // Russian lowercase ERY + COLS10+42, // Russian lowercase ERY + COLS10+43, // Russian lowercase SOFT SIGN + COLS10+43, // Russian lowercase SOFT SIGN + + COLS10+45, // Russian uppercase REVERSE E + COLS10+45, // Russian lowercase REVERSE E + COLS10+46, // Russian uppercase YU + COLS10+46, // Russian lowercase yu + COLS10+47, // Russian uppercase YA + COLS10+47, // Russian lowercase ya + + COLS0, // Russian uppercase EH + COLS0, // Russian lowercase eh + COLS10+7, // Macedonian uppercase SOFT DJ + COLS10+7, // Macedonian lowercase soft dj + + COLS10+4, // Ukrainian uppercase HARD G + COLS10+4, // Ukrainian lowercase hard g + COLS0, // GE bar + COLS0, // ge bar + COLS10+6, // Serbian uppercase SOFT DJ + COLS10+6, // Serbian lowercase SOFT DJ + COLS0, // IE (variant) + COLS0, // ie (variant) + COLS10+10, // Ukrainian uppercase YE + COLS10+10, // Ukrainian lowercase YE + + COLS0, // ZHE with right descender + COLS0, // zhe with right descender + COLS10+13, // Macedonian uppercase ZELO + COLS10+13, // Macedonian lowercase ZELO + COLS0, // Old Slovanic uppercase Z + COLS0, // Old Slovanic uppercase z + COLS0, // II with macron + COLS0, // ii with mscron + COLS10+15, // Ukrainian uppercase I + COLS10+15, // Ukrainian lowercase I + + COLS10+16, // Ukrainian uppercase I with Two Dots + COLS10+16, // Ukrainian lowercase I with Two Dots + COLS0, // Old Slovanic uppercase I ligature + COLS0, // Old Slovanic lowercase I ligature + COLS10+18, // Serbian--Macedonian uppercase JE + COLS10+18, // Serbian--Macedonian lowercase JE + COLS10+31, // Macedonian uppercase SOFT K + COLS10+31, // Macedonian lowercase SOFT K + COLS0, // KA with right descender + COLS0, // ka with right descender + + COLS0, // KA ogonek + COLS0, // ka ogonek + COLS0, // KA vertical bar + COLS0, // ka vertical bar + COLS10+21, // Serbian--Macedonian uppercase SOFT L + COLS10+21, // Serbian--Macedonian lowercase SOFT L + COLS0, // EN with right descender + COLS0, // en with right descender + COLS10+24, // Serbian--Macedonian uppercase SOFT N + COLS10+24, // Serbian--Macedonian lowercase SOFT N + + COLS0, // ROUND OMEGA + COLS0, // round omega + COLS0, // OMEGA + COLS0, // omega + COLS10+30, // Serbian uppercase SOFT T + COLS10+30, // Serbian lowercase SOFT T + COLS10+33, // Byelorussian uppercase SHORT U + COLS10+33, // Byelorussian lowercase SHORT U + COLS0, // U with macron + COLS0, // u with macron + + COLS0, // STRAIGHT U + COLS0, // straight u + COLS0, // STRAIGHT U bar + COLS0, // straight u bar + COLS0, // OU ligature + COLS0, // ou ligature + COLS0, // KHA with right descender + COLS0, // kha with right descender + COLS0, // KHA ogonek + COLS0, // kha ogonek + + COLS0, // H + COLS0, // h + COLS0, // OMEGA titlo + COLS0, // omega titlo + COLS10+38, // Serbian uppercase HARD DJ + COLS10+38, // Serbian lowercase HARD DJ + COLS0, // CHE with right descender + COLS0, // che with right descender + COLS0, // CHE vertical bar + COLS0, // che vertical bar + + COLS0, // Old Slavonic SHCHA (variant) + COLS0, // old SLAVONIC shcha (variant) + COLS10+44, // Old Russian uppercase YAT + COLS10+44, // Old Russian lowercase YAT + +// END OF UNIQUE COLLATED BYTES +// CHARACTERS BELOW MUST HAVE HAVE THEIR OWN +// SUB-COLLATION VALUE TO COMPARE CORRECTLY. + + COLS0, // Old Bulgarian uppercase YUS + COLS0, // Old Bulgarian lowercase YUS + COLS0, // Old Slovanic uppercase YUS MALYI + COLS0, // Old Slovanic uppercase YUS MALYI + COLS0, // KSI + COLS0, // ksi + + COLS0, // PSI + COLS0, // psi + COLS0, // Old Russian uppercase FITA + COLS0, // Old Russian lowercase FITA + COLS0, // Old Russian uppercase IZHITSA + COLS0, // Old Russian lowercase IZHITSA + COLS0, // Russian uppercase A acute + COLS0, // Russian lowercase A acute + COLS10+8, // Russian uppercase E acute + COLS10+8, // Russian lowercase E acute + +// 160-below all characters are russian to 199 + + COLS0, // E acute + COLS0, // e acute + COLS10+14, // II acute + COLS10+14, // ii acute + COLS0, // I acute + COLS0, // i acute + COLS0, // YI acute + COLS0, // yi acute + COLS10+25, // O acute + COLS10+25, // o acute + + COLS10+32, // U acute + COLS10+32, // u acute + COLS10+42, // YERI acute + COLS10+42, // YERI acute + COLS10+45, // REVERSED E acute + COLS10+45, // reversed e acute + COLS10+46, // YU acute + COLS10+46, // yu acute + COLS10+47, // YA acute + COLS10+47, // ya acute + + COLS10, // A grave + COLS10, // a grave + COLS10+8, // E grave + COLS10+8, // e grave + COLS10+9, // YO grave + COLS10+9, // yo grave + COLS10+14, // I grave + COLS10+14, // i grave + COLS10+25, // O grave + COLS10+25, // o grave + + COLS10+32, // U grave + COLS10+32, // u grave + COLS10+42, // YERI grave + COLS10+42, // yeri grave + COLS10+45, // REVERSED E grave + COLS10+45, // reversed e grave + COLS10+46, // IU (YU) grave + COLS10+46, // iu (yu) grave + COLS10+47, // ia (YA) grave + COLS10+47, // ia (ya) grave ******* [10,199] +}; + +/**************************************************************************** +Desc: The Hebrew characters are collated over the Russian characters + Therefore sorting both Hebrew and Russian is impossible to do. +****************************************************************************/ +FLMBYTE fwp_heb60TblA[ HEBTBL1LEN + 2] = +{ + 0, // starting offset + HEBTBL1LEN, // len of table + COLS10h+0, // Alef + COLS10h+1, // Bet + COLS10h+2, // Gimel + COLS10h+3, // Dalet + COLS10h+4, // He + COLS10h+5, // Vav + COLS10h+6, // Zayin + COLS10h+7, // Het + COLS10h+8, // Tet + COLS10h+9, // Yod + COLS10h+10, // Kaf (final) [9,10] + COLS10h+11, // Kaf + COLS10h+12, // Lamed + COLS10h+13, // Mem (final) + COLS10h+14, // Mem + COLS10h+15, // Nun (final) + COLS10h+16, // Nun + COLS10h+17, // Samekh + COLS10h+18, // Ayin + COLS10h+19, // Pe (final) + COLS10h+20, // Pe [9,20] + COLS10h+21, // Tsadi (final) + COLS10h+22, // Tsadi + COLS10h+23, // Qof + COLS10h+24, // Resh + COLS10h+25, // Shin + COLS10h+26 // Tav [9,26] +}; + +/**************************************************************************** +Desc: This is the ANCIENT HEBREW SCRIPT piece. + The actual value will be stored in the subcollation. + This way we don't play diacritic/subcollation games. +****************************************************************************/ +FLMBYTE fwp_heb60TblB[ HEBTBL2LEN + 2] = +{ + 84, + HEBTBL2LEN, + +// [9,84] + COLS10h+0, // Alef Dagesh [9,84] + COLS10h+1, // Bet Dagesh + COLS10h+1, // Vez - looks like a bet + COLS10h+2, // Gimel Dagesh + COLS10h+3, // Dalet Dagesh + COLS10h+4, // He Dagesh + COLS10h+5, // Vav Dagesh [9,90] + COLS10h+5, // Vav Holem + COLS10h+6, // Zayin Dagesh + COLS10h+7, // Het Dagesh + COLS10h+8, // Tet Dagesh + COLS10h+9, // Yod Dagesh + COLS10h+9, // Yod Hiriq [9,96] - not on my list + + COLS10h+11, // Kaf Dagesh + COLS10h+10, // Kaf Dagesh (final) + COLS10h+10, // Kaf Sheva (final) + COLS10h+10, // Kaf Tsere (final) [9,100] + COLS10h+10, // Kaf Segol (final) + COLS10h+10, // Kaf Patah (final) + COLS10h+10, // Kaf Qamats (final) + COLS10h+10, // Kaf Dagesh Qamats (final) + COLS10h+12, // Lamed Dagesh + COLS10h+14, // Mem Dagesh + COLS10h+16, // Nun Dagesh + COLS10h+15, // Nun Qamats (final) + COLS10h+17, // Samekh Dagesh + COLS10h+20, // Pe Dagesh [9,110] + COLS10h+20, // Fe - just guessing this is like Pe - was +21 + COLS10h+22, // Tsadi Dagesh + COLS10h+23, // Qof Dagesh + COLS10h+25, // Sin (with sin dot) + COLS10h+25, // Sin Dagesh (with sin dot) + COLS10h+25, // Shin + COLS10h+25, // Shin Dagesh + COLS10h+26 // Tav Dagesh [9,118] +}; + +/**************************************************************************** +Desc: The Arabic characters are collated OVER the Russian characters + Therefore sorting both Arabic and Russian in the same database + is not supported. + + Arabic starts with a bunch of accents/diacritic marks that are + Actually placed OVER a preceeding character. These accents are + ignored while sorting the first pass - when collation == COLS0. + + There are 4 possible states for all/most arabic characters: + ?? - occurs as the only character in a word + ?? - appears at the first of the word + ?? - appears at the middle of a word + ?? - appears at the end of the word + + Usually only the simple version of the letter is stored. + Therefore we should not have to worry about sub-collation + of these characters. + + The arabic characters with diacritics differ however. The alef has + sub-collation values to sort correctly. There is not any more room + to add more collation values. Some chars in CS14 are combined when + urdu, pashto and sindhi characters overlap. +****************************************************************************/ +FLMBYTE fwp_ar160Tbl[ AR1TBLLEN + 2] = +{ + 38, // starting offset + AR1TBLLEN, // len of table +// [13,38] + COLLS+2, // , comma + COLLS+3, // : colon +// [13,40] + COLLS+7, // ? question mark + COLS4+2, // * asterick + COLS6, // % percent + COLS9+41, // >> alphabetic - end of list) + COLS9+40, // << alphabetic - end of list) + COLS2, // ( + COLS2+1, // ) +// [13,47] + COLS8+1, // ?? One + COLS8+2, // ?? Two + COLS8+3, // ?? Three +// [13,50] + COLS8+4, // ?? Four + COLS8+5, // ?? Five + COLS8+6, // ?? Six + COLS8+7, // ?? Seven + COLS8+8, // ?? Eight + COLS8+9, // ?? Nine + COLS8+0, // ?? Zero + COLS8+2, // ?? Two (Handwritten) + + COLS10a+1, // ?? alif + COLS10a+1, // ?? alif +// [13,60] + COLS10a+2, // ?? ba + COLS10a+2, // ?? ba + COLS10a+2, // ?? ba + COLS10a+2, // ?? ba + COLS10a+6, // ?? ta + COLS10a+6, // ?? ta + COLS10a+6, // ?? ta + COLS10a+6, // ?? ta + COLS10a+8, // ?? tha + COLS10a+8, // ?? tha +// [13,70] + COLS10a+8, // ?? tha + COLS10a+8, // ?? tha + COLS10a+12, // ?? jiim + COLS10a+12, // ?? jiim + COLS10a+12, // ?? jiim + COLS10a+12, // ?? jiim + COLS10a+16, // ?? Ha + COLS10a+16, // ?? Ha + COLS10a+16, // ?? Ha + COLS10a+16, // ?? Ha +// [13,80] + COLS10a+17, // ?? kha + COLS10a+17, // ?? kha + COLS10a+17, // ?? kha + COLS10a+17, // ?? kha + COLS10a+20, // ?? dal + COLS10a+20, // ?? dal + COLS10a+22, // ?? dhal + COLS10a+22, // ?? dhal + COLS10a+27, // ?? ra + COLS10a+27, // ?? ra +// [13,90] + COLS10a+29, // ?? ziin + COLS10a+29, // ?? ziin + COLS10a+31, // ?? siin + COLS10a+31, // ?? siin + COLS10a+31, // ?? siin + COLS10a+31, // ?? siin + COLS10a+32, // ?? shiin + COLS10a+32, // ?? shiin + COLS10a+32, // ?? shiin + COLS10a+32, // ?? shiin +// [13,100] + COLS10a+34, // ?? Sad + COLS10a+34, // ?? Sad + COLS10a+34, // ?? Sad + COLS10a+34, // ?? Sad + COLS10a+35, // ?? Dad + COLS10a+35, // ?? Dad + COLS10a+35, // ?? Dad + COLS10a+35, // ?? Dad + COLS10a+36, // ?? Ta + COLS10a+36, // ?? Ta +// [13,110] + COLS10a+36, // ?? Ta + COLS10a+36, // ?? Ta + COLS10a+37, // ?? Za + COLS10a+37, // ?? Za + COLS10a+37, // ?? Za + COLS10a+37, // ?? Za + COLS10a+38, // ?? 'ain + COLS10a+38, // ?? 'ain + COLS10a+38, // ?? 'ain + COLS10a+38, // ?? 'ain +// [13,120] + COLS10a+39, // ?? ghain + COLS10a+39, // ?? ghain + COLS10a+39, // ?? ghain + COLS10a+39, // ?? ghain + COLS10a+40, // ?? fa + COLS10a+40, // ?? fa + COLS10a+40, // ?? fa + COLS10a+40, // ?? fa + COLS10a+42, // ?? Qaf + COLS10a+42, // ?? Qaf +// [13,130] + COLS10a+42, // ?? Qaf + COLS10a+42, // ?? Qaf + COLS10a+43, // ?? kaf + COLS10a+43, // ?? kaf + COLS10a+43, // ?? kaf + COLS10a+43, // ?? kaf + COLS10a+46, // ?? lam + COLS10a+46, // ?? lam + COLS10a+46, // ?? lam + COLS10a+46, // ?? lam +// [13,140] + COLS10a+47, // ?? miim + COLS10a+47, // ?? miim + COLS10a+47, // ?? miim + COLS10a+47, // ?? miim + COLS10a+48, // ?? nuun + COLS10a+48, // ?? nuun + COLS10a+48, // ?? nuun + COLS10a+48, // ?? nuun + COLS10a+49, // ?? ha + COLS10a+49, // ?? ha +// [13,150] + COLS10a+49, // ?? ha + COLS10a+49, // ?? ha + // ha is also 51 for non-arabic + COLS10a+6, // ?? ta marbuuTah + COLS10a+6, // ?? ta marbuuTah + COLS10a+50, // ?? waw + COLS10a+50, // ?? waw + COLS10a+53, // ?? ya + COLS10a+53, // ?? ya + COLS10a+53, // ?? ya + COLS10a+53, // ?? ya +// [13,160] + COLS10a+52, // ?? alif maqSuurah + COLS10a+52, // ?? ya maqSuurah? + COLS10a+52, // ?? ya maqSuurah? + COLS10a+52, // ?? alif maqSuurah + + COLS10a+0, // ?? hamzah accent - never appears alone +// [13,165] + +// Store the sub-collation as the actual +// character value from this point on + + COLS10a+1, // ?? alif hamzah + COLS10a+1, // ?? alif hamzah + COLS10a+1, // ?? hamzah-under-alif + COLS10a+1, // ?? hamzah-under-alif + COLS10a+1, // ?? waw hamzah +// [13,170] + COLS10a+1, // ?? waw hamzah + COLS10a+1, // ?? ya hamzah + COLS10a+1, // ?? ya hamzah + COLS10a+1, // ?? ya hamzah + COLS10a+1, // ?? ya hamzah + COLS10a+1, // ?? alif fatHataan + COLS10a+1, // ?? alif fatHataan + COLS10a+1, // ?? alif maddah + COLS10a+1, // ?? alif maddah + COLS10a+1, // ?? alif waSlah +// [13,180] + COLS10a+1, // ?? alif waSlah (final) + +// LIGATURES +// Should NEVER be stored so will not worry +// about breaking up into pieces for collation. +// NOTE: +// Let's store the "Lam" collation value (+42) +// below and in the sub-collation store the +// actual character. This will sort real close. +// The best implementation is to +// break up ligatures into its base pieces. + + COLS10a+46, // ?? lamalif + COLS10a+46, // ?? lamalif + COLS10a+46, // ?? lamalif hamzah + COLS10a+46, // ?? lamalif hamzah + COLS10a+46, // ?? hamzah-under-lamalif + COLS10a+46, // ?? hamzah-under-lamalif + COLS10a+46, // ?? lamalif fatHataan + COLS10a+46, // ?? lamalif fatHataan + COLS10a+46, // ?? lamalif maddah +// [13,190] + COLS10a+46, // ?? lamalif maddah + COLS10a+46, // ?? lamalif waSlah + COLS10a+46, // ?? lamalif waSlah + COLS10a+46, // ?? Allah - khaDalAlif + COLS0_ARABIC, // ?? taTwiil - character extension - throw out + COLS0_ARABIC // ?? taTwiil 1/6 - character extension - throw out +}; + +/**************************************************************************** +Desc: Alef needs a subcollation table. + If colval==COLS10a+1 & char>=165 + index through this table. Otherwise + the alef value is [13,58] and subcol + value should be 7. Alef maddah is default (0) + + Handcheck if colval==COLS10a+6 + Should sort: + [13,152]..[13,153] - taa marbuuTah - nosubcoll + [13,64] ..[13,67] - taa - subcoll of 1 +****************************************************************************/ +FLMBYTE fwp_alefSubColTbl[] = +{ +// [13,165] + 1, // ?? alif hamzah + 1, // ?? alif hamzah + 3, // ?? hamzah-under-alif + 3, // ?? hamzah-under-alif + 2, // ?? waw hamzah +// [13,170] + 2, // ?? waw hamzah + 4, // ?? ya hamzah + 4, // ?? ya hamzah + 4, // ?? ya hamzah + 4, // ?? ya hamzah + 5, // ?? alif fatHataan + 5, // ?? alif fatHataan + 0, // ?? alif maddah + 0, // ?? alif maddah + 6, // ?? alif waSlah +// [13,180] + 6 // ?? alif waSlah (final) +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMBYTE fwp_ar260Tbl[ AR2TBLLEN + 2] = +{ + 41, // starting offset + AR2TBLLEN, // len of table +// [14,41] + COLS8+4, // Farsi and Urdu Four + COLS8+4, // Urdu Four + COLS8+5, // Farsi and Urdu Five + COLS8+6, // Farsi Six + COLS8+6, // Farsi and Urdu Six + COLS8+7, // Urdu Seven + COLS8+8, // Urdu Eight + + COLS10a+3, // Sindhi bb - baa /w 2 dots below (67b) + COLS10a+3, + COLS10a+3, + COLS10a+3, + COLS10a+4, // Sindhi bh - baa /w 4 dots below (680) + COLS10a+4, + COLS10a+4, + COLS10a+4, +// [14,56] + COLS10a+5, // Malay, Kurdish, Pashto, Farsi, Sindhi, and Urdu p + COLS10a+5, // =peh - taa /w 3 dots below (67e) + COLS10a+5, + COLS10a+5, + COLS10a+7, // Urdu T - taa /w small tah + COLS10a+7, + COLS10a+7, + COLS10a+7, + COLS10a+7, // Pashto T - taa /w ring (forced to combine) + COLS10a+7, + COLS10a+7, + COLS10a+7, + COLS10a+9, // Sindhi th - taa /w 4 dots above (67f) + COLS10a+9, +// [14,70] + COLS10a+9, + COLS10a+9, + COLS10a+10, // Sindhi Tr - taa /w 3 dots above (67d) + COLS10a+10, + COLS10a+10, + COLS10a+10, + COLS10a+11, // Sindhi Th - taa /w 2 dots above (67a) + COLS10a+11, + COLS10a+11, + COLS10a+11, + COLS10a+13, // Sindhi jj - haa /w 2 middle dots verticle (684) + COLS10a+13, + COLS10a+13, + COLS10a+13, + COLS10a+14, // Sindhi ny - haa /w 2 middle dots (683) + COLS10a+14, + COLS10a+14, + COLS10a+14, +// [14,88] + COLS10a+15, // Malay, Kurdish, Pashto, Farsi, Sindhi, and Urdu ch + COLS10a+15, // =tcheh (686) + COLS10a+15, + COLS10a+15, + COLS10a+15, // Sindhi chh - haa /w middle 4 dots (687) + COLS10a+15, // forced to combine + COLS10a+15, + COLS10a+15, + COLS10a+18, // Pashto ts - haa /w 3 dots above (685) + COLS10a+18, + COLS10a+18, + COLS10a+18, + COLS10a+19, // Pashto dz - hamzah on haa (681) + COLS10a+19, + COLS10a+19, + COLS10a+19, +// [14,104] + COLS10a+21, // Urdu D - dal /w small tah (688) + COLS10a+21, + COLS10a+21, // Pashto D - dal /w ring (689) forced to combine + COLS10a+21, + COLS10a+23, // Sindhi dh - dal /w 2 dots above (68c) + COLS10a+23, + COLS10a+24, // Sindhi D - dal /w 3 dots above (68e) + COLS10a+24, + COLS10a+25, // Sindhi Dr - dal /w dot below (68a) + COLS10a+25, + COLS10a+26, // Sindhi Dh - dal /w 2 dots below (68d) + COLS10a+26, + COLS10a+28, // Pashto r - ra /w ring (693) + COLS10a+28, +// [14,118] + COLS10a+28, // Urdu R - ra /w small tah (691) forced to combine + COLS10a+28, + COLS10a+28, // Sindhi r - ra /w 4 dots above (699) forced to combine + COLS10a+28, + COLS10a+27, // Kurdish rolled r - ra /w 'v' below (695) + COLS10a+27, + COLS10a+27, + COLS10a+27, +// [14,126] + COLS10a+30, // Kurdish, Pashto, Farsi, Sindhi, and Urdu Z + COLS10a+30, // = jeh - ra /w 3 dots above (698) + COLS10a+30, // Pashto zz - ra /w dot below & dot above (696) + COLS10a+30, // forced to combine + COLS10a+30, // Pashto g - not in unicode! - forced to combine + COLS10a+30, + COLS10a+33, // Pashto x - seen dot below & above (69a) + COLS10a+33, + COLS10a+33, + COLS10a+33, + COLS10a+39, // Malay ng - old maly ain /w 3 dots above (6a0) + COLS10a+39, // forced to combine + COLS10a+39, + COLS10a+39, +// [14,140] + COLS10a+41, // Malay p, Kurdish v - Farsi ? - fa /w 3 dots above + COLS10a+41, // = veh - means foreign words (6a4) + COLS10a+41, + COLS10a+41, + COLS10a+41, // Sindhi ph - fa /w 4 dots above (6a6) forced to combine + COLS10a+41, + COLS10a+41, + COLS10a+41, +// [14,148] + COLS10a+43, // Misc k - open caf (6a9) + COLS10a+43, + COLS10a+43, + COLS10a+43, + COLS10a+43, // misc k - no unicode - forced to combine + COLS10a+43, + COLS10a+43, + COLS10a+43, + COLS10a+43, // Sindhi k - swash caf (various) (6aa) -forced to combine + COLS10a+43, + COLS10a+43, + COLS10a+43, +// [14,160] + COLS10a+44, // Persian/Urdu g - gaf (6af) + COLS10a+44, + COLS10a+44, + COLS10a+44, + COLS10a+44, // Persian/Urdu g - no unicode + COLS10a+44, + COLS10a+44, + COLS10a+44, + COLS10a+44, // malay g - gaf /w ring (6b0) + COLS10a+44, + COLS10a+44, + COLS10a+44, + COLS10a+44, // Sindhi ng - gaf /w 2 dots above (6ba) + COLS10a+44, // forced to combine ng only + COLS10a+44, + COLS10a+44, + COLS10a+45, // Sindhi gg - gaf /w 2 dots vertical below (6b3) + COLS10a+45, + COLS10a+45, + COLS10a+45, +// [14,180] + COLS10a+46, // Kurdish velar l - lam /w small v (6b5) + COLS10a+46, + COLS10a+46, + COLS10a+46, + COLS10a+46, // Kurdish Lamalif with diacritic - no unicode + COLS10a+46, +// [14,186] + COLS10a+48, // Urdu n - dotless noon (6ba) + COLS10a+48, + COLS10a+48, + COLS10a+48, + COLS10a+48, // Pashto N - noon /w ring (6bc) - forced to combine + COLS10a+48, + COLS10a+48, + COLS10a+48, + COLS10a+48, // Sindhi N - dotless noon/w small tah (6bb) + COLS10a+48, // forced to combine + COLS10a+48, + COLS10a+48, + COLS10a+50, // Kurdish o - waw /w small v (6c6) + COLS10a+50, +// [14,200] + COLS10a+50, // Kurdish o - waw /w bar above (6c5) + COLS10a+50, + COLS10a+50, // Kurdish o - waw /w 2 dots above (6ca) + COLS10a+50, +// [14,204] + COLS10a+51, // Urdu h - no unicode + COLS10a+51, + COLS10a+51, + COLS10a+51, + COLS10a+52, // Kurdish ? - ya /w small v (6ce) + COLS10a+52, + COLS10a+52, + COLS10a+52, +// [14,212] + COLS10a+54, // Urdu y - ya barree (6d2) + COLS10a+54, + COLS10a+54, // Malay ny - ya /w 3 dots below (6d1) forced to combine + COLS10a+54, + COLS10a+54, + COLS10a+54, +// [14,218] + COLS10a+51, // Farsi hamzah - hamzah on ha (6c0) forced to combine + COLS10a+51 +}; + +/**************************************************************************** +Desc: If the bit position is set then save the character in the sub-col + area. The bit values are determined by looking at the + FLAIM COLTBL1 to see which characters are combined with other + Arabic characters. +****************************************************************************/ +FLMBYTE fwp_ar2BitTbl[] = +{ + // Start at character 64 + // The only 'clean' areas uncollate to the correct place, they are... + // 48..63 + // 68..91 + // 96..117 + // 126..127 + // 140..143 + // 160..163 + // 176..179 + // 212..213 + + 0xF0, // 64..71 + 0x00, // 72..79 + 0x00, // 80..87 + 0x0F, // 88..95 - 92..95 + 0x00, // 96..103 + 0x00, // 104..111 + 0x03, // 112..119 + 0xFC, // 120..127 + 0xFF, // 128..135 + 0xF0, // 136..143 - 136..139 + 0xFF, // 144..151 - 144..147, 148..159 + 0xFF, // 152..159 + 0x0F, // 160..167 - 164..175 + 0xFF, // 168..175 + 0x0F, // 176..183 - 180..185 + 0xFF, // 184..191 - 186..197 + 0xFF, // 192..199 - 198..203 + 0xFF, // 200..207 - 204..207 + 0xF3, // 208..215 - 208..211 , 214..217 + 0xF0 // 216..219 - 218..219 +}; + +/**************************************************************************** +Desc: This table describes and gives addresses for collating 5.0 + character sets. Each line corresponds with a character set. +***************************************************************************/ +TBL_B_TO_BP fwp_col60Tbl[] = +{ + {CHSASCI, fwp_asc60Tbl}, // ascii - " " - "~" + {CHSMUL1, fwp_mn60Tbl}, // multinational + {CHSSYM1, fwp_sym60Tbl}, // symbols + {CHSGREK, fwp_grk60Tbl}, // greek + {CHSCYR, fwp_cyrl60Tbl}, // Cyrillic - Russian + {0xFF, 0} // table terminator +}; + +/**************************************************************************** +Desc: This table is for sorting the hebrew/arabic languages. + These values overlap the end of ASC/european and cyrillic tables. +****************************************************************************/ +TBL_B_TO_BP fwp_HebArabicCol60Tbl[] = +{ + {CHSASCI, fwp_asc60Tbl}, // ascii - " " - "~" + {CHSMUL1, fwp_mn60Tbl}, // multinational + {CHSSYM1, fwp_sym60Tbl}, // symbols + {CHSGREK, fwp_grk60Tbl}, // greek + {CHSHEB, fwp_heb60TblA}, // Hebrew + {CHSHEB, fwp_heb60TblB}, // Hebrew + {CHSARB1, fwp_ar160Tbl}, // Arabic Set 1 + {CHSARB2, fwp_ar260Tbl}, // Arabic Set 2 + {0xff, 0} // table terminator +}; + +/**************************************************************************** +Desc: The diacritical to collated table translates the first 26 + characters of WP character set #1 into a 5 bit value for "correct" + sorting sequence for that diacritical (DCV) - diacritic collated + value. + + The attempt here is to convert the collated character value + along with the DCV to form the original WP character. + + The diacriticals are in an order to fit the most languages. + Czech, Swedish, and Finnish will have to manual reposition the + ring above (assign it a value greater then the umlaut) + + This table is index by the diacritical value. +****************************************************************************/ +FLMBYTE fwp_dia60Tbl[] = +{ + 2, // grave offset = 0 + 16, // centerd offset = 1 + 7, // tilde offset = 2 + 4, // circum offset = 3 + 12, // crossb offset = 4 + 10, // slash offset = 5 + 1, // acute offset = 6 + 6, // umlaut offset = 7 + // In SU, SV and CZ will = 9 + 17, // macron offset = 8 + 18, // aposab offset = 9 + 19, // aposbes offset = 10 + 20, // aposba offset = 11 + 21, // aposbc offset = 12 + 22, // abosbl offset = 13 + 8, // ring offset = 14 + 13, // dota offset = 15 + 23, // dacute offset = 16 + 11, // cedilla offset = 17 + 14, // ogonek offset = 18 + 5, // caron offset = 19 + 15, // stroke offset = 20 + 24, // bara offset = 21 + 3, // breve offset = 22 + 0, // dbls offset = 23 sorts as 'ss' + 25, // dotlesi offset = 24 + 26 // dotlesj offset = 25 +}; + +/**************************************************************************** +Desc: This table defines the range of characters within the set + which are case convertible. +****************************************************************************/ +static FLMBYTE fwp_caseConvertableRange[] = +{ + 26,241, // Multinational 1 + 0,0, // Multinational 2 + 0,0, // Box Drawing + 0,0, // Symbol 1 + 0,0, // Symbol 2 + 0,0, // Math 1 + 0,0, // Math 2 + 0,69, // Greek 1 + 0,0, // Hebrew + 0,199, // Cyrillic + 0,0, // Japanese Kana + 0,0, // User-defined + 0,0, // Not defined + 0,0, // Not defined + 0,0, // Not defined +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMUINT16 colToWPChr[ COLS11 - COLLS] = +{ + 0x20, // colls - + 0x2e, // colls+1 - . + 0x2c, // colls+2 - , + 0x3a, // colls+3 - : + 0x3b, // colls+4 - ; + 0x21, // colls+5 - ! + 0, // colls+6 - NO VALUE + 0x3f, // colls+7 - ? + 0, // colls+8 - NO VALUE + + 0x22, // cols1 - " + 0x27, // cols1+1 - ' + 0x60, // cols1+2 - ` + 0, // cols1+3 - NO VALUE + 0, // cols1+4 - NO VALUE + + 0x28, // cols2 - ( + 0x29, // cols2+1 - ) + 0x5b, // cols2+2 - japanese angle brackets + 0x5d, // cols2+3 - japanese angle brackets + 0x7b, // cols2+4 - { + 0x7d, // cols2+5 - } + + 0x24, // cols3 - $ + 0x413, // cols3+1 - cent + 0x40b, // cols3+2 - pound + 0x40c, // cols3+3 - yen + 0x40d, // cols3+4 - pacetes + 0x40e, // cols3+5 - floren + + 0x2b, // cols4 - + + 0x2d, // cols4+1 - - + 0x2a, // cols4+2 - * + 0x2f, // cols4+3 - / + 0x5e, // cols4+4 - ^ + 0, // cols4+5 - NO VALUE + 0, // cols4+6 - NO VALUE + 0, // cols4+7 - NO VALUE + + 0x3c, // cols5 - < + 0, // cols5+1 - NO VALUE + 0x3d, // cols5+2 - = + 0, // cols5+3 - NO VALUE + 0x3e, // cols5+4 - > + 0, // cols5+5 - NO VALUE + 0, // cols5+6 - NO VALUE + 0, // cols5+7 - NO VALUE + 0, // cols5+8 - NO VALUE + 0, // cols5+9 - NO VALUE + 0, // cols5+10 - NO VALUE + 0, // cols5+11 - NO VALUE + 0, // cols5+12 - NO VALUE + 0, // cols5+13 - NO VALUE + + 0x25, // cols6 - % + 0x23, // cols6+1 - # + 0x26, // cols6+2 - & + 0x40, // cols6+3 - @ + 0x5c, // cols6+4 - Backslash + 0x5f, // cols6+5 - _ + 0x7c, // cols6+6 - | + 0x7e, // cols6+7 - ~ + 0, // cols6+8 - NO VALUE + 0, // cols6+9 - NO VALUE + 0, // cols6+10 - NO VALUE + 0, // cols6+11 - NO VALUE + 0, // cols6+12 - NO VALUE + + 0x800, // cols7 - Uppercase Alpha + 0x802, // cols7+1 - Uppercase Beta + 0x806, // cols7+2 - Uppercase Gamma + 0x808, // cols7+3 - Uppercase Delta + 0x80a, // cols7+4 - Uppercase Epsilon + 0x80c, // cols7+5 - Uppercase Zeta + 0x80e, // cols7+6 - Uppercase Eta + 0x810, // cols7+7 - Uppercase Theta + 0x812, // cols7+8 - Uppercase Iota + 0x814, // cols7+9 - Uppercase Kappa + 0x816, // cols7+10 - Uppercase Lambda + 0x818, // cols7+11 - Uppercase Mu + 0x81a, // cols7+12 - Uppercase Nu + 0x81c, // cols7+13 - Uppercase Xi + 0x81e, // cols7+14 - Uppercase Omicron + 0x820, // cols7+15 - Uppercase Pi + 0x822, // cols7+16 - Uppercase Rho + 0x824, // cols7+17 - Uppercase Sigma + 0x828, // cols7+18 - Uppercase Tau + 0x82a, // cols7+19 - Uppercase Upsilon + 0x82c, // cols7+20 - Uppercase Phi + 0x82e, // cols7+21 - Uppercase Chi + 0x830, // cols7+22 - Uppercase Psi + 0x832, // cols7+23 - Uppercase Omega + 0, // cols7+24 - NO VALUE + + 0x30, // cols8 - 0 + 0x31, // cols8+1 - 1 + 0x32, // cols8+2 - 2 + 0x33, // cols8+3 - 3 + 0x34, // cols8+4 - 4 + 0x35, // cols8+5 - 5 + 0x36, // cols8+6 - 6 + 0x37, // cols8+7 - 7 + 0x38, // cols8+8 - 8 + 0x39, // cols8+9 - 9 + + 0x41, // cols9 - A + 0x124, // cols9+1 - AE digraph + 0x42, // cols9+2 - B + 0x43, // cols9+3 - C + 0xffff, // cols9+4 - CH in spanish + 0x162, // cols9+5 - Holder for C caron in Czech + 0x44, // cols9+6 - D + 0x45, // cols9+7 - E + 0x46, // cols9+8 - F + 0x47, // cols9+9 - G + 0x48, // cols9+10 - H + 0xffff, // cols9+11 - CH in czech or dotless i in turkish + 0x49, // cols9+12 - I + 0x18a, // cols9+13 - IJ Digraph + 0x4a, // cols9+14 - J + 0x4b, // cols9+15 - K + 0x4c, // cols9+16 - L + 0xffff, // cols9+17 - LL in spanish + 0x4d, // cols9+18 - M + 0x4e, // cols9+19 - N + 0x138, // cols9+20 - N Tilde + 0x4f, // cols9+21 - O + 0x1a6, // cols9+22 - OE digraph + 0x50, // cols9+23 - P + 0x51, // cols9+24 - Q + 0x52, // cols9+25 - R + 0x1aa, // cols9+26 - Holder for R caron in Czech + 0x53, // cols9+27 - S + 0x1b0, // cols9+28 - Holder for S caron in Czech + 0x54, // cols9+29 - T + 0x55, // cols9+30 - U + 0x56, // cols9+31 - V + + 0x57, // cols9+32 - W + 0x58, // cols9+33 - X + 0x59, // cols9+34 - Y + 0x5a, // cols9+35 - Z + 0x1ce, // cols9+36 - Holder for Z caron in Czech + 0x158, // cols9+37 - Uppercase Thorn + 0, // cols9+38 - ??? + 0, // cols9+39 - ??? + 0x5b, // cols9+40 - [ (note: alphabetic - end of list) + 0x5d, // cols9+41 - ] (note: alphabetic - end of list) +// 0xAA - also start of Hebrew + 0x124, // cols9+42 - AE diagraph - DK + 0x124, // cols9+43 - AE diagraph - NO + 0x122, // cols9+44 - A ring - SW + 0x11E, // cols9+45 - A diaeresis - DK + 0x124, // cols9+46 - AE diagraph - IC + 0x150, // cols9+47 - O slash - NO + 0x11e, // cols9+48 - A diaeresis - SW + 0x150, // cols9+49 - O slash - DK + 0x13E, // cols9+50 - O Diaeresis - IC + 0x122, // cols9+51 - A ring - NO + 0x13E, // cols9+52 - O Diaeresis - SW + 0x13E, // cols9+53 - O Diaeresis - DK + 0x150, // cols9+54 - O slash - IC + 0x122, // cols9+55 - A ring - DK + 0x124, // cols9+56 - AE diagraph future + 0x13E, // cols9+57 - O Diaeresis future + 0x150, // cols9+58 - O slash future + 0, // cols9+59 - NOT USED future + + 0xA00, // cols10 - Russian A + 0xA02, // cols10+1 - Russian BE + 0xA04, // cols10+2 - Russian VE + 0xA06, // cols10+3 - Russian GHE + 0xA46, // cols10+4 - Ukrainian HARD G + 0xA08, // cols10+5 - Russian DE + 0xA4a, // cols10+6 - Serbian SOFT DJ + 0xA44, // cols10+7 - Macedonian SOFT DJ + 0xA0a, // cols10+8 - Russian E + 0xA0c, // cols10+9 - Russian YO + 0xA4e, // cols10+10 - Ukrainian YE + 0xA0e, // cols10+11 - Russian ZHE + 0xA10, // cols10+12 - Russian ZE + 0xA52, // cols10+13 - Macedonian ZELO + 0xA12, // cols10+14 - Russian I + 0xA58, // cols10+15 - Ukrainian I + 0xA5a, // cols10+16 - Ukrainian I with Two dots + 0xA14, // cols10+17 - Russian SHORT I + 0xA5e, // cols10+18 - Serbian--Macedonian JE + 0xA16, // cols10+19 - Russian KA + 0xA18, // cols10+20 - Russian EL + 0xA68, // cols10+21 - Serbian--Macedonian SOFT L + 0xA1a, // cols10+22 - Russian EM + 0xA1c, // cols10+23 - Russian EN + 0xA6c, // cols10+24 - Serbian--Macedonian SOFT N + 0xA1e, // cols10+25 - Russian O + 0xA20, // cols10+26 - Russian PE + 0xA22, // cols10+27 - Russian ER + 0xA24, // cols10+28 - Russian ES + 0xA26, // cols10+29 - Russian TE + 0xA72, // cols10+30 - Serbian SOFT T + 0xA60, // cols10+31 - Macedonian SOFT K + 0xA28, // cols10+32 - Russian U + 0xA74, // cols10+33 - Byelorussian SHORT U + 0xA2a, // cols10+34 - Russian EF + 0xA2c, // cols10+35 - Russian HA + 0xA2e, // cols10+36 - Russian TSE + 0xA30, // cols10+37 - Russian CHE + 0xA86, // cols10+38 - Serbian HARD DJ + 0xA32, // cols10+39 - Russian SHA + 0xA34, // cols10+40 - Russian SHCHA + 0xA36, // cols10+41 - Russian ER (also hard + 0xA38, // cols10+42 - Russian ERY + 0xA3a, // cols10+43 - Russian SOFT SIGN + 0xA8e, // cols10+44 - Old Russian YAT + 0xA3c, // cols10+45 - Russian uppercase REVERSE E + 0xA3e, // cols10+46 - Russian YU + 0xA40, // cols10+47 - Russian YA + 0xA3a, // cols10+48 - Russian SOFT SIGN - UKRAIN ONLY + 0 // cols10+49 - future +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMUINT16 HebArabColToWPChr[] = +{ + // Start at COLS10a+0 +// [0] + 0x0D00 +164, // hamzah + 0x0D00 + 58, // [13,177] alef maddah + // Read subcollation to get other alef values + 0x0D00 + 60, // baa + 0x0E00 + 48, // Sindhi bb + 0x0E00 + 52, // Sindhi bh + 0x0E00 + 56, // Misc p = peh + 0x0D00 +152, // taa marbuuTah + // subcollation of 1 is taa [13,64] + 0x0E00 + 60, // Urdu T [14,60] + // Pashto T [14,64] +// [8] + 0x0D00 + 68, // thaa + 0x0E00 + 68, // Sindhi th + 0x0E00 + 72, // Sindhi tr + 0x0E00 + 76, // Sindhi Th + 0x0D00 + 72, // jiim - jeem + 0x0E00 + 80, // Sindhi jj + 0x0E00 + 84, // Sindhi ny + 0x0E00 + 88, // Misc ch + // Sinhi chh [14,92] +// [16] + 0x0D00 + 76, // Haa + 0x0D00 + 80, // khaa + 0x0E00 + 96, // Pashto ts + 0x0E00 +100, // Pashto dz + + 0x0D00 + 84, // dal + 0x0E00 +104, // Urdu D + // Pashto D + 0x0D00 + 86, // thal + 0x0E00 +108, // Sindhi dh + +// [24] + 0x0E00 +110, // Sindhi D + 0x0E00 +112, // Sindhi Dr + 0x0E00 +114, // Sindhi Dh + + 0x0D00 + 88, // ra + // Kurdish rolled r [14,122] + 0x0E00 +116, // Pashto r [14,116] - must pick this! + // Urdu R [14,118] + // Sindhi r [14,120] + + 0x0D00 + 90, // zain + 0x0E00 +126, // Mizc Z=jeh [14,126] + // Pashto zz [14,128] + // Pashto g [14,130] + + 0x0D00 + 92, // seen + +// [32] + 0x0D00 + 96, // sheen + 0x0E00 +132, // Pashto x + 0x0D00 +100, // Sad + 0x0D00 +104, // Dad + 0x0D00 +108, // Tah + 0x0D00 +112, // Za (dhah) + 0x0D00 +116, // 'ain + 0x0D00 +120, // ghain + // malay ng [14,136] +// [40] + 0x0D00 +124, // fa + 0x0E00 +140, // Malay p, kurdish v = veh + // Sindhi ph [14,144] + 0x0D00 +128, // Qaf + 0x0D00 +132, // kaf (caf) + // Misc k [14,148] + // misc k - no unicode [14,152] + // Sindhi k [14,156] + + 0x0E00 +160, // Persian/Urdu gaf + // gaf - no unicode [14,164] + // malay g [14,168] + // Sindhi ng [14,172] + 0x0E00 +176, // Singhi gg + + 0x0D00 +136, // lam - all ligature variants + // Kurdish valar lam [14,180] + // Kurdish lamalef - no unicode [14,184] + + 0x0D00 +140, // meem + +// [48] + 0x0D00 +144, // noon + // Urdu n [14,186] + // Pashto N [14,190] + // Sindhi N [14,194] + 0x0D00 +148, // ha - arabic language only! + 0x0D00 +154, // waw + // Kurdish o [14,198] + // Kurdish o with bar [14,200] + // Kurdish o with 2 dots [14,202] + 0x0D00 +148, // ha - non-arabic language + // Urdu h [14,204] + // Farsi hamzah on ha [14,218] + 0x0D00 +160, // alef maqsurah + // Kurdish e - ya /w small v + + 0x0D00 +156, // ya + 0x0E00 +212 // Urdu ya barree + // Malay ny [14,214] +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +FLMUINT16 ArabSubColToWPChr[] = +{ + 0x0D00 +177, // Alef maddah - default value - here for documentation + 0x0D00 +165, // Alef Hamzah + 0x0D00 +169, // Waw hamzah + 0x0D00 +167, // Hamzah under alef + 0x0D00 +171, // ya hamzah + 0x0D00 +175, // alef fathattan + 0x0D00 +179, // alef waslah + 0x0D00 + 58, // alef + 0x0D00 + 64 // taa - after taa marbuuTah +}; + +/**************************************************************************** +Desc: Turns a collated diacritic value into the original diacritic value +****************************************************************************/ +FLMBYTE ml1_COLtoD[27] = +{ + 23, // dbls sort value = 0 sorts as 'ss' + 6, // acute sort value = 1 + 0, // grave sort value = 2 + 22, // breve sort value = 3 + 3, // circum sort value = 4 + 19, // caron sort value = 5 + 7, // umlaut sort value = 6 + 2, // tilde sort value = 7 + 14, // ring sort value = 8 + 7, // umlaut in SU,SV & CZ after ring = 9 + 5, // slash sort value = 10 + 17, // cedilla sort value = 11 + 4, // crossb sort value = 12 + 15, // dota sort value = 13 + 18, // ogonek sort value = 14 + 20, // stroke sort value = 15 + 1, // centerd sort value = 16 + 8, // macron sort value = 17 + 9, // aposab sort value = 18 + 10, // aposbes sort value = 19 + 11, // aposba sort value = 20 + 12, // aposbc sort value = 21 + 13, // abosbl sort value = 22 + 16, // dacute sort value = 23 + 21, // bara sort value = 24 + 24, // dotlesi sort value = 25 + 25 // dotlesj sort value = 26 +}; + +/**************************************************************************** +Desc: +Notes: Only 48 values + 0x40, 0x41, 0x42 (169..171) +****************************************************************************/ +FLMBYTE ColToKanaTbl[ 48] = +{ + 0, // a=0, A=1 + 2, // i=2, I=3 + 4, // u=4, U=5, VU=83 + 6, // e=6, E=7 + 8, // o=8, O=9 + 84, // KA=10, GA=11, ka=84 - remember voicing table is optimized + // so that zero value is position and + // if voice=1 and no 0 is changed to 0 + 12, // KI=12, GI=13 + 14, // KU=14, GU=15 + 85, // KE=16, GE=17, ke=85 + 18, // KO=18, GO=19 + 20, // SA=20, ZA=21 + 22, // SHI=22, JI=23 + 24, // SU=24, ZU=25 + 26, // SE=26, ZE=27 + 28, // SO=28, ZO=29 + 30, // TA=30, DA=31 + 32, // CHI=32, JI=33 + 34, // tsu=34, TSU=35, ZU=36 + 37, // TE=37, DE=38 + 39, // TO=39, DO=40 + 41, // NA + 42, // NI + 43, // NU + 44, // NE + 45, // NO + 46, // HA, BA, PA + 49, // HI, BI, PI + 52, // FU, BU, PU + 55, // HE, BE, PE + 58, // HO, BO, PO + 61, // MA + 62, // MI + 63, // MU + 64, // ME + 65, // MO + 66, // ya, YA + 68, // yu, YU + 70, // yo, YO + 72, // RA + 73, // RI + 74, // RU + 75, // RE + 76, // RO + 77, // wa, WA + 79, // WI + 80, // WE + 81, // WO + 82 // N +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +static FLMBYTE f_langtbl[ FLM_LAST_LANG + FLM_LAST_LANG] = +{ + 'U', 'S', // English, United States + 'A', 'F', // Afrikaans + 'A', 'R', // Arabic + 'C', 'A', // Catalan + 'H', 'R', // Croatian + 'C', 'Z', // Czech + 'D', 'K', // Danish + 'N', 'L', // Dutch + 'O', 'Z', // English, Australia + 'C', 'E', // English, Canada + 'U', 'K', // English, United Kingdom + 'F', 'A', // Farsi + 'S', 'U', // Finnish + 'C', 'F', // French, Canada + 'F', 'R', // French, France + 'G', 'A', // Galician + 'D', 'E', // German, Germany + 'S', 'D', // German, Switzerland + 'G', 'R', // Greek + 'H', 'E', // Hebrew + 'M', 'A', // Hungarian + 'I', 'S', // Icelandic + 'I', 'T', // Italian + 'N', 'O', // Norwegian + 'P', 'L', // Polish + 'B', 'R', // Portuguese, Brazil + 'P', 'O', // Portuguese, Portugal + 'R', 'U', // Russian + 'S', 'L', // Slovak + 'E', 'S', // Spanish + 'S', 'V', // Swedish + 'Y', 'K', // Ukrainian + 'U', 'R', // Urdu + 'T', 'K', // Turkey + 'J', 'P', // Japanese + 'K', 'R', // Korean + 'C', 'T', // Chinese-Traditional + 'C', 'S', // Chinese-Simplified + 'L', 'A' // Future asian language +}; + +/**************************************************************************** +Desc: UNICODE to WP6 character mapping table +Notes: This table is used to convert a subset of Unicode characters to + their WordPerfect equivalents so that the WP collation routines + can be used for indexing. This contains characters that can be + mapped 1:1 from Unicode->WP and from WP->Unicode. There is + no ambiguity and there are no character expansions or + contractions. +****************************************************************************/ +#define UTOWP60_ENTRIES 1502 +FLMUINT16 WP_UTOWP60[ UTOWP60_ENTRIES][2] = +{ + { 0x00A1, 0x0407 }, // 7 , 4 + { 0x00A2, 0x0413 }, // 19 , 4 + { 0x00A3, 0x040b }, // 11 , 4 + { 0x00A4, 0x0418 }, // 24 , 4 + { 0x00A5, 0x040c }, // 12 , 4 + { 0x00A7, 0x0406 }, // 6 , 4 + { 0x00A9, 0x0417 }, // 23 , 4 + { 0x00AA, 0x040f }, // 15 , 4 + { 0x00AB, 0x0409 }, // 9 , 4 + { 0x00AC, 0x0614 }, // 20 , 6 + { 0x00AE, 0x0416 }, // 22 , 4 + { 0x00B0, 0x0624 }, // 36 , 6 + { 0x00B1, 0x0601 }, // 1 , 6 + { 0x00B2, 0x0414 }, // 20 , 4 + { 0x00B3, 0x041a }, // 26 , 4 + { 0x00B5, 0x0625 }, // 37 , 6 + { 0x00B6, 0x0405 }, // 5 , 4 + { 0x00B7, 0x0101 }, // 101, 1 + { 0x00B9, 0x044e }, // 78 , 4 + { 0x00BA, 0x0410 }, // 16 , 4 + { 0x00BB, 0x040a }, // 10 , 4 + { 0x00BC, 0x0412 }, // 18 , 4 + { 0x00BD, 0x0411 }, // 17 , 4 + { 0x00BE, 0x0419 }, // 25 , 4 + { 0x00BF, 0x0408 }, // 8 , 4 + { 0x00C0, 0x0120 }, // 32 , 1 + { 0x00C1, 0x011a }, // 26 , 1 + { 0x00C2, 0x011c }, // 28 , 1 + { 0x00C3, 0x014c }, // 76 , 1 + { 0x00C4, 0x011e }, // 30 , 1 + { 0x00C5, 0x0122 }, // 34 , 1 + { 0x00C6, 0x0124 }, // 36 , 1 + { 0x00C7, 0x0126 }, // 38 , 1 + { 0x00C8, 0x012e }, // 46 , 1 + { 0x00C9, 0x0128 }, // 40 , 1 + { 0x00CA, 0x012a }, // 42 , 1 + { 0x00CB, 0x012c }, // 44 , 1 + { 0x00CC, 0x0136 }, // 54 , 1 + { 0x00CD, 0x0130 }, // 48 , 1 + { 0x00CE, 0x0132 }, // 50 , 1 + { 0x00CF, 0x0134 }, // 52 , 1 + { 0x00D0, 0x0156 }, // 86 , 1 + { 0x00D1, 0x0138 }, // 56 , 1 + { 0x00D2, 0x0140 }, // 64 , 1 + { 0x00D3, 0x013a }, // 58 , 1 + { 0x00D4, 0x013c }, // 60 , 1 + { 0x00D5, 0x0152 }, // 82 , 1 + { 0x00D6, 0x013e }, // 62 , 1 + { 0x00D7, 0x0627 }, // 39 , 6 + { 0x00D8, 0x0150 }, // 80 , 1 + { 0x00D9, 0x0148 }, // 72 , 1 + { 0x00DA, 0x0142 }, // 66 , 1 + { 0x00DB, 0x0144 }, // 68 , 1 + { 0x00DC, 0x0146 }, // 70 , 1 + { 0x00DD, 0x0154 }, // 84 , 1 + { 0x00DE, 0x0158 }, // 88 , 1 + { 0x00DF, 0x0117 }, // 23 , 1 + { 0x00E0, 0x0121 }, // 33 , 1 + { 0x00E1, 0x011b }, // 27 , 1 + { 0x00E2, 0x011d }, // 29 , 1 + { 0x00E3, 0x014d }, // 77 , 1 + { 0x00E4, 0x011f }, // 31 , 1 + { 0x00E5, 0x0123 }, // 35 , 1 + { 0x00E6, 0x0125 }, // 37 , 1 + { 0x00E7, 0x0127 }, // 39 , 1 + { 0x00E8, 0x012f }, // 47 , 1 + { 0x00E9, 0x0129 }, // 41 , 1 + { 0x00EA, 0x012b }, // 43 , 1 + { 0x00EB, 0x012d }, // 45 , 1 + { 0x00EC, 0x0137 }, // 55 , 1 + { 0x00ED, 0x0131 }, // 49 , 1 + { 0x00EE, 0x0133 }, // 51 , 1 + { 0x00EF, 0x0135 }, // 53 , 1 + { 0x00F0, 0x0157 }, // 87 , 1 + { 0x00F1, 0x0139 }, // 57 , 1 + { 0x00F2, 0x0141 }, // 65 , 1 + { 0x00F3, 0x013b }, // 59 , 1 + { 0x00F4, 0x013d }, // 61 , 1 + { 0x00F5, 0x0153 }, // 83 , 1 + { 0x00F6, 0x013f }, // 63 , 1 + { 0x00F7, 0x0608 }, // 8 , 6 + { 0x00F8, 0x0151 }, // 81 , 1 + { 0x00F9, 0x0149 }, // 73 , 1 + { 0x00FA, 0x0143 }, // 67 , 1 + { 0x00FB, 0x0145 }, // 69 , 1 + { 0x00FC, 0x0147 }, // 71 , 1 + { 0x00FD, 0x0155 }, // 85 , 1 + { 0x00FE, 0x0159 }, // 89 , 1 + { 0x00FF, 0x014b }, // 75 , 1 + { 0x0100, 0x015c }, // 92 , 1 + { 0x0101, 0x015d }, // 93 , 1 + { 0x0102, 0x015a }, // 90 , 1 + { 0x0103, 0x015b }, // 91 , 1 + { 0x0104, 0x015e }, // 94 , 1 + { 0x0105, 0x015f }, // 95 , 1 + { 0x0106, 0x0160 }, // 96 , 1 + { 0x0107, 0x0161 }, // 97 , 1 + { 0x0108, 0x0164 }, // 100, 1 + { 0x0109, 0x0165 }, // 101, 1 + { 0x010A, 0x0166 }, // 102, 1 + { 0x010B, 0x0167 }, // 103, 1 + { 0x010C, 0x0162 }, // 98 , 1 + { 0x010D, 0x0163 }, // 99 , 1 + { 0x010E, 0x0168 }, // 104, 1 + { 0x010F, 0x0169 }, // 105, 1 + { 0x0110, 0x014e }, // 78 , 1 + { 0x0111, 0x014f }, // 79 , 1 + { 0x0112, 0x016e }, // 110, 1 + { 0x0113, 0x016f }, // 111, 1 + { 0x0114, 0x01ea }, // 234, 1 + { 0x0115, 0x01eb }, // 235, 1 + { 0x0116, 0x016c }, // 108, 1 + { 0x0117, 0x016d }, // 109, 1 + { 0x0118, 0x0170 }, // 112, 1 + { 0x0119, 0x0171 }, // 113, 1 + { 0x011A, 0x016a }, // 106, 1 + { 0x011B, 0x016b }, // 107, 1 + { 0x011C, 0x017a }, // 122, 1 + { 0x011D, 0x017b }, // 123, 1 + { 0x011E, 0x0174 }, // 116, 1 + { 0x011F, 0x0175 }, // 117, 1 + { 0x0120, 0x017c }, // 124, 1 + { 0x0121, 0x017d }, // 125, 1 + { 0x0122, 0x0178 }, // 120, 1 + { 0x0123, 0x0179 }, // 121, 1 + { 0x0124, 0x017e }, // 126, 1 + { 0x0125, 0x017f }, // 127, 1 + { 0x0126, 0x0180 }, // 128, 1 + { 0x0127, 0x0181 }, // 129, 1 + { 0x0128, 0x0188 }, // 136, 1 + { 0x0129, 0x0189 }, // 137, 1 + { 0x012A, 0x0184 }, // 132, 1 + { 0x012B, 0x0185 }, // 133, 1 + { 0x012C, 0x01ec }, // 236, 1 + { 0x012D, 0x01ed }, // 237, 1 + { 0x012E, 0x0186 }, // 134, 1 + { 0x012F, 0x0187 }, // 135, 1 + { 0x0130, 0x0182 }, // 130, 1 + { 0x0131, 0x01ef }, // 239, 1 + { 0x0132, 0x018a }, // 138, 1 + { 0x0133, 0x018b }, // 139, 1 + { 0x0134, 0x018c }, // 140, 1 + { 0x0135, 0x018d }, // 141, 1 + { 0x0136, 0x018e }, // 142, 1 + { 0x0137, 0x018f }, // 143, 1 + { 0x0138, 0x0118 }, // 24 , 1 + { 0x0139, 0x0190 }, // 144, 1 + { 0x013A, 0x0191 }, // 145, 1 + { 0x013B, 0x0194 }, // 148, 1 + { 0x013C, 0x0195 }, // 149, 1 + { 0x013D, 0x0192 }, // 146, 1 + { 0x013E, 0x0193 }, // 147, 1 + { 0x013F, 0x0196 }, // 150, 1 + { 0x0140, 0x0197 }, // 151, 1 + { 0x0141, 0x0198 }, // 152, 1 + { 0x0142, 0x0199 }, // 153, 1 + { 0x0143, 0x019a }, // 154, 1 + { 0x0144, 0x019b }, // 155, 1 + { 0x0145, 0x01a0 }, // 160, 1 + { 0x0146, 0x01a1 }, // 161, 1 + { 0x0147, 0x019e }, // 158, 1 + { 0x0148, 0x019f }, // 159, 1 + { 0x0149, 0x019d }, // 157, 1 + { 0x014A, 0x01d2 }, // 210, 1 + { 0x014B, 0x01d3 }, // 211, 1 + { 0x014C, 0x01a4 }, // 164, 1 + { 0x014D, 0x01a5 }, // 165, 1 + { 0x014E, 0x01f0 }, // 240, 1 + { 0x014F, 0x01f1 }, // 241, 1 + { 0x0150, 0x01a2 }, // 162, 1 + { 0x0151, 0x01a3 }, // 163, 1 + { 0x0152, 0x01a6 }, // 166, 1 + { 0x0153, 0x01a7 }, // 167, 1 + { 0x0154, 0x01a8 }, // 168, 1 + { 0x0155, 0x01a9 }, // 169, 1 + { 0x0156, 0x01ac }, // 172, 1 + { 0x0157, 0x01ad }, // 173, 1 + { 0x0158, 0x01aa }, // 170, 1 + { 0x0159, 0x01ab }, // 171, 1 + { 0x015A, 0x01ae }, // 174, 1 + { 0x015B, 0x01af }, // 175, 1 + { 0x015C, 0x01b4 }, // 180, 1 + { 0x015D, 0x01b5 }, // 181, 1 + { 0x015E, 0x01b2 }, // 178, 1 + { 0x015F, 0x01b3 }, // 179, 1 + { 0x0160, 0x01b0 }, // 176, 1 + { 0x0161, 0x01b1 }, // 177, 1 + { 0x0162, 0x01b8 }, // 184, 1 + { 0x0163, 0x01b9 }, // 185, 1 + { 0x0164, 0x01b6 }, // 182, 1 + { 0x0165, 0x01b7 }, // 183, 1 + { 0x0166, 0x01ba }, // 186, 1 + { 0x0167, 0x01bb }, // 187, 1 + { 0x0168, 0x01c6 }, // 198, 1 + { 0x0169, 0x01c7 }, // 199, 1 + { 0x016A, 0x01c0 }, // 192, 1 + { 0x016B, 0x01c1 }, // 193, 1 + { 0x016C, 0x01bc }, // 188, 1 + { 0x016D, 0x01bd }, // 189, 1 + { 0x016E, 0x01c4 }, // 196, 1 + { 0x016F, 0x01c5 }, // 197, 1 + { 0x0170, 0x01be }, // 190, 1 + { 0x0171, 0x01bf }, // 191, 1 + { 0x0172, 0x01c2 }, // 194, 1 + { 0x0173, 0x01c3 }, // 195, 1 + { 0x0174, 0x01c8 }, // 200, 1 + { 0x0175, 0x01c9 }, // 201, 1 + { 0x0176, 0x01ca }, // 202, 1 + { 0x0177, 0x01cb }, // 203, 1 + { 0x0178, 0x014a }, // 74 , 1 + { 0x0179, 0x01cc }, // 204, 1 + { 0x017A, 0x01cd }, // 205, 1 + { 0x017B, 0x01d0 }, // 208, 1 + { 0x017C, 0x01d1 }, // 209, 1 + { 0x017D, 0x01ce }, // 206, 1 + { 0x017E, 0x01cf }, // 207, 1 + { 0x0192, 0x040e }, // 14 , 4 + { 0x0194, 0x0a7c }, // 124, 10 + { 0x01A0, 0x01e6 }, // 230, 1 + { 0x01A1, 0x01e7 }, // 231, 1 + { 0x01AF, 0x01e8 }, // 232, 1 + { 0x01B0, 0x01e9 }, // 233, 1 + { 0x01C0, 0x0605 }, // 5 , 6 + { 0x0250, 0x0237 }, // 55 , 2 + { 0x0251, 0x0238 }, // 56 , 2 + { 0x0252, 0x0239 }, // 57 , 2 + { 0x0253, 0x023a }, // 58 , 2 + { 0x0254, 0x023c }, // 60 , 2 + { 0x0255, 0x023d }, // 61 , 2 + { 0x0256, 0x023f }, // 63 , 2 + { 0x0257, 0x0240 }, // 64 , 2 + { 0x0258, 0x0241 }, // 65 , 2 + { 0x0259, 0x0242 }, // 66 , 2 + { 0x025A, 0x0243 }, // 67 , 2 + { 0x025B, 0x0244 }, // 68 , 2 + { 0x025C, 0x0245 }, // 69 , 2 + { 0x025D, 0x0246 }, // 70 , 2 + { 0x025E, 0x0248 }, // 72 , 2 + { 0x025F, 0x0249 }, // 73 , 2 + { 0x0260, 0x024c }, // 76 , 2 + { 0x0261, 0x024b }, // 75 , 2 + { 0x0262, 0x024d }, // 77 , 2 + { 0x0263, 0x024f }, // 79 , 2 + { 0x0264, 0x0250 }, // 80 , 2 + { 0x0265, 0x0251 }, // 81 , 2 + { 0x0266, 0x0252 }, // 82 , 2 + { 0x0267, 0x0253 }, // 83 , 2 + { 0x0268, 0x0255 }, // 85 , 2 + { 0x0269, 0x0257 }, // 87 , 2 + { 0x026A, 0x0256 }, // 86 , 2 + { 0x026B, 0x025a }, // 90 , 2 + { 0x026C, 0x025b }, // 91 , 2 + { 0x026D, 0x025c }, // 92 , 2 + { 0x026E, 0x025e }, // 94 , 2 + { 0x026F, 0x0260 }, // 96 , 2 + { 0x0270, 0x0261 }, // 97 , 2 + { 0x0271, 0x0262 }, // 98 , 2 + { 0x0272, 0x0263 }, // 99 , 2 + { 0x0273, 0x0264 }, // 100, 2 + { 0x0274, 0x0265 }, // 101, 2 + { 0x0275, 0x0279 }, // 121, 2 + { 0x0276, 0x0266 }, // 102, 2 + { 0x0277, 0x0267 }, // 103, 2 + { 0x0278, 0x024a }, // 74 , 2 + { 0x0279, 0x0269 }, // 105, 2 + { 0x027A, 0x026a }, // 106, 2 + { 0x027B, 0x026b }, // 107, 2 + { 0x027C, 0x026c }, // 108, 2 + { 0x027D, 0x026d }, // 109, 2 + { 0x027E, 0x026e }, // 110, 2 + { 0x027F, 0x026f }, // 111, 2 + { 0x0280, 0x0270 }, // 112, 2 + { 0x0281, 0x0271 }, // 113, 2 + { 0x0282, 0x0272 }, // 114, 2 + { 0x0283, 0x0273 }, // 115, 2 + { 0x0284, 0x0274 }, // 116, 2 + { 0x0285, 0x0275 }, // 117, 2 + { 0x0286, 0x0276 }, // 118, 2 + { 0x0287, 0x0277 }, // 119, 2 + { 0x0288, 0x0278 }, // 120, 2 + { 0x0289, 0x027a }, // 122, 2 + { 0x028A, 0x027b }, // 123, 2 + { 0x028B, 0x027d }, // 125, 2 + { 0x028C, 0x027c }, // 124, 2 + { 0x028D, 0x027e }, // 126, 2 + { 0x028E, 0x025f }, // 95 , 2 + { 0x028F, 0x0280 }, // 128, 2 + { 0x0290, 0x0281 }, // 129, 2 + { 0x0291, 0x0282 }, // 130, 2 + { 0x0292, 0x0283 }, // 131, 2 + { 0x0293, 0x0284 }, // 132, 2 + { 0x0294, 0x0285 }, // 133, 2 + { 0x0295, 0x0286 }, // 134, 2 + { 0x0296, 0x0287 }, // 135, 2 + { 0x0297, 0x023e }, // 62 , 2 + { 0x0298, 0x028a }, // 138, 2 + { 0x0299, 0x023b }, // 59 , 2 + { 0x029A, 0x0247 }, // 71 , 2 + { 0x029B, 0x024e }, // 78 , 2 + { 0x029C, 0x0254 }, // 84 , 2 + { 0x029D, 0x0258 }, // 88 , 2 + { 0x029E, 0x0259 }, // 89 , 2 + { 0x029F, 0x025d }, // 93 , 2 + { 0x02A0, 0x0268 }, // 104, 2 + { 0x02A1, 0x0288 }, // 136, 2 + { 0x02A2, 0x0289 }, // 137, 2 + { 0x02A3, 0x028b }, // 139, 2 + { 0x02A4, 0x028c }, // 140, 2 + { 0x02A5, 0x028d }, // 141, 2 + { 0x02A6, 0x028e }, // 142, 2 + { 0x02A7, 0x028f }, // 143, 2 + { 0x02A8, 0x0290 }, // 144, 2 + { 0x02B0, 0x0235 }, // 53 , 2 + { 0x02B6, 0x0236 }, // 54 , 2 + { 0x02B9, 0x0200 }, // 0 , 2 + { 0x02BA, 0x0201 }, // 1 , 2 + { 0x02BB, 0x0202 }, // 2 , 2 + { 0x02BC, 0x0205 }, // 5 , 2 + { 0x02BD, 0x0204 }, // 4 , 2 + { 0x02BE, 0x0207 }, // 7 , 2 + { 0x02BF, 0x0208 }, // 8 , 2 + { 0x02C6, 0x0217 }, // 23 , 2 + { 0x02C7, 0x0218 }, // 24 , 2 + { 0x02C8, 0x020f }, // 15 , 2 + { 0x02C9, 0x0211 }, // 17 , 2 + { 0x02CA, 0x0212 }, // 18 , 2 + { 0x02CB, 0x0213 }, // 19 , 2 + { 0x02CC, 0x0210 }, // 16 , 2 + { 0x02CD, 0x0214 }, // 20 , 2 + { 0x02CE, 0x0215 }, // 21 , 2 + { 0x02CF, 0x0216 }, // 22 , 2 + { 0x02D0, 0x020a }, // 10 , 2 + { 0x02D1, 0x020b }, // 11 , 2 + { 0x02D2, 0x022a }, // 42 , 2 + { 0x02D3, 0x022b }, // 43 , 2 + { 0x02DA, 0x021b }, // 27 , 2 + { 0x02DB, 0x0231 }, // 49 , 2 + { 0x02DC, 0x0219 }, // 25 , 2 + { 0x02DE, 0x0233 }, // 51 , 2 + { 0x0300, 0x0100 }, // 0 , 1 + { 0x0301, 0x0106 }, // 6 , 1 + { 0x0302, 0x0103 }, // 3 , 1 + { 0x0303, 0x0102 }, // 2 , 1 + { 0x0304, 0x0108 }, // 8 , 1 + { 0x0305, 0x0115 }, // 21 , 1 + { 0x0306, 0x0116 }, // 22 , 1 + { 0x0307, 0x010f }, // 15 , 1 + { 0x0308, 0x0107 }, // 7 , 1 + { 0x030A, 0x010e }, // 14 , 1 + { 0x030B, 0x0110 }, // 16 , 1 + { 0x030C, 0x0113 }, // 19 , 1 + { 0x0310, 0x0209 }, // 9 , 2 + { 0x0311, 0x0858 }, // 88 , 8 + { 0x0313, 0x0109 }, // 9 , 1 + { 0x0314, 0x085a }, // 90 , 8 + { 0x0315, 0x010a }, // 10 , 1 + { 0x031C, 0x0221 }, // 33 , 2 + { 0x031D, 0x0222 }, // 34 , 2 + { 0x031E, 0x0223 }, // 35 , 2 + { 0x031F, 0x0224 }, // 36 , 2 + { 0x0320, 0x0225 }, // 37 , 2 + { 0x0321, 0x0226 }, // 38 , 2 + { 0x0322, 0x0227 }, // 39 , 2 + { 0x0323, 0x021e }, // 30 , 2 + { 0x0324, 0x0220 }, // 32 , 2 + { 0x0325, 0x021a }, // 26 , 2 + { 0x0326, 0x010c }, // 12 , 1 + { 0x0327, 0x0111 }, // 17 , 1 + { 0x0328, 0x0112 }, // 18 , 1 + { 0x0329, 0x020e }, // 14 , 2 + { 0x032A, 0x0228 }, // 40 , 2 + { 0x032B, 0x0229 }, // 41 , 2 + { 0x032C, 0x021d }, // 29 , 2 + { 0x032D, 0x021c }, // 28 , 2 + { 0x032E, 0x020d }, // 13 , 2 + { 0x0335, 0x0104 }, // 4 , 1 + { 0x0337, 0x0114 }, // 20 , 1 + { 0x0338, 0x0105 }, // 5 , 1 + { 0x033E, 0x0230 }, // 48 , 2 + { 0x0345, 0x085b }, // 91 , 8 + { 0x0374, 0x0851 }, // 81 , 8 + { 0x0375, 0x0852 }, // 82 , 8 + { 0x0391, 0x0800 }, // 0 , 8 + { 0x0392, 0x0802 }, // 2 , 8 + { 0x0393, 0x0806 }, // 6 , 8 + { 0x0394, 0x0808 }, // 8 , 8 + { 0x0395, 0x080a }, // 10 , 8 + { 0x0396, 0x080c }, // 12 , 8 + { 0x0397, 0x080e }, // 14 , 8 + { 0x0398, 0x0810 }, // 16 , 8 + { 0x0399, 0x0812 }, // 18 , 8 + { 0x039A, 0x0814 }, // 20 , 8 + { 0x039B, 0x0816 }, // 22 , 8 + { 0x039C, 0x0818 }, // 24 , 8 + { 0x039D, 0x081a }, // 26 , 8 + { 0x039E, 0x081c }, // 28 , 8 + { 0x039F, 0x081e }, // 30 , 8 + { 0x03A0, 0x0820 }, // 32 , 8 + { 0x03A1, 0x0822 }, // 34 , 8 + { 0x03A3, 0x0824 }, // 36 , 8 + { 0x03A4, 0x0828 }, // 40 , 8 + { 0x03A5, 0x082a }, // 42 , 8 + { 0x03A6, 0x082c }, // 44 , 8 + { 0x03A7, 0x082e }, // 46 , 8 + { 0x03A8, 0x0830 }, // 48 , 8 + { 0x03A9, 0x0832 }, // 50 , 8 + { 0x03AA, 0x083c }, // 60 , 8 + { 0x03AB, 0x0842 }, // 66 , 8 + { 0x03AC, 0x0835 }, // 53 , 8 + { 0x03AD, 0x0837 }, // 55 , 8 + { 0x03AE, 0x0839 }, // 57 , 8 + { 0x03AF, 0x083b }, // 59 , 8 + { 0x03B1, 0x0801 }, // 1 , 8 + { 0x03B2, 0x0803 }, // 3 , 8 + { 0x03B3, 0x0807 }, // 7 , 8 + { 0x03B4, 0x0809 }, // 9 , 8 + { 0x03B5, 0x080b }, // 11 , 8 + { 0x03B6, 0x080d }, // 13 , 8 + { 0x03B7, 0x080f }, // 15 , 8 + { 0x03B8, 0x0811 }, // 17 , 8 + { 0x03B9, 0x0813 }, // 19 , 8 + { 0x03BA, 0x0815 }, // 21 , 8 + { 0x03BB, 0x0817 }, // 23 , 8 + { 0x03BC, 0x0819 }, // 25 , 8 + { 0x03BD, 0x081b }, // 27 , 8 + { 0x03BE, 0x081d }, // 29 , 8 + { 0x03BF, 0x081f }, // 31 , 8 + { 0x03C0, 0x0821 }, // 33 , 8 + { 0x03C1, 0x0823 }, // 35 , 8 + { 0x03C2, 0x0827 }, // 39 , 8 + { 0x03C3, 0x0825 }, // 37 , 8 + { 0x03C4, 0x0829 }, // 41 , 8 + { 0x03C5, 0x082b }, // 43 , 8 + { 0x03C6, 0x082d }, // 45 , 8 + { 0x03C7, 0x082f }, // 47 , 8 + { 0x03C8, 0x0831 }, // 49 , 8 + { 0x03C9, 0x0833 }, // 51 , 8 + { 0x03CA, 0x083d }, // 61 , 8 + { 0x03CB, 0x0843 }, // 67 , 8 + { 0x03CC, 0x083f }, // 63 , 8 + { 0x03CD, 0x0841 }, // 65 , 8 + { 0x03CE, 0x0845 }, // 69 , 8 + { 0x03D0, 0x0805 }, // 5 , 8 + { 0x03D1, 0x0847 }, // 71 , 8 + { 0x03D2, 0x084c }, // 76 , 8 + { 0x03D5, 0x084d }, // 77 , 8 + { 0x03D6, 0x0849 }, // 73 , 8 + { 0x03D7, 0x084f }, // 79 , 8 + { 0x03DA, 0x08d7 }, // 215, 8 + { 0x03DB, 0x084B }, // 75 , 8 + { 0x03DC, 0x08d8 }, // 216, 8 + { 0x03DE, 0x08d9 }, // 217, 8 + { 0x03E0, 0x08da }, // 218, 8 + { 0x03F0, 0x0848 }, // 72 , 8 + { 0x03F1, 0x084a }, // 74 , 8 + { 0x0401, 0x0a0c }, // 12 , 10 + { 0x0402, 0x0a4a }, // 74 , 10 + { 0x0403, 0x0a44 }, // 68 , 10 + { 0x0404, 0x0a4e }, // 78 , 10 + { 0x0405, 0x0a52 }, // 82 , 10 + { 0x0406, 0x0a58 }, // 88 , 10 + { 0x0407, 0x0a5a }, // 90 , 10 + { 0x0408, 0x0a5e }, // 94 , 10 + { 0x0409, 0x0a68 }, // 104, 10 + { 0x040A, 0x0a6c }, // 108, 10 + { 0x040B, 0x0a72 }, // 114, 10 + { 0x040C, 0x0a60 }, // 96 , 10 + { 0x040E, 0x0a74 }, // 116, 10 + { 0x040F, 0x0a86 }, // 134, 10 + { 0x0410, 0x0a00 }, // 0 , 10 + { 0x0411, 0x0a02 }, // 2 , 10 + { 0x0412, 0x0a04 }, // 4 , 10 + { 0x0413, 0x0a06 }, // 6 , 10 + { 0x0414, 0x0a08 }, // 8 , 10 + { 0x0415, 0x0a0a }, // 10 , 10 + { 0x0416, 0x0a0e }, // 14 , 10 + { 0x0417, 0x0a10 }, // 16 , 10 + { 0x0418, 0x0a12 }, // 18 , 10 + { 0x0419, 0x0a14 }, // 20 , 10 + { 0x041A, 0x0a16 }, // 22 , 10 + { 0x041B, 0x0a18 }, // 24 , 10 + { 0x041C, 0x0a1a }, // 26 , 10 + { 0x041D, 0x0a1c }, // 28 , 10 + { 0x041E, 0x0a1e }, // 30 , 10 + { 0x041F, 0x0a20 }, // 32 , 10 + { 0x0420, 0x0a22 }, // 34 , 10 + { 0x0421, 0x0a24 }, // 36 , 10 + { 0x0422, 0x0a26 }, // 38 , 10 + { 0x0423, 0x0a28 }, // 40 , 10 + { 0x0424, 0x0a2a }, // 42 , 10 + { 0x0425, 0x0a2c }, // 44 , 10 + { 0x0426, 0x0a2e }, // 46 , 10 + { 0x0427, 0x0a30 }, // 48 , 10 + { 0x0428, 0x0a32 }, // 50 , 10 + { 0x0429, 0x0a34 }, // 52 , 10 + { 0x042A, 0x0a36 }, // 54 , 10 + { 0x042B, 0x0a38 }, // 56 , 10 + { 0x042C, 0x0a3a }, // 58 , 10 + { 0x042D, 0x0a3c }, // 60 , 10 + { 0x042E, 0x0a3e }, // 62 , 10 + { 0x042F, 0x0a40 }, // 64 , 10 + { 0x0430, 0x0a01 }, // 1 , 10 + { 0x0431, 0x0a03 }, // 3 , 10 + { 0x0432, 0x0a05 }, // 5 , 10 + { 0x0433, 0x0a07 }, // 7 , 10 + { 0x0434, 0x0a09 }, // 9 , 10 + { 0x0435, 0x0a0b }, // 11 , 10 + { 0x0436, 0x0a0f }, // 15 , 10 + { 0x0437, 0x0a11 }, // 17 , 10 + { 0x0438, 0x0a13 }, // 19 , 10 + { 0x0439, 0x0a15 }, // 21 , 10 + { 0x043A, 0x0a17 }, // 23 , 10 + { 0x043B, 0x0a19 }, // 25 , 10 + { 0x043C, 0x0a1b }, // 27 , 10 + { 0x043D, 0x0a1d }, // 29 , 10 + { 0x043E, 0x0a1f }, // 31 , 10 + { 0x043F, 0x0a21 }, // 33 , 10 + { 0x0440, 0x0a23 }, // 35 , 10 + { 0x0441, 0x0a25 }, // 37 , 10 + { 0x0442, 0x0a27 }, // 39 , 10 + { 0x0443, 0x0a29 }, // 41 , 10 + { 0x0444, 0x0a2b }, // 43 , 10 + { 0x0445, 0x0a2d }, // 45 , 10 + { 0x0446, 0x0a2f }, // 47 , 10 + { 0x0447, 0x0a31 }, // 49 , 10 + { 0x0448, 0x0a33 }, // 51 , 10 + { 0x0449, 0x0a35 }, // 53 , 10 + { 0x044A, 0x0a37 }, // 55 , 10 + { 0x044B, 0x0a39 }, // 57 , 10 + { 0x044C, 0x0a3b }, // 59 , 10 + { 0x044D, 0x0a3d }, // 61 , 10 + { 0x044E, 0x0a3f }, // 63 , 10 + { 0x044F, 0x0a41 }, // 65 , 10 + { 0x0451, 0x0a0d }, // 13 , 10 + { 0x0452, 0x0a4b }, // 75 , 10 + { 0x0453, 0x0a45 }, // 69 , 10 + { 0x0454, 0x0a4f }, // 79 , 10 + { 0x0455, 0x0a53 }, // 83 , 10 + { 0x0456, 0x0a59 }, // 89 , 10 + { 0x0457, 0x0a5b }, // 91 , 10 + { 0x0458, 0x0a5f }, // 95 , 10 + { 0x0459, 0x0a69 }, // 105, 10 + { 0x045A, 0x0a6d }, // 109, 10 + { 0x045B, 0x0a73 }, // 115, 10 + { 0x045C, 0x0a61 }, // 97 , 10 + { 0x045E, 0x0a75 }, // 117, 10 + { 0x045F, 0x0a87 }, // 135, 10 + { 0x0460, 0x0a70 }, // 112, 10 + { 0x0461, 0x0a71 }, // 113, 10 + { 0x0462, 0x0a8e }, // 142, 10 + { 0x0463, 0x0a8f }, // 143, 10 + { 0x0466, 0x0a90 }, // 144, 10 + { 0x0467, 0x0a91 }, // 145, 10 + { 0x046A, 0x0a92 }, // 146, 10 + { 0x046B, 0x0a93 }, // 147, 10 + { 0x046E, 0x0a94 }, // 148, 10 + { 0x046F, 0x0a95 }, // 149, 10 + { 0x0470, 0x0a96 }, // 150, 10 + { 0x0471, 0x0a97 }, // 151, 10 + { 0x0472, 0x0a98 }, // 152, 10 + { 0x0473, 0x0a99 }, // 153, 10 + { 0x0474, 0x0a9a }, // 154, 10 + { 0x0475, 0x0a9b }, // 155, 10 + { 0x047A, 0x0a6e }, // 110, 10 + { 0x047B, 0x0a6f }, // 111, 10 + { 0x047E, 0x0a84 }, // 132, 10 + { 0x047F, 0x0a85 }, // 133, 10 + { 0x0490, 0x0a46 }, // 70 , 10 + { 0x0491, 0x0a47 }, // 71 , 10 + { 0x0492, 0x0a48 }, // 72 , 10 + { 0x0493, 0x0a49 }, // 73 , 10 + { 0x0496, 0x0a50 }, // 80 , 10 + { 0x0497, 0x0a51 }, // 81 , 10 + { 0x049A, 0x0a62 }, // 98 , 10 + { 0x049B, 0x0a63 }, // 99 , 10 + { 0x049C, 0x0a66 }, // 102, 10 + { 0x049D, 0x0a67 }, // 103, 10 + { 0x04A2, 0x0a6a }, // 106, 10 + { 0x04A3, 0x0a6b }, // 107, 10 + { 0x04AE, 0x0a78 }, // 120, 10 + { 0x04AF, 0x0a79 }, // 121, 10 + { 0x04B0, 0x0a7a }, // 122, 10 + { 0x04B1, 0x0a7b }, // 123, 10 + { 0x04B2, 0x0a7e }, // 126, 10 + { 0x04B3, 0x0a7f }, // 127, 10 + { 0x04B6, 0x0a88 }, // 136, 10 + { 0x04B7, 0x0a89 }, // 137, 10 + { 0x04B8, 0x0a8a }, // 138, 10 + { 0x04B9, 0x0a8b }, // 139, 10 + { 0x04BA, 0x0a82 }, // 130, 10 + { 0x04BB, 0x0a83 }, // 131, 10 + { 0x04D8, 0x0a42 }, // 66 , 10 + { 0x04D9, 0x0a43 }, // 67 , 10 + { 0x04EE, 0x0a76 }, // 118, 10 + { 0x04EF, 0x0a77 }, // 119, 10 + { 0x05B0, 0x0920 }, // 32 , 9 + { 0x05B1, 0x0921 }, // 33 , 9 + { 0x05B2, 0x0922 }, // 34 , 9 + { 0x05B3, 0x0923 }, // 35 , 9 + { 0x05B4, 0x0924 }, // 36 , 9 + { 0x05B5, 0x0925 }, // 37 , 9 + { 0x05B6, 0x0926 }, // 38 , 9 + { 0x05B7, 0x0927 }, // 39 , 9 + { 0x05B8, 0x0928 }, // 40 , 9 + { 0x05B9, 0x0929 }, // 41 , 9 + { 0x05BB, 0x092b }, // 43 , 9 + { 0x05BC, 0x092c }, // 44 , 9 + { 0x05BD, 0x092d }, // 45 , 9 + { 0x05BF, 0x092e }, // 46 , 9 + { 0x05C0, 0x091c }, // 28 , 9 + { 0x05C3, 0x091d }, // 29 , 9 + { 0x05D0, 0x0900 }, // 0 , 9 + { 0x05D1, 0x0901 }, // 1 , 9 + { 0x05D2, 0x0902 }, // 2 , 9 + { 0x05D3, 0x0903 }, // 3 , 9 + { 0x05D4, 0x0904 }, // 4 , 9 + { 0x05D5, 0x0905 }, // 5 , 9 + { 0x05D6, 0x0906 }, // 6 , 9 + { 0x05D7, 0x0907 }, // 7 , 9 + { 0x05D8, 0x0908 }, // 8 , 9 + { 0x05D9, 0x0909 }, // 9 , 9 + { 0x05DA, 0x090a }, // 10 , 9 + { 0x05DB, 0x090b }, // 11 , 9 + { 0x05DC, 0x090c }, // 12 , 9 + { 0x05DD, 0x090d }, // 13 , 9 + { 0x05DE, 0x090e }, // 14 , 9 + { 0x05DF, 0x090f }, // 15 , 9 + { 0x05E0, 0x0910 }, // 16 , 9 + { 0x05E1, 0x0911 }, // 17 , 9 + { 0x05E2, 0x0912 }, // 18 , 9 + { 0x05E3, 0x0913 }, // 19 , 9 + { 0x05E4, 0x0914 }, // 20 , 9 + { 0x05E5, 0x0915 }, // 21 , 9 + { 0x05E6, 0x0916 }, // 22 , 9 + { 0x05E7, 0x0917 }, // 23 , 9 + { 0x05E8, 0x0918 }, // 24 , 9 + { 0x05E9, 0x0919 }, // 25 , 9 + { 0x05EA, 0x091a }, // 26 , 9 + { 0x05F0, 0x0931 }, // 49 , 9 + { 0x05F1, 0x0932 }, // 50 , 9 + { 0x05F2, 0x0933 }, // 51 , 9 + { 0x05F3, 0x091e }, // 30 , 9 + { 0x05F4, 0x091f }, // 31 , 9 + { 0x060C, 0x0d26 }, // 38 , 13 + { 0x061B, 0x0d27 }, // 39 , 13 + { 0x061F, 0x0d28 }, // 40 , 13 + { 0x0621, 0x0da4 }, // 164, 13 + { 0x0622, 0x0db1 }, // 177, 13 + { 0x0623, 0x0da5 }, // 165, 13 + { 0x0624, 0x0da9 }, // 169, 13 + { 0x0625, 0x0da7 }, // 167, 13 + { 0x0626, 0x0dab }, // 171, 13 + { 0x0627, 0x0d3a }, // 58 , 13 + { 0x0628, 0x0d3c }, // 60 , 13 + { 0x0629, 0x0d98 }, // 152, 13 + { 0x062A, 0x0d40 }, // 64 , 13 + { 0x062B, 0x0d44 }, // 68 , 13 + { 0x062C, 0x0d48 }, // 72 , 13 + { 0x062D, 0x0d4c }, // 76 , 13 + { 0x062E, 0x0d50 }, // 80 , 13 + { 0x062F, 0x0d54 }, // 84 , 13 + { 0x0630, 0x0d56 }, // 86 , 13 + { 0x0631, 0x0d58 }, // 88 , 13 + { 0x0632, 0x0d5a }, // 90 , 13 + { 0x0633, 0x0d5c }, // 92 , 13 + { 0x0634, 0x0d60 }, // 96 , 13 + { 0x0635, 0x0d64 }, // 100, 13 + { 0x0636, 0x0d68 }, // 104, 13 + { 0x0637, 0x0d6c }, // 108, 13 + { 0x0638, 0x0d70 }, // 112, 13 + { 0x0639, 0x0d74 }, // 116, 13 + { 0x063A, 0x0d78 }, // 120, 13 + { 0x0640, 0x0dc2 }, // 194, 13 + { 0x0641, 0x0d7c }, // 124, 13 + { 0x0642, 0x0d80 }, // 128, 13 + { 0x0643, 0x0d84 }, // 132, 13 + { 0x0644, 0x0d88 }, // 136, 13 + { 0x0645, 0x0d8c }, // 140, 13 + { 0x0646, 0x0d90 }, // 144, 13 + { 0x0647, 0x0d94 }, // 148, 13 + { 0x0648, 0x0d9a }, // 154, 13 + { 0x0649, 0x0da0 }, // 160, 13 + { 0x064A, 0x0d9c }, // 156, 13 + { 0x064B, 0x0d10 }, // 16 , 13 + { 0x064C, 0x0d11 }, // 17 , 13 + { 0x064E, 0x0d0a }, // 10 , 13 + { 0x064F, 0x0d0c }, // 12 , 13 + { 0x0650, 0x0d0e }, // 14 , 13 + { 0x0651, 0x0d16 }, // 22 , 13 + { 0x0652, 0x0d14 }, // 20 , 13 + { 0x0660, 0x0d38 }, // 56 , 13 + { 0x0661, 0x0d2f }, // 47 , 13 + { 0x0662, 0x0d30 }, // 48 , 13 + { 0x0663, 0x0d31 }, // 49 , 13 + { 0x0664, 0x0d32 }, // 50 , 13 + { 0x0665, 0x0d33 }, // 51 , 13 + { 0x0666, 0x0d34 }, // 52 , 13 + { 0x0667, 0x0d35 }, // 53 , 13 + { 0x0668, 0x0d36 }, // 54 , 13 + { 0x0669, 0x0d37 }, // 55 , 13 + { 0x066A, 0x0d2a }, // 42 , 13 + { 0x0671, 0x0db3 }, // 179, 13 + { 0x0674, 0x0d24 }, // 36 , 13 + { 0x0679, 0x0e3c }, // 60 , 14 + { 0x067A, 0x0e4c }, // 76 , 14 + { 0x067B, 0x0e30 }, // 48 , 14 + { 0x067C, 0x0e40 }, // 64 , 14 + { 0x067D, 0x0e48 }, // 72 , 14 + { 0x067E, 0x0e38 }, // 56 , 14 + { 0x067F, 0x0e44 }, // 68 , 14 + { 0x0680, 0x0e34 }, // 52 , 14 + { 0x0681, 0x0e64 }, // 100, 14 + { 0x0683, 0x0e54 }, // 84 , 14 + { 0x0684, 0x0e50 }, // 80 , 14 + { 0x0685, 0x0e60 }, // 96 , 14 + { 0x0686, 0x0e58 }, // 88 , 14 + { 0x0687, 0x0e5c }, // 92 , 14 + { 0x0688, 0x0e68 }, // 104, 14 + { 0x0689, 0x0e6a }, // 106, 14 + { 0x068A, 0x0e70 }, // 112, 14 + { 0x068C, 0x0e6c }, // 108, 14 + { 0x068D, 0x0e72 }, // 114, 14 + { 0x068E, 0x0e6e }, // 110, 14 + { 0x0691, 0x0e76 }, // 118, 14 + { 0x0692, 0x0e7C }, // 124, 14 + { 0x0693, 0x0e74 }, // 116, 14 + { 0x0695, 0x0e7a }, // 122, 14 + { 0x0696, 0x0e80 }, // 128, 14 + { 0x0698, 0x0e7e }, // 126, 14 + { 0x0699, 0x0e78 }, // 120, 14 + { 0x069A, 0x0e84 }, // 132, 14 + { 0x06A0, 0x0e88 }, // 136, 14 + { 0x06A4, 0x0e8c }, // 140, 14 + { 0x06A6, 0x0e90 }, // 144, 14 + { 0x06A9, 0x0e94 }, // 148, 14 + { 0x06AA, 0x0e9c }, // 156, 14 + { 0x06AB, 0x0ea8 }, // 168, 14 + { 0x06AF, 0x0ea0 }, // 160, 14 + { 0x06B1, 0x0eac }, // 172, 14 + { 0x06B3, 0x0eb0 }, // 176, 14 + { 0x06B5, 0x0eb4 }, // 180, 14 + { 0x06BA, 0x0eba }, // 186, 14 + { 0x06BB, 0x0ec2 }, // 194, 14 + { 0x06BC, 0x0ebe }, // 190, 14 + { 0x06C0, 0x0eda }, // 218, 14 + { 0x06C6, 0x0ec6 }, // 198, 14 + { 0x06CA, 0x0ec8 }, // 200, 14 + { 0x06CE, 0x0ed0 }, // 208, 14 + { 0x06D1, 0x0ed6 }, // 214, 14 + { 0x06D2, 0x0ed4 }, // 212, 14 + { 0x06D6, 0x0d25 }, // 37 , 13 + { 0x06E4, 0x0d22 }, // 34 , 13 + { 0x06F4, 0x0e29 }, // 41 , 14 + { 0x06F5, 0x0e2b }, // 43 , 14 + { 0x06F6, 0x0e2c }, // 44 , 14 + { 0x06F7, 0x0e2e }, // 46 , 14 + { 0x06F8, 0x0e2f }, // 47 , 14 + { 0x10D0, 0x0ad2 }, // 210, 10 + { 0x10D1, 0x0ad3 }, // 211, 10 + { 0x10D2, 0x0ad4 }, // 212, 10 + { 0x10D3, 0x0ad5 }, // 213, 10 + { 0x10D4, 0x0ad6 }, // 214, 10 + { 0x10D5, 0x0ad7 }, // 215, 10 + { 0x10D6, 0x0ad8 }, // 216, 10 + { 0x10D7, 0x0ada }, // 218, 10 + { 0x10D8, 0x0adb }, // 219, 10 + { 0x10D9, 0x0adc }, // 220, 10 + { 0x10DA, 0x0add }, // 221, 10 + { 0x10DB, 0x0ade }, // 222, 10 + { 0x10DC, 0x0adf }, // 223, 10 + { 0x10DD, 0x0ae1 }, // 225, 10 + { 0x10DE, 0x0ae2 }, // 226, 10 + { 0x10DF, 0x0ae3 }, // 227, 10 + { 0x10E0, 0x0ae4 }, // 228, 10 + { 0x10E1, 0x0ae5 }, // 229, 10 + { 0x10E2, 0x0ae6 }, // 230, 10 + { 0x10E3, 0x0ae7 }, // 231, 10 + { 0x10E4, 0x0ae9 }, // 233, 10 + { 0x10E5, 0x0aea }, // 234, 10 + { 0x10E6, 0x0aeb }, // 235, 10 + { 0x10E7, 0x0aec }, // 236, 10 + { 0x10E8, 0x0aed }, // 237, 10 + { 0x10E9, 0x0aee }, // 238, 10 + { 0x10EA, 0x0aef }, // 239, 10 + { 0x10EB, 0x0af0 }, // 240, 10 + { 0x10EC, 0x0af1 }, // 241, 10 + { 0x10ED, 0x0af2 }, // 242, 10 + { 0x10EE, 0x0af3 }, // 243, 10 + { 0x10EF, 0x0af5 }, // 245, 10 + { 0x10F0, 0x0af6 }, // 246, 10 + { 0x10F1, 0x0ad9 }, // 217, 10 + { 0x10F2, 0x0ae0 }, // 224, 10 + { 0x10F3, 0x0ae8 }, // 232, 10 + { 0x10F4, 0x0af4 }, // 244, 10 + { 0x10F5, 0x0af7 }, // 247, 10 + { 0x10F6, 0x0af8 }, // 248, 10 + { 0x1F00, 0x0873 }, // 115, 8 + { 0x1F01, 0x087b }, // 123, 8 + { 0x1F02, 0x0875 }, // 117, 8 + { 0x1F03, 0x087d }, // 125, 8 + { 0x1F04, 0x0874 }, // 116, 8 + { 0x1F05, 0x087c }, // 124, 8 + { 0x1F10, 0x0884 }, // 132, 8 + { 0x1F11, 0x0887 }, // 135, 8 + { 0x1F12, 0x0886 }, // 134, 8 + { 0x1F13, 0x0889 }, // 137, 8 + { 0x1F14, 0x0885 }, // 133, 8 + { 0x1F15, 0x0888 }, // 136, 8 + { 0x1F20, 0x0890 }, // 144, 8 + { 0x1F21, 0x0898 }, // 152, 8 + { 0x1F22, 0x0892 }, // 146, 8 + { 0x1F23, 0x089a }, // 154, 8 + { 0x1F24, 0x0891 }, // 145, 8 + { 0x1F25, 0x0899 }, // 153, 8 + { 0x1F30, 0x08a4 }, // 164, 8 + { 0x1F31, 0x08a8 }, // 168, 8 + { 0x1F32, 0x08a6 }, // 166, 8 + { 0x1F33, 0x08aa }, // 170, 8 + { 0x1F34, 0x08a5 }, // 165, 8 + { 0x1F35, 0x08a9 }, // 169, 8 + { 0x1F40, 0x08ad }, // 173, 8 + { 0x1F41, 0x08b0 }, // 176, 8 + { 0x1F42, 0x08af }, // 175, 8 + { 0x1F43, 0x08b2 }, // 178, 8 + { 0x1F44, 0x08ae }, // 174, 8 + { 0x1F45, 0x08b1 }, // 177, 8 + { 0x1F50, 0x08b9 }, // 185, 8 + { 0x1F51, 0x08bd }, // 189, 8 + { 0x1F52, 0x08bb }, // 187, 8 + { 0x1F53, 0x08bf }, // 191, 8 + { 0x1F54, 0x08ba }, // 186, 8 + { 0x1F55, 0x08be }, // 190, 8 + { 0x1F60, 0x08c7 }, // 199, 8 + { 0x1F61, 0x08cf }, // 207, 8 + { 0x1F62, 0x08c9 }, // 201, 8 + { 0x1F63, 0x08d1 }, // 209, 8 + { 0x1F64, 0x08c8 }, // 200, 8 + { 0x1F65, 0x08d0 }, // 208, 8 + { 0x1F70, 0x086d }, // 109, 8 + { 0x1F72, 0x0883 }, // 131, 8 + { 0x1F74, 0x088a }, // 138, 8 + { 0x1F76, 0x08a0 }, // 160, 8 + { 0x1F78, 0x08ac }, // 172, 8 + { 0x1F7A, 0x08b5 }, // 181, 8 + { 0x1F7C, 0x08c1 }, // 193, 8 + { 0x1F80, 0x0877 }, // 119, 8 + { 0x1F81, 0x087f }, // 127, 8 + { 0x1F82, 0x0879 }, // 121, 8 + { 0x1F83, 0x0881 }, // 129, 8 + { 0x1F84, 0x0878 }, // 120, 8 + { 0x1F85, 0x0880 }, // 128, 8 + { 0x1F90, 0x0894 }, // 148, 8 + { 0x1F91, 0x089c }, // 156, 8 + { 0x1F92, 0x0896 }, // 150, 8 + { 0x1F93, 0x089e }, // 158, 8 + { 0x1F94, 0x0895 }, // 149, 8 + { 0x1F95, 0x089d }, // 157, 8 + { 0x1FA0, 0x08cb }, // 203, 8 + { 0x1FA1, 0x08d3 }, // 211, 8 + { 0x1FA2, 0x08cd }, // 205, 8 + { 0x1FA3, 0x08d5 }, // 213, 8 + { 0x1FA4, 0x08cc }, // 204, 8 + { 0x1FA5, 0x08d4 }, // 212, 8 + { 0x1FB2, 0x0871 }, // 113, 8 + { 0x1FB3, 0x086f }, // 111, 8 + { 0x1FB4, 0x0870 }, // 112, 8 + { 0x1FC2, 0x088e }, // 142, 8 + { 0x1FC3, 0x088c }, // 140, 8 + { 0x1FC4, 0x088d }, // 141, 8 + { 0x1FCD, 0x085e }, // 94 , 8 + { 0x1FCE, 0x085c }, // 92 , 8 + { 0x1FDD, 0x085f }, // 95 , 8 + { 0x1FDE, 0x085d }, // 93 , 8 + { 0x1FE4, 0x08B4 }, // 180, 8 + { 0x1FE5, 0x08B3 }, // 179, 8 + { 0x1FF2, 0x08c5 }, // 197, 8 + { 0x1FF3, 0x08c3 }, // 195, 8 + { 0x1FF4, 0x08c4 }, // 196, 8 + { 0x2007, 0x0517 }, // 23 , 5 + { 0x2012, 0x0432 }, // 50 , 4 + { 0x2013, 0x0421 }, // 33 , 4 + { 0x2014, 0x0422 }, // 34 , 4 + { 0x2017, 0x022f }, // 47 , 2 + { 0x2018, 0x041d }, // 29 , 4 + { 0x2019, 0x041c }, // 28 , 4 + { 0x201A, 0x043e }, // 62 , 4 + { 0x201B, 0x041b }, // 27 , 4 + { 0x201C, 0x0420 }, // 32 , 4 + { 0x201D, 0x041f }, // 31 , 4 + { 0x201E, 0x043f }, // 63 , 4 + { 0x201F, 0x041e }, // 30 , 4 + { 0x2020, 0x0427 }, // 39 , 4 + { 0x2021, 0x0428 }, // 40 , 4 + { 0x2022, 0x0403 }, // 3 , 4 + { 0x2026, 0x0438 }, // 56 , 4 + { 0x2030, 0x044b }, // 75 , 4 + { 0x2033, 0x0580 }, // 128, 5 + { 0x2034, 0x0671 }, // 113, 6 + { 0x2036, 0x057f }, // 127, 5 + { 0x2039, 0x0423 }, // 35 , 4 + { 0x203A, 0x0424 }, // 36 , 4 + { 0x203C, 0x050d }, // 13 , 5 + { 0x203E, 0x0626 }, // 38 , 6 + { 0x207F, 0x0415 }, // 21 , 4 + { 0x20A0, 0x043c }, // 60 , 4 + { 0x20A2, 0x043b }, // 59 , 4 + { 0x20A3, 0x043a }, // 58 , 4 + { 0x20A4, 0x043d }, // 61 , 4 + { 0x20A6, 0x0457 }, // 87 , 4 + { 0x20A7, 0x040d }, // 13 , 4 + { 0x20A8, 0x0458 }, // 88 , 4 + { 0x20A9, 0x0456 }, // 86 , 4 + { 0x20AA, 0x097A }, // 122, 9 + { 0x20AC, 0x0466 }, // 102, 4, Euro Sign - GW assigned x448 [4,72] + { 0x20DD, 0x066d }, // 109, 6 + { 0x20E1, 0x06e1 }, // 225, 6 + { 0x2102, 0x06d5 }, // 213, 6 + { 0x2104, 0x0515 }, // 21 , 5 + { 0x2105, 0x0449 }, // 73 , 4 + { 0x2106, 0x044a }, // 74 , 4 + { 0x210C, 0x06e9 }, // 233, 6 + { 0x210F, 0x0632 }, // 50 , 6 + { 0x2111, 0x0633 }, // 51 , 6 + { 0x2112, 0x0669 }, // 105, 6 + { 0x2113, 0x0631 }, // 49 , 6 + { 0x2115, 0x06d7 }, // 215, 6 + { 0x2116, 0x044c }, // 76 , 4 + { 0x2118, 0x0635 }, // 53 , 6 + { 0x211C, 0x0634 }, // 52 , 6 + { 0x211D, 0x06d8 }, // 216, 6 + { 0x211E, 0x042b }, // 43 , 4 + { 0x2120, 0x042a }, // 42 , 4 + { 0x2122, 0x0429 }, // 41 , 4 + { 0x2127, 0x06a7 }, // 167, 6 + { 0x2128, 0x066b }, // 107, 6 + { 0x212B, 0x0623 }, // 35 , 6 + { 0x212D, 0x066a }, // 106, 6 + { 0x212F, 0x0630 }, // 48 , 6 + { 0x2130, 0x06d3 }, // 211, 6 + { 0x2131, 0x06d4 }, // 212, 6 + { 0x2153, 0x0440 }, // 64 , 4 + { 0x2154, 0x0441 }, // 65 , 4 + { 0x215B, 0x0442 }, // 66 , 4 + { 0x215C, 0x0443 }, // 67 , 4 + { 0x215D, 0x0444 }, // 68 , 4 + { 0x215E, 0x0445 }, // 69 , 4 + { 0x2190, 0x0590 }, // 144, 5 + { 0x2191, 0x0617 }, // 23 , 6 + { 0x2192, 0x05d5 }, // 213, 5 + { 0x2193, 0x0618 }, // 24 , 6 + { 0x2194, 0x05d6 }, // 214, 5 + { 0x2195, 0x05d7 }, // 215, 5 + { 0x2196, 0x0640 }, // 64 , 6 + { 0x2197, 0x063e }, // 62 , 6 + { 0x2198, 0x063f }, // 63 , 6 + { 0x2199, 0x0641 }, // 65 , 6 + { 0x219D, 0x0690 }, // 144, 6 + { 0x21A3, 0x0693 }, // 147, 6 + { 0x21A8, 0x050f }, // 15 , 5 + { 0x21A9, 0x0691 }, // 145, 6 + { 0x21AA, 0x0692 }, // 146, 6 + { 0x21B5, 0x0514 }, // 20 , 5 + { 0x21BC, 0x0694 }, // 148, 6 + { 0x21BD, 0x0695 }, // 149, 6 + { 0x21BE, 0x069b }, // 155, 6 + { 0x21BF, 0x069a }, // 154, 6 + { 0x21C0, 0x0696 }, // 150, 6 + { 0x21C1, 0x0697 }, // 151, 6 + { 0x21C2, 0x069d }, // 157, 6 + { 0x21C3, 0x069c }, // 156, 6 + { 0x21C4, 0x0636 }, // 54 , 6 + { 0x21C6, 0x0637 }, // 55 , 6 + { 0x21C7, 0x069f }, // 159, 6 + { 0x21C9, 0x069e }, // 158, 6 + { 0x21CB, 0x0699 }, // 153, 6 + { 0x21CC, 0x0698 }, // 152, 6 + { 0x21D0, 0x0639 }, // 57 , 6 + { 0x21D1, 0x063a }, // 58 , 6 + { 0x21D2, 0x0638 }, // 56 , 6 + { 0x21D3, 0x063b }, // 59 , 6 + { 0x21D4, 0x063c }, // 60 , 6 + { 0x21D5, 0x063d }, // 61 , 6 + { 0x21E6, 0x0597 }, // 151, 5 + { 0x21E8, 0x0596 }, // 150, 5 + { 0x2200, 0x067a }, // 122, 6 + { 0x2202, 0x062c }, // 44 , 6 + { 0x2203, 0x0679 }, // 121, 6 + { 0x2204, 0x06d0 }, // 208, 6 + { 0x2205, 0x0648 }, // 72 , 6 + { 0x2207, 0x062b }, // 43 , 6 + { 0x2208, 0x060f }, // 15 , 6 + { 0x2209, 0x06d1 }, // 209, 6 + { 0x220B, 0x06db }, // 219, 6 + { 0x220D, 0x0647 }, // 71 , 6 + { 0x220F, 0x0629 }, // 41 , 6 + { 0x2210, 0x0672 }, // 114, 6 + { 0x2211, 0x0612 }, // 18 , 6 + { 0x2212, 0x0600 }, // 0 , 6 + { 0x2213, 0x062a }, // 42 , 6 + { 0x2214, 0x06ae }, // 174, 6 + { 0x2215, 0x0606 }, // 6 , 6 + { 0x2216, 0x0607 }, // 7 , 6 + { 0x2218, 0x0621 }, // 33 , 6 + { 0x2219, 0x0622 }, // 34 , 6 + { 0x221A, 0x0704 }, // 4 , 7 + { 0x221D, 0x0604 }, // 4 , 6 + { 0x221E, 0x0613 }, // 19 , 6 + { 0x221F, 0x06da }, // 218, 6 + { 0x2220, 0x064f }, // 79 , 6 + { 0x2221, 0x06a8 }, // 168, 6 + { 0x2222, 0x06a9 }, // 169, 6 + { 0x2223, 0x0609 }, // 9 , 6 + { 0x2224, 0x06ce }, // 206, 6 + { 0x2225, 0x0611 }, // 17 , 6 + { 0x2226, 0x06cd }, // 205, 6 + { 0x2227, 0x0655 }, // 85 , 6 + { 0x2228, 0x0656 }, // 86 , 6 + { 0x2229, 0x0610 }, // 16 , 6 + { 0x222A, 0x0642 }, // 66 , 6 + { 0x222B, 0x0628 }, // 40 , 6 + { 0x222E, 0x0668 }, // 104, 6 + { 0x2234, 0x0666 }, // 102, 6 + { 0x2235, 0x0665 }, // 101, 6 + { 0x2237, 0x0667 }, // 103, 6 + { 0x223C, 0x060c }, // 12 , 6 + { 0x2241, 0x06bd }, // 189, 6 + { 0x2243, 0x0673 }, // 115, 6 + { 0x2244, 0x06be }, // 190, 6 + { 0x2245, 0x0674 }, // 116, 6 + { 0x2247, 0x06bf }, // 191, 6 + { 0x2248, 0x060d }, // 13 , 6 + { 0x2249, 0x06c0 }, // 192, 6 + { 0x224D, 0x06b3 }, // 179, 6 + { 0x224E, 0x06b2 }, // 178, 6 + { 0x2250, 0x06af }, // 175, 6 + { 0x2252, 0x06b0 }, // 176, 6 + { 0x2253, 0x06b1 }, // 177, 6 + { 0x225F, 0x06d9 }, // 217, 6 + { 0x2260, 0x0663 }, // 99 , 6 + { 0x2261, 0x060e }, // 14 , 6 + { 0x2262, 0x0664 }, // 100, 6 + { 0x2264, 0x0602 }, // 2 , 6 + { 0x2265, 0x0603 }, // 3 , 6 + { 0x226A, 0x064d }, // 77 , 6 + { 0x226B, 0x064e }, // 78 , 6 + { 0x226C, 0x06b6 }, // 182, 6 + { 0x226D, 0x06cf }, // 207, 6 + { 0x226E, 0x06b9 }, // 185, 6 + { 0x226F, 0x06bb }, // 187, 6 + { 0x2270, 0x06ba }, // 186, 6 + { 0x2271, 0x06bc }, // 188, 6 + { 0x2272, 0x06eb }, // 235, 6 + { 0x2273, 0x06ec }, // 236, 6 + { 0x227A, 0x0675 }, // 117, 6 + { 0x227B, 0x0677 }, // 119, 6 + { 0x227C, 0x0676 }, // 118, 6 + { 0x227D, 0x0678 }, // 120, 6 + { 0x2280, 0x06c1 }, // 193, 6 + { 0x2281, 0x06c3 }, // 195, 6 + { 0x2282, 0x0643 }, // 67 , 6 + { 0x2283, 0x0644 }, // 68 , 6 + { 0x2284, 0x06c5 }, // 197, 6 + { 0x2285, 0x06c6 }, // 198, 6 + { 0x2286, 0x0645 }, // 69 , 6 + { 0x2287, 0x0646 }, // 70 , 6 + { 0x2288, 0x06c7 }, // 199, 6 + { 0x2289, 0x06c8 }, // 200, 6 + { 0x228A, 0x067e }, // 126, 6 + { 0x228B, 0x067f }, // 127, 6 + { 0x228E, 0x067d }, // 125, 6 + { 0x228F, 0x0682 }, // 130, 6 + { 0x2290, 0x0685 }, // 133, 6 + { 0x2291, 0x0683 }, // 131, 6 + { 0x2292, 0x0686 }, // 134, 6 + { 0x2293, 0x0680 }, // 128, 6 + { 0x2294, 0x0681 }, // 129, 6 + { 0x2295, 0x0651 }, // 81 , 6 + { 0x2296, 0x0652 }, // 82 , 6 + { 0x2297, 0x0650 }, // 80 , 6 + { 0x2299, 0x0654 }, // 84 , 6 + { 0x229A, 0x06a4 }, // 164, 6 + { 0x229B, 0x06a5 }, // 165, 6 + { 0x229D, 0x06a6 }, // 166, 6 + { 0x22A2, 0x065b }, // 91 , 6 + { 0x22A3, 0x065c }, // 92 , 6 + { 0x22A4, 0x0658 }, // 88 , 6 + { 0x22A5, 0x0659 }, // 89 , 6 + { 0x22A8, 0x06b4 }, // 180, 6 + { 0x22BB, 0x0657 }, // 87 , 6 + { 0x22C5, 0x061f }, // 31 , 6 + { 0x22C6, 0x0670 }, // 112, 6 + { 0x22C8, 0x068c }, // 140, 6 + { 0x22D0, 0x06a2 }, // 162, 6 + { 0x22D1, 0x06a3 }, // 163, 6 + { 0x22D2, 0x06a1 }, // 161, 6 + { 0x22D3, 0x06a0 }, // 160, 6 + { 0x22D8, 0x067b }, // 123, 6 + { 0x22D9, 0x067c }, // 124, 6 + { 0x22E0, 0x06c2 }, // 194, 6 + { 0x22E1, 0x06c4 }, // 196, 6 + { 0x22E2, 0x06cb }, // 203, 6 + { 0x22E3, 0x06cc }, // 204, 6 + { 0x22E4, 0x0684 }, // 132, 6 + { 0x22E5, 0x0687 }, // 135, 6 + { 0x22EE, 0x06de }, // 222, 6 + { 0x22EF, 0x06dc }, // 220, 6 + { 0x22F1, 0x06df }, // 223, 6 + { 0x2302, 0x050c }, // 12 , 5 + { 0x2308, 0x0649 }, // 73 , 6 + { 0x2309, 0x064a }, // 74 , 6 + { 0x230A, 0x064b }, // 75 , 6 + { 0x230B, 0x064c }, // 76 , 6 + { 0x2310, 0x0510 }, // 16 , 5 + { 0x2312, 0x065a }, // 90 , 6 + { 0x2319, 0x0511 }, // 17 , 5 + { 0x231A, 0x051f }, // 31 , 5 + { 0x231B, 0x0520 }, // 32 , 5 + { 0x2320, 0x0700 }, // 0 , 7 + { 0x2321, 0x0701 }, // 1 , 7 + { 0x2322, 0x068e }, // 142, 6 + { 0x2323, 0x068d }, // 141, 6 + { 0x2329, 0x060a }, // 10 , 6 + { 0x232A, 0x060b }, // 11 , 6 + { 0x2409, 0x044f }, // 79 , 4 + { 0x240A, 0x0452 }, // 82 , 4 + { 0x240B, 0x0454 }, // 84 , 4 + { 0x240C, 0x0450 }, // 80 , 4 + { 0x240D, 0x0451 }, // 81 , 4 + { 0x2424, 0x0453 }, // 83 , 4 + { 0x24C2, 0x0446 }, // 70 , 4 + { 0x24C5, 0x0447 }, // 71 , 4 + { 0x24CA, 0x0448 }, // 72 , 4, - circled U + { 0x2500, 0x0308 }, // 8 , 3 + { 0x2502, 0x0309 }, // 9 , 3 + { 0x250C, 0x030a }, // 10 , 3 + { 0x2510, 0x030b }, // 11 , 3 + { 0x2514, 0x030d }, // 13 , 3 + { 0x2518, 0x030c }, // 12 , 3 + { 0x251C, 0x030e }, // 14 , 3 + { 0x251E, 0x033e }, // 62 , 3 + { 0x251F, 0x033c }, // 60 , 3 + { 0x2521, 0x033f }, // 63 , 3 + { 0x2522, 0x033d }, // 61 , 3 + { 0x2524, 0x0310 }, // 16 , 3 + { 0x2526, 0x0345 }, // 69 , 3 + { 0x2527, 0x0344 }, // 68 , 3 + { 0x2529, 0x0347 }, // 71 , 3 + { 0x252A, 0x0346 }, // 70 , 3 + { 0x252C, 0x030f }, // 15 , 3 + { 0x252D, 0x0342 }, // 66 , 3 + { 0x252E, 0x0340 }, // 64 , 3 + { 0x2531, 0x0343 }, // 67 , 3 + { 0x2532, 0x0341 }, // 65 , 3 + { 0x2534, 0x0311 }, // 17 , 3 + { 0x2535, 0x034a }, // 74 , 3 + { 0x2536, 0x0348 }, // 72 , 3 + { 0x2539, 0x034b }, // 75 , 3 + { 0x253A, 0x0349 }, // 73 , 3 + { 0x253C, 0x0312 }, // 18 , 3 + { 0x253D, 0x0352 }, // 82 , 3 + { 0x253E, 0x034e }, // 78 , 3 + { 0x2540, 0x034f }, // 79 , 3 + { 0x2541, 0x034c }, // 76 , 3 + { 0x2543, 0x0355 }, // 85 , 3 + { 0x2544, 0x0350 }, // 80 , 3 + { 0x2545, 0x0353 }, // 83 , 3 + { 0x2546, 0x034d }, // 77 , 3 + { 0x2547, 0x0357 }, // 87 , 3 + { 0x2548, 0x0354 }, // 84 , 3 + { 0x2549, 0x0356 }, // 86 , 3 + { 0x254A, 0x0351 }, // 81 , 3 + { 0x2550, 0x0313 }, // 19 , 3 + { 0x2551, 0x0314 }, // 20 , 3 + { 0x2552, 0x031e }, // 30 , 3 + { 0x2553, 0x0322 }, // 34 , 3 + { 0x2554, 0x0315 }, // 21 , 3 + { 0x2555, 0x031f }, // 31 , 3 + { 0x2556, 0x0323 }, // 35 , 3 + { 0x2557, 0x0316 }, // 22 , 3 + { 0x2558, 0x0321 }, // 33 , 3 + { 0x2559, 0x0325 }, // 37 , 3 + { 0x255A, 0x0318 }, // 24 , 3 + { 0x255B, 0x0320 }, // 32 , 3 + { 0x255C, 0x0324 }, // 36 , 3 + { 0x255D, 0x0317 }, // 23 , 3 + { 0x255E, 0x0326 }, // 38 , 3 + { 0x255F, 0x032a }, // 42 , 3 + { 0x2560, 0x0319 }, // 25 , 3 + { 0x2561, 0x0328 }, // 40 , 3 + { 0x2562, 0x032c }, // 44 , 3 + { 0x2563, 0x031b }, // 27 , 3 + { 0x2564, 0x032b }, // 43 , 3 + { 0x2565, 0x0327 }, // 39 , 3 + { 0x2566, 0x031a }, // 26 , 3 + { 0x2567, 0x032d }, // 45 , 3 + { 0x2568, 0x0329 }, // 41 , 3 + { 0x2569, 0x031c }, // 28 , 3 + { 0x256A, 0x032f }, // 47 , 3 + { 0x256B, 0x032e }, // 46 , 3 + { 0x256C, 0x031d }, // 29 , 3 + { 0x2574, 0x0330 }, // 48 , 3 + { 0x2575, 0x0331 }, // 49 , 3 + { 0x2576, 0x0332 }, // 50 , 3 + { 0x2577, 0x0333 }, // 51 , 3 + { 0x2578, 0x0334 }, // 52 , 3 + { 0x2579, 0x0335 }, // 53 , 3 + { 0x257A, 0x0336 }, // 54 , 3 + { 0x257B, 0x0337 }, // 55 , 3 + { 0x257C, 0x0338 }, // 56 , 3 + { 0x257D, 0x033a }, // 58 , 3 + { 0x257E, 0x0339 }, // 57 , 3 + { 0x257F, 0x033b }, // 59 , 3 + { 0x2580, 0x0305 }, // 5 , 3 + { 0x2584, 0x0307 }, // 7 , 3 + { 0x2588, 0x0303 }, // 3 , 3 + { 0x258C, 0x0304 }, // 4 , 3 + { 0x2590, 0x0306 }, // 6 , 3 + { 0x2591, 0x0300 }, // 0 , 3 + { 0x2592, 0x0301 }, // 1 , 3 + { 0x2593, 0x0302 }, // 2 , 3 + { 0x25A0, 0x0402 }, // 2 , 4 + { 0x25A1, 0x0426 }, // 38 , 4 + { 0x25AA, 0x042f }, // 47 , 4 + { 0x25AB, 0x0431 }, // 49 , 4 + { 0x25AC, 0x050b }, // 11 , 5 + { 0x25B2, 0x0573 }, // 115, 5 + { 0x25B3, 0x0688 }, // 136, 6 + { 0x25B4, 0x061d }, // 29 , 6 + { 0x25B5, 0x06ac }, // 172, 6 + { 0x25B8, 0x061b }, // 27 , 6 + { 0x25B9, 0x068b }, // 139, 6 + { 0x25BC, 0x0574 }, // 116, 5 + { 0x25BD, 0x0689 }, // 137, 6 + { 0x25BE, 0x061e }, // 30 , 6 + { 0x25BF, 0x06ad }, // 173, 6 + { 0x25C2, 0x061c }, // 28 , 6 + { 0x25C3, 0x068a }, // 138, 6 + { 0x25C6, 0x0575 }, // 117, 5 + { 0x25C7, 0x066f }, // 111, 6 + { 0x25CA, 0x065f }, // 95 , 6 + { 0x25CB, 0x0401 }, // 1 , 4 + { 0x25CF, 0x0400 }, // 0 , 4 + { 0x25D6, 0x059e }, // 158, 5 + { 0x25D7, 0x0577 }, // 119, 5 + { 0x25D8, 0x0512 }, // 18 , 5 + { 0x25D9, 0x0513 }, // 19 , 5 + { 0x25E6, 0x042d }, // 45 , 4 + { 0x2605, 0x0548 }, // 72, 5 + { 0x260E, 0x051e }, // 30 , 5 + { 0x2610, 0x0518 }, // 24 , 5 + { 0x2612, 0x0519 }, // 25 , 5 + { 0x261B, 0x052a }, // 42 , 5 + { 0x261C, 0x0516 }, // 22 , 5 + { 0x261E, 0x052b }, // 43 , 5 + { 0x2639, 0x051a }, // 26 , 5 + { 0x263A, 0x0507 }, // 7 , 5 + { 0x263B, 0x0508 }, // 8 , 5 + { 0x263C, 0x0506 }, // 6 , 5 + { 0x2640, 0x0505 }, // 5 , 5 + { 0x2642, 0x0504 }, // 4 , 5 + { 0x2660, 0x05ab }, // 171, 5 + { 0x2661, 0x0500 }, // 0 , 5 + { 0x2662, 0x0501 }, // 1 , 5 + { 0x2663, 0x05a8 }, // 168, 5 + { 0x2664, 0x0503 }, // 3 , 5 + { 0x2665, 0x05aa }, // 170, 5 + { 0x2666, 0x05a9 }, // 169, 5 + { 0x2667, 0x0502 }, // 2 , 5 + { 0x266A, 0x0509 }, // 9 , 5 + { 0x266C, 0x050a }, // 10 , 5 + { 0x266D, 0x051c }, // 28 , 5 + { 0x266E, 0x051d }, // 29 , 5 + { 0x266F, 0x051b }, // 27 , 5 + { 0x2701, 0x0521 }, // 33 , 5 + { 0x2702, 0x0522 }, // 34 , 5 + { 0x2703, 0x0523 }, // 35 , 5 + { 0x2704, 0x0524 }, // 36 , 5 + { 0x2706, 0x0526 }, // 38 , 5 + { 0x2707, 0x0527 }, // 39 , 5 + { 0x2708, 0x0528 }, // 40 , 5 + { 0x2709, 0x0529 }, // 41 , 5 + { 0x270C, 0x052c }, // 44 , 5 + { 0x270D, 0x052d }, // 45 , 5 + { 0x270E, 0x052e }, // 46 , 5 + { 0x270F, 0x052f }, // 47 , 5 + { 0x2710, 0x0530 }, // 48 , 5 + { 0x2711, 0x0531 }, // 49 , 5 + { 0x2712, 0x0532 }, // 50 , 5 + { 0x2713, 0x0533 }, // 51 , 5 + { 0x2714, 0x0534 }, // 52 , 5 + { 0x2715, 0x0535 }, // 53 , 5 + { 0x2716, 0x0536 }, // 54 , 5 + { 0x2717, 0x0537 }, // 55 , 5 + { 0x2718, 0x0538 }, // 56 , 5 + { 0x2719, 0x0539 }, // 57 , 5 + { 0x271A, 0x053a }, // 58 , 5 + { 0x271B, 0x053b }, // 59 , 5 + { 0x271C, 0x053c }, // 60 , 5 + { 0x271D, 0x053d }, // 61 , 5 + { 0x271E, 0x053e }, // 62 , 5 + { 0x271F, 0x053f }, // 63 , 5 + { 0x2720, 0x0540 }, // 64 , 5 + { 0x2721, 0x0541 }, // 65 , 5 + { 0x2722, 0x0542 }, // 66 , 5 + { 0x2723, 0x0543 }, // 67 , 5 + { 0x2724, 0x0544 }, // 68 , 5 + { 0x2725, 0x0545 }, // 69 , 5 + { 0x2726, 0x0546 }, // 70 , 5 + { 0x2727, 0x0547 }, // 71 , 5 + { 0x2729, 0x0549 }, // 73 , 5 + { 0x272A, 0x054a }, // 74 , 5 + { 0x272B, 0x054b }, // 75 , 5 + { 0x272C, 0x054c }, // 76 , 5 + { 0x272D, 0x054d }, // 77 , 5 + { 0x272E, 0x054e }, // 78 , 5 + { 0x272F, 0x054f }, // 79 , 5 + { 0x2730, 0x0550 }, // 80 , 5 + { 0x2731, 0x0551 }, // 81 , 5 + { 0x2732, 0x0552 }, // 82 , 5 + { 0x2733, 0x0553 }, // 83 , 5 + { 0x2734, 0x0554 }, // 84 , 5 + { 0x2735, 0x0555 }, // 85 , 5 + { 0x2736, 0x0556 }, // 86 , 5 + { 0x2737, 0x0557 }, // 87 , 5 + { 0x2738, 0x0558 }, // 88 , 5 + { 0x2739, 0x0559 }, // 89 , 5 + { 0x273A, 0x055a }, // 90 , 5 + { 0x273B, 0x055b }, // 91 , 5 + { 0x273C, 0x055c }, // 92 , 5 + { 0x273D, 0x055d }, // 93 , 5 + { 0x273E, 0x055e }, // 94 , 5 + { 0x273F, 0x055f }, // 95 , 5 + { 0x2740, 0x0560 }, // 96 , 5 + { 0x2741, 0x0561 }, // 97 , 5 + { 0x2742, 0x0562 }, // 98 , 5 + { 0x2743, 0x0563 }, // 99 , 5 + { 0x2744, 0x0564 }, // 100, 5 + { 0x2745, 0x0565 }, // 101, 5 + { 0x2746, 0x0566 }, // 102, 5 + { 0x2747, 0x0567 }, // 103, 5 + { 0x2748, 0x0568 }, // 104, 5 + { 0x2749, 0x0569 }, // 105, 5 + { 0x274A, 0x056a }, // 106, 5 + { 0x274B, 0x056b }, // 107, 5 + { 0x274D, 0x056d }, // 109, 5 + { 0x274F, 0x056f }, // 111, 5 + { 0x2750, 0x0570 }, // 112, 5 + { 0x2751, 0x0571 }, // 113, 5 + { 0x2752, 0x0572 }, // 114, 5 + { 0x2756, 0x0576 }, // 118, 5 + { 0x2758, 0x0578 }, // 120, 5 + { 0x2759, 0x0579 }, // 121, 5 + { 0x275A, 0x057a }, // 122, 5 + { 0x275B, 0x057b }, // 123, 5 + { 0x275C, 0x057c }, // 124, 5 + { 0x275D, 0x057d }, // 125, 5 + { 0x275E, 0x057e }, // 126, 5 + { 0x2761, 0x05a1 }, // 161, 5 + { 0x2762, 0x05a2 }, // 162, 5 + { 0x2763, 0x05a3 }, // 163, 5 + { 0x2764, 0x05a4 }, // 164, 5 + { 0x2765, 0x05a5 }, // 165, 5 + { 0x2766, 0x05a6 }, // 166, 5 + { 0x2767, 0x05a7 }, // 167, 5 + { 0x2776, 0x05b6 }, // 182, 5 + { 0x2777, 0x05b7 }, // 183, 5 + { 0x2778, 0x05b8 }, // 184, 5 + { 0x2779, 0x05b9 }, // 185, 5 + { 0x277A, 0x05ba }, // 186, 5 + { 0x277B, 0x05bb }, // 187, 5 + { 0x277C, 0x05bc }, // 188, 5 + { 0x277D, 0x05bd }, // 189, 5 + { 0x277E, 0x05be }, // 190, 5 + { 0x277F, 0x05bf }, // 191, 5 + { 0x2780, 0x05c0 }, // 192, 5 + { 0x2781, 0x05c1 }, // 193, 5 + { 0x2782, 0x05c2 }, // 194, 5 + { 0x2783, 0x05c3 }, // 195, 5 + { 0x2784, 0x05c4 }, // 196, 5 + { 0x2785, 0x05c5 }, // 197, 5 + { 0x2786, 0x05c6 }, // 198, 5 + { 0x2787, 0x05c7 }, // 199, 5 + { 0x2788, 0x05c8 }, // 200, 5 + { 0x2789, 0x05c9 }, // 201, 5 + { 0x278A, 0x05ca }, // 202, 5 + { 0x278B, 0x05cb }, // 203, 5 + { 0x278C, 0x05cc }, // 204, 5 + { 0x278D, 0x05cd }, // 205, 5 + { 0x278E, 0x05ce }, // 206, 5 + { 0x278F, 0x05cf }, // 207, 5 + { 0x2790, 0x05d0 }, // 208, 5 + { 0x2791, 0x05d1 }, // 209, 5 + { 0x2792, 0x05d2 }, // 210, 5 + { 0x2793, 0x05d3 }, // 211, 5 + { 0x2794, 0x05d4 }, // 212, 5 + { 0x2798, 0x05d8 }, // 216, 5 + { 0x2799, 0x05d9 }, // 217, 5 + { 0x279A, 0x05da }, // 218, 5 + { 0x279B, 0x05db }, // 219, 5 + { 0x279C, 0x05dc }, // 220, 5 + { 0x279D, 0x05dd }, // 221, 5 + { 0x279E, 0x05de }, // 222, 5 + { 0x279F, 0x05df }, // 223, 5 + { 0x27A0, 0x05e0 }, // 224, 5 + { 0x27A1, 0x05e1 }, // 225, 5 + { 0x27A2, 0x05e2 }, // 226, 5 + { 0x27A3, 0x05e3 }, // 227, 5 + { 0x27A4, 0x05e4 }, // 228, 5 + { 0x27A5, 0x05e5 }, // 229, 5 + { 0x27A6, 0x05e6 }, // 230, 5 + { 0x27A7, 0x05e7 }, // 231, 5 + { 0x27A8, 0x05e8 }, // 232, 5 + { 0x27A9, 0x05e9 }, // 233, 5 + { 0x27AA, 0x05ea }, // 234, 5 + { 0x27AB, 0x05eb }, // 235, 5 + { 0x27AC, 0x05ec }, // 236, 5 + { 0x27AD, 0x05ed }, // 237, 5 + { 0x27AE, 0x05ee }, // 238, 5 + { 0x27AF, 0x05ef }, // 239, 5 + { 0x27B1, 0x05f1 }, // 241, 5 + { 0x27B2, 0x05f2 }, // 242, 5 + { 0x27B3, 0x05f3 }, // 243, 5 + { 0x27B4, 0x05f4 }, // 244, 5 + { 0x27B5, 0x05f5 }, // 245, 5 + { 0x27B6, 0x05f6 }, // 246, 5 + { 0x27B7, 0x05f7 }, // 247, 5 + { 0x27B8, 0x05f8 }, // 248, 5 + { 0x27B9, 0x05f9 }, // 249, 5 + { 0x27BA, 0x05fa }, // 250, 5 + { 0x27BB, 0x05fb }, // 251, 5 + { 0x27BC, 0x05fc }, // 252, 5 + { 0x27BD, 0x05fd }, // 253, 5 + { 0x27BE, 0x05fe }, // 254, 5 + + // Range 0xE000 through 0xF8FF is reserved for private use. + // We cannot try to interpret characters in this range nor + // assign any default collation or meaning. + + { 0xFB00, 0x0433 }, // 51 , 4 + { 0xFB01, 0x0436 }, // 54 , 4 + { 0xFB02, 0x0437 }, // 55 , 4 + { 0xFB03, 0x0434 }, // 52 , 4 + { 0xFB04, 0x0435 }, // 53 , 4 + { 0xFB1E, 0x0930 }, // 48 , 9 + { 0xFF61, 0x0b00 }, // 0 , 11 + { 0xFF62, 0x0b01 }, // 1 , 11 + { 0xFF63, 0x0b02 }, // 2 , 11 + { 0xFF64, 0x0b03 }, // 3 , 11 + { 0xFF65, 0x0b04 }, // 4 , 11 + { 0xFF66, 0x0b05 }, // 5 , 11 + { 0xFF67, 0x0b06 }, // 6 , 11 + { 0xFF68, 0x0b07 }, // 7 , 11 + { 0xFF69, 0x0b08 }, // 8 , 11 + { 0xFF6A, 0x0b09 }, // 9 , 11 + { 0xFF6B, 0x0b0a }, // 10 , 11 + { 0xFF6C, 0x0b0b }, // 11 , 11 + { 0xFF6D, 0x0b0c }, // 12 , 11 + { 0xFF6E, 0x0b0d }, // 13 , 11 + { 0xFF6F, 0x0b0e }, // 14 , 11 + { 0xFF70, 0x0b0f }, // 15 , 11 + { 0xFF71, 0x0b10 }, // 16 , 11 + { 0xFF72, 0x0b11 }, // 17 , 11 + { 0xFF73, 0x0b12 }, // 18 , 11 + { 0xFF74, 0x0b13 }, // 19 , 11 + { 0xFF75, 0x0b14 }, // 20 , 11 + { 0xFF76, 0x0b15 }, // 21 , 11 + { 0xFF77, 0x0b16 }, // 22 , 11 + { 0xFF78, 0x0b17 }, // 23 , 11 + { 0xFF79, 0x0b18 }, // 24 , 11 + { 0xFF7A, 0x0b19 }, // 25 , 11 + { 0xFF7B, 0x0b1a }, // 26 , 11 + { 0xFF7C, 0x0b1b }, // 27 , 11 + { 0xFF7D, 0x0b1c }, // 28 , 11 + { 0xFF7E, 0x0b1d }, // 29 , 11 + { 0xFF7F, 0x0b1e }, // 30 , 11 + { 0xFF80, 0x0b1f }, // 31 , 11 + { 0xFF81, 0x0b20 }, // 32 , 11 + { 0xFF82, 0x0b21 }, // 33 , 11 + { 0xFF83, 0x0b22 }, // 34 , 11 + { 0xFF84, 0x0b23 }, // 35 , 11 + { 0xFF85, 0x0b24 }, // 36 , 11 + { 0xFF86, 0x0b25 }, // 37 , 11 + { 0xFF87, 0x0b26 }, // 38 , 11 + { 0xFF88, 0x0b27 }, // 39 , 11 + { 0xFF89, 0x0b28 }, // 40 , 11 + { 0xFF8A, 0x0b29 }, // 41 , 11 + { 0xFF8B, 0x0b2a }, // 42 , 11 + { 0xFF8C, 0x0b2b }, // 43 , 11 + { 0xFF8D, 0x0b2c }, // 44 , 11 + { 0xFF8E, 0x0b2d }, // 45 , 11 + { 0xFF8F, 0x0b2e }, // 46 , 11 + { 0xFF90, 0x0b2f }, // 47 , 11 + { 0xFF91, 0x0b30 }, // 48 , 11 + { 0xFF92, 0x0b31 }, // 49 , 11 + { 0xFF93, 0x0b32 }, // 50 , 11 + { 0xFF94, 0x0b33 }, // 51 , 11 + { 0xFF95, 0x0b34 }, // 52 , 11 + { 0xFF96, 0x0b35 }, // 53 , 11 + { 0xFF97, 0x0b36 }, // 54 , 11 + { 0xFF98, 0x0b37 }, // 55 , 11 + { 0xFF99, 0x0b38 }, // 56 , 11 + { 0xFF9A, 0x0b39 }, // 57 , 11 + { 0xFF9B, 0x0b3a }, // 58 , 11 + { 0xFF9C, 0x0b3b }, // 59 , 11 + { 0xFF9D, 0x0b3c }, // 60 , 11 + { 0xFF9E, 0x0b3d }, // 61 , 11 + { 0xFF9F, 0x0b3e } // 62 , 11 +}; + +/**************************************************************************** +Desc: +****************************************************************************/ +#define shiftN(data,size,distance) \ + f_memmove((FLMBYTE *)(data) + (FLMINT)(distance), \ + (FLMBYTE *)(data), (size_t)(size)) + +/**************************************************************************** +Desc: +****************************************************************************/ +FINLINE FLMUINT bytesInBits( + FLMUINT uiBits) +{ + return( (uiBits + 7) >> 3); +} + +/**************************************************************************** +Desc: +****************************************************************************/ +FINLINE FLMBOOL testOneBit( + const FLMBYTE * pucBuf, + FLMUINT uiBit) +{ + return( (((pucBuf[ uiBit >> 3]) >> (7 - (uiBit & 7))) & 1) + ? TRUE + : FALSE); +} + +/**************************************************************************** +Desc: +****************************************************************************/ +FINLINE FLMUINT getNBits( + FLMUINT uiNumBits, + const FLMBYTE * pucBuf, + FLMUINT uiBit) +{ + return(((FLMUINT)( + ((FLMUINT)pucBuf[ uiBit >> 3] << 8) | // append high bits (byte 1) to ... + (FLMUINT)pucBuf[ (uiBit >> 3) + 1]) >> // ... overflow bits in 2nd byte + (16 - uiNumBits - (uiBit & 7))) & // reposition to low end of value + ((1 << uiNumBits) - 1)); // mask off high bits +} + +/**************************************************************************** +Desc: +****************************************************************************/ +FINLINE void setBit( + FLMBYTE * pucBuf, + FLMUINT uiBit) +{ + pucBuf[ uiBit >> 3] |= (FLMBYTE)(1 << (7 - (uiBit & 7))); +} + +/**************************************************************************** +Desc: +****************************************************************************/ +FINLINE void setBits( + FLMUINT uiCount, + FLMBYTE * pucBuf, + FLMUINT uiBit, + FLMUINT uiVal) +{ + pucBuf[ uiBit >> 3] |= // 1st byte + (FLMBYTE)((uiVal << (8 - uiCount)) // Align to bit 0 + >> + (uiBit & 7)); // Re-align to actual bit position + + pucBuf[ (uiBit >> 3) + 1] = // 2nd byte + (FLMBYTE)(uiVal + << + (16 - uiCount - (uiBit & 7))); // Align spill-over bits +} + +/**************************************************************************** +Desc: Returns TRUE if the character is upper case, FALSE if lower case. +****************************************************************************/ +FINLINE FLMBOOL charIsUpper( + FLMUINT16 ui16Char) +{ + return( (FLMBOOL)((ui16Char < 0x7F) + ? (FLMBOOL)((ui16Char >= ASCII_LOWER_A && + ui16Char <= ASCII_LOWER_Z) + ? (FLMBOOL)FALSE + : (FLMBOOL)TRUE) + : flmWPIsUpper( ui16Char))); +} + +/**************************************************************************** +Desc: getNextCharState can be thought of as a 2 dimentional array with + i and j as the row and column indicators respectively. If a value + exists at the intersection of i and j, it is returned. Sparse array + techniques are used to minimize memory usage. + +Return: 0 = no valid next state + non-zero = valid next state, offset for action, or collating value +****************************************************************************/ +FINLINE FLMUINT16 getNextCharState( + FLMUINT i, + FLMUINT j) +{ + FLMUINT k, x; + + for( k = fwp_indexi[ x = + (i > START_COL) ? (START_ALL) : i ]; // adjust so don't use full tables + k <= (FLMUINT) (fwp_indexi[ x + 1] - 1); + k++ ) + { + // FIXUP_AREA_SIZE should be 24. + if( j == fwp_indexj[ k]) + { + return( fwp_valuea[ (i > START_COL) + ? (k + (FIXUP_AREA_SIZE * (i - START_ALL))) + : k]); + } + } + + return(0); +} + +/**************************************************************************** +Desc: Convert a Unicode character to its WP equivalent +Ret: Returns TRUE if the character could be converted +****************************************************************************/ +FLMBOOL FLMAPI f_unicodeToWP( + FLMUNICODE uUniChar, // Unicode character to convert + FLMUINT16 * pui16WPChar) // Returns 0 or WPChar converted. +{ + if( uUniChar <= 127) + { + // Character is in the ASCII conversion range + + *pui16WPChar = uUniChar; + return( TRUE); + } + + if( uUniChar < gv_uiMinUniChar || uUniChar > gv_uiMaxUniChar) + { + *pui16WPChar = 0; + return( FALSE); + } + + if( (*pui16WPChar = gv_pUnicodeToWP60[ uUniChar - gv_uiMinUniChar]) != 0) + { + return( TRUE); + } + + return( FALSE); +} + +/**************************************************************************** +Desc: Convert a WP character to its Unicode equivalent +****************************************************************************/ +RCODE FLMAPI f_wpToUnicode( + FLMUINT16 ui16WPChar, + FLMUNICODE * puUniChar) +{ + if( ui16WPChar <= 127) + { + // Character is in the ASCII conversion range + + *puUniChar = (FLMUNICODE)ui16WPChar; + return( NE_FLM_OK); + } + + if( ui16WPChar < gv_uiMinWPChar || ui16WPChar > gv_uiMaxWPChar) + { + return( RC_SET_AND_ASSERT( NE_FLM_CONV_ILLEGAL)); + } + + *puUniChar = gv_pWP60ToUnicode[ ui16WPChar - gv_uiMinWPChar]; + return( NE_FLM_OK); +} + +/**************************************************************************** +Desc: Reads the next character from the storage buffer +****************************************************************************/ +FINLINE RCODE flmGetCharFromUTF8Buf( + const FLMBYTE ** ppucBuf, + const FLMBYTE * pucEnd, + FLMUNICODE * puChar) +{ + const FLMBYTE * pucBuf = *ppucBuf; + FLMUINT uiMaxLen = pucEnd ? (FLMUINT)(pucEnd - *ppucBuf) : 3; + + if( !uiMaxLen) + { + *puChar = 0; + return( NE_FLM_OK); + } + + if( pucBuf[ 0] <= 0x7F) + { + if( (*puChar = (FLMUNICODE)pucBuf[ 0]) != 0) + { + (*ppucBuf)++; + } + return( NE_FLM_OK); + } + + if( uiMaxLen < 2 || (pucBuf[ 1] >> 6) != 0x02) + { + return( RC_SET( NE_FLM_BAD_UTF8)); + } + + if( (pucBuf[ 0] >> 5) == 0x06) + { + *puChar = + (FLMUNICODE)(((FLMUNICODE)( pucBuf[ 0] - 0xC0) << 6) + + (FLMUNICODE)(pucBuf[ 1] - 0x80)); + (*ppucBuf) += 2; + return( NE_FLM_OK); + } + + if( uiMaxLen < 3 || + (pucBuf[ 0] >> 4) != 0x0E || + (pucBuf[ 2] >> 6) != 0x02) + { + return( RC_SET( NE_FLM_BAD_UTF8)); + } + + *puChar = + (FLMUNICODE)(((FLMUNICODE)(pucBuf[ 0] - 0xE0) << 12) + + ((FLMUNICODE)(pucBuf[ 1] - 0x80) << 6) + + (FLMUNICODE)(pucBuf[ 2] - 0x80)); + (*ppucBuf) += 3; + + return( NE_FLM_OK); +} + +/**************************************************************************** +Desc: Convert a Unicode character to UTF-8 +*****************************************************************************/ +FINLINE RCODE flmUni2UTF8( + FLMUNICODE uChar, + FLMBYTE * pucBuf, + FLMUINT * puiBufSize) +{ + if( uChar <= 0x007F) + { + if( pucBuf) + { + if( *puiBufSize < 1) + { + return( RC_SET( NE_FLM_CONV_DEST_OVERFLOW)); + } + + *pucBuf = (FLMBYTE)uChar; + } + *puiBufSize = 1; + } + else if( uChar <= 0x07FF) + { + if( pucBuf) + { + if( *puiBufSize < 2) + { + return( RC_SET( NE_FLM_CONV_DEST_OVERFLOW)); + } + + *pucBuf++ = (FLMBYTE)(0xC0 | (FLMBYTE)(uChar >> 6)); + *pucBuf = (FLMBYTE)(0x80 | (FLMBYTE)(uChar & 0x003F)); + } + *puiBufSize = 2; + } + else + { + if( pucBuf) + { + if( *puiBufSize < 3) + { + return( RC_SET( NE_FLM_CONV_DEST_OVERFLOW)); + } + + *pucBuf++ = (FLMBYTE)(0xE0 | (FLMBYTE)(uChar >> 12)); + *pucBuf++ = (FLMBYTE)(0x80 | (FLMBYTE)((uChar & 0x0FC0) >> 6)); + *pucBuf = (FLMBYTE)(0x80 | (FLMBYTE)(uChar & 0x003F)); + } + *puiBufSize = 3; + } + + return( NE_FLM_OK); +} + +/**************************************************************************** +Desc: Reads the next UTF-8 character from a UTF-8 buffer +Notes: This routine assumes that the destination buffer can hold at least + three bytes +****************************************************************************/ +FINLINE RCODE flmGetUTF8CharFromUTF8Buf( + FLMBYTE ** ppucBuf, + FLMBYTE * pucEnd, + FLMBYTE * pucDestBuf, + FLMUINT * puiLen) +{ + FLMBYTE * pucBuf = *ppucBuf; + FLMUINT uiMaxLen = pucEnd ? (FLMUINT)(pucEnd - *ppucBuf) : 3; + + if( !uiMaxLen || !pucBuf[ 0]) + { + *puiLen = 0; + return( NE_FLM_OK); + } + + if( pucBuf[ 0] <= 0x7F) + { + *pucDestBuf = pucBuf[ 0]; + (*ppucBuf)++; + *puiLen = 1; + return( NE_FLM_OK); + } + + if( uiMaxLen < 2 || (pucBuf[ 1] >> 6) != 0x02) + { + return( RC_SET( NE_FLM_BAD_UTF8)); + } + + if( (pucBuf[ 0] >> 5) == 0x06) + { + pucDestBuf[ 0] = pucBuf[ 0]; + pucDestBuf[ 1] = pucBuf[ 1]; + (*ppucBuf) += 2; + *puiLen = 2; + return( NE_FLM_OK); + } + + if( uiMaxLen < 3 || + (pucBuf[ 0] >> 4) != 0x0E || + (pucBuf[ 2] >> 6) != 0x02) + { + return( RC_SET( NE_FLM_BAD_UTF8)); + } + + pucDestBuf[ 0] = pucBuf[ 0]; + pucDestBuf[ 1] = pucBuf[ 1]; + pucDestBuf[ 2] = pucBuf[ 2]; + (*ppucBuf) += 3; + *puiLen = 3; + + return( NE_FLM_OK); +} + +/**************************************************************************** +Desc: +****************************************************************************/ +FINLINE RCODE flmGetUTF8Length( + const FLMBYTE * pucBuf, + FLMUINT uiBufLen, + FLMUINT * puiBytes, + FLMUINT * puiChars) +{ + const FLMBYTE * pucStart = pucBuf; + const FLMBYTE * pucEnd = uiBufLen ? (pucStart + uiBufLen) : NULL; + FLMUINT uiChars = 0; + + if (!pucBuf) + { + goto Exit; + } + + while( (!pucEnd || pucBuf < pucEnd) && *pucBuf) + { + if( *pucBuf <= 0x7F) + { + pucBuf++; + uiChars++; + continue; + } + + if( (pucEnd && pucBuf + 1 >= pucEnd) || + (pucBuf[ 1] >> 6) != 0x02) + { + return( RC_SET( NE_FLM_BAD_UTF8)); + } + + if( ((*pucBuf) >> 5) == 0x06) + { + pucBuf += 2; + uiChars++; + continue; + } + + if( (pucEnd && pucBuf + 2 >= pucEnd) || + (pucBuf[ 0] >> 4) != 0x0E || + (pucBuf[ 2] >> 6) != 0x02) + { + return( RC_SET( NE_FLM_BAD_UTF8)); + } + + pucBuf += 3; + uiChars++; + } + +Exit: + + *puiChars = uiChars; + if (pucEnd && pucBuf == pucEnd) + { + *puiBytes = (FLMUINT)(pucBuf - pucStart); + } + else + { + // Hit a null byte + *puiBytes = (FLMUINT)(pucBuf - pucStart) + 1; + } + + return( NE_FLM_OK); +} + +/**************************************************************************** +Desc: Converts a character to upper case (if possible) +****************************************************************************/ +FLMUINT16 flmWPUpper( + FLMUINT16 ui16WpChar) +{ + if( ui16WpChar < 256) + { + if( ui16WpChar >= ASCII_LOWER_A && ui16WpChar <= ASCII_LOWER_Z) + { + // Return ASCII upper case + + return( ui16WpChar & 0xdf); + } + } + else + { + FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8); + + if( ucCharSet == CHSMUL1) + { + FLMBYTE ucChar = (FLMBYTE)(ui16WpChar & 0xFF); + + if( ucChar >= fwp_caseConvertableRange[ (CHSMUL1-1) * 2] && + ucChar <= fwp_caseConvertableRange[ ((CHSMUL1-1) * 2) + 1]) + { + return( ui16WpChar & 0xFFFE); + } + } + else if( ucCharSet == CHSGREK) + { + if( (ui16WpChar & 0xFF) <= + fwp_caseConvertableRange[ ((CHSGREK-1) * 2) + 1]) + { + return( ui16WpChar & 0xFFFE); + } + } + else if( ucCharSet == CHSCYR) + { + if( (ui16WpChar & 0xFF) <= + fwp_caseConvertableRange[ ((CHSCYR-1) * 2) + 1]) + { + return( ui16WpChar & 0xFFFE); + } + } + else if( ui16WpChar >= Lower_JP_a) + { + // Possible double byte character set alphabetic character? + + if( ui16WpChar <= Lower_JP_z) + { + // Japanese? + + ui16WpChar = (ui16WpChar - Lower_JP_a) + Upper_JP_A; + } + else if( ui16WpChar >= Lower_KR_a && ui16WpChar <= Lower_KR_z) + { + // Korean? + + ui16WpChar = (ui16WpChar - Lower_KR_a) + Upper_KR_A; + } + else if( ui16WpChar >= Lower_CS_a && ui16WpChar <= Lower_CS_z) + { + // Chinese Simplified? + + ui16WpChar = (ui16WpChar - Lower_CS_a) + Upper_CS_A; + } + else if( ui16WpChar >= Lower_CT_a && ui16WpChar <= Lower_CT_z) + { + // Chinese Traditional? + + ui16WpChar = (ui16WpChar - Lower_CT_a) + Upper_CT_A; + } + } + } + + // Return original character - original not in lower case. + + return( ui16WpChar); +} + +/**************************************************************************** +Desc: Checks to see if WP character is upper case +****************************************************************************/ +FLMBOOL flmWPIsUpper( + FLMUINT16 ui16WpChar) +{ + FLMBYTE ucChar; + FLMBYTE ucCharSet; + + // Get character + + ucChar = (FLMBYTE)(ui16WpChar & 0xFF); + + // Test if ASCII character set + + if( !(ui16WpChar & 0xFF00)) + { + return( (ucChar >= ASCII_LOWER_A && ucChar <= ASCII_LOWER_Z) + ? FALSE + : TRUE); + } + + // Get the character set + + ucCharSet = (FLMBYTE) (ui16WpChar >> 8); + + // CHSMUL1 == Multinational 1 character set + // CHSGREK == Greek character set + // CHSCYR == Cyrillic character set + + if( (ucCharSet == CHSMUL1 && ucChar >= 26 && ucChar <= 241) || + (ucCharSet == CHSGREK && ucChar <= 69) || + (ucCharSet == CHSCYR && ucChar <= 199)) + { + return( (ucChar & 1) ? FALSE : TRUE); + } + + // Don't care that double ss is lower + + return( TRUE); +} + +/**************************************************************************** +Desc: Converts a character to lower case (if possible) +****************************************************************************/ +FLMUINT16 flmWPLower( + FLMUINT16 ui16WpChar) +{ + if( ui16WpChar < 256) + { + if( ui16WpChar >= ASCII_UPPER_A && ui16WpChar <= ASCII_UPPER_Z) + { + return( ui16WpChar | 0x20); + } + } + else + { + FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8); + + if( ucCharSet == CHSMUL1) + { + FLMBYTE ucChar = (FLMBYTE)(ui16WpChar & 0xFF); + + if( ucChar >= fwp_caseConvertableRange[ (CHSMUL1-1) * 2] && + ucChar <= fwp_caseConvertableRange[ ((CHSMUL1-1) * 2) + 1] ) + { + return( ui16WpChar | 1); + } + } + else if( ucCharSet == CHSGREK) + { + if( (ui16WpChar & 0xFF) <= + fwp_caseConvertableRange[ ((CHSGREK-1) * 2) + 1]) + { + return( ui16WpChar | 1); + } + } + else if( ucCharSet == CHSCYR) + { + if( (ui16WpChar & 0xFF) <= + fwp_caseConvertableRange[ ((CHSCYR-1) * 2) + 1]) + { + return( ui16WpChar | 1); + } + } + else if( ui16WpChar >= Upper_JP_A) + { + // Possible double byte character set alphabetic character? + + if( ui16WpChar <= Upper_JP_Z) + { + // Japanese? + + ui16WpChar = ui16WpChar - Upper_JP_A + Lower_JP_a; + } + else if( ui16WpChar >= Upper_KR_A && ui16WpChar <= Upper_KR_Z) + { + // Korean? + + ui16WpChar = ui16WpChar - Upper_KR_A + Lower_KR_a; + } + else if( ui16WpChar >= Upper_CS_A && ui16WpChar <= Upper_CS_Z) + { + // Chinese Simplified? + + ui16WpChar = ui16WpChar - Upper_CS_A + Lower_CS_a; + } + else if( ui16WpChar >= Upper_CT_A && ui16WpChar <= Upper_CT_Z) + { + // Chinese Traditional? + + ui16WpChar = ui16WpChar - Upper_CT_A + Lower_CT_a; + } + } + } + + // Return original character, original not in upper case + + return( ui16WpChar); +} + +/**************************************************************************** +Desc: Break a WP character into a base and a diacritical char. +Ret: TRUE - if not found + FALSE - if found +****************************************************************************/ +FLMBOOL flmWPBrkcar( + FLMUINT16 ui16WpChar, + FLMUINT16 * pui16BaseChar, + FLMUINT16 * pui16DiacriticChar) +{ + BASE_DIACRIT * pBaseDiacritic; + FLMINT iTableIndex; + + if( (pBaseDiacritic = fwp_car60_c[ HI(ui16WpChar)]) == 0) + { + return( TRUE); + } + + iTableIndex = ((FLMBYTE)ui16WpChar) - pBaseDiacritic->start_char; + if( iTableIndex < 0 || + iTableIndex > pBaseDiacritic->char_count || + pBaseDiacritic->table [iTableIndex].base == (FLMBYTE)0xFF) + { + return( TRUE); + } + + if( (HI( ui16WpChar) != CHSMUL1) || + ((fwp_ml1_cb60[ ((FLMBYTE) ui16WpChar) >> 3] >> + (7 - (ui16WpChar & 0x07))) & 0x01)) + { + + // normal case, same base as same as characters + + *pui16BaseChar = (ui16WpChar & 0xFF00) | + pBaseDiacritic->table [iTableIndex].base; + *pui16DiacriticChar = (ui16WpChar & 0xFF00) | + pBaseDiacritic->table[iTableIndex].diacrit; + } + else + { + + // Multi-national where base is ascii value. + + *pui16BaseChar = pBaseDiacritic->table [iTableIndex].base; + *pui16DiacriticChar = (ui16WpChar & 0xFF00) | + pBaseDiacritic->table[iTableIndex].diacrit; + } + + return( FALSE); +} + +/************************************************************************** +Desc: Find the collating value of a WP character +ret: Collating value (COLS0 is high value - undefined WP char) +***********************************************************************/ +FLMUINT16 flmWPGetCollation( + FLMUINT16 ui16WpChar, + FLMUINT uiLanguage) +{ + FLMUINT16 ui16State; + FLMBYTE ucCharVal; + FLMBYTE ucCharSet; + FLMBOOL bHebrewArabicFlag = FALSE; + TBL_B_TO_BP * pColTbl = fwp_col60Tbl; + + // State ONLY for non-US + + if( uiLanguage != FLM_US_LANG) + { + if( uiLanguage == FLM_AR_LANG || // Arabic + uiLanguage == FLM_FA_LANG || // Farsi - persian + uiLanguage == FLM_HE_LANG || // Hebrew + uiLanguage == FLM_UR_LANG) // Urdu + { + pColTbl = fwp_HebArabicCol60Tbl; + bHebrewArabicFlag = TRUE; + } + else + { + // check if uiLanguage candidate for alternate double collating + + ui16State = getNextCharState( START_COL, uiLanguage); + if( 0 != (ui16State = getNextCharState( (ui16State + ? ui16State // look at special case languages + : START_ALL), // look at US and European + (FLMUINT) ui16WpChar))) + { + return( ui16State); + } + } + } + + ucCharVal = (FLMBYTE)ui16WpChar; + ucCharSet = (FLMBYTE)(ui16WpChar >> 8); + + do + { + if( pColTbl->key == ucCharSet) + { + FLMBYTE * pucColVals; // table of collating values + + pucColVals = pColTbl->charPtr; + + // Check if the value is in the range of collated chars + // Above lower range of table? + + if (ucCharVal >= *pucColVals) + { + // Make value zero based to index + + ucCharVal -= *pucColVals++; + + // Below maximum number of table entries? + + if( ucCharVal < *pucColVals++) + { + // Return collated value. + + return( pucColVals[ ucCharVal]); + } + } + } + + // Go to next table entry + + pColTbl++; + } while( pColTbl->key != 0xFF); + + if( bHebrewArabicFlag) + { + if( ucCharSet == CHSHEB || + ucCharSet == CHSARB1 || + ucCharSet == CHSARB2) + { + // Same as COLS0_HEBREW + + return( COLS0_ARABIC); + } + } + + // Defaults for characters that don't have a collation value. + + return( COLS0); +} + +/**************************************************************************** +Desc: Check for double characters that sort as 1 (like ch in Spanish) or + 1 character that should sort as 2 (like ? sorts as ae in French). +Return: 0 = nothing changes + 1 if sorting 2 characters as 1 - *pui16WpChar is the one character. + second character value if 1 character sorts as 2, + *pui16WpChar changes to first character in sequence +****************************************************************************/ +RCODE FLMAPI f_wpCheckDoubleCollation( + IF_PosIStream * pIStream, + FLMBOOL bUnicodeStream, + FLMBOOL bAllowTwoIntoOne, + FLMUNICODE * puzChar, + FLMUNICODE * puzChar2, + FLMBOOL * pbTwoIntoOne, + FLMUINT uiLanguage) +{ + RCODE rc = NE_FLM_OK; + FLMUINT16 ui16CurState; + FLMUINT16 ui16WpChar; + FLMUNICODE uzLastChar = 0; + FLMUNICODE uChar = *puzChar; + FLMUNICODE uDummy; + FLMBOOL bUpperFlag; + FLMUINT64 ui64SavePosition = pIStream->getCurrPosition(); + + if (!f_unicodeToWP( *puzChar, &ui16WpChar)) + { + ui16WpChar = UNK_UNICODE_CODE; + } + bUpperFlag = flmWPIsUpper( ui16WpChar); + + if ((ui16CurState = getNextCharState( 0, uiLanguage)) == 0) + { + *pbTwoIntoOne = FALSE; + *puzChar2 = 0; + goto Exit; + } + + for (;;) + { + switch (ui16CurState) + { + case INSTSG: + *puzChar = *puzChar2 = (FLMUNICODE)f_toascii( 's'); + *pbTwoIntoOne = FALSE; + goto Exit; + case INSTAE: + if (bUpperFlag) + { + *puzChar = (FLMUNICODE)f_toascii( 'A'); + *puzChar2 = (FLMUNICODE)f_toascii( 'E'); + } + else + { + *puzChar = (FLMUNICODE)f_toascii( 'a'); + *puzChar2 = (FLMUNICODE)f_toascii( 'e'); + } + *pbTwoIntoOne = FALSE; + goto Exit; + case INSTIJ: + if (bUpperFlag) + { + *puzChar = (FLMUNICODE)f_toascii( 'I'); + *puzChar2 = (FLMUNICODE)f_toascii( 'J'); + } + else + { + *puzChar = (FLMUNICODE)f_toascii( 'i'); + *puzChar2 = (FLMUNICODE)f_toascii( 'j'); + } + *pbTwoIntoOne = FALSE; + goto Exit; + case INSTOE: + if (bUpperFlag) + { + *puzChar = (FLMUNICODE)f_toascii( 'O'); + *puzChar2 = (FLMUNICODE)f_toascii( 'E'); + } + else + { + *puzChar = (FLMUNICODE)f_toascii( 'o'); + *puzChar2 = (FLMUNICODE)f_toascii( 'e'); + } + *pbTwoIntoOne = FALSE; + goto Exit; + case WITHAA: + *puzChar = (FLMUNICODE)(bUpperFlag + ? (FLMUNICODE)0xC5 + : (FLMUNICODE)0xE5); + + if (RC_BAD( rc = pIStream->positionTo( ui64SavePosition))) + { + goto Exit; + } + + if( bUnicodeStream) + { + rc = pIStream->read( &uDummy, sizeof( FLMUNICODE), NULL); + } + else + { + rc = f_readUTF8CharAsUnicode( pIStream, &uDummy); + } + + if( RC_BAD( rc)) + { + if (rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + } + else + { + goto Exit; + } + } + + ui64SavePosition = pIStream->getCurrPosition(); + break; + case AFTERC: + *puzChar = (FLMUINT16)(bUpperFlag + ? (FLMUNICODE)f_toascii( 'C') + : (FLMUNICODE)f_toascii( 'c')); +Position_After_2nd: + + if( bAllowTwoIntoOne) + { + *puzChar2 = uzLastChar; + *pbTwoIntoOne = TRUE; + + if (RC_BAD( rc = pIStream->positionTo( ui64SavePosition))) + { + goto Exit; + } + + if( bUnicodeStream) + { + rc = pIStream->read( &uChar, sizeof( FLMUNICODE), NULL); + } + else + { + rc = f_readUTF8CharAsUnicode( pIStream, &uChar); + } + + if (RC_BAD( rc)) + { + if (rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + } + else + { + goto Exit; + } + } + + ui64SavePosition = pIStream->getCurrPosition(); + } + goto Exit; + case AFTERH: + *puzChar = (FLMUINT16)(bUpperFlag + ? (FLMUNICODE)f_toascii( 'H') + : (FLMUNICODE)f_toascii( 'h')); + goto Position_After_2nd; + case AFTERL: + *puzChar = (FLMUINT16)(bUpperFlag + ? (FLMUNICODE)f_toascii( 'L') + : (FLMUNICODE)f_toascii( 'l')); + goto Position_After_2nd; + default: + // Handles STATE1 through STATE11 also + break; + } + + if ((ui16CurState = getNextCharState( ui16CurState, + flmWPLower( ui16WpChar))) == 0) + { + break; + } + + uzLastChar = uChar; + + if( bUnicodeStream) + { + rc = pIStream->read( &uChar, sizeof( FLMUNICODE), NULL); + } + else + { + rc = f_readUTF8CharAsUnicode( pIStream, &uChar); + } + + if (RC_BAD( rc)) + { + if (rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + } + else + { + goto Exit; + } + } + + if (!f_unicodeToWP( uChar, &ui16WpChar)) + { + ui16WpChar = UNK_UNICODE_CODE; + } + } + +Exit: + + if (RC_OK( rc)) + { + rc = pIStream->positionTo( ui64SavePosition); + } + + return( rc); +} + +/**************************************************************************** +Desc: Returns the collation value of the input WP character. + If in charset 11 will convert the character to Zenkaku (double wide). +In: ui16WpChar - Char to collate off of - could be in CS0..14 or x24..up + ui16NextWpChar - next WP char for CS11 voicing marks + ui16PrevColValue - previous collating value - for repeat/vowel repeat + pui16ColValue - returns 2 byte collation value + pui16SubColVal - 0, 6 or 16 bit value for the latin sub collation + or the kana size & vowel voicing + 001 - set if large (upper) character + 010 - set if voiced + 100 - set if half voiced + + pucCaseBits - returns 2 bits + Latin/Greek/Cyrillic + 01 - case bit set if character is uppercase + 10 - double wide character in CS 0x25xx, 0x26xx and 0x27xx + Japanese + 00 - double wide hiragana 0x255e..25b0 + 01 - double wide katakana 0x2600..2655 + 10 - double wide symbols that map to charset 11 + 11 - single wide katakana from charset 11 +Ret: 0 - no valid collation value + high values set for pui16ColValue + Sub-collation gets original WP character value + 1 - valid collation value + 2 - valid collation value and used the ui16NextWpChar + +Notes: Code taken from XCH2COL.ASM - routine xch2col_f + also from CMPWS.ASM - routine getcase +Terms: + HANKAKU - single wide characters in charsets 0..14 + ZENKAKU - double wide characters in charsets 0x24..end of kanji + KANJI - collation values are 0x2900 less than WPChar value + +****************************************************************************/ +FLMUINT16 flmWPAsiaGetCollation( + FLMUINT16 ui16WpChar, // WP char to get collation values + FLMUINT16 ui16NextWpChar, // Next WP char - for CS11 voicing marks + FLMUINT16 ui16PrevColValue, // Previous collating value + FLMUINT16 * pui16ColValue, // Returns collation value + FLMUINT16 * pui16SubColVal, // Returns sub-collation value + FLMBYTE * pucCaseBits, // Returns case bits value + FLMBOOL bUppercaseFlag) // Set if to convert to uppercase +{ + FLMUINT16 ui16ColValue; + FLMUINT16 ui16SubColVal; + FLMBYTE ucCaseBits = 0; + FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8); + FLMBYTE ucCharVal = (FLMBYTE)(ui16WpChar & 0xFF); + FLMUINT16 ui16Hankaku; + FLMUINT uiLoop; + FLMUINT16 ui16ReturnValue = 1; + + ui16ColValue = ui16SubColVal = 0; + + // Kanji or above + + if( ucCharSet >= 0x2B) + { + // Puts 2 or above into high byte. + + ui16ColValue = ui16WpChar - 0x2900; + + // No subcollation or case bits need to be set + + goto Exit; + } + + // Single wide character? (HANKAKU) + + if( ucCharSet < 11) + { + // Get the values from a non-asian character + // LATIN, GREEK or CYRILLIC + // The width bit may have been set on a jump to + // label from below. + +Latin_Greek_Cyrillic: + + // YES: Pass FLM_US_LANG because this is what we want - + // Prevents double character sorting. + + ui16ColValue = flmWPGetCollation( ui16WpChar, FLM_US_LANG); + + if (bUppercaseFlag || flmWPIsUpper( ui16WpChar)) + { + // Uppercase - set case bit + + ucCaseBits |= SET_CASE_BIT; + } + + // Character for which there is no collation value? + + if( ui16ColValue == COLS0) + { + ui16ReturnValue = 0; + if( !flmWPIsUpper( ui16WpChar)) + { + // Convert to uppercase + + ui16WpChar--; + } + ui16ColValue = 0xFFFF; + ui16SubColVal = ui16WpChar; + } + else if( ucCharSet) // Don't bother with ascii + { + if( !flmWPIsUpper( ui16WpChar)) + { + // Convert to uppercase + + ui16WpChar--; + } + + if( ucCharSet == CHSMUL1) + { + FLMUINT16 ui16Base; + FLMUINT16 ui16Diacritic; + + ui16SubColVal = !flmWPBrkcar( ui16WpChar, &ui16Base, + &ui16Diacritic) + ? fwp_dia60Tbl[ ui16Diacritic & 0xFF] + : ui16WpChar; + } + else if( ucCharSet == CHSGREK) // GREEK + { + if( ui16WpChar >= 0x834 || // [8,52] or above + ui16WpChar == 0x804 || // [8,4] BETA Medial | Terminal + ui16WpChar == 0x826) // [8,38] SIGMA terminal + { + ui16SubColVal = ui16WpChar; + } + } + else if( ucCharSet == CHSCYR) // CYRILLIC + { + if( ui16WpChar >= 0xA90) // [10, 144] or above + { + ui16SubColVal = ui16WpChar; // Dup collation values + } + } + // else don't need a sub collation value + } + goto Exit; + } + + // Single wide Japanese character? + + if( ucCharSet == 11) + { + FLMUINT16 ui16KanaChar; + + // Convert charset 11 to Zenkaku (double wide) CS24 or CS26 hex. + // All characters in charset 11 will convert to CS24 or CS26. + // when combining the collation and the sub-collation values. + + if( flmWPHanToZenkaku( ui16WpChar, + ui16NextWpChar, &ui16KanaChar ) == 2) + { + // Return 2 + + ui16ReturnValue++; + } + + ucCaseBits |= SET_WIDTH_BIT; // Set so will allow to go back + ui16WpChar = ui16KanaChar; // If in CS24 will fall through to ZenKaku + ucCharSet = (FLMBYTE)(ui16KanaChar >> 8); + ucCharVal = (FLMBYTE)(ui16KanaChar & 0xFF); + } + + if( ui16WpChar < 0x2400) + { + // In some other character set + + goto Latin_Greek_Cyrillic; + } + else if( ui16WpChar >= 0x255e && // Hiragana? + ui16WpChar <= 0x2655) // Katakana? + { + if( ui16WpChar >= 0x2600) + { + ucCaseBits |= SET_KATAKANA_BIT; + } + + // HIRAGANA & KATAKANA + // Kana contains both hiragana and katakana. + // The tables contain the same characters in same order + + if( ucCharSet == 0x25) + { + // Change value to be in character set 26 + + ucCharVal -= 0x5E; + } + + ui16ColValue = 0x0100 + KanaColTbl[ ucCharVal ]; + ui16SubColVal = KanaSubColTbl[ ucCharVal ]; + goto Exit; + } + + // ZenKaku - means any double wide character + // Hankaku - single wide character + + // Inputs: 0x2400..2559 symbols..latin - Zenkaku + // 0x265B..2750 greek..cyrillic - Zenkaku + + // SET_WIDTH_BIT may have been set if original char + // was in 11 and got converted to CS24. [1,2,5,27(extendedVowel),53,54] + // Original chars from CS11 will have some collation value that when + // combined with the sub-collation value will format a character in + // CS24. The width bit will then convert back to CS11. + + if( (ui16Hankaku = flmWPZenToHankaku( ui16WpChar, NULL)) != 0) + { + if( (ui16Hankaku >> 8) != 11) // if CharSet11 was a CS24 symbol + { + ui16WpChar = ui16Hankaku; // May be CS24 symbol/latin/gk/cy + ucCharSet = (FLMBYTE)(ui16WpChar >> 8); + ucCharVal = (FLMBYTE)(ui16WpChar & 0xFF); + ucCaseBits |= SET_WIDTH_BIT; // Latin symbols double wide + goto Latin_Greek_Cyrillic; + } + } + + // 0x2400..0x24bc Japanese symbols that cannot be converted to Hankaku. + // All 6 original symbol chars from 11 will also be here. + // First try to find a collation value of the symbol. + // The sub-collation value will be the position in the CS24 table + 1. + + for( uiLoop = 0; + uiLoop < (sizeof( fwp_Ch24ColTbl) / sizeof( BYTE_WORD_TBL)); + uiLoop++ ) + { + if( ucCharVal == fwp_Ch24ColTbl[ uiLoop].ByteValue) + { + if( (ui16ColValue = fwp_Ch24ColTbl[ uiLoop].WordValue) < 0x100) + { + // Don't save for chuuten, dakuten, handakuten + + ui16SubColVal = (FLMUINT16)(uiLoop + 1); + } + break; + } + } + + if( !ui16ColValue) + { + // Now see if it's a repeat or repeat-vowel character + + if( (((ucCharVal >= 0x12) && (ucCharVal <= 0x15)) || + (ucCharVal == 0x17) || + (ucCharVal == 0x18)) && + ((ui16PrevColValue >> 8) == 1)) + { + ui16ColValue = ui16PrevColValue; + + // Store original WP character + + ui16SubColVal = ui16WpChar; + } + else if( (ucCharVal == 0x1B) && // repeat vowel? + (ui16PrevColValue >= 0x100) && + (ui16PrevColValue < COLS_ASIAN_MARKS)) // Previous kana char? + { + ui16ColValue = 0x0100 + KanaColToVowel[ ui16PrevColValue & 0xFF ]; + + // Store original WP character + + ui16SubColVal = ui16WpChar; + } + else + { + ui16ReturnValue = 0; + ui16ColValue = 0xFFFF; // No collation value + ui16SubColVal = ui16WpChar; // Never have changed if gets here + } + } + +Exit: + + // Set return values + + *pui16ColValue = ui16ColValue; + *pui16SubColVal = ui16SubColVal; + *pucCaseBits = ucCaseBits; + + return( ui16ReturnValue); +} + +/**************************************************************************** +Desc: Convert a zenkaku (double wide) char to a hankaku (single wide) char +Ret: Hankaku char or 0 if a conversion doesn't exist +Notes: Taken from CHAR.ASM - zen2han_f routine +****************************************************************************/ +FSTATIC FLMUINT16 flmWPZenToHankaku( + FLMUINT16 ui16WpChar, + FLMUINT16 * pui16DakutenOrHandakuten) +{ + FLMUINT16 ui16Hankaku = 0; + FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8); + FLMBYTE ucCharVal = (FLMBYTE)(ui16WpChar & 0xFF); + FLMUINT uiLoop; + + switch( ucCharSet) + { + // SYMBOLS + + case 0x24: + { + for( uiLoop = 0; + uiLoop < (sizeof( Zen24ToHankaku) / sizeof( BYTE_WORD_TBL)); + uiLoop++) + { + // List is sorted so table entry is more you are done + + if( Zen24ToHankaku [uiLoop].ByteValue >= ucCharVal) + { + if( Zen24ToHankaku [uiLoop].ByteValue == ucCharVal) + { + ui16Hankaku = Zen24ToHankaku [uiLoop].WordValue; + } + break; + } + } + break; + } + + // ROMAN - 0x250F..2559 + // Hiragana - 0x255E..2580 + + case 0x25: + { + if( ucCharVal >= 0x0F && ucCharVal < 0x5E) + { + ui16Hankaku = ucCharVal + 0x21; + } + break; + } + + // Katakana - 0x2600..2655 + // Greek - 0x265B..2695 + + case 0x26: + { + if( ucCharVal <= 0x55) // Katakana range + { + FLMBYTE ucCS11CharVal; + FLMUINT16 ui16NextWpChar = 0; + + if( (ucCS11CharVal = MapCS26ToCharSet11[ ucCharVal ]) != 0xFF) + { + if( ucCS11CharVal & 0x80) + { + if( ucCS11CharVal & 0x40) + { + // Handakuten voicing + + ui16NextWpChar = 0xB3E; + } + else + { + // Dakuten voicing + + ui16NextWpChar = 0xB3D; + } + ucCS11CharVal &= 0x3F; + } + ui16Hankaku = 0x0b00 + ucCS11CharVal; + if( ui16NextWpChar && pui16DakutenOrHandakuten) + { + *pui16DakutenOrHandakuten = ui16NextWpChar; + } + } + } + else if( ucCharVal <= 0x95) // Greek + { + FLMBYTE ucGreekChar = ucCharVal; + + // Make a zero based number. + + ucGreekChar -= 0x5E; + + // Check for lowercase + if( ucGreekChar >= 0x20) + { + // Convert to upper case for now + + ucGreekChar -= 0x20; + } + + if( ucGreekChar >= 2) + { + ucGreekChar++; + } + + if (ucGreekChar >= 19) + { + ucGreekChar++; + } + + // Convert to character set 8 + + ui16Hankaku = (ucGreekChar << 1) + 0x800; + if( ucCharVal >= (0x5E + 0x20)) + { + // Adjust to lower case character + + ui16Hankaku++; + } + } + break; + } + + // Cyrillic + + case 0x27: + { + // Uppercase? + + if( ucCharVal <= 0x20) + { + ui16Hankaku = (ucCharVal << 1) + 0xa00; + } + else if( ucCharVal >= 0x30 && ucCharVal <= 0x50) + { + // Lower case + + ui16Hankaku = ((ucCharVal - 0x30) << 1) + 0xa01; + } + break; + } + } + + return( ui16Hankaku); +} + +/**************************************************************************** +Desc: Convert a WPChar from hankaku (single wide) to zenkaku (double wide). + 1) Used to see if a char in CS11 can map to a double wide character + 2) Used to convert keys into original data. +Ret: 0 = no conversion + 1 = converted character to zenkaku + 2 = ui16NextWpChar dakuten or handakuten voicing got combined +Notes: Taken from char.asm - han2zen() + From8ToZen could be taken out and placed in code. +****************************************************************************/ +FSTATIC FLMUINT16 flmWPHanToZenkaku( + FLMUINT16 ui16WpChar, + FLMUINT16 ui16NextWpChar, + FLMUINT16 * pui16Zenkaku) +{ + FLMUINT16 ui16Zenkaku = 0; + FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChar >> 8); + FLMBYTE ucCharVal = (FLMBYTE)(ui16WpChar & 0xFF); + FLMUINT uiLoop; + FLMUINT16 ui16CharsUsed = 1; + + switch( ucCharSet) + { + // Character set 0 - symbols + + case 0: + { + // Invalid? - all others are used. + + if( ucCharVal < 0x20) + { + ; + } + else if( ucCharVal <= 0x2F) + { + // Symbols A + ui16Zenkaku = 0x2400 + From0AToZen[ ucCharVal - 0x20 ]; + } + else if( ucCharVal <= 0x39) + { + // 0..9 + ui16Zenkaku = 0x2500 + (ucCharVal - 0x21); + } + else if( ucCharVal <= 0x40) + { + // Symbols B + ui16Zenkaku = 0x2400 + From0BToZen[ ucCharVal - 0x3A ]; + } + else if( ucCharVal <= 0x5A) + { + // A..Z + ui16Zenkaku = 0x2500 + (ucCharVal - 0x21); + } + else if( ucCharVal <= 0x60) + { + // Symbols C + ui16Zenkaku = 0x2400 + From0CToZen[ ucCharVal - 0x5B ]; + } + else if( ucCharVal <= 0x7A) + { + // a..z + ui16Zenkaku = 0x2500 + (ucCharVal - 0x21); + } + else if( ucCharVal <= 0x7E) + { + // Symbols D + ui16Zenkaku = 0x2400 + From0DToZen[ ucCharVal - 0x7B ]; + } + break; + } + + // GREEK + + case 8: + { + if( (ucCharVal >= sizeof( From8ToZen)) || + ((ui16Zenkaku = 0x2600 + From8ToZen[ ucCharVal ]) == 0x26FF)) + { + ui16Zenkaku = 0; + } + break; + } + + // CYRILLIC + + case 10: + { + // Check range + + ui16Zenkaku = 0x2700 + (ucCharVal >> 1); // Uppercase value + + // Convert to lower case? + + if( ucCharVal & 0x01) + { + ui16Zenkaku += 0x30; + } + break; + } + + // JAPANESE + + case 11: + { + if( ucCharVal < 5) + { + ui16Zenkaku = 0x2400 + From11AToZen[ ucCharVal ]; + } + else if( ucCharVal < 0x3D) // katakana? + { + if( (ui16Zenkaku = 0x2600 + + From11BToZen[ ucCharVal - 5 ]) == 0x26FF) + { + // Dash - convert to this + ui16Zenkaku = 0x241b; + } + else + { + if( ui16NextWpChar == 0xB3D) // dakuten? - voicing + { + // First check exception(s) then + // check if voicing exists! - will NOT access out of table + + if( (ui16Zenkaku != 0x2652) && // is not 'N'? + (KanaSubColTbl[ ui16Zenkaku - 0x2600 + 1 ] == 3)) + { + ui16Zenkaku++; + + // Return 2 + + ui16CharsUsed++; + } + } + else if( ui16NextWpChar == 0xB3E) // handakuten? - voicing + { + // Check if voicing exists! - will NOT access out of table + + if( KanaSubColTbl [ui16Zenkaku - 0x2600 + 2 ] == 5) + { + ui16Zenkaku += 2; + + // Return 2 + + ui16CharsUsed++; + } + } + } + } + else if( ucCharVal == 0x3D) // dakuten? + { + // Convert to voicing symbol + + ui16Zenkaku = 0x240A; + } + else if( ucCharVal == 0x3E) // handakuten? + { + // Convert to voicing symbol + + ui16Zenkaku = 0x240B; + } + // else cannot convert + + break; + } + + // Other character sets + // CS 1,4,5,6 - symbols + + default: + { + // Instead of includes more tables from char.asm - look down the + // Zen24Tohankaku[] table for a matching value - not much slower. + + for( uiLoop = 0; + uiLoop < (sizeof(Zen24ToHankaku) / sizeof(BYTE_WORD_TBL)); + uiLoop++) + { + if( Zen24ToHankaku[ uiLoop].WordValue == ui16WpChar) + { + ui16Zenkaku = 0x2400 + Zen24ToHankaku[ uiLoop].ByteValue; + break; + } + } + break; + } + } + + if( !ui16Zenkaku) + { + // Change return value + + ui16CharsUsed = 0; + } + + *pui16Zenkaku = ui16Zenkaku; + return( ui16CharsUsed); +} + +/**************************************************************************** +Desc: Converts a 2-byte language code into its corresponding language ID +****************************************************************************/ +FLMUINT FLMAPI f_languageToNum( + const char * pszLanguage) +{ + FLMBYTE ucFirstChar = (FLMBYTE)(*pszLanguage); + FLMBYTE ucSecondChar = (FLMBYTE)(*(pszLanguage + 1)); + FLMUINT uiTablePos; + + for( uiTablePos = 0; + uiTablePos < (FLM_LAST_LANG + FLM_LAST_LANG); uiTablePos += 2) + { + if( f_langtbl [uiTablePos] == ucFirstChar && + f_langtbl [uiTablePos+1] == ucSecondChar) + { + + // Return uiTablePos div 2 + + return( uiTablePos >> 1); + } + } + + // Language not found, return default US language + + return( FLM_US_LANG); +} + +/**************************************************************************** +Desc: Converts a language ID to its corresponding 2-byte language code +****************************************************************************/ +void FLMAPI f_languageToStr( + FLMINT iLangNum, + char * pszLanguage) +{ + // iLangNum could be negative + + if( iLangNum < 0 || iLangNum >= FLM_LAST_LANG) + { + iLangNum = FLM_US_LANG; + } + + iLangNum += iLangNum; + *pszLanguage++ = (char)f_langtbl [iLangNum ]; + *pszLanguage++ = (char)f_langtbl [iLangNum+1]; + *pszLanguage = 0; +} + +/*************************************************************************** +Desc: Return the sub-collation value of a WP character. Unconverted + unicode values always have a sub-collation value of + 11110+UnicodeChar +***************************************************************************/ +FLMUINT16 flmWPGetSubCol( + FLMUINT16 ui16WPValue, // [in] WP Character value. + FLMUINT16 ui16ColValue, // [in] Collation Value (for arabic) + FLMUINT uiLanguage) // [in] WP Language ID. +{ + FLMUINT16 ui16SubColVal; + FLMBYTE ucCharVal; + FLMBYTE ucCharSet; + FLMUINT16 ui16Base; + + // Easy case first - ascii characters. + + ui16SubColVal = 0; + if (ui16WPValue <= 127) + { + goto Exit; + } + + // From here down default ui16SubColVal is WP value. + + ui16SubColVal = ui16WPValue; + ucCharVal = (FLMBYTE) ui16WPValue; + ucCharSet = (FLMBYTE) (ui16WPValue >> 8); + + // Convert char to uppercase because case information + // is stored above. This will help + // ensure that the "ETA" doesn't sort before "eta" + // could use is lower code here for added performance. + + // This just happens to work with all WP character values. + + if (!flmWPIsUpper( ui16WPValue)) + { + ui16WPValue &= ~1; + } + + switch (ucCharSet) + { + case CHSMUL1: + + // If you cannot break down a char into base and + // diacritic then you cannot combine the charaacter + // later when converting back the key. So, write + // the entire WP char in the sub-collation area. + // We can ONLY SUPPORT MULTINATIONAL 1 for brkcar() + + if (flmWPBrkcar( ui16WPValue, &ui16Base, &ui16SubColVal)) + { + + // WordPerfect character cannot be broken down. + // If we had a collation value other than 0xFF (COLS0), don't + // return a sub-collation value. This will allow things like + // upper and lower AE digraphs to compare properly. + + if (ui16ColValue != COLS0) + { + ui16SubColVal = 0; + } + goto Exit; + } + + // Write the FLAIM diacritic sub-collation value. + // Prefix is 2 bits "10". Remember to leave + // "111" alone for the future. + // Bug 11/16/92 = was only writing a "1" and not "10" + + ui16SubColVal = ( + (ui16SubColVal & 0xFF) == umlaut + && ( (uiLanguage == FLM_SU_LANG) || + (uiLanguage == FLM_SV_LANG) || + (uiLanguage == FLM_CZ_LANG) || + (uiLanguage == FLM_SL_LANG) + ) + ) + ? (FLMUINT16)(fwp_dia60Tbl[ ring] + 1) // umlaut must be after ring above + : (FLMUINT16)(fwp_dia60Tbl[ ui16SubColVal & 0xFF]); + + break; + + case CHSGREK: + + // Greek + + if( (ucCharVal >= 52) || // Keep case bit for 52-69 else ignore + (ui16WPValue == 0x804) || // [ 8,4] BETA Medial | Terminal + (ui16WPValue == 0x826)) // [ 8,38] SIGMA termainal + { + ui16SubColVal = ui16WPValue; + } + // else no subcollation to worry about + break; + + case CHSCYR: + if (ucCharVal >= 144) + { + ui16SubColVal = ui16WPValue; + } + // else no subcollation to worry about + + // VISIT: Georgian covers 208-249 - no collation defined yet + break; + + case CHSHEB: // Hebrew + + // Three sections in Hebrew: + // 0..26 - main characters + // 27..83 - accents that apear over previous character + // 84..118- dagesh (ancient) hebrew with accents + + // Because the ancient is only used for sayings & scriptures + // we will support a collation value and in the sub-collation + // store the actual character because sub-collation is in + // character order. + + if (ucCharVal >= 84) // Save ancient - value 84 and above + { + ui16SubColVal = ui16WPValue; + } + break; + + case CHSARB1: // Arabic 1 + + // Three sections in Arabic: + // 00..37 - accents that display OVER a previous character + // 38..46 - symbols + // 47..57 - numbers + // 58..163 - characters + // 164 - hamzah accent + // 165..180- common characters with accents + // 181..193- ligatures - common character combinations + // 194..195- extensions - throw away when sorting + + if (ucCharVal <= 46) + { + ui16SubColVal = ui16WPValue; + } + else + { + if (ui16ColValue == COLS10a+1) // Alef? + { + ui16SubColVal = (ucCharVal >= 165) + ? (FLMUINT16)(fwp_alefSubColTbl[ ucCharVal - 165 ]) + : (FLMUINT16)7; // Alef subcol value + } + else + { + if (ucCharVal >= 181) // Ligatures - char combination + { + ui16SubColVal = ui16WPValue; + } + else if (ucCharVal == 64) // taa exception + { + ui16SubColVal = 8; + } + } + } + break; + + case CHSARB2: // Arabic 2 + + // There are some characters that share the same slot + // Check the bit table if above character 64 + + if ((ucCharVal >= 64) && + (fwp_ar2BitTbl[(ucCharVal-64)>> 3] & (0x80 >> (ucCharVal&0x07)))) + { + ui16SubColVal = ui16WPValue; + } + break; + + } + +Exit: + + return( ui16SubColVal); +} + +/***************************************************************************** +Desc: +******************************************************************************/ +RCODE F_CollIStream::read( + FLMBOOL bAllowTwoIntoOne, + FLMUNICODE * puChar, + FLMBOOL * pbCharIsWild, + FLMUINT16 * pui16Col, + FLMUINT16 * pui16SubCol, + FLMBYTE * pucCase) +{ + RCODE rc = NE_FLM_OK; + FLMUNICODE uChar; + FLMUINT16 ui16WpChar; + FLMUINT16 ui16NextWpChar; + FLMUINT16 ui16Col; + FLMUINT16 ui16SubCol; + FLMBOOL bTwoIntoOne; + FLMBYTE ucCase; + FLMBOOL bAsian; + FLMBOOL bLastCharWasSpace = FALSE; + FLMUINT64 ui64AfterLastSpacePos = 0; + FLMUINT64 ui64CurrCharPos = 0; + + if (pbCharIsWild) + { + *pbCharIsWild = FALSE; + } + + // Is this a double-byte (Asian) character set? + + bAsian = (m_uiLanguage >= FLM_FIRST_DBCS_LANG && + m_uiLanguage <= FLM_LAST_DBCS_LANG) + ? TRUE + : FALSE; + + // Get the next character from the stream + +GetNextChar: + + ui16WpChar = 0; + ui16NextWpChar = 0; + ui16Col = 0; + ui16SubCol = 0; + bTwoIntoOne = FALSE; + ucCase = 0; + + if (m_uNextChar) + { + uChar = m_uNextChar; + m_uNextChar = 0; + } + else + { + ui64CurrCharPos = m_pIStream->getCurrPosition(); + if( RC_BAD( rc = readCharFromStream( &uChar))) + { + if (rc != NE_FLM_EOF_HIT) + { + goto Exit; + } + + // If we were skipping spaces, we need to + // process a single space character, unless we are + // ignoring trailing white space. + + if (bLastCharWasSpace && + !(m_uiCompareRules & FLM_COMP_IGNORE_TRAILING_SPACE)) + { + // bLastCharWasSpace flag can only be TRUE if either + // FLM_COMP_IGNORE_TRAILING_SPACE is set or + // FLM_COMP_COMPRESS_WHITESPACE is set. + + flmAssert( m_uiCompareRules & FLM_COMP_COMPRESS_WHITESPACE); + uChar = ASCII_SPACE; + rc = NE_FLM_OK; + goto Process_Char; + } + goto Exit; + } + } + + if ((uChar = f_convertChar( uChar, m_uiCompareRules)) == 0) + { + goto GetNextChar; + } + + // Deal with spaces + + if (uChar == ASCII_SPACE) + { + if (m_uiCompareRules & FLM_COMP_COMPRESS_WHITESPACE) + { + bLastCharWasSpace = TRUE; + ui64AfterLastSpacePos = m_pIStream->getCurrPosition(); + goto GetNextChar; + } + else if (m_uiCompareRules & FLM_COMP_IGNORE_TRAILING_SPACE) + { + if (!bLastCharWasSpace) + { + bLastCharWasSpace = TRUE; + + // Save where we are at so that if this doesn't turn out + // to be trailing spaces, we can restore this position. + + ui64AfterLastSpacePos = m_pIStream->getCurrPosition(); + } + goto GetNextChar; + } + } + else + { + if (m_uiCompareRules & FLM_COMP_IGNORE_LEADING_SPACE) + { + m_ui64EndOfLeadingSpacesPos = ui64CurrCharPos; + m_uiCompareRules &= (~(FLM_COMP_IGNORE_LEADING_SPACE)); + } + + // If the last character was a space, we need to process it. + + if (bLastCharWasSpace) + { + + // Position back to after the last space, and process a space + // character. + + if (RC_BAD( rc = m_pIStream->positionTo( ui64AfterLastSpacePos))) + { + goto Exit; + } + + uChar = ASCII_SPACE; + bLastCharWasSpace = FALSE; + } + else if (uChar == ASCII_BACKSLASH) + { + // If wildcards are allowed, the backslash should be treated + // as an escape character, and the next character is the one + // we want. Otherwise, it should be treated as + // the actual character we want returned. + + if (m_bMayHaveWildCards) + { + + // Got a backslash. Means the next character is to be taken + // no matter what because it is escaped. + + if (RC_BAD( rc = readCharFromStream( &uChar))) + { + if (rc != NE_FLM_EOF_HIT) + { + goto Exit; + } + rc = NE_FLM_OK; + uChar = ASCII_BACKSLASH; + } + } + } + else if (uChar == ASCII_WILDCARD) + { + if (m_bMayHaveWildCards && pbCharIsWild) + { + *pbCharIsWild = TRUE; + } + } + } + +Process_Char: + + if (!bAsian) + { + + // Must check for double characters if non-US and non-Asian + // character set + + if (m_uiLanguage != FLM_US_LANG) + { + if (RC_BAD( rc = f_wpCheckDoubleCollation( + m_pIStream, m_bUnicodeStream, bAllowTwoIntoOne, + &uChar, &m_uNextChar, &bTwoIntoOne, m_uiLanguage))) + { + goto Exit; + } + } + } + else + { + if (RC_BAD( rc = readCharFromStream( &m_uNextChar))) + { + if (rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + m_uNextChar = 0; + } + else + { + RC_UNEXPECTED_ASSERT( rc); + goto Exit; + } + } + } + + // Convert each character to its WP equivalent + + if (!f_unicodeToWP( uChar, &ui16WpChar)) + { + ui16WpChar = 0; + } + + if (!f_unicodeToWP( m_uNextChar, &ui16NextWpChar)) + { + ui16NextWpChar = 0; + } + + // If we have an unconvertible UNICODE character, the collation + // value for it will be COLS0 + + if (!ui16WpChar) + { + if (!bAsian) + { + ui16Col = COLS0; + } + else + { + if (uChar < 0x20) + { + ui16Col = 0xFFFF; + ui16SubCol = uChar; + } + else + { + ui16Col = uChar; + ui16SubCol = 0; + } + } + } + else + { + if (!bAsian) + { + ui16Col = flmWPGetCollation( ui16WpChar, m_uiLanguage); + if (bTwoIntoOne) + { + // Since two characters were merged into one, increment + // the collation value by one. In the case of something + // like 'ch', there is a collation value between 'c' and + // 'd'. flmWPGetCollation would have returned the + // collation value for 'c' ... incrementing by one gives + // us the proper collation value for 'ch' (i.e., the + // collation value between 'c' and 'd'). + + ui16Col++; + } + } + else + { + if (flmWPAsiaGetCollation( ui16WpChar, ui16NextWpChar, ui16Col, + &ui16Col, &ui16SubCol, &ucCase, !m_bCaseSensitive) == 2) + { + + // Next character was consumed by collation + + m_uNextChar = 0; + } + } + } + + if (pui16Col) + { + *pui16Col = ui16Col; + } + + // Consume m_uNextChar if two characters merged into one + + if (bTwoIntoOne) + { + m_uNextChar = 0; + } + + // Subcollation + + if( pui16SubCol) + { + if( uChar > 127 && !bAsian) + { + ui16SubCol = ui16WpChar + ? flmWPGetSubCol( ui16WpChar, ui16Col, m_uiLanguage) + : uChar; + + if( !m_bCaseSensitive) + { + // If the sub-collation value is the original + // character, it means that the collation could not + // distinguish the characters and sub-collation is being + // used to do it. However, this creates a problem when the + // characters are the same character except for case. In that + // scenario, we incorrectly return a not-equal when we are + // doing a case-insensitive comparison. So, at this point, + // we need to use the sub-collation for the upper-case of the + // character instead of the sub-collation for the character + // itself. + + if( ui16WpChar && ui16SubCol == ui16WpChar) + { + ui16SubCol = flmWPGetSubCol( + flmWPUpper( ui16WpChar), + ui16Col, m_uiLanguage); + } + } + } + + *pui16SubCol = ui16SubCol; + } + + // Case + + if( pucCase) + { + if (!m_bCaseSensitive) + { + *pucCase = 0; + } + else + { + if (!bAsian && ui16WpChar) + { + // flmWPIsUpper() returns FALSE if the character is lower or + // TRUE if the character is not lower case. + + if( flmWPIsUpper( ui16WpChar)) + { + if( bTwoIntoOne) + { + if( flmWPIsUpper( ui16NextWpChar)) + { + ucCase = 0x03; + } + else + { + ucCase = 0x10; + } + } + else + { + ucCase = 0x01; + } + } + } + *pucCase = ucCase; + } + } + + if (puChar) + { + *puChar = uChar; + } + +Exit: + + return( rc); +} + +/*************************************************************************** +Desc: Compare two entire strings. +****************************************************************************/ +RCODE FLMAPI f_compareCollStreams( + IF_CollIStream * pLStream, + IF_CollIStream * pRStream, + FLMBOOL bOpIsMatch, + FLMUINT uiLanguage, + FLMINT * piResult) +{ + RCODE rc = NE_FLM_OK; + FLMUINT16 ui16RCol; + FLMUINT16 ui16LCol; + FLMUINT16 ui16RSubCol; + FLMUINT16 ui16LSubCol; + FLMBYTE ucRCase; + FLMBYTE ucLCase; + F_CollStreamPos savedRPos; + F_CollStreamPos savedLPos; + F_CollStreamPos startLPos; + FLMUNICODE uLChar = 0; + FLMBOOL bLCharIsWild = FALSE; + FLMUNICODE uRChar = 0; + FLMBOOL bRCharIsWild = FALSE; + FLMBOOL bPrevLWasWild = FALSE; + FLMBOOL bPrevRWasWild = FALSE; + FLMBOOL bAllowTwoIntoOne; + + // If we are doing a "match" operation, we don't want two + // character sequences like Ch, ae, etc. turned into a single + // a single collation, because then matches that involve wildcards + // like "aetna == a*" would not match properly. + // When not doing a match operation, we WANT two character sequences + // turned into a single collation value so that we can know if + // something is > or <. When doing match operations, all we care + // about is if they are equal or not, so there is no need to look + // at double character collation properties. + + bAllowTwoIntoOne = bOpIsMatch ? FALSE : TRUE; + + for( ;;) + { +GetNextLChar: + + if( bLCharIsWild) + { + bPrevLWasWild = TRUE; + } + + pLStream->getCurrPosition( &startLPos); + if( RC_BAD( rc = pLStream->read( + bAllowTwoIntoOne, + &uLChar, &bLCharIsWild, &ui16LCol, &ui16LSubCol, &ucLCase))) + { + if( rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + + // If the last character was a wildcard, we have a match! + + if( bPrevLWasWild) + { + *piResult = 0; + goto Exit; + } + + for( ;;) + { + if( RC_BAD( rc = pRStream->read( + bAllowTwoIntoOne, + &uRChar, &bRCharIsWild, &ui16RCol, &ui16RSubCol, &ucRCase))) + { + if( rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + *piResult = 0; + } + + goto Exit; + } + + // Break out when we hit a non-wild character + + if( !bRCharIsWild) + { + break; + } + } + + *piResult = -1; + } + + goto Exit; + } + + if( bLCharIsWild) + { + // Consume multiple wildcards + + if( bPrevLWasWild) + { + goto GetNextLChar; + } + + // See if we match anywhere on the remaining right string + + for( ;;) + { + pRStream->getCurrPosition( &savedRPos); + pLStream->getCurrPosition( &savedLPos); + + if( RC_BAD( rc = f_compareCollStreams( pLStream, pRStream, + bOpIsMatch, uiLanguage, piResult))) + { + goto Exit; + } + + if( !(*piResult)) + { + goto Exit; + } + + if( RC_BAD( rc = pRStream->positionTo( &savedRPos))) + { + goto Exit; + } + + if( RC_BAD( rc = pRStream->read( + bAllowTwoIntoOne, + NULL, NULL, NULL, NULL, NULL))) + { + if( rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + break; + } + goto Exit; + } + + if( RC_BAD( rc = pLStream->positionTo( &savedLPos))) + { + goto Exit; + } + } + + *piResult = 1; + goto Exit; + } + +GetNextRChar: + + if( bRCharIsWild) + { + bPrevRWasWild = TRUE; + } + + if( RC_BAD( rc = pRStream->read( + bAllowTwoIntoOne, + &uRChar, &bRCharIsWild, &ui16RCol, &ui16RSubCol, &ucRCase))) + { + if( rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + + // If the last character was a wildcard, we have a match! + + if( bPrevRWasWild) + { + *piResult = 0; + } + else + { + *piResult = 1; + } + } + + goto Exit; + } + + if( bRCharIsWild) + { + if( bPrevRWasWild) + { + goto GetNextRChar; + } + + // See if we match anywhere on the remaining left string + + if( RC_BAD( rc = pLStream->positionTo( &startLPos))) + { + goto Exit; + } + + for( ;;) + { + pLStream->getCurrPosition( &savedLPos); + pRStream->getCurrPosition( &savedRPos); + + if( RC_BAD( rc = f_compareCollStreams( pLStream, pRStream, + bOpIsMatch, uiLanguage, piResult))) + { + goto Exit; + } + + if( !(*piResult)) + { + goto Exit; + } + + if( RC_BAD( rc = pRStream->positionTo( &savedRPos))) + { + goto Exit; + } + + if( RC_BAD( rc = pLStream->positionTo( &savedLPos))) + { + goto Exit; + } + + // Skip the character we just processed + + if( RC_BAD( rc = pLStream->read( + bAllowTwoIntoOne, + NULL, NULL, NULL, NULL, NULL))) + { + if( rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + break; + } + goto Exit; + } + } + + *piResult = -1; + goto Exit; + } + + if( ui16LCol != ui16RCol) + { + *piResult = ui16LCol < ui16RCol ? -1 : 1; + goto Exit; + } + else if( ui16LSubCol != ui16RSubCol) + { + *piResult = ui16LSubCol < ui16RSubCol ? -1 : 1; + goto Exit; + } + else if( ucLCase != ucRCase) + { + // NOTE: If we are doing a case insensitive comparison, + // ucLCase and ucRCase should be equal (both will have been + // set to zero + + *piResult = ucLCase < ucRCase ? -1 : 1; + goto Exit; + } + } + +Exit: + + return( rc); +} + +/*************************************************************************** +Desc: +****************************************************************************/ +FLMUNICODE FLMAPI f_convertChar( + FLMUNICODE uzChar, + FLMUINT uiCompareRules) +{ + if (uzChar == ASCII_SPACE || + (uzChar == ASCII_UNDERSCORE && + (uiCompareRules & FLM_COMP_NO_UNDERSCORES)) || + (f_isWhitespace( uzChar) && + (uiCompareRules & FLM_COMP_WHITESPACE_AS_SPACE))) + { + return( (FLMUNICODE)((uiCompareRules & + (FLM_COMP_NO_WHITESPACE | + FLM_COMP_IGNORE_LEADING_SPACE)) + ? (FLMUNICODE)0 + : (FLMUNICODE)ASCII_SPACE)); + } + else if (uzChar == ASCII_DASH && (uiCompareRules & FLM_COMP_NO_DASHES)) + { + return( (FLMUNICODE)0); + } + else + { + return( uzChar); + } +} + + +/**************************************************************************** +Desc: Called by ftkStartup, this routine initializes the Unicode to + WP and WP to Unicode mapping tables. +****************************************************************************/ +RCODE f_initCharMappingTables( void) +{ + FLMUINT16 * puStaticPtr; + FLMUINT uiLoop; + FLMUINT uiEntries; + FLMUINT uiOffset; + RCODE rc = NE_FLM_OK; + + if( gv_pUnicodeToWP60 || gv_pWP60ToUnicode) + { + rc = RC_SET_AND_ASSERT( NE_FLM_FAILURE); + goto Exit; + } + + gv_uiMinUniChar = 0; + gv_uiMaxUniChar = 0; + + gv_uiMinWPChar = 0; + gv_uiMaxWPChar = 0; + + // Make an initial pass over the table to determine + // what our allocation sizes will need to be. + + for( uiLoop = 0, puStaticPtr = (FLMUINT16 *)WP_UTOWP60; + uiLoop < UTOWP60_ENTRIES; + uiLoop++, puStaticPtr += 2) + { + // Unicode + + if( (FLMUINT)puStaticPtr[ 0] < gv_uiMinUniChar || + !gv_uiMinUniChar) + { + flmAssert( puStaticPtr[ 0] != 0); + gv_uiMinUniChar = (FLMUINT)puStaticPtr[ 0]; + } + + if( (FLMUINT)puStaticPtr[ 0] > gv_uiMaxUniChar) + { + gv_uiMaxUniChar = (FLMUINT)puStaticPtr[ 0]; + } + + // WordPerfect + + if( (FLMUINT)puStaticPtr[ 1] < gv_uiMinWPChar || + !gv_uiMinWPChar) + { + flmAssert( puStaticPtr[ 1] != 0); + gv_uiMinWPChar = (FLMUINT)puStaticPtr[ 1]; + } + + if( (FLMUINT)puStaticPtr[ 1] > gv_uiMaxWPChar) + { + gv_uiMaxWPChar = (FLMUINT)puStaticPtr[ 1]; + } + } + + // Allocate the Unicode table + + uiEntries = (gv_uiMaxUniChar - gv_uiMinUniChar) + 1; + if (RC_BAD( rc = f_calloc( uiEntries * sizeof( FLMUINT16), + &gv_pUnicodeToWP60))) + { + goto Exit; + } + + // Populate the Unicode table + + for( uiLoop = 0, puStaticPtr = (FLMUINT16 *)WP_UTOWP60; + uiLoop < UTOWP60_ENTRIES; uiLoop++, puStaticPtr += 2) + { + uiOffset = (FLMUINT)puStaticPtr[ 0] - gv_uiMinUniChar; + + flmAssert( gv_pUnicodeToWP60[ uiOffset] == 0); + gv_pUnicodeToWP60[ uiOffset] = puStaticPtr[ 1]; + } + + // Allocate the WordPerfect table + + uiEntries = (gv_uiMaxWPChar - gv_uiMinWPChar) + 1; + if (RC_BAD( rc = f_calloc( uiEntries * sizeof( FLMUINT16), + &gv_pWP60ToUnicode))) + { + goto Exit; + } + + // Populate the WordPerfect table + + for( uiLoop = 0, puStaticPtr = (FLMUINT16 *)WP_UTOWP60; + uiLoop < UTOWP60_ENTRIES; uiLoop++, puStaticPtr += 2) + { + uiOffset = (FLMUINT)puStaticPtr[ 1] - gv_uiMinWPChar; + + flmAssert( gv_pWP60ToUnicode[ uiOffset] == 0); + gv_pWP60ToUnicode[ uiOffset] = puStaticPtr[ 0]; + } + +Exit: + + if( RC_BAD( rc)) + { + if( gv_pUnicodeToWP60) + { + f_free( &gv_pUnicodeToWP60); + } + + if( gv_pWP60ToUnicode) + { + f_free( &gv_pWP60ToUnicode); + } + + gv_uiMinUniChar = 0; + gv_uiMaxUniChar = 0; + + gv_uiMinWPChar = 0; + gv_uiMaxWPChar = 0; + } + + return( rc); +} + +/**************************************************************************** +Desc: Called by ftkShutdown, this routine frees the Unicode to WP and + WP to Unicode mapping tables. +****************************************************************************/ +void f_freeCharMappingTables( void) +{ + if( gv_pUnicodeToWP60) + { + f_free( &gv_pUnicodeToWP60); + } + + if( gv_pWP60ToUnicode) + { + f_free( &gv_pWP60ToUnicode); + } + + gv_uiMinUniChar = 0; + gv_uiMaxUniChar = 0; + + gv_uiMinWPChar = 0; + gv_uiMaxWPChar = 0; +} + +/************************************************************************** +Desc: Convert the WP string to lower case chars given low/up bit string +Out: WP characters that have been modified to their original case +Ret: Number of bytes used in the lower/upper buffer +Notes: Only WP to lower case conversion is done here for each bit NOT set. +***************************************************************************/ +FSTATIC FLMUINT flmWPToMixed( + FLMBYTE * pucWPStr, // Existing WP string to modify + FLMUINT uiWPStrLen, // Length of the WP string in bytes + const FLMBYTE * pucLowUpBitStr, // Lower/upper case bit string + FLMUINT uiLang) +{ + FLMUINT uiNumChars; + FLMUINT uiTempWord; + FLMBYTE ucTempByte = 0; + FLMBYTE ucMaskByte; + FLMBYTE ucXorByte; // Used to reverse GR, bits + + ucXorByte = (uiLang == FLM_US_LANG) // Do most common compare first + ? (FLMBYTE)0 + : (uiLang == FLM_GR_LANG) // Greek has uppercase first + ? (FLMBYTE)0xFF + : (FLMBYTE)0 ; + + // For each character (two bytes) in the word string ... + for( uiNumChars = uiWPStrLen >> 1, + ucMaskByte = 0; // Force first time to get a byte + uiNumChars--; + pucWPStr += 2, // Next WP character - word + ucMaskByte >>= 1) // Next bit to mask and check + { + if( ucMaskByte == 0) + { + // Time to get another byte + + ucTempByte = ucXorByte ^ *pucLowUpBitStr++; + ucMaskByte = 0x80; + } + + // If lowercase convert, else is upper + + if( (ucTempByte & ucMaskByte) == 0) + { + // Convert to lower case - COLL -> WP is already in upper case + + uiTempWord = (FLMUINT) FB2UW( pucWPStr); + if( uiTempWord >= ASCII_UPPER_A && uiTempWord <= ASCII_UPPER_Z) + { + uiTempWord |= 0x20; + } + else + { + FLMBYTE ucCharVal = (FLMBYTE)( uiTempWord & 0xFF); + FLMBYTE ucCharSet = (FLMBYTE)( uiTempWord >> 8); + + // Check if charact within region of character set + + if( ((ucCharSet == CHSMUL1) && + ((ucCharVal >= 26) && (ucCharVal <= 241))) || + ((ucCharSet == CHSGREK) && (ucCharVal <= 69)) || + ((ucCharSet == CHSCYR) && (ucCharVal <= 199))) + { + uiTempWord |= 0x01; // Set the bit ... don't increment! + } + } + UW2FBA( (FLMUINT16)uiTempWord, pucWPStr); + } + } + + uiNumChars = uiWPStrLen >> 1; + return( bytesInBits( uiNumChars)); +} + +/**************************************************************************** +Desc: Take a base and a diacritic and compose a WP character. + Note on base character: i's and j's must be dotless i's and j's (for + those which use them) or they will not be found. +Ret: TRUE - if not found + FALSE - if found +Notes: ascii characters with diacriticals are in multi-national if anywhere; + all other base chars with diacritics are found in their own sets. +****************************************************************************/ +FSTATIC FLMBOOL flmWPCmbcar( + FLMUINT16 * pui16WpChar, + FLMUINT16 ui16BaseChar, + FLMINT16 ui16DiacriticChar) +{ + FLMUINT uiRemaining; + FLMBYTE ucCharSet; + FLMBYTE ucChar; + BASE_DIACRIT * pBaseDiacritic; + BASE_DIACRIT_TABLE * pTable; + + ucCharSet = HI( ui16BaseChar); + if( ucCharSet > WP_MAX_CAR60_SIZE) + { + return( TRUE); + } + + // Is base ASCII? If so, look in multinational 1 + + if( !ucCharSet) + { + ucCharSet = CHSMUL1; + } + + if( (pBaseDiacritic = fwp_car60_c[ucCharSet]) == 0) + { + return( TRUE); + } + + ucChar = LO( ui16BaseChar); + ui16DiacriticChar = LO( ui16DiacriticChar); + pTable = pBaseDiacritic->table; + for( uiRemaining = pBaseDiacritic->char_count; + uiRemaining; + uiRemaining--, pTable++ ) + { + // Same base? + + if( pTable->base == ucChar && + (pTable->diacrit & 0x7F) == ui16DiacriticChar) + { + // Same diacritic? + + *pui16WpChar = (FLMUINT16) (((FLMUINT16) ucCharSet << 8) + + (pBaseDiacritic->start_char + + (FLMUINT16)(pTable - pBaseDiacritic->table))); + return( FALSE); + } + } + + return( TRUE); +} + +/**************************************************************************** +Desc: Convert a text string to a collated string. + If NE_FLM_CONV_DEST_OVERFLOW is returned the string is truncated as + best as it can be. The caller must decide to return the error up + or deal with the truncation. +VISIT: If the string is EXACTLY the length of the truncation + length then it should, but doesn't, set the truncation flag. + The code didn't match the design intent. Fix next major + version. +****************************************************************************/ +RCODE flmUTF8ToColText( + IF_PosIStream * pIStream, + FLMBYTE * pucCollatedStr, // Returns collated string + FLMUINT * puiCollatedStrLen, // Returns total collated string length + // Input is maximum bytes in buffer + FLMBOOL bCaseInsensitive, // Set if to convert to uppercase + FLMUINT * puiCollationLen, // Returns the collation bytes length + FLMUINT * puiCaseLen, // Returns length of case bytes + FLMUINT uiLanguage, // Language + FLMUINT uiCharLimit, // Max number of characters in this key piece + FLMBOOL bFirstSubstring, // TRUE is this is the first substring key + FLMBOOL bDataTruncated, // TRUE if data is coming in truncated. + FLMBOOL * pbOriginalCharsLost, + FLMBOOL * pbDataTruncated) +{ + RCODE rc = NE_FLM_OK; + FLMUINT16 ui16Base; // Value of the base character + FLMUINT16 ui16SubColVal; // Sub-collated value (diacritic) + FLMUINT uiLength; // Temporary variable for length + FLMUINT uiTargetColLen = *puiCollatedStrLen - 8; // 4=ovhd,4=worse char + + // Need to increase the buffer sizes to not overflow. + // Characaters without COLL values will take up 3 bytes in + // the ucSubColBuf[] and easily overflow the buffer. + // Hard coded the values so as to minimize changes. + + FLMBYTE ucSubColBuf[ MAX_SUBCOL_BUF + 301]; // Holds sub-collated values(diac) + FLMBYTE ucCaseBits[ MAX_CASE_BYTES + 81]; // Holds case bits + FLMUINT16 ui16WpChr; // Current WP character + FLMUNICODE uChar = 0; // Current unconverted Unicode character + FLMUNICODE uChar2; + FLMUINT16 ui16WpChr2; // 2nd character if any; default 0 for US lang + FLMUINT uiColLen; // Return value of collated length + FLMUINT uiSubColBitPos; // Sub-collation bit position + FLMUINT uiCaseBitPos; // Case bit position + FLMUINT uiFlags; // Clear all bit flags + FLMBOOL bHebrewArabic = FALSE; // Set if language is hebrew, arabic, farsi + FLMBOOL bTwoIntoOne = FALSE; + FLMUINT uiUppercaseFlag; + + uiColLen = 0; + uiSubColBitPos = 0; + uiCaseBitPos = 0; + uiFlags = 0; + ui16WpChr2 = 0; + + // We don't want any single key piece to "pig out" more + // than 256 bytes of the key + + if( uiTargetColLen > 256 - 8) + { + uiTargetColLen = 256 - 8; + } + + // Code below sets ucSubColBuf[] and ucCaseBits[] values to zero. + + if (uiLanguage != FLM_US_LANG) + { + if (uiLanguage == FLM_AR_LANG || // Arabic + uiLanguage == FLM_FA_LANG || // Farsi - persian + uiLanguage == FLM_HE_LANG || // Hebrew + uiLanguage == FLM_UR_LANG) // Urdu + { + bHebrewArabic = TRUE; + } + } + + for (;;) + { + // Set the case bits and sub-collation bits to zero when + // on the first bit of the byte. + + if (!(uiCaseBitPos & 0x07)) + { + ucCaseBits [uiCaseBitPos >> 3] = 0; + } + if (!(uiSubColBitPos & 0x07)) + { + ucSubColBuf [uiSubColBitPos >> 3] = 0; + } + + ui16SubColVal = 0; // Default sub-collation value + + // Get the next character from the string. + + if( RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar))) + { + if (rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + break; + } + goto Exit; + } + + // f_wpCheckDoubleCollation modifies ui16WpChr if a digraph or a double + // character sequence is found. If a double character is found, pucStr + // is incremented past the next character and ui16WpChr2 is set to 1. + // If a digraph is found, pucStr is not changed, but ui16WpChr + // contains the first character and ui16WpChr2 contains the second + // character of the digraph. + + if (uiLanguage != FLM_US_LANG) + { + if( RC_BAD( rc = f_wpCheckDoubleCollation( + pIStream, FALSE, TRUE, &uChar, &uChar2, &bTwoIntoOne, uiLanguage))) + { + goto Exit; + } + if (!f_unicodeToWP( uChar, &ui16WpChr)) + { + ui16WpChr = UNK_UNICODE_CODE; + } + if (uChar2) + { + if (!f_unicodeToWP( uChar2, &ui16WpChr2)) + { + ui16WpChr2 = UNK_UNICODE_CODE; + } + } + else + { + ui16WpChr2 = 0; + } + } + else + { + + // Convert the character to its WP equivalent + + if( !f_unicodeToWP( uChar, &ui16WpChr)) + { + ui16WpChr = UNK_UNICODE_CODE; + } + } + + // Save the case bit if not case-insensitive + + if (!bCaseInsensitive) + { + + // charIsUpper returns TRUE if upper case, 0 if lower case. + + if (!charIsUpper( ui16WpChr)) + { + uiFlags |= HAD_LOWER_CASE; + } + else + { + // Set if upper case. + + setBit( ucCaseBits, uiCaseBitPos); + } + uiCaseBitPos++; + } + + // Handle non-collating characters with subcollating values, + // Get the collated value from the WP character-if not collating value + + if ((pucCollatedStr[ uiColLen++] = + (FLMBYTE)(flmWPGetCollation( ui16WpChr, uiLanguage))) >= COLS11) + { + FLMUINT uiTemp; + + // If lower case, convert to upper case. + + if (!charIsUpper( ui16WpChr)) + { + ui16WpChr &= ~1; + } + + // No collating value given for this WP char. + // Save original WP char (2 bytes) in subcollating + // buffer. + + // 1110 is a new code that will store an insert over + // the character OR a non-convertable unicode character. + // Store with the same alignment as "store_extended_char" + // below. + + // 11110 is code for unmappable UNICODE value. + // A value 0xFE will be the collation value. The sub-collation + // value will be 0xFFFF followed by the UNICODE value. + // Be sure to eat an extra case bit. + + // See specific Hebrew and Arabic comments in the + // switch statement below. + + // Set the next byte that follows in the sub collation buffer. + + ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0; + if (bHebrewArabic && (pucCollatedStr[ uiColLen - 1] == COLS0_ARABIC)) + { + // Store first bit of 1110, fall through & store remaining 3 bits + + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos++; + + // Don't store collation value + + uiColLen--; + } + else if( uChar) + { + ui16WpChr = uChar; + uChar = 0; + + // Store 11 out of 11110 + + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos++; + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos++; + if (!bCaseInsensitive) + { + ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0; + + // Set upper case bit. + + setBit( ucCaseBits, uiCaseBitPos); + uiCaseBitPos++; + } + } +store_extended_char: + + // Set the next byte that follows in the sub collation buffer. + + ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0; + ucSubColBuf [(uiSubColBitPos + 16) >> 3] = 0; + uiFlags |= HAD_SUB_COLLATION; + + // Set 110 bits in sub-collation - continued from above. + // No need to explicitly set the zero, but must increment + // for it. + + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos++; + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos += 2; + + // store_aligned_word: This label is not referenced. + // Go to the next byte boundary to write the character. + + uiSubColBitPos = (uiSubColBitPos + 7) & (~7); + uiTemp = bytesInBits( uiSubColBitPos); + + // Need to big-endian - so it will sort correctly. + + ucSubColBuf [uiTemp] = (FLMBYTE)(ui16WpChr >> 8); + ucSubColBuf [uiTemp + 1] = (FLMBYTE)(ui16WpChr); + uiSubColBitPos += 16; + ucSubColBuf [uiSubColBitPos >> 3] = 0; + } + else + { + // Had a collation value + // Add the lower/uppercase bit if a mixed case output. + // If not lower ASCII set - check diacritic value for sub-collation + + if( !(ui16WpChr & 0xFF00)) + { + // ASCII character set - set a single 0 bit - just need to + // increment to do this. + + uiSubColBitPos++; + } + else + { + FLMBYTE ucChar = (FLMBYTE)ui16WpChr; + FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChr >> 8); + + // Convert char to uppercase because case information + // is stored above. This will help + // ensure that the "ETA" doesn't sort before "eta" + + if( !charIsUpper( ui16WpChr)) + { + ui16WpChr &= ~1; + } + + switch( ucCharSet) + { + case CHSMUL1: // Multinational 1 + { + // If we cannot break down a char into base and + // diacritic we cannot combine the charaacter + // later when converting back the key. In that case, + // write the entire WP char in the sub-collation area. + + if( flmWPBrkcar( ui16WpChr, &ui16Base, &ui16SubColVal)) + { + goto store_extended_char; + } + + // Write the FLAIM diacritic sub-collation value. + // Prefix is 2 bits "10". Remember to leave + // "111" alone for the future. + // NOTE: The "unlaut" character must sort after the "ring" + // character. + + ui16SubColVal = ((ui16SubColVal & 0xFF) == umlaut && + (uiLanguage == FLM_SU_LANG || + uiLanguage == FLM_SV_LANG || + uiLanguage == FLM_CZ_LANG || + uiLanguage == FLM_SL_LANG)) + ? (FLMUINT16)(fwp_dia60Tbl[ ring] + 1) + : (FLMUINT16)(fwp_dia60Tbl[ ui16SubColVal & 0xFF]); + +store_sub_col: + // Set the next byte that follows in the sub collation buffer. + + ucSubColBuf[ (uiSubColBitPos + 8) >> 3] = 0; + uiFlags |= HAD_SUB_COLLATION; + + // Set the 10 bits - no need to explicitly set the zero, but + // must increment for it. + + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos += 2; + + // Set sub-collation bits. + + setBits( 5, ucSubColBuf, uiSubColBitPos, ui16SubColVal); + uiSubColBitPos += 5; + break; + } + + case CHSGREK: // Greek + { + if (ucChar >= 52 || // Keep case bit for 52-69 else ignore + ui16WpChr == 0x804 || // [ 8,4] BETA Medial | Terminal + ui16WpChr == 0x826) // [ 8,38] SIGMA terminal + { + goto store_extended_char; + } + + // No subcollation to worry about - set a zero bit by + // incrementing the bit position. + + uiSubColBitPos++; + break; + } + + case CHSCYR: + { + if (ucChar >= 144) + { + goto store_extended_char; + } + + // No subcollation to worry about - set a zero bit by + // incrementing the bit position. + + uiSubColBitPos++; + + // Georgian covers 208-249 - no collation defined yet + + break; + } + + case CHSHEB: // Hebrew + { + // Three sections in Hebrew: + // 0..26 - main characters + // 27..83 - accents that apear over previous character + // 84..118- dagesh (ancient) hebrew with accents + + // Because the ancient is only used for sayings & scriptures + // we will support a collation value and in the sub-collation + // store the actual character because sub-collation is in + // character order. + + if (ucChar >= 84) // Save ancient - value 84 and above + { + goto store_extended_char; + } + + // No subcollation to worry about - set a zero bit by + // incrementing the bit position. + + uiSubColBitPos++; + break; + } + + case CHSARB1: // Arabic 1 + { + // Three sections in Arabic: + // 00..37 - accents that display OVER a previous character + // 38..46 - symbols + // 47..57 - numbers + // 58..163 - characters + // 164 - hamzah accent + // 165..180- common characters with accents + // 181..193- ligatures - common character combinations + // 194..195- extensions - throw away when sorting + + if( ucChar <= 46) + { + goto store_extended_char; // save original character + } + + if( pucCollatedStr[ uiColLen - 1] == COLS10a + 1) // Alef? + { + ui16SubColVal = (ucChar >= 165) + ? (FLMUINT16)(fwp_alefSubColTbl[ ucChar - 165 ]) + : (FLMUINT16)7; // Alef subcol value + goto store_sub_col; + } + + if (ucChar >= 181) // Ligatures - char combination + { + goto store_extended_char; // save original character + } + + if (ucChar == 64) // taa exception + { + ui16SubColVal = 8; + goto store_sub_col; + } + + // No subcollation to worry about - set a zero bit by + // incrementing the bit position. + + uiSubColBitPos++; + break; + } + + case CHSARB2: // Arabic 2 + { + // There are some characters that share the same slot + // Check the bit table if above character 64 + + if (ucChar >= 64 && + fwp_ar2BitTbl[(ucChar-64)>> 3] & (0x80 >> (ucChar&0x07))) + { + goto store_extended_char; // Will save original + } + + // No subcollation to worry about - set a zero bit by + // incrementing the bit position. + + uiSubColBitPos++; + break; + } + + default: + { + // Increment bit position to set a zero bit. + + uiSubColBitPos++; + break; + } + } + } + + // Now let's worry about double character sorting + + if (ui16WpChr2) + { + if (pbOriginalCharsLost) + { + *pbOriginalCharsLost = TRUE; + } + + // Set the next byte that follows in the sub collation buffer. + + ucSubColBuf[ (uiSubColBitPos + 7) >> 3] = 0; + + if (bTwoIntoOne) + { + + // Sorts after character in ui16WpChr after call to + // f_wpCheckDoubleCollation + // Write the char 2 times so lower/upper bits are correct. + // Could write infinite times because of collation rules. + + pucCollatedStr[ uiColLen] = ++pucCollatedStr[ uiColLen - 1]; + uiColLen++; + + // If original was upper case, set one more upper case bit + + if( !bCaseInsensitive) + { + ucCaseBits[ (uiCaseBitPos + 7) >> 3] = 0; + if( !charIsUpper( ui16WpChr2)) + { + uiFlags |= HAD_LOWER_CASE; + } + else + { + setBit( ucCaseBits, uiCaseBitPos); + } + uiCaseBitPos++; + } + + // Take into account the diacritical space + + uiSubColBitPos++; + } + else + { + + // We have a digraph, get second collation value + + pucCollatedStr[ uiColLen++] = + (FLMBYTE)(flmWPGetCollation( ui16WpChr2, uiLanguage)); + + // Normal case, assume no diacritics set + + uiSubColBitPos++; + + // If first was upper, set one more upper bit. + + if( !bCaseInsensitive) + { + ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0; + if (charIsUpper( ui16WpChr)) + { + setBit( ucCaseBits, uiCaseBitPos); + } + uiCaseBitPos++; + + // no need to reset the uiFlags + } + } + } + } + + // Check to see if uiColLen is at some overflow limit. + + if (uiColLen >= uiCharLimit || + uiColLen + bytesInBits( uiSubColBitPos) + + bytesInBits( uiCaseBitPos) >= uiTargetColLen) + { + + // We hit the maximum number of characters. See if we hit the + // end of the string. + + if (RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar))) + { + if (rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + } + else + { + goto Exit; + } + } + else + { + bDataTruncated = TRUE; + } + break; + } + } + + if (puiCollationLen) + { + *puiCollationLen = uiColLen; + } + + // Add the first substring marker - also serves as making the string non-null. + + if (bFirstSubstring) + { + pucCollatedStr[ uiColLen++] = COLL_FIRST_SUBSTRING; + } + + if (bDataTruncated) + { + pucCollatedStr[ uiColLen++ ] = COLL_TRUNCATED; + } + + // Return NOTHING if no values found + + if (!uiColLen && !uiSubColBitPos) + { + if (puiCaseLen) + { + *puiCaseLen = 0; + } + goto Exit; + } + + // Store extra zero bit in the sub-collation area for Hebrew/Arabic + + if (bHebrewArabic) + { + uiSubColBitPos++; + } + + // Done putting the string into 4 sections - build the COLLATED KEY + // Don't set uiUppercaseFlag earlier than here because SC_LOWER may be zero + + uiUppercaseFlag = (uiLanguage == FLM_GR_LANG) + ? SC_LOWER + : SC_UPPER; + + // Did we write anything to the subcollation area? + // The default terminating characters is (COLL_MARKER|SC_UPPER) + + if (uiFlags & HAD_SUB_COLLATION) + { + // Writes out a 0x7 + + pucCollatedStr[ uiColLen++] = COLL_MARKER | SC_SUB_COL; + + // Move the sub-collation into the collating string + + uiLength = bytesInBits( uiSubColBitPos); + f_memcpy( &pucCollatedStr[ uiColLen], ucSubColBuf, uiLength); + uiColLen += uiLength; + } + + // Move the upper/lower case stuff - force bits for Greek ONLY + // This is such a small size that a memcpy is not worth it + + if( uiFlags & HAD_LOWER_CASE) + { + FLMUINT uiNumBytes = bytesInBits( uiCaseBitPos); + FLMBYTE * pucCasePtr = ucCaseBits; + + // Output the 0x5 + + pucCollatedStr[ uiColLen++] = (FLMBYTE)(COLL_MARKER | SC_MIXED); + if( puiCaseLen) + { + *puiCaseLen = uiNumBytes + 1; + } + + if( uiUppercaseFlag == SC_LOWER) + { + // Negate case bits for languages (like GREEK) that sort + // upper case before lower case. + + while( uiNumBytes--) + { + pucCollatedStr[ uiColLen++] = ~(*pucCasePtr++); + } + } + else + { + while( uiNumBytes--) + { + pucCollatedStr[ uiColLen++] = *pucCasePtr++; + } + } + } + else + { + // All characters are either upper or lower case, as determined + // by uiUppercaseFlag. + + pucCollatedStr[ uiColLen++] = (FLMBYTE)(COLL_MARKER | uiUppercaseFlag); + if( puiCaseLen) + { + *puiCaseLen = 1; + } + } + +Exit: + + if( pbDataTruncated) + { + *pbDataTruncated = bDataTruncated; + } + + *puiCollatedStrLen = uiColLen; + return( rc); +} + +/***************************************************************************** +Desc: Convert a collated string to a WP word string +*****************************************************************************/ +RCODE FLMAPI f_colStr2WPStr( + const FLMBYTE * pucColStr, // Points to the collated string + FLMUINT uiColStrLen, // Length of the collated string + FLMBYTE * pucWPStr, // Output string to build - WP word string + FLMUINT * puiWPStrLen, + FLMUINT uiLang, + FLMUINT * puiUnconvChars, + FLMBOOL * pbDataTruncated, // Set to TRUE if truncated + FLMBOOL * pbFirstSubstring) // Sets to TRUE if first substring +{ + FLMBYTE * pucWPPtr = pucWPStr; // Points to the word string data area + FLMBYTE * pucWPEnd = &pucWPPtr[ *puiWPStrLen]; + FLMUINT uiMaxWPBytes = *puiWPStrLen; + FLMUINT uiLength = uiColStrLen; // May optimize as a register + FLMUINT uiPos = 0; // Position in pucColStr + FLMUINT uiBitPos; // Computed bit position + FLMUINT uiColChar; // Not portable if a FLMBYTE value + FLMUINT uiWPStrLen; + FLMUINT uiUnconvChars = 0; + FLMBOOL bHebrewArabic = FALSE; + RCODE rc = NE_FLM_OK; + + // WARNING: + // The code is duplicated for performance reasons. + // The US code below is much more optimized so + // any changes must be done twice. + + if( uiLang == FLM_US_LANG) + { + while( uiLength && (pucColStr[ uiPos] > MAX_COL_OPCODE)) + { + uiLength--; + + // Move in the WP value given uppercase collated value + + uiColChar = (FLMUINT)pucColStr[ uiPos++]; + if( uiColChar == COLS0) + { + uiColChar = (FLMUINT)0xFFFF; + uiUnconvChars++; + } + else + { + uiColChar = (FLMUINT)colToWPChr[ uiColChar - COLLS]; + } + + // Put the WP char in the word string + + if( pucWPPtr + 2 >= pucWPEnd) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + UW2FBA( (FLMUINT16)uiColChar, pucWPPtr); + pucWPPtr += 2; + } + } + else // Non-US collation + { + if( (uiLang == FLM_AR_LANG ) || // Arabic + (uiLang == FLM_FA_LANG ) || // Farsi - Persian + (uiLang == FLM_HE_LANG ) || // Hebrew + (uiLang == FLM_UR_LANG)) // Urdu + { + bHebrewArabic = TRUE; + } + + while( uiLength && (pucColStr[ uiPos] > MAX_COL_OPCODE)) + { + uiLength--; + uiColChar = (FLMUINT)pucColStr[ uiPos++]; + + switch( uiColChar) + { + case COLS9+4: // ch in spanish + case COLS9+11: // ch in czech + { + // Put the WP char in the word string + + if( pucWPPtr + 2 >= pucWPEnd) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + UW2FBA( (FLMUINT16) 'C', pucWPPtr); + pucWPPtr += 2; + uiColChar = (FLMUINT)'H'; + uiPos++; // Move past second duplicate char + break; + } + + case COLS9+17: // ll in spanish + { + // Put the WP char in the word string + + if( pucWPPtr + 2 >= pucWPEnd) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + UW2FBA( (FLMUINT16)'L', pucWPPtr); + pucWPPtr += 2; + uiColChar = (FLMUINT)'L'; + uiPos++; // Move past duplicate character + break; + } + + case COLS0: // Non-collating character or OEM character + { + // Actual character is in sub-collation area + + uiColChar = (FLMUINT)0xFFFF; + uiUnconvChars++; + break; + } + + default: + { + // Watch out COLS10h has () around it for subtraction + + if( bHebrewArabic && (uiColChar >= COLS10h)) + { + uiColChar = (uiColChar < COLS10a) // Hebrew only? + ? (FLMUINT) (0x900 + (uiColChar - (COLS10h))) // Hebrew + : (FLMUINT) (HebArabColToWPChr[ uiColChar - (COLS10a)]); // Arabic + } + else + { + uiColChar = (FLMUINT)colToWPChr[ uiColChar - COLLS]; + } + break; + } + } + + // Put the WP char in the word string + + if( pucWPPtr + 2 >= pucWPEnd) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + UW2FBA( (FLMUINT16)uiColChar, pucWPPtr); + pucWPPtr += 2; + } + } + + // Terminate the string + + if( pucWPPtr + 2 >= pucWPEnd) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + UW2FBA( (FLMUINT16)0, pucWPPtr); + uiWPStrLen = uiPos + uiPos; // Multiply by 2 + + // Parse through the sub-collation and case information. + // Here are values for some of the codes: + // [ 0x04] - case information is all uppercase (IS,DK,GR) + // [ 0x05] - case bits follow + // [ 0x06] - case information is all uppercase + // [ 0x07] - beginning of sub-collation information + // [ 0x08] - first substring field that is made + // [ 0x09] - truncation marker for text and binary + // + // Below are some cases to consider... + // + // [ COLLATION][ 0x07 sub-collation][ 0x05 case info] + // [ COLLATION][ 0x07 sub-collation][ 0x05 case info] + // [ COLLATION][ 0x07 sub-collation] + // [ COLLATION][ 0x07 sub-collation] + // [ COLLATION][ 0x05 case info] + // [ COLLATION][ 0x05 case info] + // [ COLLATION] + // [ COLLATION] + // + // In the future still want[ 0x06] to be compressed out for uppercase + // only indexes. + + // Check first substring before truncated + + if( uiLength && pucColStr[ uiPos] == COLL_FIRST_SUBSTRING) + { + if( pbFirstSubstring) + { + *pbFirstSubstring = TRUE; // Don't need to initialize to FALSE. + } + uiLength--; + uiPos++; + } + + // Is the key truncated? + + if( uiLength && pucColStr[ uiPos] == COLL_TRUNCATED) + { + if( pbDataTruncated) + { + *pbDataTruncated = TRUE; // Don't need to initialize to FALSE. + } + uiLength--; + uiPos++; + } + + // Does sub-collation follow? + // Still more to process - first work on the sub-collation (diacritics) + // Hebrew/Arabic may have empty collation area + + if( uiLength && (pucColStr[ uiPos] == (COLL_MARKER | SC_SUB_COL))) + { + FLMUINT uiTempLen; + + // Do another pass on the word string adding the diacritics + + if( RC_BAD( rc = flmWPCmbSubColBuf( pucWPStr, &uiWPStrLen, uiMaxWPBytes, + &pucColStr[ ++uiPos], bHebrewArabic, &uiBitPos))) + { + goto Exit; + } + + // Move pos to next byte value + + uiTempLen = bytesInBits( uiBitPos); + uiPos += uiTempLen; + uiLength -= uiTempLen + 1; // The 1 includes the 0x07 byte + } + + // Does the case info follow? + + if( uiLength && (pucColStr[ uiPos] >= 0x04)) + { + // Take care of the lower and upper case conversion + // If mixed case then convert using case bits + + if( pucColStr[ uiPos++] & SC_MIXED) // Increment pos here! + { + // Don't pre-increment pos on line below! + uiPos += flmWPToMixed( pucWPStr, uiWPStrLen, + &pucColStr[ uiPos], uiLang); + } + // else 0x04 or 0x06 - all characters already in uppercase + } + + // Should end perfectly at the end of the collation buffer. + + if (uiPos != uiColStrLen) + { + rc = RC_SET_AND_ASSERT( NE_FLM_DATA_ERROR); + goto Exit; + } + + *puiWPStrLen = uiWPStrLen; + *puiUnconvChars = uiUnconvChars; + +Exit: + + return( rc); +} + +/**************************************************************************** +Desc: Convert a text string to a collated string. +****************************************************************************/ +RCODE FLMAPI f_asiaUTF8ToColText( + IF_PosIStream * pIStream, + FLMBYTE * pucColStr, // Output collated string + FLMUINT * puiColStrLen, // Collated string length return value + // Input value is MAX num of bytes in buffer + FLMBOOL bCaseInsensitive, // Set if to convert to uppercase + FLMUINT * puiCollationLen, // Returns the collation bytes length + FLMUINT * puiCaseLen, // Returns length of case bytes + FLMUINT uiCharLimit, // Max number of characters in this key piece + FLMBOOL bFirstSubstring, // TRUE is this is the first substring key + FLMBOOL bDataTruncated, // Was input data already truncated. + FLMBOOL * pbDataTruncated) +{ + RCODE rc = NE_FLM_OK; + FLMBOOL bEndOfStr = FALSE; + FLMUINT uiLength; + FLMUINT uiTargetColLen = *puiColStrLen - 12; // 6=ovhd,6=worst char + FLMBYTE ucSubColBuf[ MAX_SUBCOL_BUF + 1]; // Holds Sub-col values (diac) + FLMBYTE ucLowUpBuf[ MAX_CASE_BYTES + MAX_CASE_BYTES + 2]; // 2 case bits/wpchar + FLMUINT uiColLen; + FLMUINT uiSubColBitPos; + FLMUINT uiLowUpBitPos; + FLMUINT uiFlags; + FLMUNICODE uChar; + FLMUINT16 ui16NextWpChar; + FLMUINT16 ui16ColValue; + + uiColLen = uiSubColBitPos = uiLowUpBitPos = uiFlags = 0; + uChar = ui16ColValue = 0; + + // We don't want any single key piece to "pig out" more + // than 256 bytes of the key + + if( uiTargetColLen > 256 - 12) + { + uiTargetColLen = 256 - 12; + } + + // Make sure ucSubColBuf and ucLowUpBuf are set to 0 + + f_memset( ucSubColBuf, 0, sizeof( ucSubColBuf)); + f_memset( ucLowUpBuf, 0, sizeof( ucLowUpBuf)); + + ui16NextWpChar = 0; + + while( !bEndOfStr || ui16NextWpChar || uChar) + { + FLMUINT16 ui16WpChar; // Current WP character + FLMUINT16 ui16SubColVal; // Sub-collated value (diacritic) + FLMBYTE ucCaseFlags; + FLMUINT16 ui16CurWpChar; + + // Get the next character from the string. + + ui16WpChar = ui16NextWpChar; + for( ui16NextWpChar = 0; + (!ui16WpChar || !ui16NextWpChar) && + !uChar && !bEndOfStr;) + { + if (!bEndOfStr) + { + if( RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar))) + { + if (rc == NE_FLM_EOF_HIT) + { + rc = NE_FLM_OK; + bEndOfStr = TRUE; + } + else + { + goto Exit; + } + } + } + else + { + uChar = 0; + } + + if( f_unicodeToWP( uChar, &ui16CurWpChar)) + { + uChar = 0; + } + + if( !ui16WpChar) + { + ui16WpChar = ui16CurWpChar; + } + else + { + ui16NextWpChar = ui16CurWpChar; + } + } + + // If we didn't get a character, break out of the outer + // processing loop. + + if( !ui16WpChar && !uChar) + { + break; + } + + if( ui16WpChar) + { + if( flmWPAsiaGetCollation( ui16WpChar, ui16NextWpChar, ui16ColValue, + &ui16ColValue, &ui16SubColVal, &ucCaseFlags, bCaseInsensitive) == 2) + { + // Took the ui16NextWpChar value + // Force to skip this value + + ui16NextWpChar = 0; + } + } + else // Use the uChar value for this pass + { + // This handles all of the UNICODE characters that could not + // be converted to WP characters - which will include most + // of the Asian characters. + + ucCaseFlags = 0; + if( uChar < 0x20) + { + ui16ColValue = 0xFFFF; + + // Setting ui16SubColVal to a high code will ensure + // that the code that the uChar value will be stored + // in in the sub-collation area. + + ui16SubColVal = 0xFFFF; + + // NOTE: uChar SHOULD NOT be set to zero here. + // It will be set to zero below. + } + else + { + ui16ColValue = uChar; + ui16SubColVal = 0; + uChar = 0; + } + } + + // Store the values in 2 bytes + + pucColStr[ uiColLen++] = (FLMBYTE)(ui16ColValue >> 8); + pucColStr[ uiColLen++] = (FLMBYTE)(ui16ColValue & 0xFF); + + if( ui16SubColVal) + { + uiFlags |= HAD_SUB_COLLATION; + if( ui16SubColVal <= 31) // 5 bit - store bits 10 + { + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos += 1 + 1; // Stores a zero + setBits( 5, ucSubColBuf, uiSubColBitPos, ui16SubColVal); + uiSubColBitPos += 5; + } + else // 2 bytes - store bits 110 or 11110 + { + FLMUINT uiTemp; + + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos++; + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos++; + + if( !ui16WpChar && uChar) // Store as "11110" + { + ui16SubColVal = uChar; + uChar = 0; + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos++; + setBit( ucSubColBuf, uiSubColBitPos); + uiSubColBitPos++; + } + uiSubColBitPos++; // Skip past the zero + + // Go to the next byte boundary to write the WP char + uiSubColBitPos = (uiSubColBitPos + 7) & (~7); + uiTemp = bytesInBits( uiSubColBitPos); + + // Need to store HIGH-Low - PC format is Low-high! + ucSubColBuf[ uiTemp ] = (FLMBYTE)(ui16SubColVal >> 8); + ucSubColBuf[ uiTemp + 1] = (FLMBYTE)(ui16SubColVal); + + uiSubColBitPos += 16; + } + } + else + { + uiSubColBitPos++; + } + + // Save case information - always 2 bits worth for Asian + + if( ucCaseFlags & 0x02) + { + setBit( ucLowUpBuf, uiLowUpBitPos); + } + + uiLowUpBitPos++; + + if( ucCaseFlags & 0x01) + { + setBit( ucLowUpBuf, uiLowUpBitPos); + } + uiLowUpBitPos++; + + // Check to see if uiColLen is within 1 byte of max + + if( (uiColLen >= uiCharLimit) || + (uiColLen + bytesInBits( uiSubColBitPos) + + bytesInBits( uiLowUpBitPos) >= uiTargetColLen)) + { + // Still something left? + + if (ui16NextWpChar || uChar) + { + bDataTruncated = TRUE; + } + else if (!bEndOfStr) + { + if (RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar))) + { + if (rc == NE_FLM_EOF_HIT) + { + bEndOfStr = TRUE; + rc = NE_FLM_OK; + } + else + { + goto Exit; + } + } + else + { + bDataTruncated = TRUE; + } + } + break; // Hit the max. number of characters + } + } + + if( puiCollationLen) + { + *puiCollationLen = uiColLen; + } + + // Add the first substring marker - also serves + // as making the string non-null. + + if( bFirstSubstring) + { + pucColStr[ uiColLen++] = 0; + pucColStr[ uiColLen++] = COLL_FIRST_SUBSTRING; + } + + if( bDataTruncated) + { + pucColStr[ uiColLen++] = 0; + pucColStr[ uiColLen++] = COLL_TRUNCATED; + } + + // Return NOTHING if no values found + + if( !uiColLen && !uiSubColBitPos) + { + if( puiCaseLen) + { + *puiCaseLen = 0; + } + goto Exit; + } + + // Done putting the String into 3 sections - build the COLLATED KEY + + if( uiFlags & HAD_SUB_COLLATION) + { + pucColStr[ uiColLen++] = 0; + pucColStr[ uiColLen++] = COLL_MARKER | SC_SUB_COL; + + // Move the Sub-collation (diacritics) into the collating string + + uiLength = (FLMUINT)(bytesInBits( uiSubColBitPos)); + f_memcpy( &pucColStr[ uiColLen], ucSubColBuf, uiLength); + uiColLen += uiLength; + } + + // Always represent the marker as 2 bytes and case bits in Asia + + pucColStr[ uiColLen++] = 0; + pucColStr[ uiColLen++] = COLL_MARKER | SC_MIXED; + + uiLength = (FLMUINT)(bytesInBits( uiLowUpBitPos)); + f_memcpy( &pucColStr[ uiColLen ], ucLowUpBuf, uiLength); + + if( puiCaseLen) + { + *puiCaseLen = (FLMUINT)(uiLength + 2); + } + uiColLen += uiLength; + +Exit: + + if( pbDataTruncated) + { + *pbDataTruncated = bDataTruncated; + } + + *puiColStrLen = uiColLen; + return( rc); +} + +/**************************************************************************** +Desc: Combine the diacritic 5 and 16 bit values to an existing word string. +Ret: FLMUINT - Number of bytes parsed +Notes: For each bit in the sub-collation section: + 0 - no subcollation information + 10 - take next 5 bits - will tell about diacritics or japanese vowel + 110 - align to next byte & take word value as extended character + +****************************************************************************/ +FSTATIC RCODE flmAsiaParseSubCol( + FLMBYTE * pucWPStr, + FLMUINT * puiWPStrLen, + FLMUINT uiMaxWPBytes, + const FLMBYTE * pucSubColBuf, + FLMUINT * puiSubColBitPos) +{ + RCODE rc = NE_FLM_OK; + FLMUINT uiSubColBitPos = 0; + FLMUINT uiNumChars = *puiWPStrLen >> 1; + FLMUINT16 ui16Diac; + FLMUINT16 ui16WpChar; + + // For each character (16 bits) in the WP string ... + + while( uiNumChars--) + { + // Have to skip 0, because it is not accounted for + // in the sub-collation bits. It was inserted when we + // encountered unconverted unicode characters (Asian). + // Will be converted to something else later on. + // SEE NOTE ABOVE. + + if( FB2UW( pucWPStr) == 0) + { + pucWPStr += 2; + continue; + } + + // This macro DOESN'T increment uiBitPos + + if( testOneBit( pucSubColBuf, uiSubColBitPos)) + { + // Bits 10 - take next 5 bits + // Bits 110 align and take next word + // Bits 11110 align and take unicode value + + uiSubColBitPos++; + if( !testOneBit( pucSubColBuf, uiSubColBitPos)) + { + uiSubColBitPos++; + ui16Diac = (FLMUINT16)(getNBits( 5, pucSubColBuf, uiSubColBitPos)); + uiSubColBitPos += 5; + + if( (ui16WpChar = FB2UW( pucWPStr)) < 0x100) + { + if( (ui16WpChar >= 'A') && (ui16WpChar <= 'Z')) + { + // Convert to WP diacritic and combine characters + + flmWPCmbcar( &ui16WpChar, ui16WpChar, + (FLMUINT16)ml1_COLtoD[ ui16Diac]); + + // Even if cmbcar fails, WpChar is still set to a valid value + } + else + { + // Symbols from charset 0x24 + + ui16WpChar = (FLMUINT16)(0x2400 + + fwp_Ch24ColTbl[ ui16Diac - 1 ].ByteValue); + } + } + else if( ui16WpChar >= 0x2600) // Katakana + { + // Voicings - will allow to select original char + // 000 - some 001 are changed to 000 to save space + // 001 - set if large char (uppercase) + // 010 - set if voiced + // 100 - set if half voiced + // + // Should NOT match voicing or wouldn't be here! + + FLMBYTE ucChar = (FLMBYTE)(ui16WpChar & 0xFF); + + // Try exceptions first so don't access out of bounds + + if( ucChar == 84) + { + ui16WpChar = (FLMUINT16)(0x2600 + + ((ui16Diac == 1) + ? (FLMUINT16)10 + : (FLMUINT16)11)); + } + else if( ucChar == 85) + { + ui16WpChar = (FLMUINT16)(0x2600 + + ((ui16Diac == 1) + ? (FLMUINT16)16 + : (FLMUINT16)17)); + } + + // Try the next 2 slots, if not then + // value is 83, 84 or 85 + + else if( KanaSubColTbl[ ucChar + 1 ] == ui16Diac) + { + ui16WpChar++; + } + else if( KanaSubColTbl[ ucChar + 2 ] == ui16Diac) + { + ui16WpChar += 2; + } + else if( ucChar == 4) // Last exception + { + ui16WpChar = 0x2600 + 83; + } + + // else, leave alone! - invalid storage + } + + UW2FBA( ui16WpChar, pucWPStr); // Set if changed or not + } + else // "110" + { + FLMUINT uiTemp; + + uiSubColBitPos++; // Skip second '1' + if( testOneBit( pucSubColBuf, uiSubColBitPos)) // 11?10 ? + { + if( (*puiWPStrLen) + 2 > uiMaxWPBytes) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + // Unconvertable UNICODE character + // The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode + + shiftN( pucWPStr, + (FLMUINT16)(uiNumChars + uiNumChars + 4), 2); + + pucWPStr += 2; // Skip the 0xFFFF for now + uiSubColBitPos += 2; // Skip next "11" + (*puiWPStrLen) += 2; + } + uiSubColBitPos++; // Skip the zero + + // Round up to next byte + uiSubColBitPos = (uiSubColBitPos + 7) & (~7); + uiTemp = bytesInBits( uiSubColBitPos); + pucWPStr[ 1] = pucSubColBuf[ uiTemp]; // Character set + pucWPStr[ 0] = pucSubColBuf[ uiTemp + 1]; // Character + uiSubColBitPos += 16; + } + } + else + { + uiSubColBitPos++; // Be sure to increment this! + } + + pucWPStr += 2; // Next WP character + } + + *puiSubColBitPos = bytesInBits( uiSubColBitPos); + +Exit: + + return( rc); +} + +/**************************************************************************** +Desc: The case bits for asia are: + Latin/Greek/Cyrillic + 01 - case bit set if character is uppercase + 10 - double wide character in CS 0x25xx, 0x26xx and 0x27xx + Japanese + 00 - double wide hiragana 0x255e..25b0 + 01 - double wide katakana 0x2600..2655 + 10 - single wide symbols from charset 11 that map to CS24?? + 11 - single wide katakana from charset 11 +Ret: +Notes: This is tricky to really understand the inputs. + This looks at the bits according to the current character value. +****************************************************************************/ +FSTATIC RCODE flmAsiaParseCase( + FLMBYTE * pucWPStr, + FLMUINT * puiWPStrLen, + FLMUINT uiMaxWPBytes, + const FLMBYTE * pucCaseBits, + FLMUINT * puiColBytesProcessed) +{ + RCODE rc = NE_FLM_OK; + FLMUINT uiWPStrLen = *puiWPStrLen; + FLMUINT uiCharCnt; + FLMUINT uiExtraBytes = 0; + FLMUINT16 ui16WpChar; + FLMBYTE ucTempByte = 0; + FLMBYTE ucMaskByte; + + // For each character (two bytes) in the string ... + + for( uiCharCnt = uiWPStrLen >> 1, // Total number of words in word string + ucMaskByte = 0; // Force first time to get a byte + uiCharCnt--;) + { + FLMBYTE ucChar; + FLMBYTE ucCharSet; + + ui16WpChar = FB2UW( pucWPStr); // Get the next character + + // Must skip any 0xFFFFs or zeroes that were inserted. + + if( ui16WpChar == 0xFFFF || ui16WpChar == 0) + { + // Put back 0xFFFF in case it was a zero. + + UW2FBA( 0xFFFF, pucWPStr); + pucWPStr += 2; + uiExtraBytes += 2; + continue; + } + + // Time to get another byte? + + if( ucMaskByte == 0) + { + ucTempByte = *pucCaseBits++; + ucMaskByte = 0x80; + } + + ucCharSet = (FLMBYTE)(ui16WpChar >> 8); + ucChar = (FLMBYTE)(ui16WpChar & 0xFF); + + // SINGLE WIDE - NORMAL CHARACTERS + + if( ui16WpChar < 0x2400) + { + // Convert to double wide? + + if( ucTempByte & ucMaskByte) + { + // Latin/greek/cyrillic + // Convert to uppercase double wide char + + if( ucCharSet == 0) // Latin - uppercase + { + // May convert to 0x250F (Latin) or CS24 + + if( ui16WpChar >= ASCII_UPPER_A && ui16WpChar <= ASCII_UPPER_Z) + { + // Convert to double wide + + ui16WpChar = (FLMUINT16)(ui16WpChar - 0x30 + 0x250F); + } + else + { + flmWPHanToZenkaku( ui16WpChar, 0, &ui16WpChar); + } + } + else if( ucCharSet == 8) // Greek + { + if( ucChar > 38) // Adjust for spaces in Greek + { + ucChar -= 2; + } + + if( ucChar > 4) + { + ucChar -= 2; + } + + ui16WpChar = (FLMUINT16)((ucChar >> 1) + 0x265E); + } + else if( ucCharSet == 10) // Cyrillic + { + ui16WpChar = (FLMUINT16)((ucChar >> 1) + 0x2700); + } + else + { + flmWPHanToZenkaku( ui16WpChar, 0, &ui16WpChar); + } + + ucCharSet = (FLMBYTE)(ui16WpChar >> 8); + ucChar = (FLMBYTE)(ui16WpChar & 0xFF); + } + + ucMaskByte >>= 1; // Next bit + + // Change to lower case? + + if( (ucTempByte & ucMaskByte) == 0) + { + // Convert ui16WpChar to lower case + + switch( ucCharSet) + { + case 0: + // Bit zero only if lower case + + ui16WpChar |= 0x20; + break; + + case 1: + // In upper/lower case region? + + if( ucChar >= 26) + { + ui16WpChar++; + } + break; + + case 8: + // All lowercase after 69 + + if( ucChar <= 69) + { + ui16WpChar++; + } + break; + + case 10: + // No cases after 199 + + if( ucChar <= 199) + { + ui16WpChar++; + } + break; + + case 0x25: + case 0x26: + // Should be double wide latin or Greek + // Add offset to convert to lowercase + + ui16WpChar += 0x20; + break; + + case 0x27: + // Double wide cyrillic only + // Add offset to convert to lowercase + + ui16WpChar += 0x30; + break; + } + } + } + else // JAPANESE CHARACTERS + { + if( ucTempByte & ucMaskByte) // Original chars from CharSet 11 + { + if( ucCharSet == 0x26) // Convert to Zen to Hankaku + { + FLMUINT16 ui16NextChar = 0; + + ui16WpChar = flmWPZenToHankaku( ui16WpChar, &ui16NextChar); + if( ui16NextChar) // Move everyone down + { + if( (*puiWPStrLen) + 2 > uiMaxWPBytes) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + uiCharCnt++; + shiftN( pucWPStr, uiCharCnt + uiCharCnt + 2, 2); + UW2FBA( ui16WpChar, pucWPStr); + pucWPStr += 2; + ui16WpChar = ui16NextChar; // This will be stored below + + // Adjust the length + *puiWPStrLen = *puiWPStrLen + 2; + } + } + else if( ucCharSet == 0x24) + { + ui16WpChar = flmWPZenToHankaku( ui16WpChar, NULL); + } + ucMaskByte >>= 1; // Eat the next bit + } + else + { + ucMaskByte >>= 1; // Next bit + if( (ucTempByte & ucMaskByte) == 0) // Convert to Hiragana? + { + // Kanji will also fall through here + + if( ucCharSet == 0x26) + { + // Convert to Hiragana + ui16WpChar = (FLMUINT16)(0x255E + ucChar); + } + } + } + } + UW2FBA( ui16WpChar, pucWPStr); + pucWPStr += 2; + ucMaskByte >>= 1; + } + + uiCharCnt = uiWPStrLen - uiExtraBytes; // Should be 2 bits for each character. + *puiColBytesProcessed = bytesInBits( uiCharCnt); + +Exit: + + return( rc); +} + +/*************************************************************************** +Desc: Get the original string from an asian collation string +Ret: Length of the word string in bytes +****************************************************************************/ +RCODE FLMAPI f_asiaColStr2WPStr( + const FLMBYTE * pucColStr, // Points to the collated string + FLMUINT uiColStrLen, // Length of the collated string + FLMBYTE * pucWPStr, // Output string to build - WP word string + FLMUINT * puiWPStrLen, + FLMUINT * puiUnconvChars, + FLMBOOL * pbDataTruncated, // Set to TRUE if truncated + FLMBOOL * pbFirstSubstring) // Sets to TRUE if first substring +{ + FLMBYTE * pucWPStrPtr = pucWPStr; + FLMBYTE * pucWPEnd = &pucWPStr[ *puiWPStrLen]; + FLMUINT uiLength = uiColStrLen; + FLMUINT uiMaxWPBytes = *puiWPStrLen; + FLMUINT uiColStrPos = 0; + FLMBOOL bHadExtended = FALSE; + FLMUINT uiWPStrLen; + FLMUINT16 ui16ColChar; + FLMUINT uiUnconvChars = 0; + FLMUINT uiColBytesProcessed; + RCODE rc = NE_FLM_OK; + + while( uiLength) + { + FLMBYTE ucChar = pucColStr[ uiColStrPos + 1]; + FLMBYTE ucCharSet = pucColStr[ uiColStrPos]; + + ui16ColChar = (FLMUINT16)((ucCharSet << 8) + ucChar); + if( ui16ColChar <= MAX_COL_OPCODE) + { + break; + } + + uiColStrPos += 2; + uiLength -= 2; + if( ucCharSet == 0) // Normal Latin/Greek/Cyrillic value + { + ui16ColChar = colToWPChr[ ucChar - COLLS]; + } + else if( ucCharSet == 1) // Katakana or Hiragana character + { + if( ucChar > sizeof( ColToKanaTbl)) // Special cases below + { + if( ucChar == COLS_ASIAN_MARK_VAL) // Dakuten + { + ui16ColChar = 0x240a; + } + else if( ucChar == COLS_ASIAN_MARK_VAL + 1) // Handakuten + { + ui16ColChar = 0x240b; + } + else if( ucChar == COLS_ASIAN_MARK_VAL + 2) // Chuuten + { + ui16ColChar = 0x2405; + } + else + { + ui16ColChar = 0xFFFF; // Error + } + } + else + { + ui16ColChar = (FLMUINT16)(0x2600 + ColToKanaTbl[ ucChar]); + } + } + else if( ucCharSet != 0xFF || ucChar != 0xFF) // Asian characters + { + // Insert zeroes that will be treated as a signal for + // uncoverted unicode characters later on. NOTE: Cannot + // use 0xFFFF, because we need to be able to detect this + // case in the sub-collation stuff, and we don't want + // to confuse it with the 0xFFFF that may have been inserted + // in another case. + // THIS IS A REALLY BAD HACK, BUT IT IS THE BEST WE CAN DO + // FOR NOW! + + if( pucWPStrPtr + 2 >= pucWPEnd) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + *pucWPStrPtr++ = 0; + *pucWPStrPtr++ = 0; + uiUnconvChars++; + bHadExtended = TRUE; + } + // else, there is no collation value - found in sub-collation part + + if( pucWPStrPtr + 2 >= pucWPEnd) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + UW2FBA( ui16ColChar, pucWPStrPtr); // Put the uncollation value back + pucWPStrPtr += 2; + } + + if( pucWPStrPtr + 2 >= pucWPEnd) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + UW2FBA( 0, pucWPStrPtr); // Terminate the string + uiWPStrLen = (FLMUINT)(pucWPStrPtr - pucWPStr); + + // Parse through the sub-collation and case information. + // Here are values for some of the codes: + // [ 0x05] - case bits follow + // [ 0x06] - case information is all uppercase + // [ 0x07] - beginning of sub-collation information + // [ 0x08] - first substring field that is made + // [ 0x09] - truncation marker for text and binary + // + // Asian chars the case information should always be there and not + // compressed out. This is because the case information could change + // the actual width of the character from 0x26xx to charset 11. + + // Does truncation marker or sub-collation follow? + + if( uiLength) + { + ui16ColChar = (FLMUINT16)((pucColStr[ uiColStrPos] << 8) + + pucColStr[ uiColStrPos + 1]); + + // First substring is before truncated. + if( ui16ColChar == COLL_FIRST_SUBSTRING) + { + if( pbFirstSubstring) + { + *pbFirstSubstring = TRUE; // Don't need to initialize to FALSE. + } + + uiLength -= 2; + uiColStrPos += 2; + ui16ColChar = (FLMUINT16)((pucColStr[ uiColStrPos] << 8) + + pucColStr[ uiColStrPos + 1]); + } + + if( ui16ColChar == COLL_TRUNCATED) + { + if( pbDataTruncated) + { + *pbDataTruncated = TRUE; // Don't need to initialize to FALSE. + } + uiLength -= 2; + uiColStrPos += 2; + ui16ColChar = (FLMUINT16)((pucColStr[ uiColStrPos] << 8) + + pucColStr[ uiColStrPos+1]); + } + + if( ui16ColChar == (COLL_MARKER | SC_SUB_COL)) + { + FLMUINT uiTempLen; + + // Do another pass on the word string adding diacritics/voicings + + uiColStrPos += 2; + uiLength -= 2; + if( RC_BAD( rc = flmAsiaParseSubCol( pucWPStr, &uiWPStrLen, + uiMaxWPBytes, &pucColStr[ uiColStrPos], &uiTempLen))) + { + goto Exit; + } + + uiColStrPos += uiTempLen; + uiLength -= uiTempLen; + } + else + { + goto check_case; + } + } + + // Does the case info follow? + + if( uiLength) + { + ui16ColChar = (FLMUINT16)((pucColStr[ uiColStrPos] << 8) + + pucColStr[ uiColStrPos + 1]); +check_case: + + if( ui16ColChar == (COLL_MARKER | SC_MIXED)) + { + uiColStrPos += 2; + + if( RC_BAD( rc = flmAsiaParseCase( pucWPStr, &uiWPStrLen, + uiMaxWPBytes, &pucColStr[ uiColStrPos], &uiColBytesProcessed))) + { + goto Exit; + } + + uiColStrPos += uiColBytesProcessed; + + // Set bHadExtended to FALSE, because they will have + // been taken care of in this pass. + + bHadExtended = FALSE; + } + } + + // Change embedded zeroes to 0xFFFFs + + if (bHadExtended) + { + FLMUINT uiCnt; + FLMBYTE * pucTmp; + + for( uiCnt = 0, pucTmp = pucWPStr; + uiCnt < uiWPStrLen; + uiCnt += 2, pucTmp += 2) + { + if( FB2UW( pucTmp) == 0) + { + UW2FBA( 0xFFFF, pucTmp); + } + } + } + + if (uiColStrLen != uiColStrPos) + { + rc = RC_SET_AND_ASSERT( NE_FLM_DATA_ERROR); + goto Exit; + } + + *puiUnconvChars = uiUnconvChars; + *puiWPStrLen = uiWPStrLen; + +Exit: + + return( rc); +} + +/************************************************************************** +Desc: Combine the diacritic 5-bit values to an existing WP string +***************************************************************************/ +FSTATIC RCODE flmWPCmbSubColBuf( + FLMBYTE * pucWPStr, // Existing WP string to modify + FLMUINT * puiWPStrLen, // WP string length in bytes + FLMUINT uiMaxWPBytes, + const FLMBYTE * pucSubColBuf, // Diacritic values in 5 bit sets + FLMBOOL bHebrewArabic, // Set if language is Hebrew or Arabic + FLMUINT * puiSubColBitPos) +{ + RCODE rc = NE_FLM_OK; + FLMUINT uiSubColBitPos = 0; + FLMUINT uiNumChars = *puiWPStrLen >> 1; + FLMUINT16 ui16Diac; + FLMUINT16 ui16WPChar; + FLMUINT uiTemp; + + // For each character (two bytes) in the WP string ... + + while( uiNumChars--) + { + // Label used for hebrew/arabic - additional subcollation can follow + // This macro DOESN'T increment bitPos + + if( testOneBit( pucSubColBuf, uiSubColBitPos)) + { + // If "11110" - unmappable unicode char - 0xFFFF is before it + // If "1110" then INDEX extended char is inserted + // If "110" then extended char follows that replaces collation + // If "10" then take next 5 bits which + // contain the diacritic subcollation value. + +after_last_character: + + uiSubColBitPos++; // Eat the first 1 bit + if( !testOneBit( pucSubColBuf, uiSubColBitPos)) + { + uiSubColBitPos++; // Eat the 0 bit + ui16Diac = (FLMUINT16)(getNBits( 5, pucSubColBuf, uiSubColBitPos)); + uiSubColBitPos += 5; + + // If not extended base + + if( (ui16WPChar = FB2UW( pucWPStr)) < 0x100) + { + // Convert to WP diacritic and combine characters + + flmWPCmbcar( &ui16WPChar, ui16WPChar, + (FLMUINT16)ml1_COLtoD[ ui16Diac]); + + // Even if cmbcar fails, wpchar is still set to a valid value + + UW2FBA( ui16WPChar, pucWPStr); + } + else if( (ui16WPChar & 0xFF00) == 0x0D00) // Arabic? + { + ui16WPChar = ArabSubColToWPChr[ ui16Diac]; + UW2FBA( ui16WPChar, pucWPStr); + } + // else diacritic is extra info + // cmbcar should not handle extended chars for this design + } + else // "110" or "1110" or "11110" + { + uiSubColBitPos++; // Eat the 2nd '1' bit + if( testOneBit( pucSubColBuf, uiSubColBitPos)) // Test the 3rd bit + { + if( (*puiWPStrLen) + 2 > uiMaxWPBytes) + { + rc = RC_SET( NE_FLM_CONV_DEST_OVERFLOW); + goto Exit; + } + + // 1110 - shift wpchars down 1 word and insert value below + uiSubColBitPos++; // Eat the 3rd '1' bit + *puiWPStrLen += 2; // Return 2 more bytes + + if( testOneBit( pucSubColBuf, uiSubColBitPos)) // Test 4th bit + { + // Unconvertable UNICODE character + // The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode + + shiftN( pucWPStr, uiNumChars + uiNumChars + 4, 2); + uiSubColBitPos++; // Eat the 4th '1' bit + pucWPStr += 2; // Skip the 0xFFFF for now + } + else + { + // Move down 2 byte NULL and rest of the 2 byte characters + // The extended character does not have a 0xFF col value + + shiftN( pucWPStr, uiNumChars + uiNumChars + 2, 2); + uiNumChars++; // Increment because inserted + + // Fall through reading the actual charater value + } + } + + uiSubColBitPos++; // Skip past the zero bit + uiSubColBitPos = (uiSubColBitPos + 7) & (~7); // roundup to next byte + uiTemp = bytesInBits( uiSubColBitPos); // compute position + pucWPStr[ 1] = pucSubColBuf[ uiTemp]; // Character set + pucWPStr[ 0] = pucSubColBuf[ uiTemp + 1]; // Character + uiSubColBitPos += 16; + } + } + else + { + uiSubColBitPos++; + } + + pucWPStr += 2; // Next WP character + } + + if( bHebrewArabic) + { + if( testOneBit( pucSubColBuf, uiSubColBitPos)) + { + // Hebrew/Arabic can have trailing accents that + // don't have a matching collation value. + // Keep looping in this case. + // Note that subColBitPos isn't incremented above. + + uiNumChars = 0; // Set so we won't loop forever! + goto after_last_character; // process trailing bit + } + uiSubColBitPos++; // Eat the last '0' bit + } + + *puiSubColBitPos = uiSubColBitPos; + +Exit: + + return( rc); +} + +void FLMAPI F_CollIStream::getCurrPosition( + F_CollStreamPos * pPos) +{ + pPos->uNextChar = m_uNextChar; + pPos->ui64Position = m_pIStream->getCurrPosition(); +}