//------------------------------------------------------------------------- // Desc: FLAIM collation routines and tables // Tabs: 3 // // Copyright (c) 1990-2006 Novell, Inc. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of version 2 of the GNU General Public // License as published by the Free Software Foundation. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, contact Novell, Inc. // // To contact Novell about this file by physical or electronic mail, // you may find current contact information at www.novell.com // // $Id: kycollat.cpp 12313 2006-01-19 15:14:44 -0700 (Thu, 19 Jan 2006) dsanders $ //------------------------------------------------------------------------- #include "flaimsys.h" // Collating sequence defines #define COLLS 32 // first collating number (space/end of line) #define COLS1 (COLLS+9) // quotes #define COLS2 (COLS1+5) // parens #define COLS3 (COLS2+6) // money #define COLS4 (COLS3+6) // math ops #define COLS5 (COLS4+8) // math others #define COLS6 (COLS5+14) // others: %#&@\_|~ #define COLS7 (COLS6+13) // greek #define COLS8 (COLS7+25) // numbers #define COLS9 (COLS8+10) // alphabet #define COLS10 (COLS9+60) // cyrillic #define COLS10h (COLS9+42) // hebrew - writes over european and cyrilic #define COLS10a (COLS10h+28) // arabic - inclusive from 198(C6)- 252(FC) #define COLS11 253 // End of list - arabic goes to the end #define COLS0_ARABIC COLS11 // Set if arabic accent marking #define COLS0_HEBREW COLS11 // Set if hebrew accent marking #define COLSOEM 254 // OEM character in upper range - non-collatable #define COLS0_UNICODE 254 // Use this for UNICODE #define COLS0 255 // graphics/misc - chars without a collate value // State table information for double character sorting #define STATE1 1 #define STATE2 2 #define STATE3 3 #define STATE4 4 #define STATE5 5 #define STATE6 6 #define STATE7 7 #define STATE8 8 #define STATE9 9 #define STATE10 10 #define STATE11 11 #define AFTERC 12 #define AFTERH 13 #define AFTERL 14 #define INSTAE 15 #define INSTOE 16 #define INSTSG 17 #define INSTIJ 18 #define WITHAA 19 #define START_COL 12 #define START_ALL (START_COL + 1) // all US and European #define START_DK (START_COL + 2) // Danish #define START_IS (START_COL + 3) // Icelandic #define START_NO (START_COL + 4) // Norwegian #define START_SU (START_COL + 5) // Finnish #define START_SV (START_COL + 5) // Swedish #define START_YK (START_COL + 6) // Ukrain #define START_TK (START_COL + 7) // Turkish #define START_CZ (START_COL + 8) // Czech #define START_SL (START_COL + 8) // Slovak #define FIXUP_AREA_SIZE 24 #define ASCTBLLEN 95 #define MNTBLLEN 219 #define SYMTBLLEN 9 #define GRKTBLLEN 219 #define CYRLTBLLEN 200 #define HEBTBL1LEN 27 #define HEBTBL2LEN 35 #define AR1TBLLEN 158 #define AR2TBLLEN 179 #define COMPARE_COLLATION 1 #define COMPARE_COL_AND_SUBCOL 2 #define COMPARE_VALUE 3 #define NULL_SUB_COL_CHECK NULL #define NULL_CASE_CHECK NULL #define NULL_WILD_CARD_CHECK NULL #define MAX_SUBCOL_BUF 500 // (((MAX_KEY_SIZ / 4) * 3 + fluff #define MAX_LOWUP_BUF 150 // ((MAX_KEY_SIZ - (MAX_KEY_SIZ / 8)) / 8) * 2 #define SET_CASE_BIT 0x01 #define SET_KATAKANA_BIT 0x01 #define SET_WIDTH_BIT 0x02 #define COLS_ASIAN_MARK_VAL 0x40 #define COLS_ASIAN_MARKS 0x140 FSTATIC RCODE KYCmpKeyElmBld( FDB * pDb, IXD * pIxd, FLMUINT uiContainerNum, IFD * pIfd, FLMUINT uiAction, FLMUINT uiDrn, FLMBOOL * pbHadUniqueKeys, FLMUINT uiCdlEntry, FLMUINT uiCompoundPos, FLMBYTE * pKeyBuf, FLMUINT uiKeyLen, FLMBYTE * pLowUpBuf, FLMUINT uiLuLen, FlmRecord * pRecord, FLD_CONTEXT * pFldContext); FSTATIC RCODE KYFormatText( const FLMBYTE * psVal, FLMUINT uiSrcLen, FLMBOOL bMinSpaces, FLMBOOL bNoUnderscore, FLMBOOL bNoSpace, FLMBOOL bNoDash, FLMBOOL bEscChar, FLMBOOL bInputTruncated, FLMBYTE * psDestBuf, FLMUINT * puiDestLen); FSTATIC RCODE AsiaFlmTextToColStr( const FLMBYTE * Str, FLMUINT uiStrLen, FLMBYTE * ColStr, FLMUINT * puiColStrLenRV, FLMUINT uiUppercaseFlag, FLMUINT * puiCollationLen, FLMUINT * puiCaseLenRV, FLMUINT uiCharLimit, FLMBOOL bFirstSubstring, FLMBOOL * pbDataTruncated); FSTATIC FLMUINT16 flmTextGetSubCol( FLMUINT16 ui16WPValue, FLMUINT16 ui16ColValue, FLMUINT uiLangId); FSTATIC FLMINT flmTextCompareSingleChar( FLMBYTE ** ppLeftText, FLMUINT * puiLeftLen, FLMUINT * puiLeftWpChar2, FLMBYTE ** ppRightText, FLMUINT * puiRightLen, FLMUINT * puiRightWpChar2, FLMINT * piSubColCompare, FLMINT * piCaseCompare, FLMBOOL * pbHitWildCard, FLMINT iCompareType, FLMUINT16 * pui16ColVal, FLMUINT uiFlags, FLMUINT uiLangId); FSTATIC FLMUINT FWWSGetColStr( FLMBYTE * fColStr, FLMUINT * fcStrLenRV, FLMBYTE * wordStr, FLMUINT fWPLang, FLMBOOL * pbDataTruncated, FLMBOOL * pbFirstSubstring); FSTATIC FLMUINT FWWSCmbSubColBuf( FLMBYTE * wordStr, FLMUINT * wdStrLenRV, FLMBYTE * subColBuf, FLMBOOL hebrewArabicFlag); FSTATIC FLMUINT FWWSToMixed( FLMBYTE * wordStr, FLMUINT wdStrLen, FLMBYTE * lowUpBitStr, FLMUINT fWPLang); FSTATIC FLMUINT AsiaParseCase( FLMBYTE * WordStr, FLMUINT * uiWordStrLenRV, FLMBYTE * pCaseBits); FSTATIC RCODE FTextToColStr( const FLMBYTE * pucStr, FLMUINT uiStrLen, FLMBYTE * pucCollatedStr, FLMUINT * puiCollatedStrLen, FLMUINT uiUppercaseFlag, FLMUINT * puiCollationLen, FLMUINT * puiCaseLen, FLMUINT uiLanguage, FLMUINT uiCharLimit, FLMBOOL bFirstSubstring, FLMBOOL * pbOriginalCharsLost, FLMBOOL * pbDataTruncated); FSTATIC FLMUINT16 flmAsiaGetCollation( FLMUINT16 ui16WpChar, FLMUINT16 ui16NextWpChar, FLMUINT16 ui16PrevColValue, FLMUINT16 * pui16ColValue, FLMUINT16 * pui16SubColVal, FLMBYTE * pucCaseBits, FLMUINT16 uiUppercaseFlag); FSTATIC FLMUINT16 flmCheckDoubleCollation( FLMUINT16 * pui16WpChar, FLMBOOL * pbTwoIntoOne, const FLMBYTE ** ppucInputStr, FLMUINT uiLanguage); FSTATIC FLMUINT AsiaParseSubCol( FLMBYTE * WordStr, FLMUINT * puiWordStrLen, FLMBYTE * SubColBuf); FSTATIC FLMUINT FColStrToText( FLMBYTE * fColStr, FLMUINT * fcStrLenRV, FLMBYTE * textStr, FLMUINT fWPLang, FLMBYTE * postBuf, FLMUINT * postBytesRV, FLMBOOL * pbDataTruncated, FLMBOOL * pbFirstSubstring); FSTATIC FLMUINT16 ZenToHankaku( FLMUINT16 ui16WpChar, FLMUINT16 * DakutenOrHandakutenRV); FSTATIC FLMUINT16 HanToZenkaku( FLMUINT16 ui16WpChar, FLMUINT16 ui16NextWpChar, FLMUINT16 * pui16Zenkaku); /**************************************************************************** Desc: ****************************************************************************/ typedef struct BYTE_WORD_TBL { FLMBYTE ByteValue; FLMUINT16 WordValue; } BYTE_WORD_TBL; /**************************************************************************** Desc: ****************************************************************************/ typedef struct TBL_B_TO_BP { FLMBYTE key; FLMBYTE * charPtr; } TBL_B_TO_BP; /**************************************************************************** Desc: ****************************************************************************/ #define BYTES_IN_BITS( bits) \ ((bits + 7) >> 3) /**************************************************************************** Desc: ****************************************************************************/ #define TEST1BIT( buf, bPos) \ ((((buf)[ (bPos) >> 3]) >> (7 - ((bPos) & 7))) & 1) /**************************************************************************** Desc: ****************************************************************************/ #define GET1BIT( buf, bPos) \ ((((buf)[ (bPos) >> 3]) >> (7 - ((bPos) & 7))) & 1) /**************************************************************************** Desc: ****************************************************************************/ #define GETnBITS( n, bf, bit) \ (((unsigned int)( \ ((unsigned char)bf[ (bit) >> 3] << 8) \ | \ (unsigned char)bf[ ((bit) >> 3) + 1] \ ) >> (16 - (n) - ((bit) & 7)) \ ) & ((1 << (n)) - 1) \ ) /**************************************************************************** Desc: ****************************************************************************/ #define SET_BIT( buf, bPos) \ ((buf)[(bPos) >> 3] |= (FLMBYTE)((1 << (7 - ((bPos) & 7))))) /**************************************************************************** Desc: ****************************************************************************/ #define RESET_BIT( buf, bPos) \ ((buf)[(bPos) >> 3] &= (FLMBYTE)(~(1 << (7 - ((bPos) & 7))))) /**************************************************************************** Desc: ****************************************************************************/ #define SETnBITS( n, bf, bit, v) \ { (bf)[ (bit) >> 3] |= \ (FLMBYTE)(((v) << (8 - (n))) \ >> \ ((bit) & 7)); \ (bf)[ ((bit) >> 3) + 1] = \ (FLMBYTE)((v) \ << \ (16 - (n) - ((bit) & 7))); \ } /**************************************************************************** Desc: ****************************************************************************/ FLMUINT16 flmValuea[] = { STATE1, STATE3, STATE2, STATE2, STATE8, STATE8, STATE1, STATE3, STATE9, STATE10, // No longer in use STATE10, // No longer in use STATE4, STATE6, STATE6, STATE5, INSTAE, INSTOE, AFTERC, AFTERH, AFTERL, STATE7, STATE6, INSTSG, // ss for German INSTIJ, STATE11, // aa - no longer in use WITHAA, // aa - no longer in use START_CZ, // Czech START_DK, // Danish START_NO, // Norwegian START_SL, // Slovak START_TK, // Turkish START_SU, // Finnish START_IS, // Icelandic START_SV, // Swedish START_YK, // Ukrainian // Single character fixups COLS9, COLS9, COLS9, COLS9, // US and others COLS9+1, COLS9+1, COLS9+21, COLS9+21, COLS9+30, COLS9+30, COLS9+21, COLS9+21, COLS10+43, COLS10+43, COLS9+12, COLS9+12, COLS9+3, COLS9+3, COLS9+25, COLS9+25, COLS9+27, COLS9+27, COLS9+35, COLS9+35, COLS9+45, COLS9+45, COLS9+55, COLS9+55, // DANISH COLS9+42, COLS9+42, COLS9+53, COLS9+53, COLS9+30, COLS9+30, COLS9+49, COLS9+49, COLS10+43, COLS10+43, COLS9+12, COLS9+12, COLS9+3, COLS9+3, COLS9+25, COLS9+25, COLS9+27, COLS9+27, COLS9+35, COLS9+35, COLS9, COLS9, COLS9, COLS9, // Icelandic COLS9+46, COLS9+46, COLS9+50, COLS9+50, COLS9+30, COLS9+30, COLS9+54, COLS9+54, COLS10+43, COLS10+43, COLS9+12, COLS9+12, COLS9+3, COLS9+3, COLS9+25, COLS9+25, COLS9+27, COLS9+27, COLS9+35, COLS9+35, COLS9, COLS9, COLS9+51, COLS9+51, // Norwegian COLS9+43, COLS9+43, COLS9+21, COLS9+21, COLS9+30, COLS9+30, COLS9+47, COLS9+47, COLS10+43, COLS10+43, COLS9+12, COLS9+12, COLS9+3, COLS9+3, COLS9+25, COLS9+25, COLS9+27, COLS9+27, COLS9+35, COLS9+35, COLS9+48, COLS9+48, COLS9+44, COLS9+44, // Finnish/Swedish COLS9+1, COLS9+1, COLS9+52, COLS9+52, COLS9+30, COLS9+30, COLS9+21, COLS9+21, COLS10+43, COLS10+43, COLS9+12, COLS9+12, COLS9+3, COLS9+3, COLS9+25, COLS9+25, COLS9+27, COLS9+27, COLS9+35, COLS9+35, COLS9, COLS9, COLS9, COLS9, // Ukrain COLS9+1, COLS9+1, COLS9+21, COLS9+21, COLS9+30, COLS9+30, COLS9+21, COLS9+21, COLS10+48, COLS10+48, COLS9+12, COLS9+12, COLS9+3, COLS9+3, COLS9+25, COLS9+25, COLS9+27, COLS9+27, COLS9+35, COLS9+35, COLS9, COLS9, COLS9, COLS9, // Turkish COLS9+1, COLS9+1, COLS9+21, COLS9+21, COLS9+30, COLS9+30, COLS9+21, COLS9+21, COLS9+43, COLS9+43, COLS9+11, COLS9+11, COLS9+3, COLS9+3, COLS9+25, COLS9+25, COLS9+27, COLS9+27, COLS9+35, COLS9+35, COLS9, COLS9, COLS9, COLS9, // Czech / Slovak COLS9+1, COLS9+1, COLS9+21, COLS9+21, COLS9+30, COLS9+30, COLS9+21, COLS9+21, COLS10+43, COLS10+43, COLS9+12, COLS9+12, COLS9+5, COLS9+5, COLS9+26, COLS9+26, COLS9+28, COLS9+28, COLS9+36, COLS9+36 }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE flmAsc60Tbl[ ASCTBLLEN + 2] = { 0x20, // Initial character offset ASCTBLLEN, // Length of this table COLLS, // COLLS+5, // ! COLS1, // " COLS6+1, // # COLS3, // $ COLS6, // % COLS6+2, // & COLS1+1, // ' COLS2, // ( COLS2+1, // ) COLS4+2, // * COLS4, // + COLLS+2, // , COLS4+1, // - COLLS+1, // . COLS4+3, // / COLS8, // 0 COLS8+1, // 1 COLS8+2, // 2 COLS8+3, // 3 COLS8+4, // 4 COLS8+5, // 5 COLS8+6, // 6 COLS8+7, // 7 COLS8+8, // 8 COLS8+9, // 9 COLLS+3, // : COLLS+4, // ; COLS5, // < COLS5+2, // = COLS5+4, // > COLLS+7, // ? COLS6+3, // @ COLS9, // A COLS9+2, // B COLS9+3, // C COLS9+6, // D COLS9+7, // E COLS9+8, // F COLS9+9, // G COLS9+10, // H COLS9+12, // I COLS9+14, // J COLS9+15, // K COLS9+16, // L COLS9+18, // M COLS9+19, // N COLS9+21, // O COLS9+23, // P COLS9+24, // Q COLS9+25, // R COLS9+27, // S COLS9+29, // T COLS9+30, // U COLS9+31, // V COLS9+32, // W COLS9+33, // X COLS9+34, // Y COLS9+35, // Z COLS9+40, // [ COLS6+4, // backslash COLS9+41, // ] COLS4+4, // ^ COLS6+5, // _ COLS1+2, // ` COLS9, // a COLS9+2, // b COLS9+3, // c COLS9+6, // d COLS9+7, // e COLS9+8, // f COLS9+9, // g COLS9+10, // h COLS9+12, // i COLS9+14, // j COLS9+15, // k COLS9+16, // l COLS9+18, // m COLS9+19, // n COLS9+21, // o COLS9+23, // p COLS9+24, // q COLS9+25, // r COLS9+27, // s COLS9+29, // t COLS9+30, // u COLS9+31, // v COLS9+32, // w COLS9+33, // x COLS9+34, // y COLS9+35, // z COLS2+4, // { COLS6+6, // | COLS2+5, // } COLS6+7 // ~ }; /**************************************************************************** Desc: Multinational table ****************************************************************************/ FLMBYTE flmMn60Tbl[ MNTBLLEN + 2] = { 23, // Initial character offset MNTBLLEN, // Length of this table COLS9+27, // German Double s COLS9+15, // Icelandic k COLS9+14, // Dotless j // IBM Charset COLS9, // A Acute COLS9, // a Acute COLS9, // A Circumflex COLS9, // a Circumflex COLS9, // A Diaeresis or Umlaut COLS9, // a Diaeresis or Umlaut COLS9, // A Grave COLS9, // a Grave COLS9, // A Ring COLS9, // a Ring COLS9+1, // AE digraph COLS9+1, // ae digraph COLS9+3, // C Cedilla COLS9+3, // c Cedilla COLS9+7, // E Acute COLS9+7, // e Acute COLS9+7, // E Circumflex COLS9+7, // e Circumflex COLS9+7, // E Diaeresis or Umlaut COLS9+7, // e Diaeresis or Umlaut COLS9+7, // E Grave COLS9+7, // e Grave COLS9+12, // I Acute COLS9+12, // i Acute COLS9+12, // I Circumflex COLS9+12, // i Circumflex COLS9+12, // I Diaeresis or Umlaut COLS9+12, // i Diaeresis or Umlaut COLS9+12, // I Grave COLS9+12, // i Grave COLS9+20, // N Tilde COLS9+20, // n Tilde COLS9+21, // O Acute COLS9+21, // o Acute COLS9+21, // O Circumflex COLS9+21, // o Circumflex COLS9+21, // O Diaeresis or Umlaut COLS9+21, // o Diaeresis or Umlaut COLS9+21, // O Grave COLS9+21, // o Grave COLS9+30, // U Acute COLS9+30, // u Acute COLS9+30, // U Circumflex COLS9+30, // u Circumflex COLS9+30, // U Diaeresis or Umlaut COLS9+30, // u Diaeresis or Umlaut COLS9+30, // U Grave COLS9+30, // u Grave COLS9+34, // Y Diaeresis or Umlaut COLS9+34, // y Diaeresis or Umlaut // IBM foreign COLS9, // A Tilde COLS9, // a Tilde COLS9+6, // D Cross Bar COLS9+6, // d Cross Bar COLS9+21, // O Slash COLS9+21, // o Slash COLS9+21, // O Tilde COLS9+21, // o Tilde COLS9+34, // Y Acute COLS9+34, // y Acute COLS9+6, // Uppercase Eth COLS9+6, // Lowercase Eth COLS9+37, // Uppercase Thorn COLS9+37, // Lowercase Thorn // Teletex chars COLS9, // A Breve COLS9, // a Breve COLS9, // A Macron COLS9, // a Macron COLS9, // A Ogonek COLS9, // a Ogonek COLS9+3, // C Acute COLS9+3, // c Acute COLS9+3, // C Caron or Hachek COLS9+3, // c Caron or Hachek COLS9+3, // C Circumflex COLS9+3, // c Circumflex COLS9+3, // C Dot Above COLS9+3, // c Dot Above COLS9+6, // D Caron or Hachek (Apostrophe Beside) COLS9+6, // d Caron or Hachek (Apostrophe Beside) COLS9+7, // E Caron or Hachek COLS9+7, // e Caron or Hachek COLS9+7, // E Dot Above COLS9+7, // e Dot Above COLS9+7, // E Macron COLS9+7, // e Macron COLS9+7, // E Ogonek COLS9+7, // e Ogonek COLS9+9, // G Acute COLS9+9, // g Acute COLS9+9, // G Breve COLS9+9, // g Breve COLS9+9, // G Caron or Hachek COLS9+9, // g Caron or Hachek COLS9+9, // G Cedilla (Apostrophe Under) COLS9+9, // g Cedilla (Apostrophe Over) COLS9+9, // G Circumflex COLS9+9, // g Circumflex COLS9+9, // G Dot Above COLS9+9, // g Dot Above COLS9+10, // H Circumflex COLS9+10, // h Circumflex COLS9+10, // H Cross Bar COLS9+10, // h Cross Bar COLS9+12, // I Dot Above (Sharp Accent) COLS9+12, // i Dot Above (Sharp Accent) COLS9+12, // I Macron COLS9+12, // i Macron COLS9+12, // I Ogonek COLS9+12, // i Ogonek COLS9+12, // I Tilde COLS9+12, // i Tilde COLS9+13, // IJ Digraph COLS9+13, // ij Digraph COLS9+14, // J Circumflex COLS9+14, // j Circumflex COLS9+15, // K Cedilla (Apostrophe Under) COLS9+15, // k Cedilla (Apostrophe Under) COLS9+16, // L Acute COLS9+16, // l Acute COLS9+16, // L Caron or Hachek (Apostrophe Beside) COLS9+16, // l Caron or Hachek (Apostrophe Beside) COLS9+16, // L Cedilla (Apostrophe Under) COLS9+16, // l Cedilla (Apostrophe Under) COLS9+16, // L Center Dot COLS9+16, // l Center Dot COLS9+16, // L Stroke COLS9+16, // l Stroke COLS9+19, // N Acute COLS9+19, // n Acute COLS9+19, // N Apostrophe COLS9+19, // n Apostrophe COLS9+19, // N Caron or Hachek COLS9+19, // n Caron or Hachek COLS9+19, // N Cedilla (Apostrophe Under) COLS9+19, // n Cedilla (Apostrophe Under) COLS9+21, // O Double Acute COLS9+21, // o Double Acute COLS9+21, // O Macron COLS9+21, // o Macron COLS9+22, // OE digraph COLS9+22, // oe digraph COLS9+25, // R Acute COLS9+25, // r Acute COLS9+25, // R Caron or Hachek COLS9+25, // r Caron or Hachek COLS9+25, // R Cedilla (Apostrophe Under) COLS9+25, // r Cedilla (Apostrophe Under) COLS9+27, // S Acute COLS9+27, // s Acute COLS9+27, // S Caron or Hachek COLS9+27, // s Caron or Hachek COLS9+27, // S Cedilla COLS9+27, // s Cedilla COLS9+27, // S Circumflex COLS9+27, // s Circumflex COLS9+29, // T Caron or Hachek (Apostrophe Beside) COLS9+29, // t Caron or Hachek (Apostrophe Beside) COLS9+29, // T Cedilla (Apostrophe Under) COLS9+29, // t Cedilla (Apostrophe Under) COLS9+29, // T Cross Bar COLS9+29, // t Cross Bar COLS9+30, // U Breve COLS9+30, // u Breve COLS9+30, // U Double Acute COLS9+30, // u Double Acute COLS9+30, // U Macron COLS9+30, // u Macron COLS9+30, // U Ogonek COLS9+30, // u Ogonek COLS9+30, // U Ring COLS9+30, // u Ring COLS9+30, // U Tilde COLS9+30, // u Tilde COLS9+32, // W Circumflex COLS9+32, // w Circumflex COLS9+34, // Y Circumflex COLS9+34, // y Circumflex COLS9+35, // Z Acute COLS9+35, // z Acute COLS9+35, // Z Caron or Hachek COLS9+35, // z Caron or Hachek COLS9+35, // Z Dot Above COLS9+35, // z Dot Above COLS9+19, // Uppercase Eng COLS9+19, // Lowercase Eng // other COLS9+6, // D Macron COLS9+6, // d Macron COLS9+16, // L Macron COLS9+16, // l Macron COLS9+19, // N Macron COLS9+19, // n Macron COLS9+25, // R Grave COLS9+25, // r Grave COLS9+27, // S Macron COLS9+27, // s Macron COLS9+29, // T Macron COLS9+29, // t Macron COLS9+34, // Y Breve COLS9+34, // y Breve COLS9+34, // Y Grave COLS9+34, // y Grave COLS9+6, // D Apostrophe Beside COLS9+6, // d Apostrophe Beside COLS9+21, // O Apostrophe Beside COLS9+21, // o Apostrophe Beside COLS9+30, // U Apostrophe Beside COLS9+30, // u Apostrophe Beside COLS9+7, // E breve COLS9+7, // e breve COLS9+12, // I breve COLS9+12, // i breve COLS9+12, // dotless I COLS9+12, // dotless i COLS9+21, // O breve COLS9+21 // o breve }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE flmSym60Tbl[ SYMTBLLEN + 2] = { 11, // Initial character offset SYMTBLLEN, // Length of this table COLS3+2, // pound COLS3+3, // yen COLS3+4, // pacetes COLS3+5, // floren COLS0, COLS0, COLS0, COLS0, COLS3+1, // cent }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE flmGrk60Tbl[ GRKTBLLEN + 2] = { 0, // Starting offset GRKTBLLEN, // Length COLS7, // Uppercase Alpha COLS7, // Lowercase Alpha COLS7+1, // Uppercase Beta COLS7+1, // Lowercase Beta COLS7+1, // Uppercase Beta Medial COLS7+1, // Lowercase Beta Medial COLS7+2, // Uppercase Gamma COLS7+2, // Lowercase Gamma COLS7+3, // Uppercase Delta COLS7+3, // Lowercase Delta COLS7+4, // Uppercase Epsilon COLS7+4, // Lowercase Epsilon COLS7+5, // Uppercase Zeta COLS7+5, // Lowercase Zeta COLS7+6, // Uppercase Eta COLS7+6, // Lowercase Eta COLS7+7, // Uppercase Theta COLS7+7, // Lowercase Theta COLS7+8, // Uppercase Iota COLS7+8, // Lowercase Iota COLS7+9, // Uppercase Kappa COLS7+9, // Lowercase Kappa COLS7+10, // Uppercase Lambda COLS7+10, // Lowercase Lambda COLS7+11, // Uppercase Mu COLS7+11, // Lowercase Mu COLS7+12, // Uppercase Nu COLS7+12, // Lowercase Nu COLS7+13, // Uppercase Xi COLS7+13, // Lowercase Xi COLS7+14, // Uppercase Omicron COLS7+14, // Lowercase Omicron COLS7+15, // Uppercase Pi COLS7+15, // Lowercase Pi COLS7+16, // Uppercase Rho COLS7+16, // Lowercase Rho COLS7+17, // Uppercase Sigma COLS7+17, // Lowercase Sigma COLS7+17, // Uppercase Sigma Terminal COLS7+17, // Lowercase Sigma Terminal COLS7+18, // Uppercase Tau COLS7+18, // Lowercase Tau COLS7+19, // Uppercase Upsilon COLS7+19, // Lowercase Upsilon COLS7+20, // Uppercase Phi COLS7+20, // Lowercase Phi COLS7+21, // Uppercase Chi COLS7+21, // Lowercase Chi COLS7+22, // Uppercase Psi COLS7+22, // Lowercase Psi COLS7+23, // Uppercase Omega COLS7+23, // Lowercase Omega // Other Modern Greek Characters [8,52] COLS7, // Uppercase ALPHA Tonos high prime COLS7, // Lowercase Alpha Tonos - acute COLS7+4, // Uppercase EPSILON Tonos - high prime COLS7+4, // Lowercase Epslion Tonos - acute COLS7+6, // Uppercase ETA Tonos - high prime COLS7+6, // Lowercase Eta Tonos - acute COLS7+8, // Uppercase IOTA Tonos - high prime COLS7+8, // Lowercase iota Tonos - acute COLS7+8, // Uppercase IOTA Diaeresis COLS7+8, // Lowercase iota diaeresis COLS7+14, // Uppercase OMICRON Tonos - high prime COLS7+14, // Lowercase Omicron Tonos - acute COLS7+19, // Uppercase UPSILON Tonos - high prime COLS7+19, // Lowercase Upsilon Tonos - acute COLS7+19, // Uppercase UPSILON Diaeresis COLS7+19, // Lowercase Upsilon diaeresis COLS7+23, // Uppercase OMEGA Tonos - high prime COLS7+23, // Lowercase Omega Tonso - acute // Variants [8,70] COLS7+4, // epsilon (variant) COLS7+7, // theta (variant) COLS7+9, // kappa (variant) COLS7+15, // pi (variant) COLS7+16, // rho (variant) COLS7+17, // sigma (variant) COLS7+19, // upsilon (variant) COLS7+20, // phi (variant) COLS7+23, // omega (variant) // Greek Diacritic marks [8,79] COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, COLS0, // 8,108 end of diacritic marks // Ancient Greek [8,109] COLS7, // alpha grave COLS7, // alpha circumflex COLS7, // alpha w/iota COLS7, // alpha acute w/iota COLS7, // alpha grave w/iota COLS7, // alpha circumflex w/Iota COLS7, // alpha smooth COLS7, // alpha smooth acute COLS7, // alpha smooth grave COLS7, // alpha smooth circumflex COLS7, // alpha smooth w/Iota COLS7, // alpha smooth acute w/Iota COLS7, // alpha smooth grave w/Iota COLS7, // alpha smooth circumflex w/Iota COLS7, // alpha rough COLS7, // alpha rough acute COLS7, // alpha rough grave COLS7, // alpha rough circumflex COLS7, // alpha rough w/Iota COLS7, // alpha rough acute w/Iota COLS7, // alpha rough grave w/Iota COLS7, // alpha rough circumflex w/Iota COLS7+4, // epsilon grave COLS7+4, // epsilon smooth COLS7+4, // epsilon smooth acute COLS7+4, // epsilon smooth grave COLS7+4, // epsilon rough COLS7+4, // epsilon rough acute COLS7+4, // epsilon rough grave COLS7+6, // eta grave COLS7+6, // eta circumflex COLS7+6, // eta w/iota COLS7+6, // eta acute w/iota COLS7+6, // eta grave w/Iota COLS7+6, // eta circumflex w/Iota COLS7+6, // eta smooth COLS7+6, // eta smooth acute COLS7+6, // eta smooth grave COLS7+6, // eta smooth circumflex COLS7+6, // eta smooth w/Iota COLS7+6, // eta smooth acute w/Iota COLS7+6, // eta smooth grave w/Iota COLS7+6, // eta smooth circumflex w/Iota COLS7+6, // eta rough COLS7+6, // eta rough acute COLS7+6, // eta rough grave COLS7+6, // eta rough circumflex COLS7+6, // eta rough w/Iota COLS7+6, // eta rough acute w/Iota COLS7+6, // eta rough grave w/Iota COLS7+6, // eta rough circumflex w/Iota COLS7+8, // iota grave COLS7+8, // iota circumflex COLS7+8, // iota acute diaeresis COLS7+8, // iota grave diaeresis COLS7+8, // iota smooth COLS7+8, // iota smooth acute COLS7+8, // iota smooth grave COLS7+8, // iota smooth circumflex COLS7+8, // iota rough COLS7+8, // iota rough acute COLS7+8, // iota rough grave COLS7+8, // iota rough circumflex COLS7+14, // omicron grave COLS7+14, // omicron smooth COLS7+14, // omicron smooth acute COLS7+14, // omicron smooth grave COLS7+14, // omicron rough COLS7+14, // omicron rough acute COLS7+14, // omicron rough grave COLS7+16, // rho smooth COLS7+16, // rho rough COLS7+19, // upsilon grave COLS7+19, // upsilon circumflex COLS7+19, // upsilon acute diaeresis COLS7+19, // upsilon grave diaeresis COLS7+19, // upsilon smooth COLS7+19, // upsilon smooth acute COLS7+19, // upsilon smooth grave COLS7+19, // upsilon smooth circumflex COLS7+19, // upsilon rough COLS7+19, // upsilon rough acute COLS7+19, // upsilon rough grave COLS7+19, // upsilon rough circumflex COLS7+23, // omega grave COLS7+23, // omega circumflex COLS7+23, // omega w/Iota COLS7+23, // omega acute w/Iota COLS7+23, // omega grave w/Iota COLS7+23, // omega circumflex w/Iota COLS7+23, // omega smooth COLS7+23, // omega smooth acute COLS7+23, // omega smooth grave COLS7+23, // omega smooth circumflex COLS7+23, // omega smooth w/Iota COLS7+23, // omega smooth acute w/Iota COLS7+23, // omega smooth grave w/Iota COLS7+23, // omega smooth circumflex w/Iota COLS7+23, // omega rough COLS7+23, // omega rough acute COLS7+23, // omega rough grave COLS7+23, // omega rough circumflex COLS7+23, // omega rough w/Iota COLS7+23, // omega rough acute w/Iota COLS7+23, // omega rough grave w/Iota COLS7+23, // omega rough circumflex w/Iota COLS7+24, // Uppercase Stigma--the number 6 COLS7+24, // Uppercase Digamma--Obsolete letter used as 6 COLS7+24, // Uppercase Koppa--Obsolete letter used as 90 COLS7+24 // Uppercase Sampi--Obsolete letter used as 900 }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE flmCyrl60Tbl[ CYRLTBLLEN + 2] = { 0, // Starting offset CYRLTBLLEN, // Length of table COLS10, // Russian uppercase A COLS10, // Russian lowercase A COLS10+1, // Russian uppercase BE COLS10+1, // Russian lowercase BE COLS10+2, // Russian uppercase VE COLS10+2, // Russian lowercase VE COLS10+3, // Russian uppercase GHE COLS10+3, // Russian lowercase GHE COLS10+5, // Russian uppercase DE COLS10+5, // Russian lowercase DE COLS10+8, // Russian uppercase E COLS10+8, // Russian lowercase E COLS10+9, // Russian lowercase YO COLS10+9, // Russian lowercase YO COLS10+11, // Russian uppercase ZHE COLS10+11, // Russian lowercase ZHE COLS10+12, // Russian uppercase ZE COLS10+12, // Russian lowercase ZE COLS10+14, // Russian uppercase I COLS10+14, // Russian lowercase I COLS10+17, // Russian uppercase SHORT I COLS10+17, // Russian lowercase SHORT I COLS10+19, // Russian uppercase KA COLS10+19, // Russian lowercase KA COLS10+20, // Russian uppercase EL COLS10+20, // Russian lowercase EL COLS10+22, // Russian uppercase EM COLS10+22, // Russian lowercase EM COLS10+23, // Russian uppercase EN COLS10+23, // Russian lowercase EN COLS10+25, // Russian uppercase O COLS10+25, // Russian lowercase O COLS10+26, // Russian uppercase PE COLS10+26, // Russian lowercase PE COLS10+27, // Russian uppercase ER COLS10+27, // Russian lowercase ER COLS10+28, // Russian uppercase ES COLS10+28, // Russian lowercase ES COLS10+29, // Russian uppercase TE COLS10+29, // Russian lowercase TE COLS10+32, // Russian uppercase U COLS10+32, // Russian lowercase U COLS10+34, // Russian uppercase EF COLS10+34, // Russian lowercase EF COLS10+35, // Russian uppercase HA COLS10+35, // Russian lowercase HA COLS10+36, // Russian uppercase TSE COLS10+36, // Russian lowercase TSE COLS10+37, // Russian uppercase CHE COLS10+37, // Russian lowercase CHE COLS10+39, // Russian uppercase SHA COLS10+39, // Russian lowercase SHA COLS10+40, // Russian uppercase SHCHA COLS10+40, // Russian lowercase SHCHA COLS10+41, // Russian lowercase ER (also hard sign) COLS10+41, // Russian lowercase ER (also hard sign) COLS10+42, // Russian lowercase ERY COLS10+42, // Russian lowercase ERY COLS10+43, // Russian lowercase SOFT SIGN COLS10+43, // Russian lowercase SOFT SIGN COLS10+45, // Russian uppercase REVERSE E COLS10+45, // Russian lowercase REVERSE E COLS10+46, // Russian uppercase YU COLS10+46, // Russian lowercase yu COLS10+47, // Russian uppercase YA COLS10+47, // Russian lowercase ya COLS0, // Russian uppercase EH COLS0, // Russian lowercase eh COLS10+7, // Macedonian uppercase SOFT DJ COLS10+7, // Macedonian lowercase soft dj COLS10+4, // Ukrainian uppercase HARD G COLS10+4, // Ukrainian lowercase hard g COLS0, // GE bar COLS0, // ge bar COLS10+6, // Serbian uppercase SOFT DJ COLS10+6, // Serbian lowercase SOFT DJ COLS0, // IE (variant) COLS0, // ie (variant) COLS10+10, // Ukrainian uppercase YE COLS10+10, // Ukrainian lowercase YE COLS0, // ZHE with right descender COLS0, // zhe with right descender COLS10+13, // Macedonian uppercase ZELO COLS10+13, // Macedonian lowercase ZELO COLS0, // Old Slovanic uppercase Z COLS0, // Old Slovanic uppercase z COLS0, // II with macron COLS0, // ii with mscron COLS10+15, // Ukrainian uppercase I COLS10+15, // Ukrainian lowercase I COLS10+16, // Ukrainian uppercase I with Two Dots COLS10+16, // Ukrainian lowercase I with Two Dots COLS0, // Old Slovanic uppercase I ligature COLS0, // Old Slovanic lowercase I ligature COLS10+18, // Serbian--Macedonian uppercase JE COLS10+18, // Serbian--Macedonian lowercase JE COLS10+31, // Macedonian uppercase SOFT K COLS10+31, // Macedonian lowercase SOFT K COLS0, // KA with right descender COLS0, // ka with right descender COLS0, // KA ogonek COLS0, // ka ogonek COLS0, // KA vertical bar COLS0, // ka vertical bar COLS10+21, // Serbian--Macedonian uppercase SOFT L COLS10+21, // Serbian--Macedonian lowercase SOFT L COLS0, // EN with right descender COLS0, // en with right descender COLS10+24, // Serbian--Macedonian uppercase SOFT N COLS10+24, // Serbian--Macedonian lowercase SOFT N COLS0, // ROUND OMEGA COLS0, // round omega COLS0, // OMEGA COLS0, // omega COLS10+30, // Serbian uppercase SOFT T COLS10+30, // Serbian lowercase SOFT T COLS10+33, // Byelorussian uppercase SHORT U COLS10+33, // Byelorussian lowercase SHORT U COLS0, // U with macron COLS0, // u with macron COLS0, // STRAIGHT U COLS0, // straight u COLS0, // STRAIGHT U bar COLS0, // straight u bar COLS0, // OU ligature COLS0, // ou ligature COLS0, // KHA with right descender COLS0, // kha with right descender COLS0, // KHA ogonek COLS0, // kha ogonek COLS0, // H COLS0, // h COLS0, // OMEGA titlo COLS0, // omega titlo COLS10+38, // Serbian uppercase HARD DJ COLS10+38, // Serbian lowercase HARD DJ COLS0, // CHE with right descender COLS0, // che with right descender COLS0, // CHE vertical bar COLS0, // che vertical bar COLS0, // Old Slavonic SHCHA (variant) COLS0, // old SLAVONIC shcha (variant) COLS10+44, // Old Russian uppercase YAT COLS10+44, // Old Russian lowercase YAT // END OF UNIQUE COLLATED BYTES // CHARACTERS BELOW MUST HAVE HAVE THEIR OWN // SUB-COLLATION VALUE TO COMPARE CORRECTLY. COLS0, // Old Bulgarian uppercase YUS COLS0, // Old Bulgarian lowercase YUS COLS0, // Old Slovanic uppercase YUS MALYI COLS0, // Old Slovanic uppercase YUS MALYI COLS0, // KSI COLS0, // ksi COLS0, // PSI COLS0, // psi COLS0, // Old Russian uppercase FITA COLS0, // Old Russian lowercase FITA COLS0, // Old Russian uppercase IZHITSA COLS0, // Old Russian lowercase IZHITSA COLS0, // Russian uppercase A acute COLS0, // Russian lowercase A acute COLS10+8, // Russian uppercase E acute COLS10+8, // Russian lowercase E acute COLS0, // E acute COLS0, // e acute COLS10+14, // II acute COLS10+14, // ii acute COLS0, // I acute COLS0, // i acute COLS0, // YI acute COLS0, // yi acute COLS10+25, // O acute COLS10+25, // o acute COLS10+32, // U acute COLS10+32, // u acute COLS10+42, // YERI acute COLS10+42, // YERI acute COLS10+45, // REVERSED E acute COLS10+45, // reversed e acute COLS10+46, // YU acute COLS10+46, // yu acute COLS10+47, // YA acute COLS10+47, // ya acute COLS10, // A grave COLS10, // a grave COLS10+8, // E grave COLS10+8, // e grave COLS10+9, // YO grave COLS10+9, // yo grave COLS10+14, // I grave COLS10+14, // i grave COLS10+25, // O grave COLS10+25, // o grave COLS10+32, // U grave COLS10+32, // u grave COLS10+42, // YERI grave COLS10+42, // yeri grave COLS10+45, // REVERSED E grave COLS10+45, // reversed e grave COLS10+46, // IU (YU) grave COLS10+46, // iu (yu) grave COLS10+47, // ia (YA) grave COLS10+47, // ia (ya) grave ******* [10,199] }; /**************************************************************************** Desc: The Hebrew characters are collated over the Russian characters. Therefore sorting both Hebrew and Russian is impossible to do. ****************************************************************************/ FLMBYTE flmHeb60TblA[ HEBTBL1LEN + 2] = { 0, // Starting offset HEBTBL1LEN, // Length of table COLS10h+0, // Alef COLS10h+1, // Bet COLS10h+2, // Gimel COLS10h+3, // Dalet COLS10h+4, // He COLS10h+5, // Vav COLS10h+6, // Zayin COLS10h+7, // Het COLS10h+8, // Tet COLS10h+9, // Yod COLS10h+10, // Kaf (final) [9,10] COLS10h+11, // Kaf COLS10h+12, // Lamed COLS10h+13, // Mem (final) COLS10h+14, // Mem COLS10h+15, // Nun (final) COLS10h+16, // Nun COLS10h+17, // Samekh COLS10h+18, // Ayin COLS10h+19, // Pe (final) COLS10h+20, // Pe [9,20] COLS10h+21, // Tsadi (final) COLS10h+22, // Tsadi COLS10h+23, // Qof COLS10h+24, // Resh COLS10h+25, // Shin COLS10h+26 // Tav [9,26] }; /**************************************************************************** Desc: This is the ANCIENT HEBREW SCRIPT piece. The actual value will be stored in the subcollation. This way we don't play diacritic/subcollation games. ****************************************************************************/ FLMBYTE flmHeb60TblB[ HEBTBL2LEN + 2] = { 84, HEBTBL2LEN, COLS10h+0, // Alef Dagesh [9,84] COLS10h+1, // Bet Dagesh COLS10h+1, // Vez - looks like a bet COLS10h+2, // Gimel Dagesh COLS10h+3, // Dalet Dagesh COLS10h+4, // He Dagesh COLS10h+5, // Vav Dagesh [9,90] COLS10h+5, // Vav Holem COLS10h+6, // Zayin Dagesh COLS10h+7, // Het Dagesh COLS10h+8, // Tet Dagesh COLS10h+9, // Yod Dagesh COLS10h+9, // Yod Hiriq [9,96] - not on my list COLS10h+11, // Kaf Dagesh COLS10h+10, // Kaf Dagesh (final) COLS10h+10, // Kaf Sheva (final) COLS10h+10, // Kaf Tsere (final) [9,100] COLS10h+10, // Kaf Segol (final) COLS10h+10, // Kaf Patah (final) COLS10h+10, // Kaf Qamats (final) COLS10h+10, // Kaf Dagesh Qamats (final) COLS10h+12, // Lamed Dagesh COLS10h+14, // Mem Dagesh COLS10h+16, // Nun Dagesh COLS10h+15, // Nun Qamats (final) COLS10h+17, // Samekh Dagesh COLS10h+20, // Pe Dagesh [9,110] COLS10h+20, // Fe - just guessing this is like Pe - was +21 COLS10h+22, // Tsadi Dagesh COLS10h+23, // Qof Dagesh COLS10h+25, // Sin (with sin dot) COLS10h+25, // Sin Dagesh (with sin dot) COLS10h+25, // Shin COLS10h+25, // Shin Dagesh COLS10h+26 // Tav Dagesh [9,118] }; /**************************************************************************** Desc: The Arabic characters are collated OVER the Russian characters Therefore sorting both Arabic and Russian in the same database is not supported. Arabic starts with a bunch of accents/diacritic marks that are Actually placed OVER a preceeding character. These accents are ignored while sorting the first pass - when collation == COLS0. There are 4 possible states for all/most arabic characters: ÚÙ - occurs as the only character in a word ÄÙ - appears at the first of the word ÄÄ - appears at the middle of a word ÚÄ - appears at the end of the word Usually only the simple version of the letter is stored. Therefore we should not have to worry about sub-collation of these characters. The arabic characters with diacritics differ however. The alef has sub-collation values to sort correctly. There is not any more room to add more collation values. Some chars in CS14 are combined when urdu, pashto and sindhi characters overlap. ****************************************************************************/ FLMBYTE flmAr160Tbl[ AR1TBLLEN + 2] = { 38, // Starting offset AR1TBLLEN, // Length of table COLLS+2, // , comma COLLS+3, // : colon COLLS+7, // ? question mark COLS4+2, // * asterick COLS6, // % percent COLS9+41, // >> alphabetic - end of list) COLS9+40, // << alphabetic - end of list) COLS2, // ( COLS2+1, // ) COLS8+1, // ÚÙ One COLS8+2, // ÚÙ Two COLS8+3, // ÚÙ Three COLS8+4, // ÚÙ Four COLS8+5, // ÚÙ Five COLS8+6, // ÚÙ Six COLS8+7, // ÚÙ Seven COLS8+8, // ÚÙ Eight COLS8+9, // ÚÙ Nine COLS8+0, // ÚÙ Zero COLS8+2, // ÚÙ Two (Handwritten) COLS10a+1, // ÚÙ alif COLS10a+1, // ÚÄ alif COLS10a+2, // ÚÙ ba COLS10a+2, // ÄÙ ba COLS10a+2, // ÄÄ ba COLS10a+2, // ÚÄ ba COLS10a+6, // ÚÙ ta COLS10a+6, // ÄÙ ta COLS10a+6, // ÄÄ ta COLS10a+6, // ÚÄ ta COLS10a+8, // ÚÙ tha COLS10a+8, // ÄÙ tha COLS10a+8, // ÄÄ tha COLS10a+8, // ÚÄ tha COLS10a+12, // ÚÙ jiim COLS10a+12, // ÄÙ jiim COLS10a+12, // ÄÄ jiim COLS10a+12, // ÚÄ jiim COLS10a+16, // ÚÙ Ha COLS10a+16, // ÄÙ Ha COLS10a+16, // ÄÄ Ha COLS10a+16, // ÚÄ Ha COLS10a+17, // ÚÙ kha COLS10a+17, // ÄÙ kha COLS10a+17, // ÄÄ kha COLS10a+17, // ÚÄ kha COLS10a+20, // ÚÙ dal COLS10a+20, // ÚÄ dal COLS10a+22, // ÚÙ dhal COLS10a+22, // ÚÄ dhal COLS10a+27, // ÚÙ ra COLS10a+27, // ÚÄ ra COLS10a+29, // ÚÙ ziin COLS10a+29, // ÚÄ ziin COLS10a+31, // ÚÙ siin COLS10a+31, // ÄÙ siin COLS10a+31, // ÄÄ siin COLS10a+31, // ÚÄ siin COLS10a+32, // ÚÙ shiin COLS10a+32, // ÄÙ shiin COLS10a+32, // ÄÄ shiin COLS10a+32, // ÚÄ shiin COLS10a+34, // ÚÙ Sad COLS10a+34, // ÄÙ Sad COLS10a+34, // ÄÄ Sad COLS10a+34, // ÚÄ Sad COLS10a+35, // ÚÙ Dad COLS10a+35, // ÄÙ Dad COLS10a+35, // ÄÄ Dad COLS10a+35, // ÚÄ Dad COLS10a+36, // ÚÙ Ta COLS10a+36, // ÄÙ Ta COLS10a+36, // ÄÄ Ta COLS10a+36, // ÚÄ Ta COLS10a+37, // ÚÙ Za COLS10a+37, // ÄÙ Za COLS10a+37, // ÄÄ Za COLS10a+37, // ÚÄ Za COLS10a+38, // ÚÙ 'ain COLS10a+38, // ÄÙ 'ain COLS10a+38, // ÄÄ 'ain COLS10a+38, // ÚÄ 'ain COLS10a+39, // ÚÙ ghain COLS10a+39, // ÄÙ ghain COLS10a+39, // ÄÄ ghain COLS10a+39, // ÚÄ ghain COLS10a+40, // ÚÙ fa COLS10a+40, // ÄÙ fa COLS10a+40, // ÄÄ fa COLS10a+40, // ÚÄ fa COLS10a+42, // ÚÙ Qaf COLS10a+42, // ÄÙ Qaf COLS10a+42, // ÄÄ Qaf COLS10a+42, // ÚÄ Qaf COLS10a+43, // ÚÙ kaf COLS10a+43, // ÄÙ kaf COLS10a+43, // ÄÄ kaf COLS10a+43, // ÚÄ kaf COLS10a+46, // ÚÙ lam COLS10a+46, // ÄÙ lam COLS10a+46, // ÄÄ lam COLS10a+46, // ÚÄ lam COLS10a+47, // ÚÙ miim COLS10a+47, // ÄÙ miim COLS10a+47, // ÄÄ miim COLS10a+47, // ÚÄ miim COLS10a+48, // ÚÙ nuun COLS10a+48, // ÄÙ nuun COLS10a+48, // ÄÄ nuun COLS10a+48, // ÚÄ nuun COLS10a+49, // ÚÙ ha COLS10a+49, // ÄÙ ha COLS10a+49, // ÄÄ ha COLS10a+49, // ÚÄ ha COLS10a+6, // ÚÙ ta marbuuTah COLS10a+6, // ÚÄ ta marbuuTah COLS10a+50, // ÚÙ waw COLS10a+50, // ÚÄ waw COLS10a+53, // ÚÙ ya COLS10a+53, // ÄÙ ya COLS10a+53, // ÄÄ ya COLS10a+53, // ÚÄ ya COLS10a+52, // ÚÙ alif maqSuurah COLS10a+52, // ÄÙ ya maqSuurah? COLS10a+52, // ÄÄ ya maqSuurah? COLS10a+52, // ÚÄ alif maqSuurah COLS10a+0, // ÚÙ hamzah accent - never appears alone // Store the sub-collation as the actual // character value from this point on COLS10a+1, // ÚÙ alif hamzah COLS10a+1, // ÚÄ alif hamzah COLS10a+1, // ÚÙ hamzah-under-alif COLS10a+1, // ÚÄ hamzah-under-alif COLS10a+1, // ÚÙ waw hamzah COLS10a+1, // ÚÄ waw hamzah COLS10a+1, // ÚÙ ya hamzah COLS10a+1, // ÄÙ ya hamzah COLS10a+1, // ÄÄ ya hamzah COLS10a+1, // ÚÄ ya hamzah COLS10a+1, // ÚÙ alif fatHataan COLS10a+1, // ÚÄ alif fatHataan COLS10a+1, // ÚÙ alif maddah COLS10a+1, // ÚÄ alif maddah COLS10a+1, // ÚÙ alif waSlah COLS10a+1, // ÚÄ alif waSlah (final) // LIGATURES // Should NEVER be stored so will not worry // about breaking up into pieces for collation. // NOTE: // Let's store the "Lam" collation value (+42) // below and in the sub-collation store the // actual character. This will sort real close. // The best implementation is to // break up ligatures into its base pieces. COLS10a+46, // ÚÙ lamalif COLS10a+46, // ÚÄ lamalif COLS10a+46, // ÚÙ lamalif hamzah COLS10a+46, // ÚÄ lamalif hamzah COLS10a+46, // ÚÙ hamzah-under-lamalif COLS10a+46, // ÚÄ hamzah-under-lamalif COLS10a+46, // ÚÙ lamalif fatHataan COLS10a+46, // ÚÄ lamalif fatHataan COLS10a+46, // ÚÙ lamalif maddah COLS10a+46, // ÚÄ lamalif maddah COLS10a+46, // ÚÙ lamalif waSlah COLS10a+46, // ÚÄ lamalif waSlah COLS10a+46, // ÚÙ Allah - khaDalAlif COLS0_ARABIC, // ÄÄ taTwiil COLS0_ARABIC // ÄÄ taTwiil }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE flmAlefSubColTbl[] = { 1, // ÚÙ alif hamzah 1, // ÚÄ alif hamzah 3, // ÚÙ hamzah-under-alif 3, // ÚÄ hamzah-under-alif 2, // ÚÙ waw hamzah 2, // ÚÄ waw hamzah 4, // ÚÙ ya hamzah 4, // ÄÙ ya hamzah 4, // ÄÄ ya hamzah 4, // ÚÄ ya hamzah 5, // ÚÙ alif fatHataan 5, // ÚÄ alif fatHataan 0, // ÚÙ alif maddah 0, // ÚÄ alif maddah 6, // ÚÙ alif waSlah 6 // ÚÄ alif waSlah (final) }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE flmAr260Tbl[ AR2TBLLEN + 2] = { 41, // Starting offset AR2TBLLEN, // Length of table COLS8+4, // Farsi and Urdu Four COLS8+4, // Urdu Four COLS8+5, // Farsi and Urdu Five COLS8+6, // Farsi Six COLS8+6, // Farsi and Urdu Six COLS8+7, // Urdu Seven COLS8+8, // Urdu Eight COLS10a+3, // Sindhi bb - baa /w 2 dots below (67b) COLS10a+3, COLS10a+3, COLS10a+3, COLS10a+4, // Sindhi bh - baa /w 4 dots below (680) COLS10a+4, COLS10a+4, COLS10a+4, COLS10a+5, // Malay, Kurdish, Pashto, Farsi, Sindhi, and Urdu p COLS10a+5, // =peh - taa /w 3 dots below (67e) COLS10a+5, COLS10a+5, COLS10a+7, // Urdu T - taa /w small tah COLS10a+7, COLS10a+7, COLS10a+7, COLS10a+7, // Pashto T - taa /w ring (forced to combine) COLS10a+7, COLS10a+7, COLS10a+7, COLS10a+9, // Sindhi th - taa /w 4 dots above (67f) COLS10a+9, COLS10a+9, COLS10a+9, COLS10a+10, // Sindhi Tr - taa /w 3 dots above (67d) COLS10a+10, COLS10a+10, COLS10a+10, COLS10a+11, // Sindhi Th - taa /w 2 dots above (67a) COLS10a+11, COLS10a+11, COLS10a+11, COLS10a+13, // Sindhi jj - haa /w 2 middle dots verticle (684) COLS10a+13, COLS10a+13, COLS10a+13, COLS10a+14, // Sindhi ny - haa /w 2 middle dots (683) COLS10a+14, COLS10a+14, COLS10a+14, COLS10a+15, // Malay, Kurdish, Pashto, Farsi, Sindhi, and Urdu ch COLS10a+15, // =tcheh (686) COLS10a+15, COLS10a+15, COLS10a+15, // Sindhi chh - haa /w middle 4 dots (687) COLS10a+15, // forced to combine COLS10a+15, COLS10a+15, COLS10a+18, // Pashto ts - haa /w 3 dots above (685) COLS10a+18, COLS10a+18, COLS10a+18, COLS10a+19, // Pashto dz - hamzah on haa (681) COLS10a+19, COLS10a+19, COLS10a+19, COLS10a+21, // Urdu D - dal /w small tah (688) COLS10a+21, COLS10a+21, // Pashto D - dal /w ring (689) forced to combine COLS10a+21, COLS10a+23, // Sindhi dh - dal /w 2 dots above (68c) COLS10a+23, COLS10a+24, // Sindhi D - dal /w 3 dots above (68e) COLS10a+24, COLS10a+25, // Sindhi Dr - dal /w dot below (68a) COLS10a+25, COLS10a+26, // Sindhi Dh - dal /w 2 dots below (68d) COLS10a+26, COLS10a+28, // Pashto r - ra /w ring (693) COLS10a+28, COLS10a+28, // Urdu R - ra /w small tah (691) forced to combine COLS10a+28, COLS10a+28, // Sindhi r - ra /w 4 dots above (699) forced to combine COLS10a+28, COLS10a+27, // Kurdish rolled r - ra /w 'v' below (695) COLS10a+27, COLS10a+27, COLS10a+27, COLS10a+30, // Kurdish, Pashto, Farsi, Sindhi, and Urdu Z COLS10a+30, // = jeh - ra /w 3 dots above (698) COLS10a+30, // Pashto zz - ra /w dot below & dot above (696) COLS10a+30, // forced to combine COLS10a+30, // Pashto g - not in unicode! - forced to combine COLS10a+30, COLS10a+33, // Pashto x - seen dot below & above (69a) COLS10a+33, COLS10a+33, COLS10a+33, COLS10a+39, // Malay ng - old maly ain /w 3 dots above (6a0) COLS10a+39, // forced to combine COLS10a+39, COLS10a+39, COLS10a+41, // Malay p, Kurdish v - Farsi ? - fa /w 3 dots above COLS10a+41, // = veh - means foreign words (6a4) COLS10a+41, COLS10a+41, COLS10a+41, // Sindhi ph - fa /w 4 dots above (6a6) forced to combine COLS10a+41, COLS10a+41, COLS10a+41, COLS10a+43, // Misc k - open caf (6a9) COLS10a+43, COLS10a+43, COLS10a+43, COLS10a+43, // misc k - no unicode - forced to combine COLS10a+43, COLS10a+43, COLS10a+43, COLS10a+43, // Sindhi k - swash caf (various) (6aa) -forced to combine COLS10a+43, COLS10a+43, COLS10a+43, COLS10a+44, // Persian/Urdu g - gaf (6af) COLS10a+44, COLS10a+44, COLS10a+44, COLS10a+44, // Persian/Urdu g - no unicode COLS10a+44, COLS10a+44, COLS10a+44, COLS10a+44, // malay g - gaf /w ring (6b0) COLS10a+44, COLS10a+44, COLS10a+44, COLS10a+44, // Sindhi ng - gaf /w 2 dots above (6ba) COLS10a+44, // forced to combine ng only COLS10a+44, COLS10a+44, COLS10a+45, // Sindhi gg - gaf /w 2 dots vertical below (6b3) COLS10a+45, COLS10a+45, COLS10a+45, COLS10a+46, // Kurdish velar l - lam /w small v (6b5) COLS10a+46, COLS10a+46, COLS10a+46, COLS10a+46, // Kurdish Lamalif with diacritic - no unicode COLS10a+46, COLS10a+48, // Urdu n - dotless noon (6ba) COLS10a+48, COLS10a+48, COLS10a+48, COLS10a+48, // Pashto N - noon /w ring (6bc) - forced to combine COLS10a+48, COLS10a+48, COLS10a+48, COLS10a+48, // Sindhi N - dotless noon/w small tah (6bb) COLS10a+48, // forced to combine COLS10a+48, COLS10a+48, COLS10a+50, // Kurdish o - waw /w small v (6c6) COLS10a+50, COLS10a+50, // Kurdish o - waw /w bar above (6c5) COLS10a+50, COLS10a+50, // Kurdish o - waw /w 2 dots above (6ca) COLS10a+50, COLS10a+51, // Urdu h - no unicode COLS10a+51, COLS10a+51, COLS10a+51, COLS10a+52, // Kurdish ˆ - ya /w small v (6ce) COLS10a+52, COLS10a+52, COLS10a+52, COLS10a+54, // Urdu y - ya barree (6d2) COLS10a+54, COLS10a+54, // Malay ny - ya /w 3 dots below (6d1) forced to combine COLS10a+54, COLS10a+54, COLS10a+54, COLS10a+51, // Farsi hamzah - hamzah on ha (6c0) forced to combine COLS10a+51 }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE flmAr2BitTbl[] = { 0xF0, // 64..71 0x00, // 72..79 0x00, // 80..87 0x0F, // 88..95 - 92..95 0x00, // 96..103 0x00, // 104..111 0x03, // 112..119 0xFC, // 120..127 0xFF, // 128..135 0xF0, // 136..143 - 136..139 0xFF, // 144..151 - 144..147, 148..159 0xFF, // 152..159 0x0F, // 160..167 - 164..175 0xFF, // 168..175 0x0F, // 176..183 - 180..185 0xFF, // 184..191 - 186..197 0xFF, // 192..199 - 198..203 0xFF, // 200..207 - 204..207 0xF3, // 208..215 - 208..211 , 214..217 0xF0 // 216..219 - 218..219 }; /**************************************************************************** Desc: This table describes and gives addresses for collating character sets. Each line corresponds to a character set. ****************************************************************************/ TBL_B_TO_BP flmCol60Tbl[] = { {CHSASCI, flmAsc60Tbl}, // ascii - " " - "~" {CHSMUL1, flmMn60Tbl}, // multinational {CHSSYM1, flmSym60Tbl}, // symbols {CHSGREK, flmGrk60Tbl}, // Greek {CHSCYR, flmCyrl60Tbl}, // Cyrillic - Russian {0xFF, 0} // table terminator }; /**************************************************************************** Desc: ****************************************************************************/ TBL_B_TO_BP flmHebArabicCol60Tbl[] = { {CHSASCI, flmAsc60Tbl}, // ascii - " " - "~" {CHSMUL1, flmMn60Tbl}, // multinational {CHSSYM1, flmSym60Tbl}, // symbols {CHSGREK, flmGrk60Tbl}, // Greek {CHSHEB, flmHeb60TblA}, // Hebrew {CHSHEB, flmHeb60TblB}, // Hebrew {CHSARB1, flmAr160Tbl}, // Arabic Set 1 {CHSARB2, flmAr260Tbl}, // Arabic Set 2 {0xff, 0} // table terminator }; /**************************************************************************** Desc: The diacritical to collated table translates the first 26 characters of character set #1 into a 5 bit value for "correct" sorting sequence for that diacritical (DCV) - diacritic collated value. The attempt here is to convert the collated character value along with the DCV to form the original character. The diacriticals are in an order to fit the most languages. Czech, Swedish, and Finnish will have to manual reposition the ring above (assign it a value greater then the umlaut) This table is index by the diacritical value. ****************************************************************************/ FLMBYTE flmDia60Tbl[] = { 2, // grave offset = 0 16, // centerd offset = 1 7, // tilde offset = 2 4, // circum offset = 3 12, // crossb offset = 4 10, // slash offset = 5 1, // acute offset = 6 6, // umlaut offset = 7 // In SU, SV and CZ will = 9 17, // macron offset = 8 18, // aposab offset = 9 19, // aposbes offset = 10 20, // aposba offset = 11 21, // aposbc offset = 12 22, // abosbl offset = 13 8, // ring offset = 14 13, // dota offset = 15 23, // dacute offset = 16 11, // cedilla offset = 17 14, // ogonek offset = 18 5, // caron offset = 19 15, // stroke offset = 20 24, // bara offset = 21 3, // breve offset = 22 0, // dbls offset = 23 sorts as 'ss' 25, // dotlesi offset = 24 26 // dotlesj offset = 25 }; /**************************************************************************** Desc: Map special chars in CharSet (x24) to collation values ****************************************************************************/ BYTE_WORD_TBL flmCh24ColTbl[] = { {1, COLLS + 2}, // comma {2, COLLS + 1}, // maru {5, COLS_ASIAN_MARKS + 2}, // chuuten {10, COLS_ASIAN_MARKS}, // dakuten {11, COLS_ASIAN_MARKS + 1}, // handakuten {43, COLS2 + 2}, // angled brackets {44, COLS2 + 3}, {49, COLS2 + 2}, // pointy brackets {50, COLS2 + 3}, {51, COLS2 + 2}, // double pointy brackets {52, COLS2 + 3}, {53, COLS1}, // Japanese quotes {54, COLS1}, {55, COLS1}, // hollow Japanese quotes {56, COLS1}, {57, COLS2 + 2}, // filled rounded brackets {58, COLS2 + 3} }; /**************************************************************************** Desc: ****************************************************************************/ FLMUINT16 colToWPChr[ COLS11 - COLLS] = { 0x20, // colls - 0x2e, // colls+1 - . 0x2c, // colls+2 - , 0x3a, // colls+3 - : 0x3b, // colls+4 - ; 0x21, // colls+5 - ! 0, // colls+6 - NO VALUE 0x3f, // colls+7 - ? 0, // colls+8 - NO VALUE 0x22, // cols1 - " 0x27, // cols1+1 - ' 0x60, // cols1+2 - ` 0, // cols1+3 - NO VALUE 0, // cols1+4 - NO VALUE 0x28, // cols2 - ( 0x29, // cols2+1 - ) 0x5b, // cols2+2 - japanese angle brackets 0x5d, // cols2+3 - japanese angle brackets 0x7b, // cols2+4 - { 0x7d, // cols2+5 - } 0x24, // cols3 - $ 0x413, // cols3+1 - cent 0x40b, // cols3+2 - pound 0x40c, // cols3+3 - yen 0x40d, // cols3+4 - pacetes 0x40e, // cols3+5 - floren 0x2b, // cols4 - + 0x2d, // cols4+1 - - 0x2a, // cols4+2 - * 0x2f, // cols4+3 - / 0x5e, // cols4+4 - ^ 0, // cols4+5 - NO VALUE 0, // cols4+6 - NO VALUE 0, // cols4+7 - NO VALUE 0x3c, // cols5 - < 0, // cols5+1 - NO VALUE 0x3d, // cols5+2 - = 0, // cols5+3 - NO VALUE 0x3e, // cols5+4 - > 0, // cols5+5 - NO VALUE 0, // cols5+6 - NO VALUE 0, // cols5+7 - NO VALUE 0, // cols5+8 - NO VALUE 0, // cols5+9 - NO VALUE 0, // cols5+10 - NO VALUE 0, // cols5+11 - NO VALUE 0, // cols5+12 - NO VALUE 0, // cols5+13 - NO VALUE 0x25, // cols6 - % 0x23, // cols6+1 - # 0x26, // cols6+2 - & 0x40, // cols6+3 - @ 0x5c, // cols6+4 - backslash 0x5f, // cols6+5 - _ 0x7c, // cols6+6 - | 0x7e, // cols6+7 - ~ 0, // cols6+8 - NO VALUE 0, // cols6+9 - NO VALUE 0, // cols6+10 - NO VALUE 0, // cols6+11 - NO VALUE 0, // cols6+12 - NO VALUE 0x800, // cols7 - Uppercase Alpha 0x802, // cols7+1 - Uppercase Beta 0x806, // cols7+2 - Uppercase Gamma 0x808, // cols7+3 - Uppercase Delta 0x80a, // cols7+4 - Uppercase Epsilon 0x80c, // cols7+5 - Uppercase Zeta 0x80e, // cols7+6 - Uppercase Eta 0x810, // cols7+7 - Uppercase Theta 0x812, // cols7+8 - Uppercase Iota 0x814, // cols7+9 - Uppercase Kappa 0x816, // cols7+10 - Uppercase Lambda 0x818, // cols7+11 - Uppercase Mu 0x81a, // cols7+12 - Uppercase Nu 0x81c, // cols7+13 - Uppercase Xi 0x81e, // cols7+14 - Uppercase Omicron 0x820, // cols7+15 - Uppercase Pi 0x822, // cols7+16 - Uppercase Rho 0x824, // cols7+17 - Uppercase Sigma 0x828, // cols7+18 - Uppercase Tau 0x82a, // cols7+19 - Uppercase Upsilon 0x82c, // cols7+20 - Uppercase Phi 0x82e, // cols7+21 - Uppercase Chi 0x830, // cols7+22 - Uppercase Psi 0x832, // cols7+23 - Uppercase Omega 0, // cols7+24 - NO VALUE 0x30, // cols8 - 0 0x31, // cols8+1 - 1 0x32, // cols8+2 - 2 0x33, // cols8+3 - 3 0x34, // cols8+4 - 4 0x35, // cols8+5 - 5 0x36, // cols8+6 - 6 0x37, // cols8+7 - 7 0x38, // cols8+8 - 8 0x39, // cols8+9 - 9 0x41, // cols9 - A 0x124, // cols9+1 - AE digraph 0x42, // cols9+2 - B 0x43, // cols9+3 - C 0xffff, // cols9+4 - CH in spanish 0x162, // cols9+5 - Holder for C caron in Czech 0x44, // cols9+6 - D 0x45, // cols9+7 - E 0x46, // cols9+8 - F 0x47, // cols9+9 - G 0x48, // cols9+10 - H 0xffff, // cols9+11 - CH in czech or dotless i in turkish 0x49, // cols9+12 - I 0x18a, // cols9+13 - IJ Digraph 0x4a, // cols9+14 - J 0x4b, // cols9+15 - K 0x4c, // cols9+16 - L 0xffff, // cols9+17 - LL in spanish 0x4d, // cols9+18 - M 0x4e, // cols9+19 - N 0x138, // cols9+20 - N Tilde 0x4f, // cols9+21 - O 0x1a6, // cols9+22 - OE digraph 0x50, // cols9+23 - P 0x51, // cols9+24 - Q 0x52, // cols9+25 - R 0x1aa, // cols9+26 - Holder for R caron in Czech 0x53, // cols9+27 - S 0x1b0, // cols9+28 - Holder for S caron in Czech 0x54, // cols9+29 - T 0x55, // cols9+30 - U 0x56, // cols9+31 - V 0x57, // cols9+32 - W 0x58, // cols9+33 - X 0x59, // cols9+34 - Y 0x5a, // cols9+35 - Z 0x1ce, // cols9+36 - Holder for Z caron in Czech 0x158, // cols9+37 - Uppercase Thorn 0, // cols9+38 - ??? 0, // cols9+39 - ??? 0x5b, // cols9+40 - [ (note: alphabetic - end of list) 0x5d, // cols9+41 - ] (note: alphabetic - end of list) 0x124, // cols9+42 - AE diagraph - DK 0x124, // cols9+43 - AE diagraph - NO 0x122, // cols9+44 - A ring - SW 0x11E, // cols9+45 - A diaeresis - DK 0x124, // cols9+46 - AE diagraph - IC 0x150, // cols9+47 - O slash - NO 0x11e, // cols9+48 - A diaeresis - SW 0x150, // cols9+49 - O slash - DK 0x13E, // cols9+50 - O Diaeresis - IC 0x122, // cols9+51 - A ring - NO 0x13E, // cols9+52 - O Diaeresis - SW 0x13E, // cols9+53 - O Diaeresis - DK 0x150, // cols9+54 - O slash - IC 0x122, // cols9+55 - A ring - DK 0x124, // cols9+56 - AE diagraph future 0x13E, // cols9+57 - O Diaeresis future 0x150, // cols9+58 - O slash future 0, // cols9+59 - NOT USED future 0xA00, // cols10 - Russian A 0xA02, // cols10+1 - Russian BE 0xA04, // cols10+2 - Russian VE 0xA06, // cols10+3 - Russian GHE 0xA46, // cols10+4 - Ukrainian HARD G 0xA08, // cols10+5 - Russian DE 0xA4a, // cols10+6 - Serbian SOFT DJ 0xA44, // cols10+7 - Macedonian SOFT DJ 0xA0a, // cols10+8 - Russian E 0xA0c, // cols10+9 - Russian YO 0xA4e, // cols10+10 - Ukrainian YE 0xA0e, // cols10+11 - Russian ZHE 0xA10, // cols10+12 - Russian ZE 0xA52, // cols10+13 - Macedonian ZELO 0xA12, // cols10+14 - Russian I 0xA58, // cols10+15 - Ukrainian I 0xA5a, // cols10+16 - Ukrainian I with Two dots 0xA14, // cols10+17 - Russian SHORT I 0xA5e, // cols10+18 - Serbian--Macedonian JE 0xA16, // cols10+19 - Russian KA 0xA18, // cols10+20 - Russian EL 0xA68, // cols10+21 - Serbian--Macedonian SOFT L 0xA1a, // cols10+22 - Russian EM 0xA1c, // cols10+23 - Russian EN 0xA6c, // cols10+24 - Serbian--Macedonian SOFT N 0xA1e, // cols10+25 - Russian O 0xA20, // cols10+26 - Russian PE 0xA22, // cols10+27 - Russian ER 0xA24, // cols10+28 - Russian ES 0xA26, // cols10+29 - Russian TE 0xA72, // cols10+30 - Serbian SOFT T 0xA60, // cols10+31 - Macedonian SOFT K 0xA28, // cols10+32 - Russian U 0xA74, // cols10+33 - Byelorussian SHORT U 0xA2a, // cols10+34 - Russian EF 0xA2c, // cols10+35 - Russian HA 0xA2e, // cols10+36 - Russian TSE 0xA30, // cols10+37 - Russian CHE 0xA86, // cols10+38 - Serbian HARD DJ 0xA32, // cols10+39 - Russian SHA 0xA34, // cols10+40 - Russian SHCHA 0xA36, // cols10+41 - Russian ER 0xA38, // cols10+42 - Russian ERY 0xA3a, // cols10+43 - Russian SOFT SIGN 0xA8e, // cols10+44 - Old Russian YAT 0xA3c, // cols10+45 - Russian uppercase REVERSE E 0xA3e, // cols10+46 - Russian YU 0xA40, // cols10+47 - Russian YA 0xA3a, // cols10+48 - Russian SOFT SIGN - UKRAIN ONLY 0 // cols10+49 - future }; /**************************************************************************** Desc: ****************************************************************************/ FLMUINT16 HebArabColToWPChr[ ] = { 0x0D00 +164, // hamzah 0x0D00 + 58, // [13,177] alef maddah 0x0D00 + 60, // baa 0x0E00 + 48, // Sindhi bb 0x0E00 + 52, // Sindhi bh 0x0E00 + 56, // Misc p = peh 0x0D00 +152, // taa marbuuTah 0x0E00 + 60, // Urdu T 0x0D00 + 68, // thaa 0x0E00 + 68, // Sindhi th 0x0E00 + 72, // Sindhi tr 0x0E00 + 76, // Sindhi Th 0x0D00 + 72, // jiim - jeem 0x0E00 + 80, // Sindhi jj 0x0E00 + 84, // Sindhi ny 0x0E00 + 88, // Misc ch 0x0D00 + 76, // Haa 0x0D00 + 80, // khaa 0x0E00 + 96, // Pashto ts 0x0E00 +100, // Pashto dz 0x0D00 + 84, // dal 0x0E00 +104, // Urdu D 0x0D00 + 86, // thal 0x0E00 +108, // Sindhi dh 0x0E00 +110, // Sindhi D 0x0E00 +112, // Sindhi Dr 0x0E00 +114, // Sindhi Dh 0x0D00 + 88, // ra 0x0E00 +116, // Pashto r 0x0D00 + 90, // zain 0x0E00 +126, // Mizc Z 0x0D00 + 92, // seen 0x0D00 + 96, // sheen 0x0E00 +132, // Pashto x 0x0D00 +100, // Sad 0x0D00 +104, // Dad 0x0D00 +108, // Tah 0x0D00 +112, // Za (dhah) 0x0D00 +116, // 'ain 0x0D00 +120, // ghain 0x0D00 +124, // fa 0x0E00 +140, // Malay p, kurdish v = veh 0x0D00 +128, // Qaf 0x0D00 +132, // kaf (caf) 0x0E00 +160, // Persian/Urdu gaf 0x0E00 +176, // Singhi gg 0x0D00 +136, // lam - all ligature variants 0x0D00 +140, // meem 0x0D00 +144, // noon 0x0D00 +148, // ha - arabic language only 0x0D00 +154, // waw 0x0D00 +148, // ha - non-arabic language 0x0D00 +160, // alef maqsurah 0x0D00 +156, // ya 0x0E00 +212 // Urdu ya barree }; /**************************************************************************** Desc: ****************************************************************************/ FLMUINT16 ArabSubColToWPChr[] = { 0x0D00 +177, // Alef maddah 0x0D00 +165, // Alef Hamzah 0x0D00 +169, // Waw hamzah 0x0D00 +167, // Hamzah under alef 0x0D00 +171, // ya hamzah 0x0D00 +175, // alef fathattan 0x0D00 +179, // alef waslah 0x0D00 + 58, // alef 0x0D00 + 64 // taa - after taa marbuuTah }; /**************************************************************************** Desc: Turns a collated diacritic value into the original diacritic value ****************************************************************************/ FLMBYTE ml1_COLtoD[ 27] = { 23, // dbls sort value = 0 sorts as 'ss' 6, // acute sort value = 1 0, // grave sort value = 2 22, // breve sort value = 3 3, // circum sort value = 4 19, // caron sort value = 5 7, // umlaut sort value = 6 2, // tilde sort value = 7 14, // ring sort value = 8 7, // umlaut in SU, SV and CZ after ring = 9 5, // slash sort value = 10 17, // cedilla sort value = 11 4, // crossb sort value = 12 15, // dota sort value = 13 18, // ogonek sort value = 14 20, // stroke sort value = 15 1, // centerd sort value = 16 8, // macron sort value = 17 9, // aposab sort value = 18 10, // aposbes sort value = 19 11, // aposba sort value = 20 12, // aposbc sort value = 21 13, // abosbl sort value = 22 16, // dacute sort value = 23 21, // bara sort value = 24 24, // dotlesi sort value = 25 25 // dotlesj sort value = 26 }; /**************************************************************************** Desc: ****************************************************************************/ FLMUINT16 flmIndexi[] = { 0, 11, 14, 15, 17, 18, 19, 21, 22, 23, 24, 25, 26, 35, 59 }; /**************************************************************************** Desc: ****************************************************************************/ FLMUINT16 flmIndexj[] = { CA_LANG, // Catalan CF_LANG, // Canadian French CZ_LANG, // Czech SL_LANG, // Slovak DE_LANG, // German SD_LANG, // Swiss German ES_LANG, // Spanish (Spain) FR_LANG, // French _NL_LANG, // Netherlands 0xFFFF, // DK_LANG, Danish 0xFFFF, // NO_LANG, Norwegian 0x0063, // c 0x006c, // l 0x0197, // l with center dot 0x0063, // c 0x0125, // ae digraph 0x01a7, // oe digraph 0x0068, // h 0x0068, // h 0x006c, // l 0x0101, // center dot alone 0x006c, // l 0x0117, // á (for German) 0x018b, // ij digraph 0x0000, // was 'a' - will no longer map 'aa' to a-ring 0x0000, // was 'a' CZ_LANG, DK_LANG, NO_LANG, SL_LANG, TK_LANG, SU_LANG, IS_LANG, SV_LANG, YK_LANG, 0x011e, // A diaeresis 0x011f, // a diaeresis 0x0122, // A ring 0x0123, // a ring 0x0124, // AE diagraph 0x0125, // ae diagraph 0x013e, // O diaeresis 0x013f, // o diaeresis 0x0146, // U diaeresis 0x0147, // u diaeresis 0x0150, // O slash 0x0151, // o slash 0x0A3a, // CYRILLIC SOFT SIGN 0x0A3b, // CYRILLIC soft sign 0x01ee, // dotless i - turkish 0x01ef, // dotless I - turkish 0x0162, // C Hacek/caron 0x0163, // c Hacek/caron 0x01aa, // R Hacek/caron 0x01ab, // r Hacek/caron 0x01b0, // S Hacek/caron 0x01b1, // s Hacek/caron 0x01ce, // Z Hacek/caron 0x01cf, // z Hacek/caron }; /**************************************************************************** Desc: Kana subcollation values BIT 0: set if large char BIT 1: set if voiced BIT 2: set if half voiced Notes: To save space should be nibbles IMPORTANT: The '1' entries that do not have a matching '0' entry have been changed to zero to save space in the subcollation area. ****************************************************************************/ FLMBYTE flmKanaSubColTbl[] = { 0,1,0,1,0,1,0,1,0,1, // a A i I u U e E o O 1,3,0,3,0,3,1,3,0,3, // KA GA KI GI KU GU KE GE KO GO 0,3,0,3,0,3,0,3,0,3, // SA ZA SHI JI SU ZU SE ZE SO ZO 0,3,0,3,0,1,3,0,3,0,3, // TA DA CHI JI tsu TSU ZU TE DE TO DO 0,0,0,0,0, // NA NI NU NE NO 0,3,5,0,3,5,0,3,5, // HA BA PA HI BI PI FU BU PU 0,3,5,0,3,5, // HE BE PE HO BO PO 0,0,0,0,0, // MA MI MU ME MO 0,1,0,1,0,1, // ya YA yu YU yo YO 0,0,0,0,0, // RA RI RU RE RO 0,1,0,0,0, // wa WA WI WE WO 0,3,0,0 // N VU ka ke }; /**************************************************************************** Desc: Map KataKana (CharSet x26) to collation values. Kana collating values are two byte values where the high byte is 0x01. ****************************************************************************/ FLMBYTE KanaColTbl[] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, // a A i I u U e E o O 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, // KA GA KI GI KU GU KE GE KO GO 10,10,11,11,12,12,13,13,14,14, // SA ZA SHI JI SU ZU SE ZE SO ZO 15,15,16,16,17,17,17,18,18,19,19,// TA DA CHI JI tsu TSU ZU TE DE TO DO 20,21,22,23,24, // NA NI NU NE NO 25,25,25,26,26,26,27,27,27, // HA BA PA HI BI PI FU BU PU 28,28,28,29,29,29, // HE BE PE HO BO PO 30,31,32,33,34, // MA MI MU ME MO 35,35,36,36,37,37, // ya YA yu YU yo YO 38,39,40,41,42, // RA RI RU RE RO 43,43,44,45,46, // wa WA WI WE WO 47, 2, 5, 8 // N VU ka ke }; /**************************************************************************** Desc: Map KataKana collated value to vowel value for use for the previous char. ****************************************************************************/ FLMBYTE KanaColToVowel[] = { 0,1,2,3,4, // a i u e o 0,1,2,3,4, // ka ki ku ke ko 0,1,2,3,4, // sa shi su se so 0,1,2,3,4, // ta chi tsu te to 0,1,2,3,4, // na ni nu ne no 0,1,2,3,4, // ha hi hu he ho 0,1,2,3,4, // ma mi mu me mo 0,2,4, // ya yu yo 0,1,2,3,4, // ra ri ru re ro 0,1,3,4, // wa wi we wo }; /**************************************************************************** Desc: ****************************************************************************/ static FLMBYTE ColToKanaTbl[ 48] = { 0, // a=0, A=1 2, // i=2, I=3 4, // u=4, U=5, VU=83 6, // e=6, E=7 8, // o=8, O=9 84, // KA=10, GA=11, ka=84 12, // KI=12, GI=13 14, // KU=14, GU=15 85, // KE=16, GE=17, ke=85 18, // KO=18, GO=19 20, // SA=20, ZA=21 22, // SHI=22, JI=23 24, // SU=24, ZU=25 26, // SE=26, ZE=27 28, // SO=28, ZO=29 30, // TA=30, DA=31 32, // CHI=32, JI=33 34, // tsu=34, TSU=35, ZU=36 37, // TE=37, DE=38 39, // TO=39, DO=40 41, // NA 42, // NI 43, // NU 44, // NE 45, // NO 46, // HA, BA, PA 49, // HI, BI, PI 52, // FU, BU, PU 55, // HE, BE, PE 58, // HO, BO, PO 61, // MA 62, // MI 63, // MU 64, // ME 65, // MO 66, // ya, YA 68, // yu, YU 70, // yo, YO 72, // RA 73, // RI 74, // RU 75, // RE 76, // RO 77, // wa, WA 79, // WI 80, // WE 81, // WO 82 // N }; /**************************************************************************** Desc: Maps from charset 11 to CS24 (punctuation) (starting from 11,0) ****************************************************************************/ FLMBYTE From0AToZen[] = { 0, 9, 40, 0x53, // sp ! " # 0x4f, 0x52, 0x54, 38, // $ % & ' 0x29, 0x2a, 0x55, 0x3b, // ( ) * + 3, 0x1d, 4, 0x1e // , - . / }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE From0BToZen[] = { 6, 7, 0x42, 0x40, // : ; < = 0x43, 8, 0x56 // > ? @ }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE From0CToZen[] = { 0x2d, 0x1f, 0x2e, // [ \ ] 0x0f, 0x11, 0x0d // ^ _ ` }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE From0DToZen[] = { 0x2f, 0x22, 0x30, 0x20 // { | } ~ }; /**************************************************************************** Desc: ****************************************************************************/ FLMBYTE From8ToZen[] = { 0x5e, 0x7e, 0x5f, 0x7f, 0x5f, 0xFF, 0x60, 0x80, 0x61, 0x81, 0x62, 0x82, 0x63, 0x83, 0x64, 0x84, 0x65, 0x85, 0x66, 0x86, 0x67, 0x87, 0x68, 0x88, 0x69, 0x89, 0x6a, 0x8a, 0x6b, 0x8b, 0x6c, 0x8c, 0x6d, 0x8d, 0x6e, 0x8e, 0x6f, 0x8f, 0x6f, 0xFF, 0x70, 0x90, 0x71, 0x91, 0x72, 0x92, 0x73, 0x93, 0x74, 0x94, 0x75, 0x95 }; /**************************************************************************** Desc: ****************************************************************************/ static FLMBYTE From11AToZen[] = { 2, // japanese period 0x35, // left bracket 0x36, // right bracket 0x01, // comma 0x05 // chuuten }; /**************************************************************************** Desc: ****************************************************************************/ static FLMBYTE From11BToZen[] = { 0x51, // wo 0,2,4,6,8,0x42,0x44,0x46,0x22, // small a i u e o ya yu yo tsu 0xFF, 1, 3, 5, 7, 9, // dash (x241b) a i u e o 0x0a, 0x0c, 0x0e, 0x10, 0x12, // ka ki ku ke ko 0x14, 0x16, 0x18, 0x1a, 0x1c, // sa shi su se so 0x1e, 0x20, 0x23, 0x25, 0x27, // ta chi tsu te to 0x29, 0x2a, 0x2b, 0x2c, 0x2d, // na ni nu ne no 0x2e, 0x31, 0x34, 0x37, 0x3a, // ha hi fu he ho 0x3d, 0x3e, 0x3f, 0x40, 0x41, // ma mi mu me mo 0x43, 0x45, 0x47, // ya yu yo 0x48, 0x49, 0x4a, 0x4b, 0x4c, // ra ri ru re ro 0x4e, 0x52 // WA N }; /**************************************************************************** Desc: Convert Zenkaku (double wide) to Hankaku (single wide) Character set 0x24 maps to single wide chars in other char sets. This enables collation values to be found on some symbols. This is also used to convert symbols from hankaku to Zen24. ****************************************************************************/ BYTE_WORD_TBL Zen24ToHankaku[] = { { 0 ,0x0020 }, // space { 1 ,0x0b03 }, // japanese comma { 2 ,0x0b00 }, // circle period { 3 , 44 }, // comma { 4 , 46 }, // period { 5 ,0x0b04 }, // center dot { 6 , 58 }, // colon { 7 , 59 }, // semicolon { 8 , 63 }, // question mark { 9 , 33 }, // exclamation mark { 10 ,0x0b3d }, // dakuten { 11 ,0x0b3e }, // handakuten { 12 ,0x0106 }, // accent mark { 13 , 96 }, // accent mark { 14 ,0x0107 }, // umlat { 15 , 94 }, // caret { 16 ,0x0108 }, // macron { 17 , 95 }, // underscore { 27 ,0x0b0f }, // extend vowel { 28 ,0x0422 }, // mdash { 29 , 45 }, // hyphen { 30 , 47 }, // slash { 31 ,0x0607 }, // backslash { 32 , 126 }, // tilde { 33 ,0x0611 }, // doubleline { 34 ,0x0609 }, // line { 37 ,0x041d }, // left apostrophe { 38 ,0x041c }, // right apostrophe { 39 ,0x0420 }, // left quote { 40 ,0x041f }, // right quote { 41 , 40 }, // left paren { 42 , 41 }, // right paren { 45 , 91 }, // left bracket { 46 , 93 }, // right bracket { 47 , 123 }, // left curly bracket { 48 , 125 }, // right curly bracket { 53 ,0x0b01 }, // left j quote { 54 ,0x0b02 }, // right j quote { 59 , 43 }, // plus { 60 ,0x0600 }, // minus { 61 ,0x0601 }, // plus/minus { 62 ,0x0627 }, // times { 63 ,0x0608 }, // divide { 64 , 61 }, // equal { 65 ,0x0663 }, // unequal { 66 , 60 }, // less { 67 , 62 }, // greater { 68 ,0x0602 }, // less/equal { 69 ,0x0603 }, // greater/equal { 70 ,0x0613 }, // infinity { 71 ,0x0666 }, // traingle dots { 72 ,0x0504 }, // man { 73 ,0x0505 }, // woman { 75 ,0x062d }, // prime { 76 ,0x062e }, // double prime { 78 ,0x040c }, // yen { 79 , 36 }, // dollar { 80 ,0x0413 }, // cent { 81 ,0x040b }, // pound { 82 , 37 }, // percent { 83 , 35 }, // # { 84 , 38 }, // & { 85 , 42 }, // * { 86 , 64 }, // @ { 87 ,0x0406 }, // squiggle { 89 ,0x06b8 }, // filled star { 90 ,0x0425 }, // hollow circle { 91 ,0x042c }, // filled circle { 93 ,0x065f }, // hollow diamond { 94 ,0x0660 }, // filled diamond { 95 ,0x0426 }, // hollow box { 96 ,0x042e }, // filled box { 97 ,0x0688 }, // hollow triangle { 99 ,0x0689 }, // hollow upside down triangle { 103,0x0615 }, // right arrow { 104,0x0616 }, // left arrow { 105,0x0617 }, // up arrow { 106,0x0622 }, // down arrow { 119,0x060f }, { 121,0x0645 }, { 122,0x0646 }, { 123,0x0643 }, { 124,0x0644 }, { 125,0x0642 }, // union { 126,0x0610 }, // intersection { 135,0x0655 }, { 136,0x0656 }, { 138,0x0638 }, // right arrow { 139,0x063c }, // left/right arrow { 140,0x067a }, { 141,0x0679 }, { 153,0x064f }, // angle { 154,0x0659 }, { 155,0x065a }, { 156,0x062c }, { 157,0x062b }, { 158,0x060e }, { 159,0x06b0 }, { 160,0x064d }, { 161,0x064e }, { 162,0x050e }, // square root { 164,0x0604 }, { 175,0x0623 }, // angstrom { 176,0x044b }, // percent { 177,0x051b }, // sharp { 178,0x051c }, // flat { 179,0x0509 }, // musical note { 180,0x0427 }, // dagger { 181,0x0428 }, // double dagger { 182,0x0405 }, // paragraph { 187,0x068f } // big hollow circle }; /**************************************************************************** Desc: Maps CS26 to CharSet 11 ****************************************************************************/ FLMBYTE MapCS26ToCharSet11[ 86 ] = { 0x06, // 0 a 0x10, // 1 A 0x07, // 2 i 0x11, // 3 I 0x08, // 4 u 0x12, // 5 U 0x09, // 6 e 0x13, // 7 E 0x0a, // 8 o 0x14, // 9 O 0x15, // 0x0a KA 0x95, // GA - 21 followed by 0x3D dakuten 0x16, // 0x0c KI 0x96, // GI 0x17, // 0x0e KU 0x97, // GU 0x18, // 0x10 KE 0x98, // GE 0x19, // 0x12 KO 0x99, // GO 0x1a, // 0x14 SA 0x9a, // ZA 0x1b, // 0x16 SHI 0x9b, // JI 0x1c, // 0x18 SU 0x9c, // ZU 0x1d, // 0x1a SE 0x9d, // ZE 0x1e, // 0x1c SO 0x9e, // ZO 0x1f, // 0x1e TA 0x9f, // DA 0x20, // 0x20 CHI 0xa0, // JI 0x0e, // 0x22 small tsu 0x21, // 0x23 TSU 0xa1, // ZU 0x22, // 0x25 TE 0xa2, // DE 0x23, // 0x27 TO 0xa3, // DO 0x24, // 0x29 NA 0x25, // 0x2a NI 0x26, // 0x2b NU 0x27, // 0x2c NE 0x28, // 0x2d NO 0x29, // 0x2e HA 0xa9, // 0x2f BA 0xe9, // 0x30 PA 0x2a, // 0x31 HI 0xaa, // 0x32 BI 0xea, // 0x33 PI 0x2b, // 0x34 FU 0xab, // 0x35 BU 0xeb, // 0x36 PU 0x2c, // 0x37 HE 0xac, // 0x38 BE 0xec, // 0x39 PE 0x2d, // 0x3a HO 0xad, // 0x3b BO 0xed, // 0x3c PO 0x2e, // 0x3d MA 0x2f, // 0x3e MI 0x30, // 0x3f MU 0x31, // 0x40 ME 0x32, // 0x41 MO 0x0b, // 0x42 small ya 0x33, // 0x43 YA 0x0c, // 0x44 small yu 0x34, // 0x45 YU 0x0d, // 0x46 small yo 0x35, // 0x47 YO 0x36, // 0x48 RA 0x37, // 0x49 RI 0x38, // 0x4a RU 0x39, // 0x4b RE 0x3a, // 0x4c RO 0xff, // 0x4d small wa 0x3b, // 0x4e WA 0xff, // 0x4f WI 0xff, // 0x50 WE 0x05, // 0x51 WO 0x3c, // 0x52 N 0xff, // 0x53 VU 0xff, // 0x54 ka 0xff // 0x55 ke }; /**************************************************************************** Desc: ****************************************************************************/ FINLINE FLMBOOL charIsUpper( FLMUINT16 ui16Char) { return( (FLMBOOL)((ui16Char < 0x7F) ? (FLMBOOL)((ui16Char >= ASCII_LOWER_A && ui16Char <= ASCII_LOWER_Z) ? (FLMBOOL)FALSE : (FLMBOOL)TRUE) : flmIsUpper( ui16Char))); } /**************************************************************************** Desc: getNextCharState can be thought of as a 2 dimentional array with i and j as the row and column indicators respectively. If a value exists at the intersection of i and j, it is returned. Sparse array techniques are used to minimize memory usage. ****************************************************************************/ FINLINE FLMUINT16 getNextCharState( FLMUINT i, FLMUINT j) { FLMUINT k; FLMUINT x; for( k = flmIndexi[ x = (i > START_COL) ? (START_ALL) : i ]; k <= (FLMUINT) (flmIndexi[ x + 1] - 1); k++ ) { if( j == flmIndexj[ k]) { return( flmValuea[ (i > START_COL) ? (k + (FIXUP_AREA_SIZE * (i - START_ALL))) : k]); } } return(0); } /************************************************************************** Desc: Find the collating value of a WP character Ret: Collating value (COLS0 is high value - undefined WP char) ***************************************************************************/ FLMUINT16 flmGetCollation( FLMUINT16 ui16WpChar, FLMUINT uiLanguage) { FLMUINT16 ui16State; FLMBYTE ucCharVal; FLMBYTE ucCharSet; FLMBOOL bHebrewArabicFlag = FALSE; TBL_B_TO_BP * pColTbl = flmCol60Tbl; // State ONLY for non-US if (uiLanguage != US_LANG) { if (uiLanguage == AR_LANG || // Arabic uiLanguage == FA_LANG || // Farsi - persian uiLanguage == HE_LANG || // Hebrew uiLanguage == UR_LANG) // Urdu { pColTbl = flmHebArabicCol60Tbl; bHebrewArabicFlag = TRUE; } else { // Check if uiLanguage candidate for alternate double collating ui16State = getNextCharState( START_COL, uiLanguage); if (0 != (ui16State = getNextCharState( (ui16State ? ui16State // look at special case languages : START_ALL), // look at US and European (FLMUINT) ui16WpChar))) { return( ui16State); } } } ucCharVal = (FLMBYTE)ui16WpChar; ucCharSet = (FLMBYTE)(ui16WpChar >> 8); // This is an optimized version of f_b_bp_citrp() inline for performance do { if (pColTbl->key == ucCharSet) { FLMBYTE * pucColVals; pucColVals = pColTbl->charPtr; // Above lower range of table? if (ucCharVal >= *pucColVals) { // Make value zero based to index ucCharVal -= *pucColVals++; // Below maximum number of table entries? if (ucCharVal < *pucColVals++) { // Return collated value. return( pucColVals[ ucCharVal]); } } } // Go to next table entry pColTbl++; } while (pColTbl->key != 0xFF); if (bHebrewArabicFlag) { if (ucCharSet == CHSHEB || ucCharSet == CHSARB1 || ucCharSet == CHSARB2) { // Same as COLS0_HEBREW return( COLS0_ARABIC); } } // Defaults for characters that don't have a collation value. return( COLS0); } /**************************************************************************** Desc: Add an field into the CDL (Compound Data List) for this ISK. ****************************************************************************/ RCODE KYCmpKeyAdd2Lst( FDB * pDb, IXD * pIxd, // Index definition IFD * pIfd, // Index field definition void * pvField, // Field whose value is part of the key void * pRootContext) // Points to root context of field path { RCODE rc = FERR_OK; CDL * pCdl; KREF_CNTRL * pKrefCntrl; CDL ** ppCdlTbl; FLMUINT uiCdlEntry; FLMUINT uiIxEntry; pKrefCntrl = &pDb->KrefCntrl; ppCdlTbl = pKrefCntrl->ppCdlTbl; flmAssert( ppCdlTbl != NULL); // Figure out which CDL and index entry to use uiIxEntry = (FLMUINT) (pIxd - pDb->pDict->pIxdTbl); uiCdlEntry = (FLMUINT) (pIfd - pDb->pDict->pIfdTbl); if( (pCdl = (CDL *)GedPoolAlloc( &pDb->TempPool, sizeof( CDL))) == NULL) { rc = RC_SET( FERR_MEM); goto Exit; } flmAssert( pKrefCntrl->pIxHasCmpKeys != NULL); pKrefCntrl->pIxHasCmpKeys [uiIxEntry] = TRUE; pCdl->pField = pvField; pCdl->pRootContext = pRootContext; // Insert at first of CDL list pCdl->pNext = ppCdlTbl [uiCdlEntry]; ppCdlTbl [uiCdlEntry] = pCdl; pKrefCntrl->bHaveCompoundKey = TRUE; Exit: return( rc); } /**************************************************************************** Desc: Called when an entire record has been processed by the key building functions. Builds and add all compound keys to the table. ****************************************************************************/ RCODE KYBuildCmpKeys( FDB * pDb, FLMUINT uiAction, FLMUINT uiContainerNum, FLMUINT uiDrn, FLMBOOL * pbHadUniqueKeys, FlmRecord * pRecord) { RCODE rc = FERR_OK; KREF_CNTRL * pKrefCntrl = &pDb->KrefCntrl; CDL ** ppCdlTbl = pKrefCntrl->ppCdlTbl; FLMBYTE * pKeyBuf = pKrefCntrl->pKrefKeyBuf; FLMBYTE * pIxHasCmpKeys = pKrefCntrl->pIxHasCmpKeys; IXD * pIxd; IFD * pIfd; IFD * pFirstIfd; FLMUINT uiFirstCdlEntry; FLMUINT uiCdlEntry; FLMBOOL bBuildCmpKeys; FLMUINT uiIxEntry; FLMUINT uiTotalIndexes; FLMUINT uiIfdCnt; FLMUINT uiKeyLen; FLMBYTE LowUpBuf [MAX_LOWUP_BUF]; FLD_CONTEXT fldContext; FDICT * pDict = pDb->pDict; LowUpBuf[0] = '\0'; if( pKrefCntrl->bHaveCompoundKey == FALSE) { goto Exit; } flmAssert( pKeyBuf != NULL && pIxHasCmpKeys != NULL); pKrefCntrl->bHaveCompoundKey = FALSE; // Loop through all of the indexes looking for a CDL entry. // VISIT: We need to find the indexes faster than looping! uiTotalIndexes = pDict->uiIxdCnt; for (uiIxEntry = 0; uiIxEntry < uiTotalIndexes; uiIxEntry++) { // See if the index has compound keys to build. if( !pIxHasCmpKeys [uiIxEntry]) { continue; } pIxd = pDict->pIxdTbl + uiIxEntry; pIxHasCmpKeys [uiIxEntry] = FALSE; bBuildCmpKeys = TRUE; // Make sure that all required fields are present. pFirstIfd = pIfd = pIxd->pFirstIfd; uiCdlEntry = uiFirstCdlEntry = (FLMUINT) (pFirstIfd - pDict->pIfdTbl); for (uiIfdCnt = 0; uiIfdCnt < pIxd->uiNumFlds; pIfd++, uiCdlEntry++, uiIfdCnt++) { FLMUINT uiCompoundPos; FLMBOOL bHitFound; // Loop on each compound field piece looking for REQUIRED field // without any data - then we don't have to build a key. bHitFound = (pIfd->uiFlags & IFD_REQUIRED_PIECE) ? FALSE : TRUE; uiCompoundPos = pIfd->uiCompoundPos; for(;;) { if( !bHitFound) { if( ppCdlTbl [uiCdlEntry]) { bHitFound = TRUE; } } if( (pIfd->uiFlags & IFD_LAST) || ((pIfd+1)->uiCompoundPos != uiCompoundPos)) { break; } pIfd++; uiCdlEntry++; uiIfdCnt++; } if( !bHitFound) { bBuildCmpKeys = FALSE; break; } } // Build the individual compound keys. if( bBuildCmpKeys) { uiKeyLen = 0; f_memset( &fldContext, 0, sizeof(FLD_CONTEXT)); if( RC_BAD(rc = KYCmpKeyElmBld( pDb, pIxd, uiContainerNum, pFirstIfd, uiAction, uiDrn, pbHadUniqueKeys, uiFirstCdlEntry, 0, pKeyBuf, uiKeyLen, LowUpBuf, 0, pRecord, &fldContext))) { goto Exit; } } // Reset the CDL pointers to NULL f_memset( (void *) (&ppCdlTbl [ uiFirstCdlEntry ]), 0, sizeof(CDL *) * pIxd->uiNumFlds); } Exit: return( rc); } /**************************************************************************** Desc: Build all compound keys for a record. ****************************************************************************/ RCODE KYCmpKeyElmBld( FDB * pDb, IXD * pIxd, // Index definition. FLMUINT uiContainerNum, IFD * pIfd, // Index field definition. FLMUINT uiAction, FLMUINT uiDrn, FLMBOOL * pbHadUniqueKeys, FLMUINT uiCdlEntry, // CDL entry for the IFD. FLMUINT uiCompoundPos, // Compound Piece number - zero based FLMBYTE * pKeyBuf, // Key buffer to build the key in FLMUINT uiKeyLen, // Total length left in the key buffer FLMBYTE * pLowUpBuf, // For POST compound keys place bits here. FLMUINT uiLuLen, // Length used in pLowUpBuf. FlmRecord * pRecord, // Record being indexed. FLD_CONTEXT * pFldContext) // State to verify all fields are siblings. { RCODE rc = FERR_OK; CDL ** pCdlTbl = pDb->KrefCntrl.ppCdlTbl; CDL * pCdl = pCdlTbl [uiCdlEntry]; FLMBYTE * pTmpBuf = NULL; void * pvMark = NULL; IFD * pNextIfdPiece; void * pvField; void * pSaveParentAnchor; FLMUINT uiNextCdlEntry; FLMBOOL bBuiltKeyPiece; FLMUINT uiElmLen; FLMUINT uiPostFlag; FLMUINT uiPostLen; FLMUINT uiTempLuLen; FLMUINT uiPieceLuLen; FLMUINT uiNextPiecePos; FLMUINT uiLanguage; FLMUINT uiMaxKeySize = (pIxd->uiContainerNum) ? MAX_KEY_SIZ : MAX_KEY_SIZ - getIxContainerPartLen( pIxd); FLMBOOL bFldIsEncrypted = FALSE; if ((uiLanguage = pIxd->uiLanguage) == 0xFFFF) { uiLanguage = pDb->pFile->FileHdr.uiDefaultLanguage; } // Test for compound key being tons of levels flmAssert( uiCompoundPos < MAX_COMPOUND_PIECES); // Set if this piece is part of post uiPostFlag = IFD_IS_POST_TEXT( pIfd); // Add the DELIMITER, except on the first key element if( uiCompoundPos != 0) { IFD * pPrevIfd = pIfd - 1; if( (uiLanguage >= FIRST_DBCS_LANG) && (uiLanguage <= LAST_DBCS_LANG) && (IFD_GET_FIELD_TYPE( pPrevIfd) == FLM_TEXT_TYPE) && (!(pPrevIfd->uiFlags & IFD_CONTEXT))) { pKeyBuf [uiKeyLen++] = 0; } pKeyBuf [uiKeyLen++] = COMPOUND_MARKER; } // Determine the next IFD compound piece. for( pNextIfdPiece = (IFD *)NULL, uiNextCdlEntry = uiCdlEntry + 1, uiNextPiecePos = 0; ((pIfd+uiNextPiecePos)->uiFlags & IFD_LAST) == 0; ) { if( (pIfd+uiNextPiecePos)->uiCompoundPos != (pIfd+uiNextPiecePos+1)->uiCompoundPos) { pNextIfdPiece = pIfd + uiNextPiecePos + 1; uiNextCdlEntry = uiCdlEntry + uiNextPiecePos + 1; break; } if( !pCdl) { pIfd++; pCdl = pCdlTbl [ ++uiCdlEntry]; uiNextCdlEntry = uiCdlEntry + 1; } else { uiNextPiecePos++; } } pSaveParentAnchor = pFldContext->pParentAnchor; bBuiltKeyPiece = FALSE; // Loop on each CDL, but do at least once while( pCdl || !bBuiltKeyPiece) { // Restore context values for each iteration pFldContext->pParentAnchor = pSaveParentAnchor; // If there is a field to process, verify that its path is // relative to the previous non-null compound pieces if( pCdl) { pvField = pCdl->pField; // Validate the current and previous root contexts if( KYValidatePathRelation( pRecord, pCdl->pRootContext, pvField, pFldContext, uiCompoundPos) == FERR_FAILURE) { // This field didn't pass the test, get the next field. goto Next_CDL_Field; } } else { pvField = NULL; } bBuiltKeyPiece = TRUE; uiPostLen = uiElmLen = 0; uiTempLuLen = uiLuLen; if( pCdl && (pIfd->uiFlags & (IFD_EACHWORD | IFD_SUBSTRING)) && (pRecord->getDataType( pvField) == FLM_TEXT_TYPE) && pRecord->getDataLength( pvField) && ((!pRecord->isEncryptedField( pvField) || (pRecord->isEncryptedField( pvField) && pDb->pFile->bInLimitedMode)))) { const FLMBYTE * pText = pRecord->getDataPtr( pvField); FLMUINT uiTextLen = pRecord->getDataLength( pvField); FLMUINT uiWordLen; FLMBOOL bReturn; FLMBOOL bFirstSubstring = (pIfd->uiFlags & IFD_SUBSTRING) ? TRUE : FALSE; if( !pTmpBuf) { pvMark = GedPoolMark( &pDb->TempPool); if( (pTmpBuf = (FLMBYTE *)GedPoolAlloc( &pDb->TempPool, (FLMUINT)MAX_KEY_SIZ + 8)) == NULL) { rc = RC_SET( FERR_MEM); goto Cleanup1; } } // Loop on each WORD in the value for(;;) { bReturn = (pIfd->uiFlags & IFD_EACHWORD) ? (FLMBOOL) KYEachWordParse( &pText, &uiTextLen, pIfd->uiLimit, pTmpBuf, &uiWordLen) : (FLMBOOL) KYSubstringParse( &pText, &uiTextLen, pIfd->uiFlags, pIfd->uiLimit, pTmpBuf, &uiWordLen); if( !bReturn) { break; } uiTempLuLen = uiLuLen; // Compute number of bytes left uiElmLen = uiMaxKeySize - uiKeyLen - uiTempLuLen; if( RC_BAD( rc = KYCollateValue( &pKeyBuf [uiKeyLen], &uiElmLen, pTmpBuf, uiWordLen, pIfd->uiFlags, pIfd->uiLimit, NULL, &uiPieceLuLen, uiLanguage, TRUE, bFirstSubstring, FALSE, NULL))) { goto Exit; } bFirstSubstring = FALSE; if( uiPostFlag) { uiElmLen -= uiPieceLuLen; f_memcpy( &pLowUpBuf [uiTempLuLen], &pKeyBuf[ uiKeyLen + uiElmLen ], uiPieceLuLen); uiTempLuLen += uiPieceLuLen; } if( !pNextIfdPiece) { // All ISKs have been added so now output the key if( uiTempLuLen ) { uiPostLen = KYCombPostParts( pKeyBuf, (FLMUINT)(uiKeyLen + uiElmLen), pLowUpBuf, uiTempLuLen, uiLanguage, (FLMUINT)(pIfd->uiFlags) ); } if( RC_BAD( rc = KYAddToKrefTbl( pDb, pIxd, uiContainerNum, pIfd, uiAction, uiDrn, pbHadUniqueKeys, pKeyBuf, (FLMUINT)(uiKeyLen + uiElmLen + uiPostLen), TRUE, FALSE, FALSE))) { goto Cleanup1; } } else if( RC_BAD( rc)) { goto Cleanup1; } else { // RECURSIVE CALL to the Next ISK provided no overflow if( RC_BAD( rc = KYCmpKeyElmBld( pDb, pIxd, uiContainerNum, pNextIfdPiece, uiAction, uiDrn, pbHadUniqueKeys, uiNextCdlEntry, uiCompoundPos + 1, pKeyBuf, (FLMUINT)(uiKeyLen + uiElmLen), pLowUpBuf, uiTempLuLen, pRecord, pFldContext))) { goto Cleanup1; } } if( (pIfd->uiFlags & IFD_SUBSTRING) && (uiTextLen == 1 && !(uiLanguage >= FIRST_DBCS_LANG && uiLanguage <= LAST_DBCS_LANG))) { break; } } Cleanup1: if (RC_BAD( rc)) { goto Exit; } } else { if( pvField) { if( pIfd->uiFlags & IFD_CONTEXT) { pKeyBuf [uiKeyLen] = KY_CONTEXT_PREFIX; flmUINT16ToBigEndian( (FLMUINT16)pRecord->getFieldID( pvField), &pKeyBuf [uiKeyLen + 1]); uiKeyLen += KY_CONTEXT_LEN; } else if( pRecord->getDataLength( pvField)) { const FLMBYTE * pExportValue = pRecord->getDataPtr( pvField); FLMUINT uiDataLength = pRecord->getDataLength( pvField); if (pRecord->isEncryptedField( pvField) && pDb->pFile->bInLimitedMode) { pExportValue = pRecord->getEncryptionDataPtr( pvField); uiDataLength = pRecord->getEncryptedDataLength( pvField); bFldIsEncrypted = TRUE; } // Compute number of bytes left uiElmLen = uiMaxKeySize - uiKeyLen - uiLuLen; if( RC_BAD( rc = KYCollateValue( &pKeyBuf [uiKeyLen], &uiElmLen, pExportValue, uiDataLength, pIfd->uiFlags, pIfd->uiLimit, NULL, &uiPieceLuLen, uiLanguage, TRUE, FALSE, FALSE, NULL, NULL, bFldIsEncrypted))) { goto Exit; } if( uiPostFlag ) { uiElmLen -= uiPieceLuLen; f_memcpy( &pLowUpBuf [uiTempLuLen], &pKeyBuf [uiKeyLen + uiElmLen], uiPieceLuLen); uiTempLuLen += uiPieceLuLen; } } } if( !pNextIfdPiece) { // All IFDs have been added so now output the key if( uiTempLuLen) { uiPostLen = KYCombPostParts( pKeyBuf, (FLMUINT)(uiKeyLen + uiElmLen), pLowUpBuf, uiTempLuLen, uiLanguage, (FLMUINT)(pIfd->uiFlags)); } if( RC_BAD( rc = KYAddToKrefTbl( pDb, pIxd, uiContainerNum, pIfd, uiAction, uiDrn, pbHadUniqueKeys, pKeyBuf, (FLMUINT)(uiKeyLen + uiElmLen + uiPostLen), TRUE, FALSE, bFldIsEncrypted))) { goto Exit; } } else if( RC_BAD( rc)) { goto Exit; } else { if( RC_BAD( rc = KYCmpKeyElmBld( pDb, pIxd, uiContainerNum, pNextIfdPiece, uiAction, uiDrn, pbHadUniqueKeys, uiNextCdlEntry, uiCompoundPos + 1, pKeyBuf, (FLMUINT)(uiKeyLen + uiElmLen), pLowUpBuf, uiTempLuLen, pRecord, pFldContext))) { goto Exit; } } } Next_CDL_Field: if( pCdl) { pCdl = pCdl->pNext; } // If the CDL list is empty, goto the next IFD if same uiCompoundPos. while ((!pCdl) && ((pIfd->uiFlags & IFD_LAST) == 0) && (pIfd->uiCompoundPos == (pIfd+1)->uiCompoundPos)) { pIfd++; pCdl = pCdlTbl [++uiCdlEntry]; } // If all fields failed the validate field path test and this piece of // the compound key is required, then goto exit NOW which will not // build any key with the previous built key pieces. if( !pCdl && !bBuiltKeyPiece && ((pIfd->uiFlags & IFD_OPTIONAL) == 0)) { goto Exit; } } Exit: if( pvMark) { GedPoolReset( &pDb->TempPool, pvMark); } return( rc); } /**************************************************************************** Desc: Validate that the current field is related to the other fields in the compound key index. The context (left-most) fields of the field paths must all be siblings of each other in order to be related. ****************************************************************************/ RCODE KYValidatePathRelation( FlmRecord * pRecord, void * pCurContext, void * pCurFld, FLD_CONTEXT * pFldContext, FLMUINT uiCompoundPos) { RCODE rc = FERR_OK; void * pCurParent; FLMUINT uiPrevCompoundPos; FLMBOOL bMatchedContext; // If too many compound levels, just exit and don't check. if( uiCompoundPos >= MAX_COMPOUND_PIECES) { goto Exit; } pCurParent = pRecord->parent( pCurContext); // First time in is the easy case - just set the parent anchor. // A value of NULL is OK. if( uiCompoundPos == 0) { pFldContext->pParentAnchor = pCurParent; goto Exit; } bMatchedContext = FALSE; uiPrevCompoundPos = uiCompoundPos; while( uiPrevCompoundPos--) { if( pFldContext->rootContexts[ uiPrevCompoundPos] == pCurContext) { // Check this field against the current field values. rc = KYVerifyMatchingPaths( pRecord, pCurContext, pCurFld, pFldContext->leafFlds[ uiPrevCompoundPos]); // Return failure on any failure. Otherwise continue. if( rc == FERR_FAILURE) { goto Exit; } bMatchedContext = TRUE; } } if( bMatchedContext) { // If we had some base relation match, there is no need to // verify that the parents are the same. goto Exit; } // Verify that the parent anchor equals the parent of pCurContext. if( pFldContext->pParentAnchor != pCurParent) { rc = RC_SET( FERR_FAILURE); goto Exit; } Exit: // Set the state variables for this compound position. if( RC_OK(rc)) { pFldContext->rootContexts[ uiCompoundPos ] = pCurContext; pFldContext->leafFlds[ uiCompoundPos] = pCurFld; } return( rc); } /**************************************************************************** Desc: Verify that two paths with a common context match paths. If the tag of pCurContext has a previous match in the compound key, the field should also match (more of a relational validation). This means that for keys (A.B.C.D AND A.B.C.E) the 'A.B.C' fields should be the same field. ALL previous field pieces must be checked for this. This could be (but isn't being) done by finding the best match" and only comparing the current with the best match. Hard Example: Do these fields match - A.B.D.E.F and A.C.D.E.G? We don't want to keep the field path of the two fields around because this is more state than we need right now. These match only if the 'A's are the same field. A A B C D D E E F G ****************************************************************************/ RCODE KYVerifyMatchingPaths( FlmRecord * pRecord, void * pCurContext, // Same value as pMatchFld's context. void * pCurFld, // Current field void * pMatchFld) // Some field from a previous piece. { RCODE rc = FERR_OK; FLMUINT uiCurLevel; FLMUINT uiMatchLevel; FLMBOOL bMismatchFound = FALSE; // If a field equals a context then don't bother to check. if( (pCurContext == pCurFld) || (pCurContext == pMatchFld)) { goto Exit; } // Go up the parent line until levels match. uiCurLevel = pRecord->getLevel( pCurFld); uiMatchLevel = pRecord->getLevel( pMatchFld); flmAssert( pRecord->getLevel( pCurContext) < uiCurLevel); while( uiCurLevel != uiMatchLevel) { if( uiCurLevel > uiMatchLevel) { pCurFld = pRecord->parent( pCurFld); uiCurLevel--; } else { pMatchFld = pRecord->parent( pMatchFld); uiMatchLevel--; } } // Go up until you hit the matching context. while( pCurFld != pCurContext) { if( pRecord->getFieldID( pCurFld) == pRecord->getFieldID( pMatchFld)) { // If the fields are NOT the same we MAY have a mismatch. if( pCurFld != pMatchFld) { bMismatchFound = TRUE; } } else { // Tags are different - start over checking bMismatchFound = FALSE; } // Go to the next parent. pCurFld = pRecord->parent( pCurFld); pMatchFld = pRecord->parent( pMatchFld); } if( bMismatchFound) { rc = RC_SET( FERR_FAILURE); goto Exit; } Exit: return( rc); } /**************************************************************************** Desc: Combine the bits from all POST text keys. ****************************************************************************/ FLMUINT KYCombPostParts( FLMBYTE * pKeyBuf, FLMUINT uiKeyLen, FLMBYTE * pLowUpBuf, FLMUINT uiLuLen, FLMUINT uiLanguage, FLMUINT uiIfdAttr) { FLMUINT wReturnLen; if( !uiLuLen) { return( 0); } wReturnLen = (FLMUINT)(uiLuLen + 2); if( (uiLanguage >= FIRST_DBCS_LANG) && (uiLanguage <= LAST_DBCS_LANG) && ((uiIfdAttr & 0x0F) == FLM_TEXT_TYPE) && (!(uiIfdAttr & IFD_CONTEXT ))) { pKeyBuf [uiKeyLen++] = 0; wReturnLen++; } pKeyBuf [uiKeyLen++] = END_COMPOUND_MARKER; f_memcpy( &pKeyBuf [uiKeyLen], pLowUpBuf, uiLuLen); pKeyBuf [uiKeyLen + uiLuLen] = (FLMBYTE) uiLuLen; return( wReturnLen ); } /**************************************************************************** Desc: Create an index key given a keyTree and index definition. This routine works on a normal data tree - used in FlmKeyBuild. where a data record is traversed with field paths being checked. ****************************************************************************/ RCODE KYTreeToKey( FDB * pDb, IXD * pIxd, FlmRecord * pRecord, FLMUINT uiContainerNum, FLMBYTE * pKeyBuf, FLMUINT * puiKeyLenRV, FLMUINT uiFlags) { RCODE rc = FERR_OK; IFD * pIfd; void * pvMatchField; FLMBYTE * pToKey = pKeyBuf; const FLMBYTE * pExportPtr; FLMUINT uiToKeyLen; FLMUINT uiTotalLen; FLMINT nth; FLMINT iMissingFlds; FLMUINT uiIskPostFlag; FLMUINT uiLuLen; FLMUINT uiPieceLuLen; FLMUINT uiLanguage; FLMUINT uiIsPost = 0; FLMBOOL bIsAsianCompound; FLMBOOL bIsCompound; FLMBYTE LowUpBuf [MAX_LOWUP_BUF]; FLMUINT uiMaxKeySize = (pIxd->uiContainerNum) ? MAX_KEY_SIZ : MAX_KEY_SIZ - getIxContainerPartLen( pIxd); if ((uiLanguage = pIxd->uiLanguage) == 0xFFFF) { uiLanguage = pDb->pFile->FileHdr.uiDefaultLanguage; } uiLuLen = 0; iMissingFlds = 0; uiTotalLen = 0; pIfd = pIxd->pFirstIfd; bIsCompound = (pIfd->uiFlags & IFD_COMPOUND) ? TRUE : FALSE; for (;;pIfd++) { uiIsPost |= (FLMUINT) (uiIskPostFlag = (FLMUINT)IFD_IS_POST_TEXT( pIfd)); bIsAsianCompound =((uiLanguage >= FIRST_DBCS_LANG) && (uiLanguage <= LAST_DBCS_LANG) && (IFD_GET_FIELD_TYPE( pIfd) == FLM_TEXT_TYPE) && (!(pIfd->uiFlags & IFD_CONTEXT))); nth = 1; uiToKeyLen = 0; // Find matching node in the tree - if not found skip and continue FIND_NXT: if( (pvMatchField = pRecord->find( pRecord->root(), pIfd->uiFldNum, nth)) != NULL) { // Match was found, now if flagged, validate its parent path if( uiFlags & KY_PATH_CHK_FLAG) { FLMUINT * puiFieldPath; void * pTempField = pvMatchField; FLMUINT uiCurrentFld; puiFieldPath = pIfd->pFieldPathCToP; for( uiCurrentFld = 1; puiFieldPath [uiCurrentFld]; uiCurrentFld++) { if( ((pTempField = pRecord->parent( pTempField)) == NULL) || (pRecord->getFieldID( pTempField) != puiFieldPath [uiCurrentFld])) { nth++; goto FIND_NXT; } } } // Convert the node's key value to the index type. // Compute maximum bytes remaining. uiToKeyLen = uiMaxKeySize - uiTotalLen; // Take the tag and make it the key if( pIfd->uiFlags & IFD_CONTEXT) { // Output the tag number. *pToKey = KY_CONTEXT_PREFIX; flmUINT16ToBigEndian( (FLMUINT16) pRecord->getFieldID( pvMatchField), &pToKey [1]); uiToKeyLen = KY_CONTEXT_LEN; } else { pExportPtr = pRecord->getDataPtr( pvMatchField); if( RC_BAD( rc = KYCollateValue( pToKey, &uiToKeyLen, pExportPtr, pRecord->getDataLength( pvMatchField), pIfd->uiFlags, pIfd->uiLimit, NULL, &uiPieceLuLen, uiLanguage, bIsCompound, (FLMBOOL) ((pIfd->uiFlags & IFD_SUBSTRING) ? (pRecord->isLeftTruncated( pvMatchField) ? FALSE : TRUE) : FALSE), pRecord->isRightTruncated( pvMatchField), NULL))) { goto Exit; } if( pRecord->isRightTruncated( pvMatchField)) { // If the string is EXACTLY the length of the truncation // length then it should, but doesn't, set the truncation flag. // The code didn't match the design intent. f_memmove( &pToKey[ uiToKeyLen - uiPieceLuLen + 1], &pToKey[ uiToKeyLen - uiPieceLuLen], uiPieceLuLen); pToKey[ uiToKeyLen - uiPieceLuLen] = COLL_TRUNCATED; uiToKeyLen++; } if( uiIskPostFlag) { uiToKeyLen -= uiPieceLuLen; f_memcpy( &LowUpBuf [uiLuLen], &pToKey [uiToKeyLen], uiPieceLuLen ); uiLuLen += uiPieceLuLen; } } } // Check here if key found else the fields are missing. if( uiToKeyLen) { iMissingFlds = 0; pToKey += uiToKeyLen; uiTotalLen += uiToKeyLen; // Go to the last IFD with the same compound position. while( ((pIfd->uiFlags & IFD_LAST) == 0) && (pIfd->uiCompoundPos == (pIfd+1)->uiCompoundPos)) { pIfd++; } } else { // Continue if there are still fields with same compound position. if( ((pIfd->uiFlags & IFD_LAST) == 0) && (pIfd->uiCompoundPos == (pIfd+1)->uiCompoundPos)) { continue; } iMissingFlds++; if( bIsAsianCompound) { iMissingFlds++; } } // Check if done. if( pIfd->uiFlags & IFD_LAST) { break; } if( bIsCompound) { if( bIsAsianCompound) { *pToKey++ = 0; uiTotalLen++; } *pToKey++ = COMPOUND_MARKER; uiTotalLen++; } else if( uiToKeyLen) { break; } } // Back up iMissingFlds-1 because last // field does not have compound marker. // Add 4 bytes of foxes for high values. if( iMissingFlds && (uiFlags & KY_HIGH_FLAG) && bIsCompound) { // Ignore the last one or two iMissingFlds values because a compound // marker was not added to the end of the key. if( bIsAsianCompound) { iMissingFlds--; } uiTotalLen -= --iMissingFlds; pToKey -= iMissingFlds; // Fill with high values to the end of the buffer. // It is easy for double byte ASIAN collation values to all be 0xFF. if( uiTotalLen < uiMaxKeySize) { f_memset( pToKey, 0xFF, uiMaxKeySize - uiTotalLen ); pToKey += (uiMaxKeySize - uiTotalLen); uiTotalLen += (uiMaxKeySize - uiTotalLen); } } else if( uiIsPost) { uiTotalLen += KYCombPostParts( pKeyBuf, uiTotalLen, LowUpBuf, uiLuLen, uiLanguage, (FLMUINT)(pIfd->uiFlags)); } // Add container number to the key if the index is on all containers. if (!pIxd->uiContainerNum) { appendContainerToKey( pIxd, uiContainerNum, pKeyBuf, &uiTotalLen); } *puiKeyLenRV = uiTotalLen; Exit: return( rc); } /**************************************************************************** Desc: Build a collated key value piece. ****************************************************************************/ RCODE KYCollateValue( FLMBYTE * pDest, FLMUINT * puiDestLenRV, const FLMBYTE * pSrc, FLMUINT uiSrcLen, FLMUINT uiFlags, FLMUINT uiLimit, FLMUINT * puiCollationLen, FLMUINT * puiLuLenRV, FLMUINT uiLanguage, FLMBOOL bCompoundPiece, FLMBOOL bFirstSubstring, FLMBOOL bInputTruncated, FLMBOOL * pbDataTruncated, FLMBOOL * pbOriginalCharsLost, FLMBOOL bFldIsEncrypted) { RCODE rc = FERR_OK; FLMUINT uiDestLen; FLMUINT uiDataType = uiFlags & 0x0F; // Treat an encrypted field as binary for collation purposes. if (bFldIsEncrypted) { uiDataType = FLM_BINARY_TYPE; } if( puiLuLenRV) { *puiLuLenRV = 0; } if( (uiDestLen = *puiDestLenRV) == 0) { return( RC_SET( FERR_KEY_OVERFLOW)); } if( uiDataType == FLM_TEXT_TYPE) { FLMUINT uiCharLimit; FLMBYTE byTmpBuf[ MAX_KEY_SIZ + 8]; if( uiFlags & (IFD_MIN_SPACES | IFD_NO_UNDERSCORE | IFD_NO_SPACE | IFD_NO_DASH | IFD_ESC_CHAR)) { if( RC_BAD( rc = KYFormatText( pSrc, uiSrcLen, (uiFlags & IFD_MIN_SPACES) ? TRUE : FALSE, (uiFlags & IFD_NO_UNDERSCORE) ? TRUE : FALSE, (uiFlags & IFD_NO_SPACE) ? TRUE : FALSE, (uiFlags & IFD_NO_DASH) ? TRUE : FALSE, (uiFlags & IFD_ESC_CHAR) ? TRUE : FALSE, bInputTruncated, byTmpBuf, &uiSrcLen))) { goto Exit; } pSrc = (FLMBYTE *) byTmpBuf; } uiCharLimit = uiLimit ? uiLimit : IFD_DEFAULT_LIMIT; if( (uiLanguage >= FIRST_DBCS_LANG ) && (uiLanguage <= LAST_DBCS_LANG)) { rc = AsiaFlmTextToColStr( pSrc, uiSrcLen, pDest, &uiDestLen, (uiFlags & IFD_UPPER), puiCollationLen, puiLuLenRV, uiCharLimit, bFirstSubstring, pbDataTruncated); } else { rc = FTextToColStr( pSrc, uiSrcLen, pDest, &uiDestLen, (uiFlags & IFD_UPPER), puiCollationLen, puiLuLenRV, uiLanguage, uiCharLimit, bFirstSubstring, pbOriginalCharsLost, pbDataTruncated); } } // uiDestLen could be set to zero if text and no value. if( !uiSrcLen || !uiDestLen) { if( !bCompoundPiece) { // Zero length key. Any value under 0x1F would work. if( (uiLanguage >= FIRST_DBCS_LANG ) && (uiLanguage <= LAST_DBCS_LANG)) { pDest [0] = 0; pDest [1] = NULL_KEY_MARKER; uiDestLen = 2; } else { pDest [0] = NULL_KEY_MARKER; uiDestLen = 1; } } else { uiDestLen = 0; } goto Exit; } switch (uiDataType) { case FLM_TEXT_TYPE: { break; } case FLM_NUMBER_TYPE: { FLMBYTE * pOutput = pDest + 1; const FLMBYTE * pTempSrc = pSrc; FLMUINT uiBytesOutput = 1; FLMUINT uiMaxOutLen = uiDestLen; FLMINT iHiInNibble = 1; FLMINT iHiOutNibble = 1; FLMUINT uiSigSign = SIG_POS; FLMUINT uiMagnitude = COLLATED_NUM_EXP_BIAS - 1; FLMBYTE byValue; for (rc = FERR_OK;;) { switch( byValue = (iHiInNibble++ & 1) ? (FLMBYTE)(*pTempSrc >> 4) : (FLMBYTE)(*pTempSrc++ & 0x0F)) { case 0x0B: // Negative Sign code { uiSigSign = 0; continue; } case 0x0A: // Ignore for now - not implemented case 0x0C: case 0x0D: case 0x0E: { continue; } case 0x0F: // Terminator { *pDest = (FLMBYTE)(uiSigSign | ((uiSigSign ? uiMagnitude : ~uiMagnitude) & 0x7F)); goto NumDone; } default: { uiMagnitude++; if( uiSigSign) { byValue += COLLATED_DIGIT_OFFSET; } else { // Invert for key collation byValue = (FLMBYTE)((COLLATED_DIGIT_OFFSET + 9) - byValue); } if( iHiOutNibble++ & 1) { if( uiBytesOutput++ == uiMaxOutLen) { uiBytesOutput = 0; rc = RC_SET( FERR_KEY_OVERFLOW); goto NumDone; } *pOutput = (FLMBYTE)((byValue << 4) | 0x0F); } else { *pOutput++ &= (FLMBYTE)(byValue | 0xF0); } continue; } } } NumDone: uiDestLen = uiBytesOutput; break; } case FLM_BINARY_TYPE: { FLMUINT uiLength = uiSrcLen; const FLMBYTE * tmpSrc = pSrc; FLMBYTE * tmpDest = pDest; FLMBOOL bTruncated = FALSE; if( uiLength >= uiLimit) { uiLength = uiLimit; bTruncated = TRUE; } if( uiDestLen < (uiLength << 1)) { // Compute length so will not overflow uiLength = (FLMUINT)(uiDestLen >> 1); } else { uiDestLen = (FLMUINT)(uiLength << 1); } // Convert each byte to two bytes while( uiLength--) { *tmpDest++ = (FLMBYTE)(COLLS + ((*tmpSrc) >> 4)); *tmpDest++ = (FLMBYTE)(COLLS + ((*tmpSrc++) & 0x0F)); } if( bTruncated) { *tmpDest++ = COLL_TRUNCATED; } break; } case FLM_CONTEXT_TYPE: { if( uiDestLen < 5) { uiDestLen = 0; rc = RC_SET( FERR_KEY_OVERFLOW); } else { *pDest = 0x1F; flmUINT32ToBigEndian( FB2UD( pSrc), pDest + 1); uiDestLen = 5; rc = FERR_OK; } break; } default: { rc = RC_SET( FERR_CONV_BAD_DEST_TYPE); break; } } Exit: *puiDestLenRV = uiDestLen; return( rc); } /**************************************************************************** Desc: Format text removing leading and trailing spaces. Treat underscores as spaces. As options, remove all spaces and dashes. Ret: FERR_OK always. WIll truncate so text will fill MAX_KEY_SIZ. Allocate 8 more than MAX_KEY_SIZ for psDestBuf. Visit: Pass in uiLimit and pass back a truncated flag when the string is truncated. This was not done because we will have to get the exact truncated count that is done in f_tocoll.cpp and that could introduce some bugs. ****************************************************************************/ RCODE KYFormatText( const FLMBYTE * psVal, // Points to value source FLMUINT uiSrcLen, // Length of the key-NOT NULL TERMINATED // Booleans below are zero or NON-zero FLMBOOL bMinSpaces, // Remove leading/trailing/multiple spaces FLMBOOL bNoUnderscore, // Convert underscore to space FLMBOOL bNoSpace, // Remove all spaces FLMBOOL bNoDash, // Remove all dashes (hyphens) FLMBOOL bEscChar, // Literal '*' or '\\' char after '\\' esc char FLMBOOL bInputTruncated,// TRUE if input key data is truncated. FLMBYTE * psDestBuf, // (out) Destination buffer FLMUINT * puuiDestLen) // (out) Length of key in destination buffer. { RCODE rc = FERR_OK; FLMBYTE * psDestPtr = psDestBuf; FLMBYTE ucValue; FLMBYTE objType; FLMUINT uiCurPos = 0; FLMUINT uiDestPos = 0; FLMUINT uiOldDestPos = 0; FLMUINT objLength; FLMBOOL bLastCharWasSpace = bMinSpaces; for( ; uiCurPos < uiSrcLen && uiDestPos < MAX_KEY_SIZ - 1; uiCurPos += objLength) { ucValue = psVal [uiCurPos]; objLength = 1; uiOldDestPos = uiDestPos; objType = (FLMBYTE)(flmTextObjType( ucValue)); switch( objType) { case ASCII_CHAR_CODE: // 0nnnnnnn { if( (ucValue == ASCII_SPACE) || ((ucValue == ASCII_UNDERSCORE) && bNoUnderscore)) { if( bLastCharWasSpace || bNoSpace) { break; } // Sets to true if we want to minimize spaces. bLastCharWasSpace = bMinSpaces; ucValue = ASCII_SPACE; } else if( (ucValue == ASCII_DASH) && bNoDash) { break; } else { if( (ucValue == ASCII_BACKSLASH) && bEscChar && (psVal [uiCurPos+1] == ASCII_WILDCARD || psVal [uiCurPos+1] == ASCII_BACKSLASH)) { ucValue = psVal [uiCurPos+1]; objLength++; } bLastCharWasSpace = FALSE; } psDestPtr[ uiDestPos++] = ucValue; break; } case WHITE_SPACE_CODE: // 110nnnnn { if( bLastCharWasSpace || bNoSpace) { break; } // Sets to true if we want to minimize spaces. bLastCharWasSpace = bMinSpaces; psDestPtr[ uiDestPos++] = ASCII_SPACE; break; } case CHAR_SET_CODE: // 10nnnnnn case UNK_EQ_1_CODE: case OEM_CODE: { bLastCharWasSpace = FALSE; psDestPtr[ uiDestPos++] = psVal [uiCurPos]; psDestPtr[ uiDestPos++] = psVal [uiCurPos+1]; objLength = 2; break; } case UNICODE_CODE: // Unconvertable UNICODE code case EXT_CHAR_CODE: // Full extended character { bLastCharWasSpace = FALSE; psDestPtr[ uiDestPos++] = psVal [uiCurPos]; psDestPtr[ uiDestPos++] = psVal [uiCurPos+1]; psDestPtr[ uiDestPos++] = psVal [uiCurPos+2]; objLength = 3; break; } case UNK_GT_255_CODE: { bLastCharWasSpace = FALSE; objLength = 1 + sizeof( FLMUINT) + FB2UW( &psVal [uiCurPos + 1]); break; } case UNK_LE_255_CODE: { bLastCharWasSpace = FALSE; objLength = 2 + (FLMUINT) (psVal [uiCurPos+1]); break; } default: { psDestPtr[ uiDestPos++] = psVal [uiCurPos]; bLastCharWasSpace = FALSE; break; } } } // On overflow - back out of the last character. if( uiDestPos >= MAX_KEY_SIZ - 1) { uiDestPos = uiOldDestPos; bLastCharWasSpace = FALSE; } // Handle the trailing space if present. // bLastCharWasSpace cannot be set to true if bNoSpace is true. if( bLastCharWasSpace && uiDestPos && !bInputTruncated) { uiDestPos--; } psDestPtr[ uiDestPos] = '\0'; *puuiDestLen = (FLMUINT) uiDestPos; return( rc); } /**************************************************************************** Desc: Convert a text string to a collated string. ****************************************************************************/ RCODE AsiaFlmTextToColStr( const FLMBYTE * Str, // Points to the internal TEXT string FLMUINT StrLen, // Length of the internal TEXT string FLMBYTE * ColStr, // Output collated string FLMUINT * ColStrLenRV, // Collated string length return value // Input value is MAX num of bytes in buffer FLMUINT UppercaseFlag, // Set if to convert to uppercase FLMUINT * puiCollationLen, // Returns the collation bytes length FLMUINT * puiCaseLen, // Returns length of case bytes FLMUINT uiCharLimit, // Max number of characters in this key piece FLMBOOL bFirstSubstring, // TRUE is this is the first substring key FLMBOOL * pbDataTruncated) { RCODE rc = FERR_OK; const FLMBYTE * pszStrEnd; FLMUINT Length; FLMUINT uiTargetColLen = *ColStrLenRV - 12; FLMBYTE SubColBuf[MAX_SUBCOL_BUF + 1]; FLMBYTE LowUpBuf[MAX_LOWUP_BUF + MAX_LOWUP_BUF + 2]; FLMUINT ColLen; FLMUINT SubColBitPos; FLMUINT LowUpBitPos; FLMUINT Flags; FLMUINT16 NextWpChar; FLMUINT16 UnicodeChar; FLMUINT16 ColValue; FLMBOOL bDataTruncated = FALSE; ColLen = 0; SubColBitPos = 0; LowUpBitPos = 0; Flags = 0; UnicodeChar = 0; ColValue = 0; // Don't allow any key component to exceed 256 bytes regardless of the // user-specified character or byte limit. The goal is to prevent any // single key piece from consuming too much of the key (which is // limited to 640 bytes) and thus "starving" other pieces, resulting in // a key overflow error. if (uiTargetColLen > 256) { uiTargetColLen = 256; } // Make sure SubColBuf and LowUpBuf are set to 0's f_memset( SubColBuf, 0, sizeof( SubColBuf)); f_memset( LowUpBuf, 0, sizeof( LowUpBuf)); pszStrEnd = &Str[StrLen]; NextWpChar = 0; while ((Str < pszStrEnd) || NextWpChar || UnicodeChar) { FLMUINT16 WpChar; // Current WP character FLMUINT ObjLength; FLMUINT16 SubColVal; // Sub-collated value (diacritic) FLMBYTE CaseFlags; // Get the next character from the TEXT String. NOTE: OEM // characters will be returned as character set ZERO, the character // will be greater than 127. WpChar = NextWpChar; for (NextWpChar = 0; (!WpChar || !NextWpChar) && !UnicodeChar && (Str < pszStrEnd); Str += ObjLength) { FLMBYTE ObjType; FLMBYTE CurByte; FLMUINT16 CurWpChar = 0; CurByte = *Str; ObjType = (FLMBYTE) (flmTextObjType( CurByte)); ObjLength = 1; switch (ObjType) { case ASCII_CHAR_CODE: { CurWpChar = (FLMUINT16) CurByte; break; } case CHAR_SET_CODE: { ObjLength = 2; CurWpChar = (FLMUINT16) (((FLMUINT16) (CurByte & (~CHAR_SET_MASK)) << 8) + (FLMUINT16) *(Str + 1)); break; } case WHITE_SPACE_CODE: { CurByte &= (~WHITE_SPACE_MASK); CurWpChar = ((CurByte == HARD_HYPHEN) || (CurByte == HARD_HYPHEN_EOL) || (CurByte == HARD_HYPHEN_EOP)) ? 0x2D // Minus sign - character set zero : 0x20; // Space -- character set zero break; } case UNK_GT_255_CODE: { ObjLength = 1 + sizeof(FLMUINT16) + FB2UW( Str + 1); break; } case UNK_LE_255_CODE: { ObjLength = 2 + (FLMUINT16) * (Str + 1); break; } case UNK_EQ_1_CODE: { ObjLength = 2; break; } case EXT_CHAR_CODE: { ObjLength = 3; CurWpChar = (FLMUINT16) (((FLMUINT16) *(Str + 1) << 8) + (FLMUINT16) *(Str + 2)); break; } case OEM_CODE: { ObjLength = 2; // OEM characters are always >= 128. // We use character set zero to process them. CurWpChar = (FLMUINT16) * (Str + 1); break; } case UNICODE_CODE: { ObjLength = 3; UnicodeChar = (FLMUINT16) (((FLMUINT16) *(Str + 1) << 8) + (FLMUINT16) *(Str + 2)); CurWpChar = 0; break; } default: { // Shouldn't ever get to this point continue; } } if (!WpChar) { WpChar = CurWpChar; } else { NextWpChar = CurWpChar; } } // If we didn't get a character, break out of the outer processing // loop. if (!WpChar && !UnicodeChar) { break; } if (WpChar) { if (flmAsiaGetCollation( WpChar, NextWpChar, ColValue, &ColValue, &SubColVal, &CaseFlags, (FLMUINT16) UppercaseFlag) == 2) { // Took the NextWpChar value NextWpChar = 0; } } else { // This handles all of the UNICODE characters that could not be // converted to WP characters - which will include most of the // Asian characters. CaseFlags = 0; if (UnicodeChar < 0x20) { ColValue = 0xFFFF; // Setting SubColVal to a high code will ensure that the code // that the UnicodeChar will be stored in its full 16 bits in // the sub-collation area. SubColVal = 0xFFFF; // NOTE: UnicodeChar SHOULD NOT be set to zero here. It will // be set to zero below. } else { ColValue = UnicodeChar; SubColVal = 0; UnicodeChar = 0; } } // Store the values in 2 bytes ColStr[ColLen++] = (FLMBYTE) (ColValue >> 8); ColStr[ColLen++] = (FLMBYTE) (ColValue & 0xFF); if (SubColVal) { Flags |= HAD_SUB_COLLATION; if (SubColVal <= 31) // 5 bit - store bits 10 { SET_BIT( SubColBuf, SubColBitPos); SubColBitPos += 1 + 1; // Stores a zero SETnBITS( 5, SubColBuf, SubColBitPos, SubColVal); SubColBitPos += 5; } else // 2 bytes - store bits 110 or 11110 { FLMUINT Temp; SET_BIT( SubColBuf, SubColBitPos); SubColBitPos++; SET_BIT( SubColBuf, SubColBitPos); SubColBitPos++; if (!WpChar && UnicodeChar) // Store as "11110" { SubColVal = UnicodeChar; UnicodeChar = 0; SET_BIT( SubColBuf, SubColBitPos); SubColBitPos++; SET_BIT( SubColBuf, SubColBitPos); SubColBitPos++; } SubColBitPos++; // Skip past the zero // Go to the next byte boundary to write the WP char SubColBitPos = (SubColBitPos + 7) & (~7); Temp = BYTES_IN_BITS( SubColBitPos); // Need to store HIGH-Low - PC format is Low-high! SubColBuf[Temp] = (FLMBYTE) (SubColVal >> 8); SubColBuf[Temp + 1] = (FLMBYTE) (SubColVal); SubColBitPos += 16; } } else { SubColBitPos++; } // Save case information - always 2 bits worth for asian if (CaseFlags & 0x02) { SET_BIT( LowUpBuf, LowUpBitPos); } LowUpBitPos++; if (CaseFlags & 0x01) { SET_BIT( LowUpBuf, LowUpBitPos); } LowUpBitPos++; // Check to see if ColLen is within 1 byte of max if ((ColLen >= uiCharLimit) || (ColLen + BYTES_IN_BITS( SubColBitPos) + BYTES_IN_BITS( LowUpBitPos) >= uiTargetColLen)) { // Still something left? if ((Str < pszStrEnd) || NextWpChar || UnicodeChar) { bDataTruncated = TRUE; } // Hit the max. number of characters break; } } if (puiCollationLen) { *puiCollationLen = ColLen; } // Add the first substring marker - also serves as making the string // non-null. if (bFirstSubstring) { ColStr[ColLen++] = 0; ColStr[ColLen++] = COLL_FIRST_SUBSTRING; } if (bDataTruncated) { ColStr[ColLen++] = 0; ColStr[ColLen++] = COLL_TRUNCATED; } if (!ColLen && !SubColBitPos) { if (puiCaseLen) { *puiCaseLen = 0; } goto Exit; } // Done putting the String into 3 sections - build the COLLATED KEY if (Flags & HAD_SUB_COLLATION) { ColStr[ColLen++] = 0; ColStr[ColLen++] = COLL_MARKER | SC_SUB_COL; // Move the Sub-collation (diacritics) into the collating String Length = (FLMUINT) (BYTES_IN_BITS( SubColBitPos)); f_memcpy( &ColStr[ColLen], SubColBuf, Length); ColLen += Length; } // Always represent the marker as 2 bytes and case bits in asia ColStr[ColLen++] = 0; ColStr[ColLen++] = COLL_MARKER | SC_MIXED; Length = (FLMUINT) (BYTES_IN_BITS( LowUpBitPos)); f_memcpy( &ColStr[ColLen], LowUpBuf, Length); if (puiCaseLen) { *puiCaseLen = (FLMUINT) (Length + 2); } ColLen += Length; Exit: if (pbDataTruncated) { *pbDataTruncated = bDataTruncated; } *ColStrLenRV = (FLMUINT) ColLen; return (rc); } /**************************************************************************** Desc: Convert a text string to a collated string. If FERR_CONV_DEST_OVERFLOW is returned the string is truncated as best as it can be. The caller must decide to return the error up or deal with the truncation. Return: RCODE = SUCCESS or FERR_CONV_DEST_OVERFLOW VISIT: If the string is EXACTLY the length of the truncation length then it should, but doesn't, set the truncation flag. The code didn't match the design intent. Fix next major version. ****************************************************************************/ RCODE FTextToColStr( const FLMBYTE * pucStr, // Points to the internal TEXT string FLMUINT uiStrLen, // Length of the internal TEXT string FLMBYTE * pucCollatedStr, // Returns collated string FLMUINT * puiCollatedStrLen, // Returns total collated string length // Input is maximum bytes in buffer FLMUINT uiUppercaseFlag, // Set if to convert to uppercase FLMUINT * puiCollationLen, // Returns the collation bytes length FLMUINT * puiCaseLen, // Returns length of case bytes FLMUINT uiLanguage, // Language FLMUINT uiCharLimit, // Max number of characters in this key piece FLMBOOL bFirstSubstring, // TRUE is this is the first substring key FLMBOOL * pbOriginalCharsLost, FLMBOOL * pbDataTruncated) { RCODE rc = FERR_OK; const FLMBYTE * pucStrEnd; // Points to the end of the string FLMUINT16 ui16Base; // Value of the base character FLMUINT16 ui16SubColVal; // Sub-collated value (diacritic) FLMUINT uiObjLength = 0; FLMUINT uiLength; // Temporary variable for length FLMUINT uiTargetColLen = *puiCollatedStrLen - 8; // 4=ovhd,4=worse char FLMUINT uiObjType; FLMBOOL bDataTruncated = FALSE; // Need to increase the buffer sizes to not overflow. // Characaters without COLL values will take up 3 bytes in // the ucSubColBuf[] and easily overflow the buffer. // Hard coded the values so as to minimize changes. FLMBYTE ucSubColBuf[ MAX_SUBCOL_BUF + 301]; // Holds sub-collated values(diac) FLMBYTE ucCaseBits[ MAX_LOWUP_BUF + 81]; // Holds case bits FLMUINT16 ui16WpChr; // Current WP character FLMUNICODE unichr = 0; // Current unconverted Unicode character FLMUINT16 ui16WpChr2; // 2nd character if any; default 0 for US lang FLMUINT uiColLen; // Return value of collated length FLMUINT uiSubColBitPos; // Sub-collation bit position FLMUINT uiCaseBitPos; // Case bit position FLMUINT uiFlags; // Clear all bit flags FLMBOOL bHebrewArabic = FALSE; // Set if language is hebrew, arabic, farsi FLMBOOL bTwoIntoOne; uiColLen = 0; uiSubColBitPos = 0; uiCaseBitPos = 0; uiFlags = 0; ui16WpChr2 = 0; // Don't allow any key component to exceed 256 bytes regardless of the // user-specified character or byte limit. The goal is to prevent // any single key piece from consuming too much of the key (which is // limited to 640 bytes) and thus "starving" other pieces, resulting // in a key overflow error. if( uiTargetColLen > 256) { uiTargetColLen = 256; } // Code below sets ucSubColBuf[] and ucCaseBits[] values to zero. if (uiLanguage != US_LANG) { if (uiLanguage == AR_LANG || // Arabic uiLanguage == FA_LANG || // Farsi - persian uiLanguage == HE_LANG || // Hebrew uiLanguage == UR_LANG) // Urdu { bHebrewArabic = TRUE; } } pucStrEnd = &pucStr [uiStrLen]; while (pucStr < pucStrEnd) { // Set the case bits and sub-collation bits to zero when // on the first bit of the byte. if (!(uiCaseBitPos & 0x07)) { ucCaseBits [uiCaseBitPos >> 3] = 0; } if (!(uiSubColBitPos & 0x07)) { ucSubColBuf [uiSubColBitPos >> 3] = 0; } // Get the next character from the TEXT string. for (ui16WpChr = ui16SubColVal = 0; // Default sub-collation value !ui16WpChr && pucStr < pucStrEnd; pucStr += uiObjLength) { FLMBYTE ucChar = *pucStr; uiObjType = flmTextObjType( ucChar); switch (uiObjType) { case ASCII_CHAR_CODE: // 0nnnnnnn { uiObjLength = 1; // Character set zero is assumed. ui16WpChr = (FLMUINT16)ucChar; continue; } case CHAR_SET_CODE: // 10nnnnnn { uiObjLength = 2; // Character set followed by character ui16WpChr = (((FLMUINT16)(ucChar & (~CHAR_SET_MASK)) << 8) + (FLMUINT16)*(pucStr + 1)); continue; } case WHITE_SPACE_CODE: // 110nnnnn { uiObjLength = 1; ucChar &= (~WHITE_SPACE_MASK); ui16WpChr = (ucChar == HARD_HYPHEN || ucChar == HARD_HYPHEN_EOL || ucChar == HARD_HYPHEN_EOP) ? (FLMUINT16)0x2D // Minus sign -- character set 0 : (FLMUINT16)0x20;// Space -- character set zero continue; } case UNK_GT_255_CODE: { uiObjLength = 3 + FB2UW( pucStr + 1); continue; } case UNK_LE_255_CODE: { uiObjLength = 2 + (FLMUINT16)*(pucStr + 1); continue; } case UNK_EQ_1_CODE: { uiObjLength = 2; continue; } case EXT_CHAR_CODE: { uiObjLength = 3; // Character set followed by character ui16WpChr = (((FLMUINT16)*(pucStr + 1) << 8) + (FLMUINT16)*(pucStr + 2)); continue; } case OEM_CODE: { // OEM characters are always >= 128 // Use character set zero to process them. uiObjLength = 2; ui16WpChr = (FLMUINT16)*(pucStr + 1); continue; } case UNICODE_CODE: // Unconvertable UNICODE code { uiObjLength = 3; // Unicode character followed by unicode character set unichr = (FLMUINT16)(((FLMUINT16)*(pucStr + 1) << 8) + (FLMUINT16)*(pucStr + 2)); ui16WpChr = UNK_UNICODE_CODE; continue; } default: { // Should not happen, but don't return an error flmAssert( 0); continue; } } } // If we didn't get a character, break out of while loop. if (!ui16WpChr) { break; } // flmCheckDoubleCollation modifies ui16WpChr if a digraph or a double // character sequence is found. If a double character is found, pucStr // is incremented and ui16WpChr2 is set to 1. If a digraph is found, // pucStr is not changed, but ui16WpChr contains the first character and // ui16WpChr2 contains the second character of the digraph. if (uiLanguage != US_LANG) { ui16WpChr2 = flmCheckDoubleCollation( &ui16WpChr, &bTwoIntoOne, &pucStr, uiLanguage); } // Save the case bit if (!uiUppercaseFlag) { // charIsUpper returns TRUE if upper case, 0 if lower case. if (!charIsUpper( ui16WpChr)) { uiFlags |= HAD_LOWER_CASE; } else { // Set if upper case. SET_BIT( ucCaseBits, uiCaseBitPos); } uiCaseBitPos++; } // Handle OEM characters, non-collating characters, // characters with subcollating values, double collating // values. // Get the collated value from the WP character-if not collating value if ((pucCollatedStr[ uiColLen++] = (FLMBYTE)(flmGetCollation( ui16WpChr, uiLanguage))) >= COLS11) { FLMUINT uiTemp; // Save OEM characters just like non-collating characters // If lower case, convert to upper case. if (!charIsUpper( ui16WpChr)) { ui16WpChr &= ~1; } // No collating value given for this WP char. // Save original WP char (2 bytes) in subcollating // buffer. // 1110 is a new code that will store an insert over // the character OR a non-convertable unicode character. // Store with the same alignment as "store_extended_char" // below. // 11110 is code for unmappable UNICODE value. // A value 0xFE will be the collation value. The sub-collation // value will be 0xFFFF followed by the UNICODE value. // Be sure to eat an extra case bit. // See specific Hebrew and Arabic comments in the // switch statement below. // Set the next byte that follows in the sub collation buffer. ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0; if (bHebrewArabic && (pucCollatedStr [uiColLen-1] == COLS0_ARABIC)) { // Store first bit of 1110, fall through & store remaining 3 bits SET_BIT( ucSubColBuf, uiSubColBitPos); uiSubColBitPos++; // Don't store collation value uiColLen--; } else if (unichr) { ui16WpChr = unichr; unichr = 0; // Store 11 out of 11110 SET_BIT( ucSubColBuf, uiSubColBitPos); uiSubColBitPos++; SET_BIT( ucSubColBuf, uiSubColBitPos); uiSubColBitPos++; if (!uiUppercaseFlag) { ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0; // Set upper case bit. SET_BIT( ucCaseBits, uiCaseBitPos); uiCaseBitPos++; } } store_extended_char: // Set the next byte that follows in the sub collation buffer. ucSubColBuf [(uiSubColBitPos + 8) >> 3] = 0; ucSubColBuf [(uiSubColBitPos + 16) >> 3] = 0; uiFlags |= HAD_SUB_COLLATION; // Set 110 bits in sub-collation - continued from above. // No need to explicitly set the zero, but must increment // for it. SET_BIT( ucSubColBuf, uiSubColBitPos); uiSubColBitPos++; SET_BIT( ucSubColBuf, uiSubColBitPos); uiSubColBitPos += 2; // store_aligned_word: This label is not referenced. // Go to the next byte boundary to write the character. uiSubColBitPos = (uiSubColBitPos + 7) & (~7); uiTemp = BYTES_IN_BITS( uiSubColBitPos); // Need to big-endian - so it will sort correctly. ucSubColBuf [uiTemp] = (FLMBYTE)(ui16WpChr >> 8); ucSubColBuf [uiTemp + 1] = (FLMBYTE)(ui16WpChr); uiSubColBitPos += 16; ucSubColBuf [uiSubColBitPos >> 3] = 0; } else { // Had a collation value // Add the lower/uppercase bit if a mixed case output. // If not lower ASCII set - check diacritic value for sub-collation if (!(ui16WpChr & 0xFF00)) { // ASCII character set - set a single 0 bit - just need to // increment to do this. uiSubColBitPos++; } else { FLMBYTE ucTmpChar = (FLMBYTE)ui16WpChr; FLMBYTE ucCharSet = (FLMBYTE)(ui16WpChr >> 8); // Convert char to uppercase because case information // is stored above. This will help // ensure that the "ETA" doesn't sort before "eta" if (!charIsUpper(ui16WpChr)) { ui16WpChr &= ~1; } switch (ucCharSet) { case CHSMUL1: // Multinational 1 { // If we cannot break down a char into base and // diacritic we cannot combine the charaacter // later when converting back the key. In that case, // write the entire WP char in the sub-collation area. if (flmCh6Brkcar( ui16WpChr, &ui16Base, &ui16SubColVal)) { goto store_extended_char; } // Write the FLAIM diacritic sub-collation value. // Prefix is 2 bits "10". Remember to leave // "111" alone for the future. // NOTE: The "unlaut" character must sort after the "ring" // character. ui16SubColVal = ((ui16SubColVal & 0xFF) == umlaut && (uiLanguage == SU_LANG || uiLanguage == SV_LANG || uiLanguage == CZ_LANG || uiLanguage == SL_LANG)) ? (FLMUINT16)(flmDia60Tbl[ ring] + 1) : (FLMUINT16)(flmDia60Tbl[ ui16SubColVal & 0xFF]); store_sub_col: // Set the next byte that follows in the sub collation buffer. ucSubColBuf[ (uiSubColBitPos + 8) >> 3] = 0; uiFlags |= HAD_SUB_COLLATION; // Set the 10 bits - no need to explicitly set the zero, but // must increment for it. SET_BIT( ucSubColBuf, uiSubColBitPos); uiSubColBitPos += 2; // Set sub-collation bits. SETnBITS( 5, ucSubColBuf, uiSubColBitPos, ui16SubColVal); uiSubColBitPos += 5; break; } case CHSGREK: // Greek { if (ucTmpChar >= 52 || // Keep case bit for 52-69 else ignore ui16WpChr == 0x804 || // [ 8,4] BETA Medial | Terminal ui16WpChr == 0x826) // [ 8,38] SIGMA terminal { goto store_extended_char; } // No subcollation to worry about - set a zero bit by // incrementing the bit position. uiSubColBitPos++; break; } case CHSCYR: { if (ucTmpChar >= 144) { goto store_extended_char; } // No subcollation to worry about - set a zero bit by // incrementing the bit position. uiSubColBitPos++; // VISIT: Georgian covers 208-249 - no collation defined yet break; } case CHSHEB: // Hebrew { // Three sections in Hebrew: // 0..26 - main characters // 27..83 - accents that apear over previous character // 84..118- dagesh (ancient) hebrew with accents // Because the ancient is only used for sayings & scriptures // we will support a collation value and in the sub-collation // store the actual character because sub-collation is in // character order. if (ucTmpChar >= 84) // Save ancient - value 84 and above { goto store_extended_char; } // No subcollation to worry about - set a zero bit by // incrementing the bit position. uiSubColBitPos++; break; } case CHSARB1: // Arabic 1 { // Three sections in Arabic: // 00..37 - accents that display OVER a previous character // 38..46 - symbols // 47..57 - numbers // 58..163 - characters // 164 - hamzah accent // 165..180- common characters with accents // 181..193- ligatures - common character combinations // 194..195- extensions - throw away when sorting if (ucTmpChar <= 46) { goto store_extended_char; // save original character } if (pucCollatedStr[ uiColLen-1] == COLS10a+1) // Alef? { ui16SubColVal = (ucTmpChar >= 165) ? (FLMUINT16)(flmAlefSubColTbl[ ucTmpChar - 165 ]) : (FLMUINT16)7; // Alef subcol value goto store_sub_col; } if (ucTmpChar >= 181) // Ligatures - char combination { goto store_extended_char; // save original character } if (ucTmpChar == 64) // taa exception { ui16SubColVal = 8; goto store_sub_col; } // No subcollation to worry about - set a zero bit by // incrementing the bit position. uiSubColBitPos++; break; } case CHSARB2: // Arabic 2 { // There are some characters that share the same slot // Check the bit table if above character 64 if (ucTmpChar >= 64 && flmAr2BitTbl[(ucTmpChar - 64) >> 3] & (0x80 >> (ucTmpChar & 0x07))) { goto store_extended_char; // Will save original } // No subcollation to worry about - set a zero bit by // incrementing the bit position. uiSubColBitPos++; break; } default: { // Increment bit position to set a zero bit. uiSubColBitPos++; break; } } } // Now let's worry about double character sorting if (ui16WpChr2) { if (pbOriginalCharsLost) { *pbOriginalCharsLost = TRUE; } // Set the next byte that follows in the sub collation buffer. ucSubColBuf [(uiSubColBitPos + 7) >> 3] = 0; if (bTwoIntoOne) { // Sorts after character in ui16WpChr after call to // flmCheckDoubleCollation // Write the char 2 times so lower/upper bits are correct. // Could write infinite times because of collation rules. pucCollatedStr[ uiColLen] = ++pucCollatedStr[ uiColLen-1]; uiColLen++; // If original was upper case, set one more upper case bit if (!uiUppercaseFlag) { ucCaseBits[ (uiCaseBitPos + 7) >> 3] = 0; if (!charIsUpper( (FLMUINT16) *(pucStr - 1))) { uiFlags |= HAD_LOWER_CASE; } else { SET_BIT( ucCaseBits, uiCaseBitPos); } uiCaseBitPos++; } // Take into account the diacritical space uiSubColBitPos++; } else { // We have a digraph, get second collation value pucCollatedStr[ uiColLen++] = (FLMBYTE)(flmGetCollation( ui16WpChr2, uiLanguage)); // Normal case, assume no diacritics set uiSubColBitPos++; // If first was upper, set one more upper bit. if (!uiUppercaseFlag) { ucCaseBits [(uiCaseBitPos + 7) >> 3] = 0; if (charIsUpper( ui16WpChr)) { SET_BIT( ucCaseBits, uiCaseBitPos); } uiCaseBitPos++; // no need to reset the uiFlags } } } } // Check to see if uiColLen is at some overflow limit. if (uiColLen >= uiCharLimit || uiColLen + BYTES_IN_BITS( uiSubColBitPos) + BYTES_IN_BITS( uiCaseBitPos) >= uiTargetColLen) { // We hit the maximum number of characters. if (pucStr < pucStrEnd) { bDataTruncated = TRUE; } break; } } if (puiCollationLen) { *puiCollationLen = uiColLen; } // Add the first substring marker - also serves as making the string non-null. if (bFirstSubstring) { pucCollatedStr [uiColLen++] = COLL_FIRST_SUBSTRING; } if (bDataTruncated) { pucCollatedStr[ uiColLen++ ] = COLL_TRUNCATED; } if (!uiColLen && !uiSubColBitPos) { if (puiCaseLen) { *puiCaseLen = 0; } goto Exit; } // Store extra zero bit in the sub-collation area for Hebrew/Arabic if (bHebrewArabic) { uiSubColBitPos++; } // Done putting the string into 4 sections - build the COLLATED KEY // Don't set uiUppercaseFlag earlier than here because SC_LOWER may be zero uiUppercaseFlag = (uiLanguage == GR_LANG) ? SC_LOWER : SC_UPPER; // The default terminating characters is (COLL_MARKER|SC_UPPER) // Did we write anything to the subcollation area? if (uiFlags & HAD_SUB_COLLATION) { // Writes out a 0x7 pucCollatedStr [uiColLen++] = COLL_MARKER | SC_SUB_COL; // Move the sub-collation into the collating string uiLength = BYTES_IN_BITS( uiSubColBitPos); f_memcpy( &pucCollatedStr[uiColLen], ucSubColBuf, uiLength); uiColLen += uiLength; } // Move the upper/lower case stuff - force bits for Greek ONLY // This is such a small size that a memcpy is not worth it if (uiFlags & HAD_LOWER_CASE) { FLMUINT uiNumBytes = BYTES_IN_BITS( uiCaseBitPos); FLMBYTE * pucCasePtr = ucCaseBits; // Output the 0x5 pucCollatedStr [uiColLen++] = (FLMBYTE)(COLL_MARKER | SC_MIXED); if (puiCaseLen) { *puiCaseLen = uiNumBytes + 1; } if (uiUppercaseFlag == SC_LOWER) { // Negate case bits for languages (like GREEK) that sort // upper case before lower case. while (uiNumBytes--) { pucCollatedStr [uiColLen++] = ~(*pucCasePtr++); } } else { while (uiNumBytes--) { pucCollatedStr [uiColLen++] = *pucCasePtr++; } } } else { // All characters are either upper or lower case, as determined // by uiUppercaseFlag. pucCollatedStr [uiColLen++] = (FLMBYTE)(COLL_MARKER | uiUppercaseFlag); if( puiCaseLen) { *puiCaseLen = 1; } } Exit: if( pbDataTruncated) { *pbDataTruncated = bDataTruncated; } *puiCollatedStrLen = uiColLen; return( rc); } /**************************************************************************** Desc: Return the sub-collation value of a WPText character. Unconvered Unicode values always have a sub-collation value of 11110 + Unicode Value. ****************************************************************************/ FLMUINT16 flmTextGetSubCol( FLMUINT16 ui16WPValue, // WP Character value. FLMUINT16 ui16ColValue, // Collation Value (for arabic) FLMUINT uiLangId) // WP Language ID. { FLMUINT16 ui16SubColVal; FLMBYTE byCharVal; FLMBYTE byCharSet; FLMUINT16 ui16Base; // Easy case first. ui16SubColVal = 0; if( (ui16WPValue & 0xFF00 ) == 0) { goto Exit; } // From here down default ui16SubColVal is WP value. ui16SubColVal = ui16WPValue; byCharVal = (FLMBYTE) ui16WPValue; byCharSet = (FLMBYTE) (ui16WPValue >> 8); // Convert char to uppercase because case information // is stored above. This will help // insure that the "ETA" doesn't sort before "eta" // could use is lower code here for added performance. // This just happens to work with all WP character values if (!flmIsUpper( ui16WPValue)) { ui16WPValue &= ~1; } switch( byCharSet) { case CHSMUL1: { // If you cannot break down a char into base and // diacritic then you cannot combine the charaacter // later when converting back the key. So, write // the entire WP char in the sub-collation area. // We can ONLY SUPPORT MULTINATIONAL 1 for brkcar() if( flmCh6Brkcar( ui16WPValue, &ui16Base, &ui16SubColVal)) { // WordPerfect character cannot be broken down. // If we had a collation value other than 0xFF (COLS0), don't // return a sub-collation value. This will allow things like // upper and lower AE digraphs to compare properly. if (ui16ColValue != COLS0) { ui16SubColVal = 0; } goto Exit; } // Write the FLAIM diacritic sub-collation value. // Prefix is 2 bits "10". Remember to leave // "111" alone for the future. ui16SubColVal = ((ui16SubColVal & 0xFF) == umlaut && ((uiLangId == SU_LANG) || (uiLangId == SV_LANG) || (uiLangId == CZ_LANG) || (uiLangId == SL_LANG))) ? (FLMUINT16)(flmDia60Tbl[ ring] + 1) : (FLMUINT16)(flmDia60Tbl[ ui16SubColVal & 0xFF]); break; } case CHSGREK: { if( (byCharVal >= 52) || (ui16WPValue == 0x804) || (ui16WPValue == 0x826)) { ui16SubColVal = ui16WPValue; } break; } case CHSCYR: { if( byCharVal >= 144) { ui16SubColVal = ui16WPValue; } break; } case CHSHEB: { // Three sections in Hebrew: // 0..26 - main characters // 27..83 - accents that apear over previous character // 84..118- dagesh (ancient) hebrew with accents // // Because the ancient is only used for sayings & scriptures // we will support a collation value and in the sub-collation // store the actual character because sub-collation is in // character order. if( byCharVal >= 84) { ui16SubColVal = ui16WPValue; } break; } case CHSARB1: { // Three sections in Arabic: // 00..37 - accents that display OVER a previous character // 38..46 - symbols // 47..57 - numbers // 58..163 - characters // 164 - hamzah accent // 165..180- common characters with accents // 181..193- ligatures - common character combinations // 194..195- extensions - throw away when sorting if( byCharVal <= 46 ) { ui16SubColVal = ui16WPValue; } else { if( ui16ColValue == COLS10a + 1) { ui16SubColVal = (byCharVal >= 165) ? (FLMUINT16)(flmAlefSubColTbl[ byCharVal - 165 ]) : (FLMUINT16)7; // Alef subcol value } else { if( byCharVal >= 181) // Ligatures - char combination { ui16SubColVal = ui16WPValue; } else if( byCharVal == 64) // taa exception { ui16SubColVal = 8; } } } break; } case CHSARB2: { // There are some characters that share the same slot // Check the bit table if above character 64 if ((byCharVal >= 64) && (flmAr2BitTbl[(byCharVal-64)>> 3] & (0x80 >> (byCharVal&0x07)))) { ui16SubColVal = ui16WPValue; } break; } } Exit: return( ui16SubColVal); } /**************************************************************************** Desc: Get the original string from an asian collation string Ret: Length of the word string in bytes ****************************************************************************/ FLMUINT AsiaConvertColStr( FLMBYTE* CollatedStr, // Points to the Flaim collated string FLMUINT* CollatedStrLenRV, // Length of the Flaim collated string FLMBYTE* WordStr, // Output string to build - WP word string FLMBOOL* pbDataTruncated, // Set to TRUE if data was truncated FLMBOOL* pbFirstSubstring) // Set to TRUE if marker exists { FLMBYTE* pWordStr = WordStr; FLMUINT Length = *CollatedStrLenRV; // May optimize as a register FLMUINT CollStrPos = 0; // Position in CollatedStr[] FLMBOOL bHadExtended = FALSE; FLMUINT WordStrLen; FLMUINT16 ColChar; // 2 byte value for asian while (Length) { FLMBYTE CharVal; FLMBYTE CharSet; CharSet = CollatedStr[CollStrPos]; CharVal = CollatedStr[CollStrPos + 1]; ColChar = (FLMUINT16) ((CharSet << 8) + CharVal); if (ColChar <= MAX_COL_OPCODE) { break; } CollStrPos += 2; Length -= 2; if (CharSet == 0) // Normal Latin/Greek/Cyrillic value { ColChar = colToWPChr[CharVal - COLLS]; } else if (CharSet == 1) // katakana or hiragana character { if (CharVal > sizeof(ColToKanaTbl)) // Special cases below { if (CharVal == COLS_ASIAN_MARK_VAL) { // dakuten ColChar = 0x240a; } else if (CharVal == COLS_ASIAN_MARK_VAL + 1) { // handakuten ColChar = 0x240b; } else if (CharVal == COLS_ASIAN_MARK_VAL + 2) { // chuuten ColChar = 0x2405; } else { ColChar = 0xFFFF; // error } } else { ColChar = (FLMUINT16) (0x2600 + ColToKanaTbl[CharVal]); } } else if (CharSet != 0xFF || CharVal != 0xFF) // Asian characters { // Insert zeroes that will be treated as a signal for // uncoverted unicode characters later on. NOTE: Cannot use // 0xFFFF, because we need to be able to detect this case in // the sub-collation stuff, and we don't want to confuse it // with the 0xFFFF that may have been inserted in another // case. THIS IS A REALLY BAD HACK, BUT IT IS THE BEST WE CAN // DO FOR NOW! *pWordStr++ = 0; *pWordStr++ = 0; bHadExtended = TRUE; } // else does not have a collation value - found in sub-collation // part UW2FBA( ColChar, pWordStr); // Put the uncollation value back pWordStr += 2; } UW2FBA( 0, pWordStr); // NULL Terminate the string WordStrLen = (FLMUINT) (pWordStr - WordStr); // Parse through the sub-collation and case information. // Watch out for COMP CollStrPosT indexes-doesn't have case info after // Here are values for some of the codes: // [ 0x01] - end for fields case info follows - for COMP POST indexes // [ 0x02] - compound marker // [ 0x05] - case bits follow // [ 0x06] - case information is all uppercase // [ 0x07] - beginning of sub-collation information // [ 0x08] - first substring field that is made // [ 0x09] - truncation marker for text and binary // // Asian chars the case information should always be there and not // compressed out. This is because the case information could change // the actual width of the character from 0x26xx to charset 11. if (Length) { ColChar = (FLMUINT16) ((CollatedStr[CollStrPos] << 8) + CollatedStr[CollStrPos + 1]); // First substring is before truncated. if (ColChar == COLL_FIRST_SUBSTRING) { if (pbFirstSubstring) { *pbFirstSubstring = TRUE; // Don't need to initialize to FALSE. } Length -= 2; CollStrPos += 2; ColChar = (FLMUINT16) ((CollatedStr[CollStrPos] << 8) + CollatedStr[CollStrPos + 1]); } if (ColChar == COLL_TRUNCATED) { if (pbDataTruncated) { *pbDataTruncated = TRUE; // Don't need to initialize to FALSE. } Length -= 2; CollStrPos += 2; ColChar = (FLMUINT16) ((CollatedStr[CollStrPos] << 8) + CollatedStr[CollStrPos + 1]); } if (ColChar == (COLL_MARKER | SC_SUB_COL)) { FLMUINT TempLen; // Do another pass on the word string adding diacritics/voicings CollStrPos += 2; Length -= 2; TempLen = AsiaParseSubCol( WordStr, &WordStrLen, &CollatedStr[CollStrPos]); CollStrPos += TempLen; Length -= TempLen; } else { goto check_case; } } // Does the case info follow? - It may not because of post indexes if (Length) { ColChar = (FLMUINT16) ((CollatedStr[CollStrPos] << 8) + CollatedStr[CollStrPos + 1]); check_case: if (ColChar == (COLL_MARKER | SC_MIXED)) { CollStrPos += 2; CollStrPos += AsiaParseCase( WordStr, &WordStrLen, &CollatedStr[CollStrPos]); // Set bHadExtended to FALSE, because they will have been taken // care of in this pass. bHadExtended = FALSE; } } // Change embedded zeroes to 0xFFFFs if (bHadExtended) { FLMUINT uiCnt; FLMBYTE* pucTmp; for (uiCnt = 0, pucTmp = WordStr; uiCnt < WordStrLen; uiCnt += 2, pucTmp += 2) { if (FB2UW( pucTmp) == 0) { UW2FBA( 0xFFFF, pucTmp); } } } // Following marker is 2 bytes if post otherwise will be 1 byte ; // Should make a pass and count the extended characters *CollatedStrLenRV = CollStrPos; // value should be on 0x01 or 0x02 flag return (WordStrLen); // Return the length of the word string } /**************************************************************************** Desc: Combine the diacritic 5 and 16 bit values to an existing word string. Ret: Number of bytes parsed Notes: For each bit in the sub-collation section: 0 - no subcollation information 10 - take next 5 bits - will tell about diacritics or japanese vowel 110 - align to next byte and take word value as extended character ****************************************************************************/ FLMUINT AsiaParseSubCol( FLMBYTE * WordStr, // Existing word string to modify FLMUINT * puiWordStrLen, // Wordstring length in bytes FLMBYTE * SubColBuf) // Diacritic values in 5 bit sets { FLMUINT SubColBitPos = 0; FLMUINT NumWords = *puiWordStrLen >> 1; FLMUINT16 Diac; FLMUINT16 WpChar; // For each word in the word string ... while (NumWords--) { // Have to skip 0, because it is not accounted for in the // sub-collation bits. It was inserted when we encountered // unconverted unicode characters (Asian). Will be converted to // something else later on. SEE NOTE ABOVE. if (FB2UW( WordStr) == 0) { WordStr += 2; continue; } // This macro DOESN'T increment bitPos if (TEST1BIT( SubColBuf, SubColBitPos)) { // Bits 10 - take next 5 bits Bits 110 align and take next word // Bits 11110 align and take unicode value // SubColBitPos++; if (!TEST1BIT( SubColBuf, SubColBitPos)) { SubColBitPos++; Diac = (FLMUINT16) (GETnBITS( 5, SubColBuf, SubColBitPos)); SubColBitPos += 5; if ((WpChar = FB2UW( WordStr)) < 0x100) { if ((WpChar >= 'A') && (WpChar <= 'Z')) { // Convert to WP diacritic and combine characters flmCh6Cmbcar( &WpChar, WpChar, (FLMUINT16) ml1_COLtoD[Diac]); // Even if cmbcar fails, WpChar is still set to a valid // value } else // Symbols from charset 0x24 { WpChar = (FLMUINT16) (0x2400 + flmCh24ColTbl[Diac - 1].ByteValue); } } else if (WpChar >= 0x2600) // Katakana { // Voicings - will allow to select original char // 000 - some 001 are changed to 000 to save space // 001 - set if large char (uppercase) // 010 - set if voiced // 100 - set if half voiced // // Should NOT match voicing or wouldn't be here! FLMBYTE CharVal = (FLMBYTE) (WpChar & 0xFF); // Try exceptions first so don't access out of bounds if (CharVal == 84) { WpChar = (FLMUINT16) (0x2600 + ((Diac == 1) ? (FLMUINT16) 10 : (FLMUINT16) 11)); } else if (CharVal == 85) { WpChar = (FLMUINT16) (0x2600 + ((Diac == 1) ? (FLMUINT16) 16 : (FLMUINT16) 17)); } // Try the next 2 slots, if not then value is 83,84 or 85 else if (flmKanaSubColTbl[CharVal + 1] == Diac) { WpChar++; } else if ((flmKanaSubColTbl[CharVal + 2] == Diac)) { WpChar += 2; } // last exception below else if (CharVal == 4) { WpChar = 0x2600 + 83; } // else leave alone! - invalid storage } UW2FBA( WpChar, WordStr); // Set if changed or not } else // "110" { FLMUINT Temp; SubColBitPos++; // Skip second '1' if (TEST1BIT( SubColBuf, SubColBitPos)) { // Unconvertable UNICODE character ; // The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode shiftN( WordStr, (FLMUINT16) (NumWords + NumWords + 4), 2); WordStr += 2; // Skip the 0xFFFF for now SubColBitPos += 2; // Skip next "11" (*puiWordStrLen) += 2; } SubColBitPos++; // Skip the zero // Round up to next byte SubColBitPos = (SubColBitPos + 7) & (~7); Temp = BYTES_IN_BITS( SubColBitPos); WordStr[1] = SubColBuf[Temp]; // Character set WordStr[0] = SubColBuf[Temp + 1]; // Character SubColBitPos += 16; } } else { SubColBitPos++; // Be sure to increment this! } WordStr += 2; // Next WP character } return (BYTES_IN_BITS( SubColBitPos)); } /**************************************************************************** Desc: The case bits for asia are: Latin/Greek/Cyrillic 01 - case bit set if character is uppercase 10 - double wide character in CS 0x25xx, 0x26xx and 0x27xx Japanese 00 - double wide hiragana 0x255e..25b0 01 - double wide katakana 0x2600..2655 10 - single wide symbols from charset 11 that map to CS24?? 11 - single wide katakana from charset 11 ****************************************************************************/ FLMUINT AsiaParseCase( FLMBYTE * WordStr, // Existing word string to modify FLMUINT * WordStrLenRV, // Length of the WordString in bytes FLMBYTE * pCaseBits) // Lower/upper case bit string { FLMUINT WordStrLen = *WordStrLenRV; FLMUINT uiWordCnt; FLMUINT uiExtraBytes = 0; FLMUINT16 WpChar; FLMBYTE TempByte = 0; FLMBYTE MaskByte; // For each character in the word string ... for (uiWordCnt = WordStrLen >> 1, MaskByte = 0; uiWordCnt--;) { FLMBYTE CharSet; FLMBYTE CharVal; WpChar = FB2UW( WordStr); // Get the next character // Must skip any 0xFFFFs or zeroes that were inserted. if (WpChar == 0xFFFF || WpChar == 0) { // Put back 0xFFFF in case it was a zero. UW2FBA( 0xFFFF, WordStr); WordStr += 2; uiExtraBytes += 2; continue; } if (MaskByte == 0) // Time to get another byte { TempByte = *pCaseBits++; MaskByte = 0x80; } CharSet = (FLMBYTE) (WpChar >> 8); CharVal = (FLMBYTE) (WpChar & 0xFF); if (WpChar < 0x2400) // SINGLE WIDE - NORMAL CHARACTERS { if (TempByte & MaskByte) // convert to double wide? { // Latin/greek/cyrillic Convert to uppercase double wide char if (CharSet == 0) // Latin - uppercase { // May convert to 0x250F (Latin) or CS24 if (WpChar >= 'A' && WpChar <= 'Z') { // Convert to double wide WpChar = (FLMUINT16) (WpChar - 0x30 + 0x250F); } else { HanToZenkaku( WpChar, 0, &WpChar); } } else if (CharSet == 8) // Greek { if (CharVal > 38) { // Adjust for spaces in greek CharVal -= 2; } if (CharVal > 4) { CharVal -= 2; } WpChar = (FLMUINT16) ((CharVal >> 1) + 0x265E); } else if (CharSet == 10) // Cyrillic { WpChar = (FLMUINT16) ((CharVal >> 1) + 0x2700); } else { HanToZenkaku( WpChar, 0, &WpChar); } CharSet = (FLMBYTE) (WpChar >> 8); // Less code this way CharVal = (FLMBYTE) (WpChar & 0xFF); } MaskByte >>= 1; // Next bit if ((TempByte & MaskByte) == 0) // Change to lower case? { switch (CharSet) // Convert WpChar to lower case { case 0: { WpChar |= 0x20; // Bit zero only if lower case break; } case 1: { if (CharVal >= 26) { WpChar++; } break; } case 8: { if (CharVal <= 69) { // All lowercase after 69 WpChar++; } break; } case 10: { if (CharVal <= 199) { // No cases after 199 WpChar++; } break; } case 0x25: case 0x26: { // should be double wide latin or greek WpChar += 0x20; // Add offset to convert to lowercase break; } case 0x27: // double wide cyrillic only { WpChar += 0x30; // Add offset to convert to lowercase break; } } } } else // JAPANESE CHARACTERS { if (TempByte & MaskByte) // Original chars from // CharSet 11 { if (CharSet == 0x26) // Convert to ZenToHankaku { FLMUINT16 NextChar = 0; WpChar = ZenToHankaku( WpChar, &NextChar); if (NextChar) // Move everyone down { uiWordCnt++; shiftN( WordStr, uiWordCnt + uiWordCnt + 2, 2); UW2FBA( WpChar, WordStr); WordStr += 2; WpChar = NextChar; // Store this below *WordStrLenRV = *WordStrLenRV + 2; // Adjust length // Don't change WordStrLen - returns number of bits used } } else if (CharSet == 0x24) { WpChar = ZenToHankaku( WpChar, (FLMUINT16*) 0); } MaskByte >>= 1; // Eat next bit } else { MaskByte >>= 1; // Next bit if ((TempByte & MaskByte) == 0) // Convert to hiragana? { // kanji will also fall through here if (CharSet == 0x26) { WpChar = (FLMUINT16) (0x255E + CharVal); // Convert to // hiragana } } } } UW2FBA( WpChar, WordStr); WordStr += 2; MaskByte >>= 1; } // Should be 2 bits for each character uiWordCnt = WordStrLen - uiExtraBytes; return (BYTES_IN_BITS( uiWordCnt)); } /**************************************************************************** Desc: Returns the collation value of the input WP character. If in charset 11 will convert the character to Zenkaku (double wide). In: ui16WpChar - Char to collate off of - could be in CS0..14 or x24..up ui16NextWpChar - next WP char for CS11 voicing marks ui16PrevColValue - previous collating value - for repeat/vowel repeat pui16ColValue - returns 2 byte collation value pui16SubColVal - 0, 6 or 16 bit value for the latin sub collation or the kana size & vowel voicing 001 - set if large (upper) character 010 - set if voiced 100 - set if half voiced pucCaseBits - returns 2 bits Latin/Greek/Cyrillic 01 - case bit set if character is uppercase 10 - double wide character in CS 0x25xx, 0x26xx and 0x27xx Japanese 00 - double wide hiragana 0x255e..25b0 01 - double wide katakana 0x2600..2655 10 - double wide symbols that map to charset 11 11 - single wide katakana from charset 11 Ret: 0 - no valid collation value high values set for pui16ColValue Sub-collation gets original WP character value 1 - valid collation value 2 - valid collation value and used the ui16NextWpChar Terms: HANKAKU - single wide characters in charsets 0..14 ZENKAKU - double wide characters in charsets 0x24..end of kanji KANJI - collation values are 0x2900 less than WPChar value ****************************************************************************/ FLMUINT16 flmAsiaGetCollation( FLMUINT16 ui16WpChar, // WP char to get collation values FLMUINT16 ui16NextWpChar, // Next WP char - for CS11 voicing marks FLMUINT16 ui16PrevColValue, // Previous collating value FLMUINT16 * pui16ColValue, // Returns collation value FLMUINT16 * pui16SubColVal, // Returns sub-collation value FLMBYTE * pucCaseBits, // Returns case bits value FLMUINT16 uiUppercaseFlag) // Set if to convert to uppercase { FLMUINT16 ui16ColValue; FLMUINT16 ui16SubColVal; FLMBYTE ucCaseBits = 0; FLMBYTE ucCharSet = ui16WpChar >> 8; FLMBYTE ucCharVal = ui16WpChar & 0xFF; FLMUINT16 ui16Hankaku; FLMUINT uiLoop; FLMUINT16 ui16ReturnValue = 1; ui16ColValue = ui16SubColVal = 0; // Kanji or above if (ucCharSet >= 0x2B) { // Puts 2 or above into high byte. ui16ColValue = ui16WpChar - 0x2900; // No subcollation or case bits need to be set goto Exit; } // Single wide character? (HANKAKU) if (ucCharSet < 11) { // Get the values from a non-asian character LATIN, GREEK or // CYRILLIC. The width bit may have been set on a jump to label from // below. Latin_Greek_Cyrillic: // YES: Pass US_LANG because this is what we want - Prevents double // character sorting. ui16ColValue = flmGetCollation( ui16WpChar, US_LANG); if (uiUppercaseFlag || flmIsUpper( ui16WpChar)) { // Uppercase - set case bit ucCaseBits |= SET_CASE_BIT; } // Character for which there is no collation value? if (ui16ColValue == COLS0) { ui16ReturnValue = 0; if (!flmIsUpper( ui16WpChar)) { // Convert to uppercase ui16WpChar--; } ui16ColValue = 0xFFFF; ui16SubColVal = ui16WpChar; } else if (ucCharSet) // Don't bother with ascii { if (!flmIsUpper( ui16WpChar)) { // Convert to uppercase ui16WpChar--; } if (ucCharSet == CHSMUL1) { FLMUINT16 ui16Base; FLMUINT16 ui16Diacritic; ui16SubColVal = !flmCh6Brkcar( ui16WpChar, &ui16Base, &ui16Diacritic) ? flmDia60Tbl[ui16Diacritic & 0xFF] : ui16WpChar; } else if (ucCharSet == CHSGREK) // GREEK { if (ui16WpChar >= 0x834 || // [8,52] or above ui16WpChar == 0x804 || // [8,4] BETA Medial | Terminal ui16WpChar == 0x826) { // [8,38] SIGMA terminal ui16SubColVal = ui16WpChar; } } else if (ucCharSet == CHSCYR) // CYRILLIC { if (ui16WpChar >= 0xA90) // [10, 144] or above { ui16SubColVal = ui16WpChar; // Dup collation values } } // else don't need a sub collation value } goto Exit; } // Single wide Japanese character? if (ucCharSet == 11) { FLMUINT16 ui16KanaChar; // Convert charset 11 to Zenkaku (double wide) CS24 or CS26 hex. // All characters in charset 11 will convert to CS24 or CS26. When // combining the collation and the sub-collation values. if (HanToZenkaku( ui16WpChar, ui16NextWpChar, &ui16KanaChar) == 2) { // Return 2 ui16ReturnValue++; } ucCaseBits |= SET_WIDTH_BIT; // Set so will allow to go back ui16WpChar = ui16KanaChar; // If in CS24 will fall through // to ZenKaku ucCharSet = ui16KanaChar >> 8; ucCharVal = ui16KanaChar & 0xFF; } if (ui16WpChar < 0x2400) { // In some other character set goto Latin_Greek_Cyrillic; } else if (ui16WpChar >= 0x255e && ui16WpChar <= 0x2655) { if (ui16WpChar >= 0x2600) { ucCaseBits |= SET_KATAKANA_BIT; } // HIRAGANA and KATAKANA Kana contains both hiragana and katakana. // The tables contain the same characters in same order if (ucCharSet == 0x25) { // Change value to be in character set 26 ucCharVal -= 0x5E; } ui16ColValue = 0x0100 + KanaColTbl[ucCharVal]; ui16SubColVal = flmKanaSubColTbl[ucCharVal]; goto Exit; } if ((ui16Hankaku = ZenToHankaku( ui16WpChar, (FLMUINT16*) 0)) != 0) { if ((ui16Hankaku >> 8) != 11) { ui16WpChar = ui16Hankaku; ucCharSet = ui16WpChar >> 8; ucCharVal = ui16WpChar & 0xFF; ucCaseBits |= SET_WIDTH_BIT; goto Latin_Greek_Cyrillic; } } // 0x2400..0x24bc Japanese symbols that cannot be converted to // Hankaku. All 6 original symbol chars from 11 will also be here. // First try to find a collation value of the symbol. The sub-collation // value will be the position in the CS24 table + 1. for (uiLoop = 0; uiLoop < (sizeof(flmCh24ColTbl) / sizeof(BYTE_WORD_TBL)); uiLoop++) { if (ucCharVal == flmCh24ColTbl[uiLoop].ByteValue) { if ((ui16ColValue = flmCh24ColTbl[uiLoop].WordValue) < 0x100) { // Don't save for chuuten, dakuten, handakuten ui16SubColVal = (FLMUINT16) (uiLoop + 1); } break; } } if (!ui16ColValue) { // Now see if it's a repeat or repeat-vowel character if ((((ucCharVal >= 0x12) && (ucCharVal <= 0x15)) || (ucCharVal == 0x17) || (ucCharVal == 0x18)) && ((ui16PrevColValue >> 8) == 1)) { ui16ColValue = ui16PrevColValue; // Store original WP character ui16SubColVal = ui16WpChar; } else if ((ucCharVal == 0x1B) && // repeat vowel? (ui16PrevColValue >= 0x100) && (ui16PrevColValue < COLS_ASIAN_MARKS)) // Previous kana char? { ui16ColValue = 0x0100 + KanaColToVowel[ui16PrevColValue & 0xFF]; // Store original WP character ui16SubColVal = ui16WpChar; } else { ui16ReturnValue = 0; ui16ColValue = 0xFFFF; // No collation value ui16SubColVal = ui16WpChar; // Never have changed if gets here } } Exit: *pui16ColValue = ui16ColValue; *pui16SubColVal = ui16SubColVal; *pucCaseBits = ucCaseBits; return (ui16ReturnValue); } /***************************************************************************** Desc: Get the Flaim collating string and convert back to a WP word string Ret: Length of new WP word string *****************************************************************************/ FLMUINT FWWSGetColStr( FLMBYTE * fColStr, // Points to the Flaim collated string FLMUINT * fcStrLenRV, // Length of the Flaim collated string FLMBYTE * wordStr, // Output string to build - WP word string FLMUINT fWPLang, // FLAIM WP language number FLMBOOL * pbDataTruncated, // Set to TRUE if truncated FLMBOOL * pbFirstSubstring) // Sets to TRUE if first substring { FLMBYTE * wsPtr = wordStr; // Points to the word string data area FLMUINT length = *fcStrLenRV; // May optimize as a register FLMUINT pos = 0; // Position in fColStr[] FLMUINT bitPos; // Computed bit position FLMUINT colChar; // Not portable if a FLMBYTE value FLMUINT wdStrLen; FLMBOOL hebrewArabicFlag = 0; // Set if hebrew/arabic language // WARNING: The code is duplicated for performance reasons. The US code // below is much more optimized so any changes must be done twice. if (fWPLang != US_LANG) // Code for NON-US languages { if ((fWPLang == AR_LANG) || // Arabic (fWPLang == FA_LANG) || // Farsi - persian (fWPLang == HE_LANG) || // Hebrew (fWPLang == UR_LANG)) // Urdu { hebrewArabicFlag++; } while (length && (fColStr[pos] > MAX_COL_OPCODE)) { length--; colChar = (FLMUINT) fColStr[pos++]; switch (colChar) { case COLS9 + 4: // ch in spanish case COLS9 + 11: // ch in czech { // Put the WP char in the word string UW2FBA( (FLMUINT16) 'C', wsPtr); wsPtr += 2; colChar = (FLMUINT) 'H'; pos++; // move past second duplicate char break; } case COLS9 + 17: // ll in spanish { // Put the WP char in the word string UW2FBA( (FLMUINT16) 'L', wsPtr); wsPtr += 2; colChar = (FLMUINT) 'L'; pos++; // move past duplicate character break; } case COLS0: // Non collating character { // Actual character is in sub-collation area colChar = (FLMUINT) 0xFFFF; break; } default: { if (hebrewArabicFlag && (colChar >= COLS10h)) { colChar = (colChar < COLS10a) ? (FLMUINT) (0x900 + (colChar - (COLS10h))) // Hebrew : (FLMUINT) (HebArabColToWPChr[colChar - (COLS10a)]); // Arabic } else { colChar = (FLMUINT) colToWPChr[colChar - COLLS]; } break; } } UW2FBA( (FLMUINT16) colChar, wsPtr); wsPtr += 2; } } else { while (length && (fColStr[pos] > MAX_COL_OPCODE)) { length--; // Move in the WP value given uppercase collated value colChar = (FLMUINT) fColStr[pos++]; if (colChar == COLS0) { colChar = (FLMUINT) 0xFFFF; } else { colChar = (FLMUINT) colToWPChr[colChar - COLLS]; } UW2FBA( (FLMUINT16) colChar, wsPtr); wsPtr += 2; } } // NULL Terminate the string UW2FBA( (FLMUINT16) 0, wsPtr); wdStrLen = pos + pos; // Parse through the sub-collation and case information. // Watch out for COMP CollStrPosT indexes-doesn't have case info after // Here are values for some of the codes: // [ 0x01] - end for fields case info follows - for COMP POST indexes // [ 0x02] - compound marker // [ 0x05] - case bits follow // [ 0x06] - case information is all uppercase // [ 0x07] - beginning of sub-collation information // [ 0x08] - first substring field that is made // [ 0x09] - truncation marker for text and binary // // Asian chars the case information should always be there and not // compressed out. This is because the case information could change // the actual width of the character from 0x26xx to charset 11. if (length && fColStr[pos] == COLL_FIRST_SUBSTRING) { if (pbFirstSubstring) { *pbFirstSubstring = TRUE; // Don't need to initialize to FALSE. } length--; pos++; } if (length && fColStr[pos] == COLL_TRUNCATED) { if (pbDataTruncated) { *pbDataTruncated = TRUE; // Don't need to initialize to FALSE. } length--; pos++; } if (length && (fColStr[pos] == (COLL_MARKER | SC_SUB_COL))) { FLMUINT tempLen; // Do another pass on the word string adding the diacritics bitPos = FWWSCmbSubColBuf( wordStr, &wdStrLen, &fColStr[++pos], hebrewArabicFlag); // Move pos to next byte value tempLen = BYTES_IN_BITS( bitPos); pos += tempLen; length -= tempLen + 1; // The 1 includes the 0x07 byte } // Does the case info follow? if (length && (fColStr[pos] > COMPOUND_MARKER)) { // Take care of the lower and upper case conversion If mixed case // then convert using case bits if (fColStr[pos++] & SC_MIXED) // Increment pos here! { // Don't pre-increment pos on line below! pos += FWWSToMixed( wordStr, wdStrLen, &fColStr[pos], fWPLang); } // else 0x04 or 0x06 - all characters already in uppercase } *fcStrLenRV = pos; // pos should be on the 0x01 or 0x02 flag return (wdStrLen); // Return the length of the word string } /************************************************************************** Desc: Combine the diacritic 5 bit values to an existing word string Todo: May want to check flmCh6Cmbcar() for CY return value ***************************************************************************/ FLMUINT FWWSCmbSubColBuf( FLMBYTE* wordStr, // Existing word string to modify FLMUINT* wdStrLenRV, // Wordstring length in bytes FLMBYTE* subColBuf, // Diacritic values in 5 bit sets FLMBOOL hebrewArabicFlag) // Set if language is Hebrew or Arabic { FLMUINT subColBitPos = 0; FLMUINT numWords = *wdStrLenRV >> 1; FLMUINT16 diac; FLMUINT16 wpchar; FLMUINT temp; // For each word in the word string ... while (numWords--) { // Label used for hebrew/arabic - additional subcollation can follow ; // This macro DOESN'T increment bitPos if (TEST1BIT( subColBuf, subColBitPos)) { // If "11110" - unmappable unicode char - 0xFFFF is before it // If "1110" then INDEX extended char is inserted // If "110" then extended char follows that replaces collation // If "10" then take next 5 bits which contain the diacritic // subcollation value. after_last_character: subColBitPos++; // Eat the first 1 bit if (!TEST1BIT( subColBuf, subColBitPos)) { subColBitPos++; // Eat the 0 bit diac = (FLMUINT16) (GETnBITS( 5, subColBuf, subColBitPos)); subColBitPos += 5; if ((wpchar = FB2UW( wordStr)) < 0x100) // If not extended base.. { // Convert to WP diacritic and combine characters flmCh6Cmbcar( &wpchar, wpchar, (FLMUINT16) ml1_COLtoD[diac]); // Even if cmbcar fails, wpchar is still set to a valid // value UW2FBA( wpchar, wordStr); } else if ((wpchar & 0xFF00) == 0x0D00) // arabic? { wpchar = ArabSubColToWPChr[diac]; UW2FBA( wpchar, wordStr); } // else diacritic is extra info ; // cmbcar should not handle extended chars for this design } else // "110" or "1110" or "11110" { subColBitPos++; // Eat the 2nd '1' bit if (TEST1BIT( subColBuf, subColBitPos)) // Test the 3rd bit { // 1110 - shift wpchars down 1 word and insert value below subColBitPos++; // Eat the 3rd '1' bit *wdStrLenRV += 2; // Return 2 more bytes if (TEST1BIT( subColBuf, subColBitPos)) // Test 4th bit { // Unconvertable UNICODE character. // // The format will be 4 bytes, 0xFF, 0xFF, 2 byte Unicode shiftN( wordStr, numWords + numWords + 4, 2); subColBitPos++; // Eat the 4th '1' bit wordStr += 2; // Skip the 0xFFFF for now } else { // Move down 2 byte NULL and rest of the 2 byte characters. // The extended character does not have a 0xFF col value shiftN( wordStr, numWords + numWords + 2, 2); numWords++; } } subColBitPos++; // Skip past the zero bit subColBitPos = (subColBitPos + 7) & (~7); // roundup to next byte temp = BYTES_IN_BITS( subColBitPos); // compute position wordStr[1] = subColBuf[temp]; // Character set wordStr[0] = subColBuf[temp + 1]; // Character subColBitPos += 16; } } else { subColBitPos++; } wordStr += 2; // Next WP character } if (hebrewArabicFlag) { if (TEST1BIT( subColBuf, subColBitPos)) { // Hebrew/Arabic can have trailing accents that don't have a // matching collation value. Keep looping in this case. Note that // subColBitPos isn't incremented above. numWords = 0; // set so won't loop forever! goto after_last_character; // process trailing bit } subColBitPos++; // Eat the last '0' bit } return (subColBitPos); } /************************************************************************** Desc: Convert the word string to lower case chars given low/upp bit string Out: WP characters have modified to their original case Ret: Number of bytes used in the lower/upper buffer Notes: Only WP to lower case conversion is done here for each bit NOT set. ***************************************************************************/ FLMUINT FWWSToMixed( FLMBYTE * wordStr, // Existing word string to modify FLMUINT wdStrLen, // Length of the wordstring in bytes FLMBYTE* lowUpBitStr, // Lower/upper case bit string FLMUINT fWPLang) { FLMUINT numWords; FLMUINT tempWord; FLMBYTE tempByte = 0; FLMBYTE maskByte; FLMBYTE xorByte; xorByte = (fWPLang == US_LANG) ? (FLMBYTE) 0 : (fWPLang == GR_LANG) ? (FLMBYTE) 0xFF : (FLMBYTE) 0; // For each word in the word string ... for (numWords = wdStrLen >> 1, maskByte = 0; numWords--; wordStr += 2, maskByte >>= 1) { if (maskByte == 0) // Time to get another byte { tempByte = xorByte ^ *lowUpBitStr++; maskByte = 0x80; } if ((tempByte & maskByte) == 0) // If lowercase conver - else is upper { // Convert to lower case - COLL -> WP is already in upper case tempWord = (FLMUINT) FB2UW( wordStr); if ((tempWord >= ASCII_UPPER_A) && (tempWord <= ASCII_UPPER_Z)) { tempWord |= 0x20; } else { FLMBYTE charVal = (FLMBYTE) (tempWord & 0xFF); FLMBYTE charSet = (FLMBYTE) (tempWord >> 8); // check if charact within region of character set if (((charSet == CHSMUL1) && ((charVal >= 26) && (charVal <= 241))) || ((charSet == CHSGREK) && (charVal <= 69)) || ((charSet == CHSCYR) && (charVal <= 199))) { tempWord |= 0x01; } } UW2FBA( (FLMUINT16) tempWord, wordStr); } } numWords = wdStrLen >> 1; return (BYTES_IN_BITS( numWords)); } /******************************************************************** Desc: Build a responce tree of NODEs for the key output. *********************************************************************/ RCODE flmIxKeyOutput( IXD * pIxd, FLMBYTE * pucFromKey, FLMUINT uiKeyLen, FlmRecord ** ppKeyRV, // Returns key FLMBOOL bFullFldPaths) // If true add full field paths { RCODE rc = FERR_OK; FlmRecord * pKey = NULL; void * pvField; FLMBYTE ucKeyBuf[ MAX_KEY_SIZ + 12]; FLMBYTE * pucToKey = &ucKeyBuf[ 0]; FLMBYTE * pucPostBuf = NULL; IFD * pIfd; FLMUINT uiLongValue; FLMUINT uiToKeyLen; FLMUINT uiLanguage = pIxd->uiLanguage; FLMUINT uiFromKeyLen; FLMUINT uiFromRemaining; FLMUINT uiPostLen; FLMUINT uiPostPos; FLMUINT uiTempFromKeyLen; FLMUINT uiFldType; FLMUINT uiDataType; FLMBOOL bDataRightTruncated; FLMBOOL bFirstSubstring; FLMBOOL bSigSign; FLMBYTE ucTemp; FLMUINT uiContainer; FLMUINT uiMaxKeySize; // If the index is on all containers, see if this key has // a container component. If so, strip it off. if( (uiContainer = pIxd->uiContainerNum) == 0) { FLMUINT uiContainerPartLen = getIxContainerPartLen( pIxd); if (uiKeyLen <= uiContainerPartLen) { flmAssert( 0); rc = RC_SET( FERR_BTREE_ERROR); goto Exit; } uiContainer = getContainerFromKey( pucFromKey, uiKeyLen); // Subtract off the bytes for the container part. uiKeyLen -= uiContainerPartLen; uiMaxKeySize = MAX_KEY_SIZ - uiContainerPartLen; } else { uiMaxKeySize = MAX_KEY_SIZ; } flmAssert( uiLanguage != 0xFFFF); if (*ppKeyRV) { if( (*ppKeyRV)->isReadOnly() || (*ppKeyRV)->isCached()) { (*ppKeyRV)->Release(); *ppKeyRV = NULL; } else { (*ppKeyRV)->clear(); } } if( (pKey = *ppKeyRV) == NULL) { if( (pKey = f_new FlmRecord) == NULL) { rc = RC_SET( FERR_MEM); goto Exit; } *ppKeyRV = pKey; } pKey->setContainerID( uiContainer); uiFromKeyLen = uiFromRemaining = uiKeyLen; pIfd = pIxd->pFirstIfd; // If post index, get post low/up section. if( pIfd->uiFlags & IFD_POST ) { // Last byte has low/upper length uiPostLen = pucFromKey[ uiFromKeyLen - 1 ]; pucPostBuf = &pucFromKey[ uiFromKeyLen - uiPostLen - 1 ]; uiPostPos = 0; } if (RC_BAD( rc = pKey->insertLast( 0, FLM_KEY_TAG, FLM_CONTEXT_TYPE, NULL))) { goto Exit; } // Loop for each compound piece of key for( ;;) { FLMBOOL bIsAsianCompound; FLMUINT uiMarker; bDataRightTruncated = bFirstSubstring = FALSE; bIsAsianCompound = (FLMBOOL)(((uiLanguage >= FIRST_DBCS_LANG) && (uiLanguage <= LAST_DBCS_LANG) && (IFD_GET_FIELD_TYPE( pIfd) == FLM_TEXT_TYPE) && (!(pIfd->uiFlags & IFD_CONTEXT))) ? (FLMBOOL)TRUE : (FLMBOOL)FALSE); uiMarker = (FLMUINT)((bIsAsianCompound) ? (FLMUINT)((FLMUINT)(*pucFromKey << 8) + *(pucFromKey+1)) : (FLMUINT) *pucFromKey); uiFldType = (FLMUINT) IFD_GET_FIELD_TYPE( pIfd); uiDataType = IFD_GET_FIELD_TYPE( pIfd); // Hit a compound marker or end of key marker // Check includes COMPOUND_MARKER & END_COMPOUND_MARKER if( uiMarker <= NULL_KEY_MARKER) { // If the field is required or single field then generate an empty node. if( ((pIfd->uiFlags & IFD_OPTIONAL) == 0) || (uiFldType == FLM_TEXT_TYPE) || (uiFldType == FLM_BINARY_TYPE) || ((pIfd->uiFlags & IFD_LAST) && !pIfd->uiCompoundPos )) { if( RC_BAD( rc = flmBuildKeyPaths( pIfd, pIfd->uiFldNum, uiDataType, bFullFldPaths, pKey, &pvField))) goto Exit; } if( uiMarker == END_COMPOUND_MARKER) // Used for post keys break; uiFromKeyLen = 0; // This piece is zero - skip it - may be others } else { // If compound key or if only field used in index // output the key elements field number or else 'NA' if( pIfd->uiFlags & IFD_CONTEXT) { if( RC_BAD( rc = flmBuildKeyPaths( pIfd, flmBigEndianToUINT16( &pucFromKey [1]), uiDataType, bFullFldPaths, pKey, &pvField))) { goto Exit; } uiFromKeyLen = KY_CONTEXT_LEN; } else { if( RC_BAD( rc = flmBuildKeyPaths( pIfd, pIfd->uiFldNum, uiDataType, bFullFldPaths, pKey, &pvField))) { goto Exit; } // Grab only the Nth section of key if compound key // Null out key if uiToKeyLen gets 0 UD2FBA( 0, pucToKey); switch( uiDataType) { case FLM_TEXT_TYPE: uiTempFromKeyLen = uiFromKeyLen; uiToKeyLen = FColStrToText( pucFromKey, &uiTempFromKeyLen, pucToKey, uiLanguage, pucPostBuf, &uiPostPos, &bDataRightTruncated, &bFirstSubstring); uiFromKeyLen = uiTempFromKeyLen; break; case FLM_NUMBER_TYPE: { FLMUINT uiFirstColNibble; // Current collated nibble FLMUINT uiFirstNumNibble; // Current output nibble FLMBYTE * pucOutPtr; // Output pointer FLMBYTE * pucColPtr; FLMUINT uiBytesProcessed; // Start at byte after sign/magnitude byte pucColPtr = pucFromKey + 1; uiBytesProcessed = 1; uiFirstColNibble = 1; // Determine the sign of the number pucOutPtr = pucToKey; if( (bSigSign = (*pucFromKey & SIG_POS)) == 0) { *pucOutPtr = 0xB0; uiFirstNumNibble = 0; } else { uiFirstNumNibble = 1; } // Parse through the collated number outputting data // to the buffer as we go. for( ;;) { // Determine what we are pointing at if( (ucTemp = *pucColPtr) <= COMPOUND_MARKER) { break; } if( uiFirstColNibble++ & 1) { ucTemp >>= 4; } else { ucTemp &= 0x0F; pucColPtr++; uiBytesProcessed++; } // A hex F signifies the end of a collated number with an // odd number of nibbles if( ucTemp == 0x0F) { break; } // Convert collated number nibble to BCD nibble // and lay it in buffer ucTemp -= COLLATED_DIGIT_OFFSET; // Is number negative? if( !bSigSign) { // Negative values are ~ed ucTemp = (FLMBYTE)(10 -(ucTemp + 1)); } if( uiFirstNumNibble++ & 1) { *pucOutPtr = (FLMBYTE)(ucTemp << 4); } else { *pucOutPtr++ += ucTemp; } if( uiBytesProcessed == uiFromKeyLen) { break; } } // Append Terminator code to internal number *pucOutPtr++ |= (uiFirstNumNibble & 1) ? 0xFF : 0x0F; uiToKeyLen = (FLMUINT) (pucOutPtr - pucToKey); uiFromKeyLen = uiBytesProcessed; rc = FERR_OK; break; } case FLM_BINARY_TYPE: { FLMUINT uiMaxLength; FLMBYTE * pucSrc = pucFromKey; uiMaxLength = ((uiFromKeyLen >> 1) < uiMaxKeySize) ? (FLMUINT)(uiFromKeyLen >> 1) : (FLMUINT)uiMaxKeySize; uiToKeyLen = 0; while( (uiToKeyLen < uiMaxLength) && ((ucTemp = *pucSrc) >= COLLS)) { // Take two bytes from source to make one byte in dest pucToKey[ uiToKeyLen++] = (FLMBYTE)(((ucTemp - COLLS) << 4) + (*(pucSrc + 1) - COLLS)); pucSrc += 2; } if( (uiToKeyLen < (uiFromKeyLen >> 1)) && (*pucSrc >= COLLS)) { rc = RC_SET( FERR_CONV_DEST_OVERFLOW); } else { rc = FERR_OK; uiFromKeyLen = uiToKeyLen << 1; // FLAIM has a bug where the binary fields don't have // the COLL_TRUNCATED value on truncated values. // The good news is that we know the true length of // binary fields. if( *pucSrc == COLL_TRUNCATED) { uiFromKeyLen++; bDataRightTruncated = TRUE; } else if( uiToKeyLen >= pIfd->uiLimit) { bDataRightTruncated = TRUE; } } break; } case FLM_CONTEXT_TYPE: default: uiFromKeyLen = 5; uiLongValue = flmBigEndianToUINT32( pucFromKey + 1); UD2FBA( (FLMUINT32)uiLongValue, pucToKey); uiToKeyLen = 4; break; } if( RC_BAD( rc)) { goto Exit; } // Allocate and Copy Value into the node if( uiToKeyLen) { FLMBYTE * pucValue; if( RC_BAD(rc = pKey->allocStorageSpace( pvField, uiDataType, uiToKeyLen, 0, 0, 0, &pucValue, NULL))) { goto Exit; } f_memcpy( pucValue, pucToKey, uiToKeyLen); } // Set first sub-string and truncated flags. if( (pIfd->uiFlags & IFD_SUBSTRING) && !bFirstSubstring) { pKey->setLeftTruncated( pvField, TRUE); } if( bDataRightTruncated) { pKey->setRightTruncated( pvField, TRUE); } } } // Compute variables for next section of compound key // Add 1 for compound marker if still is stuff in key if( uiFromRemaining != uiFromKeyLen) { uiFromKeyLen += (FLMUINT)(bIsAsianCompound ? (FLMUINT)2 : (FLMUINT)1); } pucFromKey += uiFromKeyLen; if( (uiFromKeyLen = (uiFromRemaining -= uiFromKeyLen)) == 0) { break; } while( ((pIfd->uiFlags & IFD_LAST) == 0) && (pIfd->uiCompoundPos == (pIfd+1)->uiCompoundPos)) { pIfd++; } if( pIfd->uiFlags & IFD_LAST) { break; } pIfd++; } // Check if we have one field left. if( (pIfd->uiFlags & IFD_LAST) == 0) { while( (pIfd->uiFlags & IFD_LAST) == 0) { pIfd++; } if( (pIfd->uiFlags & IFD_OPTIONAL) == 0) { if( RC_BAD( rc = flmBuildKeyPaths( pIfd, pIfd->uiFldNum, uiDataType, bFullFldPaths, pKey, &pvField))) { goto Exit; } } } Exit: return( rc); } /**************************************************************************** Desc: This module will read all references of an index key. The references will be output number defined as REFS_PER_NODE ****************************************************************************/ RCODE flmBuildKeyPaths( IFD * pIfd, FLMUINT uiFldNum, FLMUINT uiDataType, FLMBOOL bFullFldPaths, FlmRecord * pKey, void ** ppvField) { RCODE rc = FERR_OK; void * pvField; void * pvParentField; void * pvChildField; FLMUINT * pFieldPath; FLMUINT uiTempDataType; FLMUINT uiFieldPos; FLMUINT uiTargetFieldID; if( !bFullFldPaths) { rc = pKey->insertLast( 1, uiFldNum, uiDataType, &pvField); goto Exit; } pFieldPath = pIfd->pFieldPathPToC; pvParentField = pKey->root(); uiFieldPos = 0; // Loop finding field matches. pvField = pKey->find( pvParentField, pFieldPath[ uiFieldPos]); if( pvField) { pvParentField = pvField; uiFieldPos++; uiTargetFieldID = pFieldPath[ uiFieldPos]; // Loop finding matching children from this point on. for( pvChildField = pKey->firstChild( pvParentField); pvChildField; ) { if( pKey->getFieldID( pvChildField) == uiTargetFieldID) { // On the child field? if( pFieldPath[ uiFieldPos + 1] == 0) { pvField = pvChildField; // Set the data type in case the data length is zero. pKey->allocStorageSpace( pvField, uiDataType, 0, 0, 0, 0, NULL, NULL); break; } pvParentField = pvChildField; uiFieldPos++; uiTargetFieldID = pFieldPath[ uiFieldPos]; pvChildField = pKey->firstChild( pvParentField); } else { pvChildField = pKey->nextSibling( pvChildField); } } } // Insert the rest of the field path down to the value field (uiFieldPos==0). uiTempDataType = FLM_CONTEXT_TYPE; for( ; pFieldPath[ uiFieldPos]; uiFieldPos++) { // Add the real data type for the last field, otherwise set as context. if( pFieldPath[ uiFieldPos + 1] == 0) { uiTempDataType = uiDataType; } if( RC_BAD( rc = pKey->insert( pvParentField, INSERT_LAST_CHILD, pFieldPath[ uiFieldPos], uiTempDataType, &pvField))) { goto Exit; } pvParentField = pvField; } Exit: *ppvField = pvField; return( rc); } /**************************************************************************** Desc: Compare only the leading left and right characters according to the many flags that are passed in. This routine operates to save and set state for the calling routine. TODO: This routine does NOT support Asian, Hebrew, or Arabic language collations. In addition, flmCheckDoubleCollation() is not called for other non-US lanagues. There is still a lot of work to do! This is our default US compare and it is not very good for JP. Return: Signed value of compare. <0 if less than, 0 if equal, >0 if greater than. Asian Notes: The asian compare takes two characters and may use one or both. This makes the algorithm complex so we may have to build full tests to see what we broke. NDS Notes: The right side (search string) is already formatted according to the space/dash rules of the syntax. ****************************************************************************/ FLMINT flmTextCompareSingleChar( FLMBYTE ** ppLeftText, // [in] Points to current value. // [out] Points to next character if equals. FLMUINT * puiLeftLen, // [in] Bytes remaining in text string. // [out] Bytes remaining in text string. FLMUINT * puiLeftWpChar2,// Second left character - for double characters FLMBYTE ** ppRightText, // [in] Points to current value. // [out] Points to next character if equals. FLMUINT * puiRightLen, // [in] Bytes remaining in text string. // [out] Bytes remaining in text string. FLMUINT * puiRightWpChar2,// Second right character - for double characters. FLMINT * piSubColCompare,//[in] If NULL disregard the subcollation // values if collation values are equal. // [out] If equals is returned, value is // set ONLY if the signed value of comparing // the sub-collation values is not equal. // See lengthy unicode compare below. FLMINT * piCaseCompare, // [in] If NULL disregard the case bits // if collation values are equal. Japanese // values are an exception to this rule. // [out] If equals is returned, value is // set ONLY if the signed value of comparing // the case values is not equal. FLMBOOL * pbHitWildCard, // [in] If NULL then do not look for wild // cards in the right text string. // [out] If non-null, a wild card (*,?) will // be looked for on the RIGHT SIDE ONLY. // If '?' is found 0 will be returned and // pointers are advanced. If '*' is found, // this value will be set to TRUE and the // right side is advanced. If no wild // card is found the value will not be set. FLMINT iCompareType, // COMPARE_COLLATION, COMPARE_COL_AND_SUBCOL, COMPARE_VALUE FLMUINT16 * pui16ColVal, // Needed for asian collation compare. FLMUINT uiFlags, // FLM_* flags FLMUINT uiLangId) // FLAIM/WordPerfect Lanaguge id. { FLMBYTE * pLeftText = *ppLeftText; FLMBYTE * pRightText = *ppRightText; FLMINT iCompare = 0; FLMUINT uiRightFlags = uiFlags; FLMUINT16 ui16LeftWPChar; FLMUINT16 ui16LeftUniChar; FLMUINT16 ui16RightWPChar; FLMUINT16 ui16RightUniChar; FLMUINT uiLeftValueLen; FLMUINT uiRightValueLen; FLMUINT16 ui16LeftCol; FLMUINT16 ui16RightCol; FLMUINT uiLeftWpChar2 = *puiLeftWpChar2; FLMUINT uiRightWpChar2 = *puiRightWpChar2; FLMBOOL bLeftTwoIntoOne; FLMBOOL bRightTwoIntoOne; // Get the next character from the TEXT string. NOTE: OEM characters // will be returned as a UNICODE character. A unicode character here // is a value that cannot be converted to the WP set (no good collation value).. uiLeftValueLen = flmTextGetValue( pLeftText, *puiLeftLen, &uiLeftWpChar2, uiFlags, &ui16LeftWPChar, &ui16LeftUniChar); uiRightValueLen = flmTextGetValue( pRightText, *puiRightLen, &uiRightWpChar2, uiRightFlags, &ui16RightWPChar, &ui16RightUniChar); // At this point, the double character, if any, should have been consumed. flmAssert( !uiLeftWpChar2 && !uiRightWpChar2); // Check for the following escape characters: "\\" "*" and "\\" "\\" if( ui16RightWPChar == ASCII_BACKSLASH) { if( pRightText[ uiRightValueLen ] == ASCII_BACKSLASH) { uiRightValueLen++; } else if( pRightText[ uiRightValueLen ] == ASCII_WILDCARD) { ui16RightWPChar = ASCII_WILDCARD; uiRightValueLen++; } } // Checking for wild cards in the right string? (Always a WP character) else if( pbHitWildCard) { // The '*' wildcard means to match zero or many characters. // The sick case of "A*B" compared to "A**B" should be considered. if( ui16RightWPChar == ASCII_WILDCARD) { // Eat all duplicate wild cards. while( pRightText[ uiRightValueLen] == ASCII_WILDCARD) { uiRightValueLen++; } // Advance the right value. Keep left value alone. // Return equals (default). *pbHitWildCard = TRUE; // Don't advance the left value. uiLeftValueLen = 0; uiLeftWpChar2 = *puiLeftWpChar2; goto Exit; } } // First section is to compare just WP values. if( ui16LeftWPChar && ui16RightWPChar) { FLMUINT16 ui16LeftSubCol; FLMUINT16 ui16RightSubCol; if (iCompareType == COMPARE_VALUE) { // Check the obvious case of equal WP values. if( ui16LeftWPChar != ui16RightWPChar) { iCompare = -1; } goto Exit; } // JP compare code. if (uiLangId >= FIRST_DBCS_LANG && uiLangId <= LAST_DBCS_LANG) { FLMUINT uiNextLeftLen; FLMUINT uiNextRightLen; FLMUINT16 ui16NextLeftWPChar; FLMUINT16 ui16NextRightWPChar; FLMUINT16 ui16ColVal = pui16ColVal ? *pui16ColVal : 0; FLMBYTE ucLeftCaseValue; FLMBYTE ucRightCaseValue; // Should have already consumed double character, if any flmAssert( !uiLeftWpChar2 && !uiRightWpChar2); uiNextLeftLen = flmTextGetValue( pLeftText+uiLeftValueLen, *puiLeftLen, &uiLeftWpChar2, uiFlags, &ui16NextLeftWPChar, &ui16LeftUniChar); uiNextRightLen = flmTextGetValue( pRightText+uiRightValueLen, *puiRightLen, &uiRightWpChar2, uiFlags, &ui16NextRightWPChar, &ui16RightUniChar); // nextL/R WPChar may be zero. if (flmAsiaGetCollation( ui16LeftWPChar, ui16NextLeftWPChar, ui16ColVal, &ui16LeftCol, &ui16LeftSubCol, &ucLeftCaseValue, FALSE) == 2) { uiLeftValueLen += uiNextLeftLen; } if (flmAsiaGetCollation( ui16RightWPChar, ui16NextRightWPChar, ui16ColVal, &ui16RightCol, &ui16RightSubCol, &ucRightCaseValue, FALSE) == 2) { uiRightValueLen += uiNextRightLen; } // Compare all of the stuff now. if (ui16LeftCol == ui16RightCol) { if( (iCompareType == COMPARE_COL_AND_SUBCOL) || (piSubColCompare && (*piSubColCompare == 0))) { if( ui16LeftSubCol != ui16RightSubCol) { if( iCompareType == COMPARE_COL_AND_SUBCOL) { iCompare = -1; goto Exit; } // At this point piSubColCompare cannot be NULL. *piSubColCompare = (ui16LeftSubCol < ui16RightSubCol) ? -1 : 1; // Write over the case compare value if( piCaseCompare ) { *piCaseCompare = *piSubColCompare; } } } if (iCompareType != COMPARE_COL_AND_SUBCOL) { // Check case? if (piCaseCompare && (*piCaseCompare == 0)) { if( ucLeftCaseValue != ucRightCaseValue) { *piCaseCompare = ucLeftCaseValue < ucRightCaseValue ? -1 : 1; } } } } else { iCompare = (ui16LeftCol < ui16RightCol) ? -1 : 1; } goto Exit; } flmAssert( !uiLeftWpChar2 && !uiRightWpChar2); if (uiLangId != US_LANG) { const FLMBYTE * pucTmp; pucTmp = pLeftText + uiLeftValueLen; uiLeftWpChar2 = flmCheckDoubleCollation( &ui16LeftWPChar, &bLeftTwoIntoOne, &pucTmp, uiLangId); uiLeftValueLen = (FLMUINT)(pucTmp - pLeftText); pucTmp = pRightText + uiRightValueLen; uiRightWpChar2 = flmCheckDoubleCollation( &ui16RightWPChar, &bRightTwoIntoOne, &pucTmp, uiLangId); uiRightValueLen = (FLMUINT)(pucTmp - pRightText); // See if we got the same double character if (uiLeftWpChar2 == uiRightWpChar2 && ui16LeftWPChar == ui16RightWPChar) { uiLeftWpChar2 = 0; uiRightWpChar2 = 0; goto Exit; } } else if (ui16LeftWPChar == ui16RightWPChar) { // Same WP character goto Exit; } ui16LeftCol = flmGetCollation( ui16LeftWPChar, uiLangId); // Handle two characters collating as one. if (uiLeftWpChar2 && bLeftTwoIntoOne) { ui16LeftCol++; } ui16RightCol = flmGetCollation( ui16RightWPChar, uiLangId); // Handle two characters collating as one. if (uiRightWpChar2 && bRightTwoIntoOne) { ui16RightCol++; } if( ui16LeftCol == ui16RightCol) { // Should we bother to check subcollation? - don't bother with 7-bit if( ((iCompareType == COMPARE_COL_AND_SUBCOL) || (piSubColCompare && (*piSubColCompare == 0))) && ((ui16LeftWPChar | ui16RightWPChar) & 0xFF00)) { ui16LeftSubCol = flmTextGetSubCol( ui16LeftWPChar, ui16LeftCol, uiLangId); ui16RightSubCol= flmTextGetSubCol( ui16RightWPChar, ui16RightCol, uiLangId); if (!piCaseCompare) { // If the sub-collation value is the original // character, it means that the collation could not // distinguish the characters and sub-collation is being // used to do it. However, this creates a problem when the // characters are the same character except for case. In that // scenario, we incorrectly return a not-equal when we are // doing a case-insensitive comparison. So, at this point, // we need to use the sub-collation for the upper-case of the // character instead of the sub-collation for the character // itself. if (ui16LeftSubCol == ui16LeftWPChar) { ui16LeftSubCol = flmTextGetSubCol( flmCh6Upper( ui16LeftWPChar), ui16LeftCol, uiLangId); } if (ui16RightSubCol == ui16RightWPChar) { ui16RightSubCol= flmTextGetSubCol( flmCh6Upper( ui16RightWPChar), ui16RightCol, uiLangId); } } // YES - go for it... if( ui16LeftSubCol != ui16RightSubCol) { if( iCompareType == COMPARE_COL_AND_SUBCOL) { iCompare = (ui16LeftSubCol < ui16RightSubCol) ? -1 : 1; goto Exit; } // At this point piSubColCompare cannot be NULL. *piSubColCompare = (ui16LeftSubCol < ui16RightSubCol) ? -1 : 1; // Write over the case compare value if( piCaseCompare ) { *piCaseCompare = *piSubColCompare; } } } if( iCompareType == COMPARE_COL_AND_SUBCOL) { goto Exit; } if( piCaseCompare && (*piCaseCompare == 0)) { // flmIsUpper() only returns FALSE (lower) or TRUE (not-lower) FLMBOOL bLeftUpper = flmIsUpper( ui16LeftWPChar); FLMBOOL bRightUpper = flmIsUpper( ui16RightWPChar); if (bLeftUpper != bRightUpper) { *piCaseCompare = !bLeftUpper ? -1 : 1; } } } else { iCompare = (ui16LeftCol < ui16RightCol) ? -1 : 1; } goto Exit; } if( ui16LeftUniChar && ui16RightUniChar) { // Compare two (non-convertable) UNICODE values. // Check the obvious case of equal UNICODE values. if( ui16LeftUniChar == ui16RightUniChar) { goto Exit; } // Compare subcollation or compare value? if( iCompareType != COMPARE_COLLATION) { iCompare = -1; goto Exit; } // For non-asian - we store these values in the sub-collcation area. // We should return the differece in sub-collation values - but this // may not work for all compares. // // For asian compares, most values we have a collation value. // This is a BIG differece in comparing asian values. // // If we want sub-collation compare then set it, otherwise set main // iCompare value. if( piSubColCompare ) { if( *piSubColCompare == 0) { *piSubColCompare = ui16LeftUniChar < ui16RightUniChar ? -1 : 1; } } else { // Treat as the collation value - this is different than the index. iCompare = ui16LeftUniChar < ui16RightUniChar ? -1 : 1; } goto Exit; } // Compare subcollation or compare value? if( iCompareType != COMPARE_COLLATION) { iCompare = -1; goto Exit; } // Check for no left character. if( !ui16LeftWPChar && !ui16LeftUniChar) { // No left character. check if no right character. if( ui16RightWPChar || ui16RightUniChar) { iCompare = -1; } } // Check for no right character. else if( !ui16RightWPChar && !ui16RightUniChar) { iCompare = 1; } // What remains is one WP char and one Unicode char. // Remember the sub-collation comment above. Some WP char may not // have a collation value (COLS0) so in US sort these values may be // equal and have different sub-collation values. YECH!!!! // // The unicode value will always have collation value of COLS0 (0xFF) // and subcollation value of 11110 [unicodeValue] // The WP value could be anything & if collation value is COLS0 will // have a subcollation value os 1110 [WPValue] // // So, we have to check to see of the WP collation value is COLS0. // If not iCompare is used. If both represent high collation then // the WP value will always have a lower sub-collation value. // // The (not so obvious) code would be to code up... // iCompare = ui16LeftWPChar ? -1 : 1; // if we didn't care about sub-collation (and we may not care). // // This is easier to over code than have ?: operators for the two cases. else if( ui16LeftWPChar) { // Remember - unicode subcol is always COLS0. if( flmGetCollation( ui16LeftWPChar, uiLangId) == COLS0) { if( piSubColCompare && (*piSubColCompare == 0)) { *piSubColCompare = -1; } } else { iCompare = -1; } } else { // left=unicode, right=WP // Remember - unicode subcol is always COLS0 for non-asian. if( flmGetCollation( ui16RightWPChar, uiLangId) == COLS0) { if( piSubColCompare && (*piSubColCompare == 0)) { *piSubColCompare = 1; } } else { iCompare = 1; } } Exit: if( !iCompare) { // Position to the next values if equal *puiLeftLen -= uiLeftValueLen; *ppLeftText = pLeftText + uiLeftValueLen; *puiLeftWpChar2 = uiLeftWpChar2; *puiRightLen -= uiRightValueLen; *ppRightText = pRightText + uiRightValueLen; *puiRightWpChar2 = uiRightWpChar2; } return( iCompare); } /************************************************************************** Desc: Get the Flaim collating string and convert back to a text string Ret: Length of new wpStr Notes: Allocates the area for the word string buffer if will be over 256. ***************************************************************************/ FLMUINT FColStrToText( FLMBYTE * fColStr, // Points to the Flaim collated string FLMUINT * fcStrLenRV, // Length of the Flaim collated string FLMBYTE * textStr, // Output string to build - TEXT string FLMUINT fWPLang, // FLAIM WP language number FLMBYTE * postBuf, // Lower/upper POST buffer or NULL FLMUINT * postBytesRV, // Return next position to use in postBuf FLMBOOL * pbDataTruncated, // Sets to TRUE if data had been truncated FLMBOOL * pbFirstSubstring) // Sets to TRUE if first substring { #define LOCAL_CHARS 150 FLMBYTE wordStr[LOCAL_CHARS * 2 + LOCAL_CHARS / 5]; // Sample + 20% FLMBYTE * wsPtr = NULL; FLMBYTE * wsAllocatedWsPtr = NULL; FLMUINT wsLen; FLMUINT textLen; FLMBYTE * textPtr; if (*fcStrLenRV > LOCAL_CHARS) // If won't fit allocate 1280 { if (RC_BAD( f_alloc( MAX_KEY_SIZ * 2, &wsPtr))) { return (0); } wsAllocatedWsPtr = wsPtr; } else { wsPtr = wordStr; } if ((fWPLang >= FIRST_DBCS_LANG) && (fWPLang <= LAST_DBCS_LANG)) { wsLen = AsiaConvertColStr( fColStr, fcStrLenRV, wsPtr, pbDataTruncated, pbFirstSubstring); if (postBuf) { FLMUINT postBytes = *postBytesRV + 2; // Skip past marker // may change wsLen postBytes += AsiaParseCase( wsPtr, &wsLen, &postBuf[postBytes]); *postBytesRV = postBytes; } } else { wsLen = FWWSGetColStr( fColStr, fcStrLenRV, wsPtr, fWPLang, pbDataTruncated, pbFirstSubstring); // If a post buffer is sent - turn unflagged chars to lower case if (postBuf) { FLMUINT postBytes = *postBytesRV; // Check if mixed case chars follow and always increment // postBytes // if (postBuf[postBytes++] == (COLL_MARKER | SC_MIXED)) { postBytes += FWWSToMixed( wsPtr, wsLen, &postBuf[postBytes], fWPLang); } *postBytesRV = postBytes; } } // Copy word string to TEXT string area wsLen >>= 1; // Convert # of bytes to # of words textPtr = textStr; while (wsLen--) { register FLMBYTE ch; register FLMBYTE cSet; // Put the character in a local variable for speed ch = *wsPtr++; cSet = *wsPtr++; if ((!cSet) && (ch <= 127)) { // Character set zero only needs one byte if the character is <= // 127. Otherwise, it is handled like all other extended // characters below. // *textPtr++ = ch; } // If the character set is > 63 it takes three bytes to store, // otherwise only two bytes are needed. else if (cSet < 63) { *textPtr++ = (FLMBYTE) (CHAR_SET_CODE | cSet); *textPtr++ = ch; } else if (cSet == 0xFF && ch == 0xFF) { *textPtr++ = UNICODE_CODE; *textPtr++ = *(wsPtr + 1); // Character set *textPtr++ = *wsPtr; // Character wsPtr += 2; wsLen--; // Skip past 4 bytes for UNICODE } else { *textPtr++ = EXT_CHAR_CODE; *textPtr++ = cSet; *textPtr++ = ch; } } textLen = (textPtr - textStr); // Compute total length if (wsAllocatedWsPtr != NULL) { f_free( &wsAllocatedWsPtr); } return (textLen); } /**************************************************************************** Desc: Compare two entire strings. There is some debate how this routine should compare the sub-collation values when wild cards are used. THIS DOES NOT ALLOW WILD CARDS. Return: Signed value of compare. <0 if less than, 0 if equal, >0 if greater than The case of returning 1 may be in using wild cards which only need to return a does not match value. ****************************************************************************/ FLMINT flmTextCompare( FLMBYTE * pLeftBuf, FLMUINT uiLeftLen, FLMBYTE * pRightBuf, FLMUINT uiRightLen, FLMUINT uiFlags, FLMUINT uiLang) { FLMINT iCompare = 0; FLMINT iSubColCompare = 0; FLMINT * pSubColCompare; FLMINT iCaseCompare = 0; FLMINT * pCaseCompare; FLMUINT uiLeadingSpace; FLMUINT uiTrailingSpace; FLMUINT16 ui16ColVal = 0; FLMUINT16 ui16WPChar; FLMUINT16 ui16UniChar; FLMUINT uiLeftWpChar2 = 0; FLMUINT uiRightWpChar2 = 0; uiTrailingSpace = uiLeadingSpace = (uiFlags & FLM_MIN_SPACES) ? FLM_NO_SPACE : 0; pCaseCompare = (uiFlags & FLM_NOCASE) ? NULL : &iCaseCompare; pSubColCompare = &iSubColCompare; // Handle NULL buffers first. if (!pLeftBuf) { if (pRightBuf) { iCompare = -1; } goto Exit; } while ((uiLeftLen || uiLeftWpChar2) && (uiRightLen || uiRightWpChar2)) { if ((iCompare = flmTextCompareSingleChar( &pLeftBuf, &uiLeftLen, &uiLeftWpChar2, &pRightBuf, &uiRightLen, &uiRightWpChar2, pSubColCompare, pCaseCompare, NULL_WILD_CARD_CHECK, COMPARE_COLLATION, &ui16ColVal, uiFlags | uiLeadingSpace, uiLang)) != 0) { goto Exit; } uiLeadingSpace = 0; } // EQUAL - as far as the collation values are concerned and one // or both of the strings is at the end. if (uiLeftLen || uiLeftWpChar2) { uiLeftLen -= flmTextGetValue( pLeftBuf, uiLeftLen, &uiLeftWpChar2, uiFlags | uiTrailingSpace, &ui16WPChar, &ui16UniChar); if (uiLeftLen || ui16WPChar || ui16UniChar) { iCompare = 1; } } else if (uiRightLen || uiRightWpChar2) { uiRightLen -= flmTextGetValue( pRightBuf, uiRightLen, &uiRightWpChar2, uiFlags | uiTrailingSpace, &ui16WPChar, &ui16UniChar); if (uiRightLen || ui16WPChar || ui16UniChar) { iCompare = -1; } } if (iCompare == 0) { // All collation bytes equal - return subcollation/case difference. iCompare = (iSubColCompare != 0) ? iSubColCompare : iCaseCompare; } Exit: return iCompare; } /**************************************************************************** Desc: Match two entire strings. Return: FLM_TRUE or FLM_FALSE Notes: This code calls the collation routine because in the future there will be equal conditions with different unicode characters. DOCUMENTATION DEALING WITH WILD CARDS AND SPACE RULES. The space rules are not obvious when dealing with wild cards. This will outline the rules that are being applied so that we can do a regression test when this code changes. Rule #1: Return same result if leading or trailing wild card is added. The underscore is also the space character in these examples and the MIN_SPACES rule is being applied. Format: DataString Operator SearchString Example: if A == A A_ == A A == A_ A_ == A_ then A == A* A_ == A* A == A_* A_ == A_* and A == *A A_ == *A A == *A_ A_ == *A_ and A == *A* A_ == *A* A == *A_* A_ == *A_* where 'A' represent a string of any characters. Strictly put, the query Field == A_* can be broken down to Field == A || Field == A_* where the space after 'A' should not be treated as a trailing space. In addition we can apply the space before the string with the same results, but we are not going to handle the case of *_A correctly. This is because the query *_A should be expanded to Field == A || Field == *_A where the space before 'A' should not be treated as a leading space. When we need to find "_A" in a search string then we will expand the query to handle this. Rule #2: The spaces before a trailing truncation are NOT to be treated as trailing spaces if there are remaining bytes in the data string. Example: (A_B == A_*) but (AB != A_*) Rule #3: Space value(s) without anything other value are equal to no values. Example: (" " == "") Rule #4: Trim leading/trailing spaces before and after wild cards. SMI does this when formatting. _* and *_ same as * so A == _* and A = *_ but A != *_* Additional wildcard cases to test for: Wildcard cases to handle. (ABBBBC == A*BC) Hits the goto Compare_Again case three times. (ABBBBD != A*B) Stuff still remains in dataString (ABBBBC != A*BCD) Stuff still remains in searchString ****************************************************************************/ FLMUINT flmTextMatch( FLMBYTE * pLeftBuf, FLMUINT uiLeftLen, FLMBYTE * pRightBuf, FLMUINT uiRightLen, FLMUINT uiFlags, FLMBOOL bLeadingWildCard, FLMBOOL bTrailingWildCard, FLMUINT uiLang) { FLMINT iCompare = 0; FLMUINT uiLeadingSpace; FLMUINT uiTrailingSpace; FLMBOOL bHitWildCard; FLMBOOL bHasWildCardPos; FLMBOOL * pbHitWildCard; FLMUINT uiValueLen; FLMUINT16 ui16WPChar; FLMUINT16 ui16UniChar; FLMUINT16 ui16Tmp1; FLMUINT16 ui16Tmp2; FLMINT iCompareType; FLMUINT uiLeftWpChar2 = 0; FLMUINT uiRightWpChar2 = 0; // LWCP = Last Wild Card Position - used for wild card state FLMBYTE * pLWCPLeftBuf = NULL; FLMBYTE * pLWCPRightBuf = NULL; FLMUINT uiLWCPLeftLen = 0; FLMUINT uiLWCPRightLen = 0; FLMUINT uiLWCPLeftWpChar2 = 0; FLMUINT uiLWCPRightWpChar2 = 0; if( uiFlags & FLM_COMPARE_COLLATED_VALUES) { iCompareType = COMPARE_COLLATION; } else { iCompareType = (uiFlags & FLM_NOCASE) ? COMPARE_COL_AND_SUBCOL : COMPARE_VALUE; } // Handle NULL buffers first - don't test for zero length values yet. if (!pLeftBuf) { if (pRightBuf) { iCompare = -1; } goto Exit; } bHitWildCard = bHasWildCardPos = FALSE; uiLeadingSpace = uiTrailingSpace = (uiFlags & FLM_MIN_SPACES) ? FLM_NO_SPACE : 0; pbHitWildCard = (uiFlags & FLM_WILD) ? &bHitWildCard : NULL; if (bLeadingWildCard) { goto Leading_Wild_Card; } while (!iCompare && (uiLeftLen || uiLeftWpChar2) && (uiRightLen || uiRightWpChar2)) { iCompare = flmTextCompareSingleChar( &pLeftBuf, &uiLeftLen, &uiLeftWpChar2, &pRightBuf, &uiRightLen, &uiRightWpChar2, NULL_SUB_COL_CHECK, NULL_CASE_CHECK, pbHitWildCard, iCompareType, NULL, uiFlags | uiLeadingSpace, uiLang); uiLeadingSpace = 0; if (bHitWildCard) { Leading_Wild_Card: bHitWildCard = FALSE; bHasWildCardPos = FALSE; // Turn off last wildcard. // If right side is done, we are done. if (!uiRightLen && !uiRightWpChar2) { uiLeftLen = 0; uiLeftWpChar2 = 0; break; } // Save state on the RIGHT to handle the sick case of search key // "b*aH" being able to match "baaaaaaaaaH" (Lambda Case) // LWCP = LastWildCardPosition pLWCPRightBuf = pRightBuf; uiLWCPRightLen = uiRightLen; uiLWCPRightWpChar2 = uiRightWpChar2; // Find first matching character on the left side. Compare_Again: iCompare = -1; while (iCompare && (uiLeftLen || uiLeftWpChar2)) { iCompare = flmTextCompareSingleChar( &pLeftBuf, &uiLeftLen, &uiLeftWpChar2, &pRightBuf, &uiRightLen, &uiRightWpChar2, NULL_SUB_COL_CHECK, NULL_CASE_CHECK, NULL_WILD_CARD_CHECK, iCompareType, NULL, uiFlags | uiLeadingSpace, uiLang); uiLeadingSpace = 0; // Done with the right side? Return iCompare value. if (!uiRightLen && !uiRightWpChar2) { break; } // Values different and still have stuff on left? if (iCompare && (uiLeftLen || uiLeftWpChar2)) { // Advance the left if there is anything left uiValueLen = flmTextGetValue( pLeftBuf, uiLeftLen, &uiLeftWpChar2, uiFlags, &ui16Tmp1, &ui16Tmp2); pLeftBuf += uiValueLen; uiLeftLen -= uiValueLen; } } // Save state on the LEFT if (uiLeftLen || uiLeftWpChar2) { pLWCPLeftBuf = pLeftBuf; uiLWCPLeftLen = uiLeftLen; uiLWCPLeftWpChar2 = uiLeftWpChar2; bHasWildCardPos = TRUE; } // EQUAL - as far as the collation values are concerned. } } if (iCompare == 0) { // In here because LEFT and/or RIGHT are out of bytes. // Check for trailing spaces if MIN_SPACES. if (uiLeftLen || uiLeftWpChar2) { if (!bTrailingWildCard) { uiLeftLen -= flmTextGetValue( pLeftBuf, uiLeftLen, &uiLeftWpChar2, uiFlags | uiTrailingSpace, &ui16WPChar, &ui16UniChar); if (uiLeftLen || ui16WPChar || ui16UniChar) { iCompare = 1; } } } else if (uiRightLen || uiRightWpChar2) { uiRightLen -= flmTextGetValue( pRightBuf, uiRightLen, &uiRightWpChar2, uiFlags | uiTrailingSpace, &ui16WPChar, &ui16UniChar); // Equals if right just had a trailing wild card. (else case) if (uiRightLen || !pbHitWildCard || ui16WPChar != '*') { if (uiRightLen || ui16WPChar || ui16UniChar) { iCompare = -1; } } } } // Handle the embedded wild card case. if (iCompare != 0 && bHasWildCardPos) { // Restore wild card state. pLeftBuf = pLWCPLeftBuf; uiLeftLen = uiLWCPLeftLen; uiLeftWpChar2 = uiLWCPLeftWpChar2; pRightBuf = pLWCPRightBuf; uiRightLen = uiLWCPRightLen; uiRightWpChar2 = uiLWCPRightWpChar2; bHasWildCardPos = FALSE; goto Compare_Again; } Exit: return (!iCompare ? FLM_TRUE : FLM_FALSE); } /**************************************************************************** Desc: Check for double characters that sort as 1 (like ch in Spanish) or 1 character that should sort as 2 (like � sorts as ae in French). Return: 0 = nothing changes. Otherwise, *pui16WpChar is the first character, and the return value contains the 2nd character. In addition, *pbTwoIntoOne will be TRUE if we should take two characters and treat as one (i.e, change the collation on the outside to one more than the collation of the first character). ****************************************************************************/ FLMUINT16 flmCheckDoubleCollation( FLMUINT16 * pui16WpChar, FLMBOOL * pbTwoIntoOne, const FLMBYTE ** ppucInputStr, FLMUINT uiLanguage) { FLMUINT16 ui16CurState; FLMUINT16 ui16WpChar; FLMUINT16 ui16SecondChar; FLMUINT16 ui16LastChar = 0; FLMUINT uiInLen; FLMBOOL bUpperFlag; ui16WpChar = *pui16WpChar; bUpperFlag = flmIsUpper( ui16WpChar); uiInLen = 0; ui16SecondChar = 0; // Primer read if ((ui16CurState = getNextCharState( 0, uiLanguage)) == 0) { goto Exit; } for (;;) { switch (ui16CurState) { case INSTSG: { *pui16WpChar = ui16SecondChar = (FLMUINT16) f_toascii( 's'); *pbTwoIntoOne = FALSE; goto Exit; } case INSTAE: { if (bUpperFlag) { *pui16WpChar = (FLMUINT16) f_toascii( 'A'); ui16SecondChar = (FLMUINT16) f_toascii( 'E'); } else { *pui16WpChar = (FLMUINT16) f_toascii( 'a'); ui16SecondChar = (FLMUINT16) f_toascii( 'e'); } *pbTwoIntoOne = FALSE; goto Exit; } case INSTIJ: { if (bUpperFlag) { *pui16WpChar = (FLMUINT16) f_toascii( 'I'); ui16SecondChar = (FLMUINT16) f_toascii( 'J'); } else { *pui16WpChar = (FLMUINT16) f_toascii( 'i'); ui16SecondChar = (FLMUINT16) f_toascii( 'j'); } *pbTwoIntoOne = FALSE; goto Exit; } case INSTOE: { if (bUpperFlag) { *pui16WpChar = (FLMUINT16) f_toascii( 'O'); ui16SecondChar = (FLMUINT16) f_toascii( 'E'); } else { *pui16WpChar = (FLMUINT16) f_toascii( 'o'); ui16SecondChar = (FLMUINT16) f_toascii( 'e'); } *pbTwoIntoOne = FALSE; goto Exit; } case WITHAA: { *pui16WpChar = (FLMUINT16) (bUpperFlag ? (FLMUINT16) 0x122 : (FLMUINT16) 0x123); (*ppucInputStr)++; break; } case AFTERC: { *pui16WpChar = (FLMUINT16) (bUpperFlag ? (FLMUINT16) f_toascii( 'C') : (FLMUINT16) f_toascii( 'c')); ui16SecondChar = ui16LastChar; *pbTwoIntoOne = TRUE; (*ppucInputStr)++; goto Exit; } case AFTERH: { *pui16WpChar = (FLMUINT16) (bUpperFlag ? (FLMUINT16) f_toascii( 'H') : (FLMUINT16) f_toascii( 'h')); ui16SecondChar = ui16LastChar; *pbTwoIntoOne = TRUE; (*ppucInputStr)++; goto Exit; } case AFTERL: { *pui16WpChar = (FLMUINT16) (bUpperFlag ? (FLMUINT16) f_toascii( 'L') : (FLMUINT16) f_toascii( 'l')); ui16SecondChar = ui16LastChar; *pbTwoIntoOne = TRUE; (*ppucInputStr)++; goto Exit; } default: { // Handles STATE1 through STATE11 also break; } } if ((ui16CurState = getNextCharState( ui16CurState, flmCh6Lower( ui16WpChar))) == 0) { goto Exit; } ui16LastChar = ui16WpChar; ui16WpChar = (FLMUINT16) * ((*ppucInputStr) + (uiInLen++)); } Exit: return (ui16SecondChar); } /**************************************************************************** Desc: Convert a WPChar from hankaku (single wide) to zenkaku (double wide). Ret: 0 = no conversion 1 = converted character to zenkaku 2 = ui16NextWpChar dakuten or handakuten voicing got combined ****************************************************************************/ FLMUINT16 HanToZenkaku( FLMUINT16 ui16WpChar, FLMUINT16 ui16NextWpChar, FLMUINT16 * pui16Zenkaku) { FLMUINT16 ui16Zenkaku = 0; FLMBYTE ucCharSet = ui16WpChar >> 8; FLMBYTE ucCharVal = ui16WpChar & 0xFF; FLMUINT uiLoop; FLMUINT16 ui16CharsUsed = 1; switch (ucCharSet) { // Character set 0 - symbols case 0: { // Invalid? - all others are used. if (ucCharVal < 0x20) { ; } else if (ucCharVal <= 0x2F) { // Symbols A ui16Zenkaku = 0x2400 + From0AToZen[ucCharVal - 0x20]; } else if (ucCharVal <= 0x39) { // 0..9 ui16Zenkaku = 0x2500 + (ucCharVal - 0x21); } else if (ucCharVal <= 0x40) { // Symbols B ui16Zenkaku = 0x2400 + From0BToZen[ucCharVal - 0x3A]; } else if (ucCharVal <= 0x5A) { // A..Z ui16Zenkaku = 0x2500 + (ucCharVal - 0x21); } else if (ucCharVal <= 0x60) { // Symbols C ui16Zenkaku = 0x2400 + From0CToZen[ucCharVal - 0x5B]; } else if (ucCharVal <= 0x7A) { // a..z ui16Zenkaku = 0x2500 + (ucCharVal - 0x21); } else if (ucCharVal <= 0x7E) { // Symbols D ui16Zenkaku = 0x2400 + From0DToZen[ucCharVal - 0x7B]; } break; } // GREEK case 8: { if ((ucCharVal >= sizeof(From8ToZen)) || ((ui16Zenkaku = 0x2600 + From8ToZen[ucCharVal]) == 0x26FF)) { ui16Zenkaku = 0; } break; } // CYRILLIC case 10: { // Check range ui16Zenkaku = 0x2700 + (ucCharVal >> 1); // Uppercase value // Convert to lower case? if (ucCharVal & 0x01) { ui16Zenkaku += 0x30; } break; } // JAPANESE case 11: { if (ucCharVal < 5) { ui16Zenkaku = 0x2400 + From11AToZen[ucCharVal]; } else if (ucCharVal < 0x3D) // katakana? { if ((ui16Zenkaku = 0x2600 + From11BToZen[ucCharVal - 5]) == 0x26FF) { // Dash - convert to this ui16Zenkaku = 0x241b; } else { if (ui16NextWpChar == 0xB3D) // dakuten? - voicing { // First check exception(s) then check if voicing // exists! if ((ui16Zenkaku != 0x2652) && // is not 'N'? (flmKanaSubColTbl[ui16Zenkaku - 0x2600 + 1] == 3)) { ui16Zenkaku++; // Return 2 ui16CharsUsed++; } } else if (ui16NextWpChar == 0xB3E) // handakuten? - voicing { // Check if voicing exists! - will NOT access out of // table if (flmKanaSubColTbl[ui16Zenkaku - 0x2600 + 2] == 5) { ui16Zenkaku += 2; // Return 2 ui16CharsUsed++; } } } } else if (ucCharVal == 0x3D) // dakuten? { // Convert to voicing symbol ui16Zenkaku = 0x240A; } else if (ucCharVal == 0x3E) // handakuten? { // Convert to voicing symbol ui16Zenkaku = 0x240B; } // else cannot convert break; } // Other character sets CS 1,4,5,6 - symbols default: { // Instead of includes more tables from char.asm - look down the // Zen24Tohankaku[] table for a matching value - not much slower. for (uiLoop = 0; uiLoop < (sizeof(Zen24ToHankaku) / sizeof(BYTE_WORD_TBL)); uiLoop++) { if (Zen24ToHankaku[uiLoop].WordValue == ui16WpChar) { ui16Zenkaku = 0x2400 + Zen24ToHankaku[uiLoop].ByteValue; break; } } break; } } if (!ui16Zenkaku) { // Change return value ui16CharsUsed = 0; } *pui16Zenkaku = ui16Zenkaku; return (ui16CharsUsed); } /**************************************************************************** Desc: Convert a zenkaku (double wide) char to a hankaku (single wide) char Ret: Hankaku char or 0 if a conversion doesn't exist ****************************************************************************/ FLMUINT16 ZenToHankaku( FLMUINT16 ui16WpChar, FLMUINT16 * DakutenOrHandakutenRV) { FLMUINT16 ui16Hankaku = 0; FLMBYTE ucCharSet = ui16WpChar >> 8; FLMBYTE ucCharVal = ui16WpChar & 0xFF; FLMUINT uiLoop; switch (ucCharSet) { // SYMBOLS case 0x24: { for (uiLoop = 0; uiLoop < (sizeof(Zen24ToHankaku) / sizeof(BYTE_WORD_TBL)); uiLoop++) { // List is sorted so table entry is more you are done if (Zen24ToHankaku[uiLoop].ByteValue >= ucCharVal) { if (Zen24ToHankaku[uiLoop].ByteValue == ucCharVal) { ui16Hankaku = Zen24ToHankaku[uiLoop].WordValue; } break; } } break; } // ROMAN - 0x250F..2559 Hiragana - 0x255E..2580 case 0x25: { if (ucCharVal >= 0x0F && ucCharVal < 0x5E) { ui16Hankaku = ucCharVal + 0x21; } break; } // Katakana - 0x2600..2655 Greek - 0x265B..2695 case 0x26: { if (ucCharVal <= 0x55) // Katakana range { FLMBYTE ucCS11CharVal; FLMUINT16 ui16NextWpChar = 0; if ((ucCS11CharVal = MapCS26ToCharSet11[ucCharVal]) != 0xFF) { if (ucCS11CharVal & 0x80) { if (ucCS11CharVal & 0x40) { // Handakuten voicing ui16NextWpChar = 0xB3E; } else { // Dakuten voicing ui16NextWpChar = 0xB3D; } ucCS11CharVal &= 0x3F; } ui16Hankaku = 0x0b00 + ucCS11CharVal; if (ui16NextWpChar && DakutenOrHandakutenRV) { *DakutenOrHandakutenRV = ui16NextWpChar; } } } else if (ucCharVal <= 0x95) // Greek { FLMBYTE ucGreekChar = ucCharVal; // Make a zero based number. ucGreekChar -= 0x5E; // Check for lowercase if (ucGreekChar >= 0x20) { // Convert to upper case for now ucGreekChar -= 0x20; } if (ucGreekChar >= 2) { ucGreekChar++; } if (ucGreekChar >= 19) { ucGreekChar++; } // Convert to character set 8 ui16Hankaku = (ucGreekChar << 1) + 0x800; if (ucCharVal >= (0x5E + 0x20)) { // Adjust to lower case character ui16Hankaku++; } } break; } // Cyrillic case 0x27: { // Uppercase? if (ucCharVal <= 0x20) { ui16Hankaku = (ucCharVal << 1) + 0xa00; } else if (ucCharVal >= 0x30 && ucCharVal <= 0x50) { // Lower case ui16Hankaku = ((ucCharVal - 0x30) << 1) + 0xa01; } break; } } return (ui16Hankaku); }