//------------------------------------------------------------------------------ // Desc: This file contains the code to parse out individual words and // substrings in a text string. // Tabs: 3 // // Copyright (c) 1990-2000, 2002-2007 Novell, Inc. All Rights Reserved. // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; version 2.1 // of the License. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Library Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, contact Novell, Inc. // // To contact Novell about this file by physical or electronic mail, // you may find current contact information at www.novell.com. // // $Id$ //------------------------------------------------------------------------------ #include "flaimsys.h" FSTATIC RCODE flmGetCharacter( IF_PosIStream * pIStream, FLMUINT * puiCompareRules, FLMUINT16 * pui16WPValue, FLMUNICODE * puUniValue); FSTATIC RCODE flmTextGetCharType( IF_PosIStream * pIStream, FLMUINT * puiCompareRules, FLMUNICODE * puUniValue, FLMUINT * puiType); /***************************************************************************** Desc: *****************************************************************************/ FINLINE FLMUINT flmCharTypeAnsi7( FLMUINT16 ui16Char) { if( (ui16Char >= ASCII_LOWER_A && ui16Char <= ASCII_LOWER_Z) || (ui16Char >= ASCII_UPPER_A && ui16Char <= ASCII_UPPER_Z) || (ui16Char >= ASCII_ZERO && ui16Char <= ASCII_NINE)) { return SDWD_CHR; } if( ui16Char == 0x27) { return WDJN_CHR; } if( ui16Char <= 0x2B) { return DELI_CHR; } if( ui16Char == ASCII_COMMA || ui16Char == ASCII_DASH || ui16Char == ASCII_DOT || ui16Char == ASCII_SLASH || ui16Char == ASCII_COLON || ui16Char == ASCII_AT || ui16Char == ASCII_BACKSLASH || ui16Char == ASCII_UNDERSCORE) { return WDJN_CHR; } return DELI_CHR; } /***************************************************************************** Desc: Return the next WP or unicode character value. Return: Number of bytes formatted to return the character value. *****************************************************************************/ FSTATIC RCODE flmGetCharacter( IF_PosIStream * pIStream, FLMUINT * puiCompareRules, FLMUINT16 * pui16WPValue, FLMUNICODE * puUniValue) { RCODE rc = NE_SFLM_OK; FLMUNICODE uChar = 0; FLMUINT64 ui64AfterLastSpacePos = 0; FLMBOOL bLastCharWasSpace = FALSE; FLMUINT uiCompareRules = *puiCompareRules; for( ;;) { if (RC_BAD( rc = f_readUTF8CharAsUnicode( pIStream, &uChar))) { if (rc != NE_SFLM_EOF_HIT) { goto Exit; } rc = NE_SFLM_OK; if (bLastCharWasSpace && !(uiCompareRules & FLM_COMP_IGNORE_TRAILING_SPACE)) { // bLastCharWasSpace flag can only be TRUE if either // FLM_COMP_IGNORE_TRAILING_SPACE is set or // FLM_COMP_COMPRESS_WHITESPACE is set. flmAssert( uiCompareRules & FLM_COMP_COMPRESS_WHITESPACE); uChar = ASCII_SPACE; } else { uChar = 0; } break; } if ((uChar = f_convertChar( uChar, uiCompareRules)) == 0) { continue; } if (uChar == ASCII_SPACE) { if (uiCompareRules & FLM_COMP_COMPRESS_WHITESPACE) { bLastCharWasSpace = TRUE; ui64AfterLastSpacePos = pIStream->getCurrPosition(); } else if (uiCompareRules & FLM_COMP_IGNORE_TRAILING_SPACE) { // If the ignore trailing space flag is set, but the compress // space flag is not set, remember the position of the // first space character. If we hit a non-space character, // we will reposition to after this space character. if (!bLastCharWasSpace) { bLastCharWasSpace = TRUE; ui64AfterLastSpacePos = pIStream->getCurrPosition(); } } else { break; } } else { // Disable the ignore leading space flag, because we are now // past all leading space, and we don't want spaces ignored // now on account of that flag. uiCompareRules &= (~(FLM_COMP_IGNORE_LEADING_SPACE)); if (bLastCharWasSpace) { // Position to after the last space if (RC_BAD( rc = pIStream->positionTo( ui64AfterLastSpacePos))) { goto Exit; } uChar = ASCII_SPACE; bLastCharWasSpace = FALSE; } break; } } if (pui16WPValue) { if (!f_unicodeToWP( uChar, pui16WPValue)) { *pui16WPValue = 0; } } if (puUniValue) { *puUniValue = uChar; } Exit: *puiCompareRules = uiCompareRules; return( rc); } /**************************************************************************** Desc: Substring-ize the string in a node. Normalize spaces and hyphens if told to. Example: ABC DEF ABC DEF BC DEF C DEF DEF ****************************************************************************/ RCODE KYSubstringParse( IF_PosIStream * pIStream, FLMUINT * puiCompareRules, // [in/out] comparison rules FLMUINT uiLimitParm, // [in] Max characters FLMBYTE * pucSubstrBuf, // [out] buffer to fill FLMUINT * puiSubstrBytes, // [out] returns length FLMUINT * puiSubstrChars) { RCODE rc = NE_SFLM_OK; FLMUINT uiDestOffset = 0; FLMUINT uiDestSize = *puiSubstrBytes; FLMUINT uiLimit = uiLimitParm ? uiLimitParm : ICD_DEFAULT_SUBSTRING_LIMIT; FLMUINT uiCharCnt = 0; FLMUINT uiSize; FLMBOOL bFirstCharacter = TRUE; FLMUINT64 ui64SavePosition = pIStream->getCurrPosition(); // The limit must return one more than requested in order // for the text to collation routine to set the truncated flag. uiLimit++; while (uiLimit--) { FLMUNICODE uChar; if( RC_BAD( rc = flmGetCharacter( pIStream, puiCompareRules, NULL, &uChar))) { goto Exit; } if (!uChar) { break; } uiCharCnt++; uiSize = uiDestSize - uiDestOffset; if (RC_BAD( rc = f_uni2UTF8( uChar, &pucSubstrBuf[ uiDestOffset], &uiSize))) { goto Exit; } uiDestOffset += uiSize; // If on the first word, position to start on next character // for the next call. if (bFirstCharacter) { bFirstCharacter = FALSE; // First character - save position so we can restore it // upon leaving the routine. ui64SavePosition = pIStream->getCurrPosition(); } } if (uiDestOffset) { pucSubstrBuf[ uiDestOffset++] = 0; } *puiSubstrBytes = (FLMUINT)uiDestOffset; *puiSubstrChars = uiCharCnt; // Restore position of stream to first character after the first // character we found - to ready for next call. if (RC_BAD( rc = pIStream->positionTo( ui64SavePosition))) { goto Exit; } Exit: return( rc); } /**************************************************************************** Desc: ****************************************************************************/ RCODE KYEachWordParse( IF_PosIStream * pIStream, FLMUINT * puiCompareRules, FLMUINT uiLimit, // [in] Max characters FLMBYTE * pucWordBuf, // [out] Buffer of at least SFLM_MAX_KEY_SIZE FLMUINT * puiWordLen) { RCODE rc = NE_SFLM_OK; FLMBOOL bSkippingDelim = TRUE; FLMUINT uiWordLen = 0; FLMUINT uiWordBufSize = *puiWordLen; FLMUNICODE uChar; FLMUINT uiType = 0; FLMUINT uiSize; if (!uiLimit) { uiLimit = ICD_DEFAULT_SUBSTRING_LIMIT; } while (uiLimit) { if (RC_BAD( rc = flmTextGetCharType( pIStream, puiCompareRules, &uChar, &uiType))) { goto Exit; } if (!uChar) { break; } // Determine how to handle what we got. if (bSkippingDelim) { // If we were skipping delimiters, and we run into a non-delimiter // character, set the bSkippingDelim flag to FALSE to indicate the // beginning of a word. if (uiType & SDWD_CHR) { bSkippingDelim = FALSE; uiLimit--; uiSize = uiWordBufSize - uiWordLen; if (RC_BAD( rc = f_uni2UTF8( uChar, &pucWordBuf [uiWordLen], &uiSize))) { goto Exit; } uiWordLen += uiSize; } } else { // If we were NOT skipping delimiters, and we run into a delimiter // output the word. if (uiType & (DELI_CHR | WDJN_CHR)) { break; } uiSize = uiWordBufSize - uiWordLen; if (RC_BAD( rc = f_uni2UTF8( uChar, &pucWordBuf [uiWordLen], &uiSize))) { goto Exit; } uiWordLen += uiSize; } } // Return the word, if any if (uiWordLen) { pucWordBuf [uiWordLen++] = 0; } *puiWordLen = uiWordLen; Exit: return( rc); } /***************************************************************************** Desc: Return the next WP or unicode character value and parsing type. *****************************************************************************/ FSTATIC RCODE flmTextGetCharType( IF_PosIStream * pIStream, FLMUINT * puiCompareRules, FLMUNICODE * puUniValue, // [out] Unicode value FLMUINT * puiType // Char attribute type. ) { RCODE rc = NE_SFLM_OK; FLMUINT16 ui16WPValue; FLMUINT uiCharSet; // We add on compress white space flag because we really want to ignore // spaces anyway - we are trying to get the "words" from this stream. if( RC_BAD( rc = flmGetCharacter( pIStream, puiCompareRules, &ui16WPValue, puUniValue))) { goto Exit; } if (ui16WPValue) { if (ui16WPValue < 0x080) { *puiType = flmCharTypeAnsi7( ui16WPValue); goto Exit; } uiCharSet = (FLMUINT)(ui16WPValue >> 8); if (uiCharSet == 1 || uiCharSet == 2 || (uiCharSet >= 8 && uiCharSet <= 11)) { *puiType = SDWD_CHR; goto Exit; } *puiType = DELI_CHR; } else { // For now all unmapped unicode characters are treated // as delimeters *puiType = DELI_CHR; } Exit: return( rc); }