git-svn-id: https://svn.code.sf.net/p/flaim/code/trunk@7 0109f412-320b-0410-ab79-c3e0c5ffbbe6
225 lines
6.3 KiB
C++
225 lines
6.3 KiB
C++
//-------------------------------------------------------------------------
|
|
// Desc: Eachword/substring parsing for eachword/substring indexing.
|
|
// Tabs: 3
|
|
//
|
|
// Copyright (c) 1990-2000,2003-2006 Novell, Inc. All Rights Reserved.
|
|
//
|
|
// This program is free software; you can redistribute it and/or
|
|
// modify it under the terms of version 2 of the GNU General Public
|
|
// License as published by the Free Software Foundation.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, contact Novell, Inc.
|
|
//
|
|
// To contact Novell about this file by physical or electronic mail,
|
|
// you may find current contact information at www.novell.com
|
|
//
|
|
// $Id: kyeword.cpp 12313 2006-01-19 15:14:44 -0700 (Thu, 19 Jan 2006) dsanders $
|
|
//-------------------------------------------------------------------------
|
|
|
|
#include "flaimsys.h"
|
|
|
|
/****************************************************************************
|
|
Desc: Substring-ize the string in a node. Normalize spaces and hyphens if
|
|
told to. Example: ABC DEF
|
|
ABC DEF
|
|
BC DEF
|
|
C DEF
|
|
DEF
|
|
VISIT: This needs a lot of word to decide what to do with Kanji and
|
|
word joining charactings. Need to use the routines in fqtextc.cpp
|
|
to determine the character type.
|
|
****************************************************************************/
|
|
FLMBOOL KYSubstringParse(
|
|
const FLMBYTE ** ppText, // [in][out] points to text
|
|
FLMUINT * puiTextLen, // [in][out] length of text
|
|
FLMUINT uiIfdFlags, // [in] flags
|
|
FLMUINT uiLimitParm, // [in] Max characters
|
|
FLMBYTE * pKeyBuf, // [out] key buffer to fill
|
|
FLMUINT * puiKeyLen) // [out] returns length
|
|
{
|
|
const FLMBYTE * pText = *ppText;
|
|
FLMUINT uiLen = *puiTextLen;
|
|
FLMUINT uiWordLen = 0;
|
|
FLMUINT uiLimit = uiLimitParm ? uiLimitParm : IFD_DEFAULT_SUBSTRING_LIMIT;
|
|
FLMUINT uiFlags = 0;
|
|
FLMUINT uiLeadingSpace = FLM_NO_SPACE;
|
|
|
|
FLMBOOL bIgnoreSpaceDefault = (uiIfdFlags & IFD_NO_SPACE) ? TRUE : FALSE;
|
|
FLMBOOL bIgnoreSpace = TRUE;
|
|
FLMBOOL bIgnoreDash = (uiIfdFlags & IFD_NO_DASH) ? TRUE : FALSE;
|
|
FLMBOOL bMinSpaces = (uiIfdFlags & (IFD_MIN_SPACES | IFD_NO_SPACE)) ? TRUE : FALSE;
|
|
FLMBOOL bNoUnderscore = (uiIfdFlags & IFD_NO_UNDERSCORE) ? TRUE : FALSE;
|
|
FLMBOOL bFirstCharacter = TRUE;
|
|
|
|
// Set uiFlags
|
|
if( bIgnoreSpaceDefault)
|
|
uiFlags |= FLM_NO_SPACE;
|
|
if( bIgnoreDash)
|
|
uiFlags |= FLM_NO_DASH;
|
|
if( bNoUnderscore)
|
|
uiFlags |= FLM_NO_UNDERSCORE;
|
|
if( uiIfdFlags & IFD_MIN_SPACES)
|
|
uiFlags |= FLM_MIN_SPACES;
|
|
|
|
/*
|
|
The limit must return one more than requested in order
|
|
for the text to collation routine to set the truncated flag.
|
|
*/
|
|
uiLimit++;
|
|
|
|
while( uiLen && uiLimit--)
|
|
{
|
|
FLMBYTE ch = *pText;
|
|
FLMUINT16 ui16WPValue;
|
|
FLMUNICODE ui16UniValue;
|
|
FLMUINT uiCharLen;
|
|
|
|
if( (ch & ASCII_CHAR_MASK) == ASCII_CHAR_CODE)
|
|
{
|
|
if( ch == ASCII_UNDERSCORE && bNoUnderscore)
|
|
{
|
|
ch = ASCII_SPACE;
|
|
}
|
|
if( ch == ASCII_SPACE && bMinSpaces)
|
|
{
|
|
if( !bIgnoreSpace)
|
|
{
|
|
pKeyBuf[ uiWordLen++ ] = ASCII_SPACE;
|
|
}
|
|
bIgnoreSpace = TRUE;
|
|
pText++;
|
|
uiLen--;
|
|
continue;
|
|
}
|
|
ui16WPValue = (FLMUINT16) ch;
|
|
uiCharLen = 1;
|
|
}
|
|
else
|
|
{
|
|
if( (uiCharLen = flmTextGetValue( pText, uiLen, NULL,
|
|
uiFlags | uiLeadingSpace,
|
|
&ui16WPValue, &ui16UniValue)) == 0)
|
|
break;
|
|
flmAssert( uiCharLen <= uiLen);
|
|
}
|
|
uiLeadingSpace = 0;
|
|
bIgnoreSpace = bIgnoreSpaceDefault;
|
|
uiLen -= uiCharLen;
|
|
while( uiCharLen--)
|
|
{
|
|
pKeyBuf[ uiWordLen++ ] = *pText++;
|
|
}
|
|
|
|
// If on the first word position to start on next character
|
|
// for the next call.
|
|
if( bFirstCharacter)
|
|
{
|
|
bFirstCharacter = FALSE;
|
|
// First character - set return value.
|
|
*ppText = pText;
|
|
*puiTextLen = uiLen;
|
|
}
|
|
}
|
|
pKeyBuf[ uiWordLen ] = '\0';
|
|
// Case of all spaces - the FALSE will trigger indexing is done.
|
|
*puiKeyLen = (FLMUINT)uiWordLen;
|
|
return( ( uiWordLen) ? TRUE : FALSE);
|
|
}
|
|
|
|
/****************************************************************************
|
|
Desc: Keyword-ize the information in a node - node is assumed to be a
|
|
TEXT node.
|
|
VISIT: This needs a lot of work to decide what to do with Kanji and
|
|
word joining charactings. Need to use the routines in fqtextc.cpp
|
|
to determine the character type. Also, the code should be redone to
|
|
be like the substring code above instead of count the buffer.
|
|
****************************************************************************/
|
|
FLMBOOL KYEachWordParse(
|
|
const FLMBYTE ** pText,
|
|
FLMUINT * puiTextLen,
|
|
FLMUINT uiLimitParm, // [in] Max characters
|
|
FLMBYTE * pKeyBuf, // [out] Buffer of at least MAX_KEY_SIZ
|
|
FLMUINT * puiKeyLen)
|
|
{
|
|
const FLMBYTE * pKey = NULL;
|
|
const FLMBYTE * pTmpKey;
|
|
FLMUINT uiLimit = uiLimitParm ? uiLimitParm : IFD_DEFAULT_SUBSTRING_LIMIT;
|
|
FLMUINT uiLen;
|
|
FLMUINT uiBytesProcessed = 0;
|
|
FLMBOOL bSkippingDelim = TRUE;
|
|
FLMBOOL bHaveWord = FALSE;
|
|
FLMUINT uiWordLen = 0;
|
|
FLMUINT16 ui16WPValue;
|
|
FLMUNICODE ui16UniValue;
|
|
FLMUINT uiCharLen;
|
|
FLMUINT uiType;
|
|
|
|
uiLen = *puiTextLen;
|
|
pTmpKey = *pText;
|
|
while ((uiBytesProcessed < uiLen) && (!bHaveWord) && uiLimit)
|
|
{
|
|
uiCharLen = flmTextGetCharType( pTmpKey, uiLen,
|
|
&ui16WPValue, &ui16UniValue, &uiType);
|
|
|
|
/* Determine how to handle what we got. */
|
|
|
|
if (bSkippingDelim)
|
|
{
|
|
|
|
/*
|
|
If we were skipping delimiters, and we run into a non-delimiter
|
|
character, set the bSkippingDelim flag to FALSE to indicate the
|
|
beginning of a word.
|
|
*/
|
|
|
|
if (uiType & SDWD_CHR)
|
|
{
|
|
pKey = pTmpKey;
|
|
uiWordLen = uiCharLen;
|
|
bSkippingDelim = FALSE;
|
|
uiLimit--;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
|
|
/*
|
|
If we were NOT skipping delimiters, and we run into a delimiter
|
|
output the word.
|
|
*/
|
|
|
|
if (uiType & (DELI_CHR | WDJN_CHR))
|
|
bHaveWord = TRUE;
|
|
else
|
|
{
|
|
uiWordLen += uiCharLen;
|
|
uiLimit--;
|
|
}
|
|
}
|
|
|
|
/* Increment str to skip past what we are pointing at. */
|
|
|
|
pTmpKey += uiCharLen;
|
|
uiBytesProcessed += uiCharLen;
|
|
}
|
|
|
|
*pText = pTmpKey;
|
|
*puiTextLen -= uiBytesProcessed;
|
|
|
|
/* Return the word, if any. */
|
|
|
|
if (uiWordLen)
|
|
{
|
|
*puiKeyLen = uiWordLen;
|
|
f_memcpy( pKeyBuf, pKey, uiWordLen);
|
|
}
|
|
|
|
return( ( uiWordLen) ? TRUE : FALSE);
|
|
}
|