Files
mars-flaim/flaim/src/kyeword.cpp
dsandersoremutah c55dab446f Renamed version4 to flaim and version5 to xflaim
git-svn-id: https://svn.code.sf.net/p/flaim/code/trunk@7 0109f412-320b-0410-ab79-c3e0c5ffbbe6
2006-01-27 21:06:39 +00:00

225 lines
6.3 KiB
C++

//-------------------------------------------------------------------------
// Desc: Eachword/substring parsing for eachword/substring indexing.
// Tabs: 3
//
// Copyright (c) 1990-2000,2003-2006 Novell, Inc. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of version 2 of the GNU General Public
// License as published by the Free Software Foundation.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, contact Novell, Inc.
//
// To contact Novell about this file by physical or electronic mail,
// you may find current contact information at www.novell.com
//
// $Id: kyeword.cpp 12313 2006-01-19 15:14:44 -0700 (Thu, 19 Jan 2006) dsanders $
//-------------------------------------------------------------------------
#include "flaimsys.h"
/****************************************************************************
Desc: Substring-ize the string in a node. Normalize spaces and hyphens if
told to. Example: ABC DEF
ABC DEF
BC DEF
C DEF
DEF
VISIT: This needs a lot of word to decide what to do with Kanji and
word joining charactings. Need to use the routines in fqtextc.cpp
to determine the character type.
****************************************************************************/
FLMBOOL KYSubstringParse(
const FLMBYTE ** ppText, // [in][out] points to text
FLMUINT * puiTextLen, // [in][out] length of text
FLMUINT uiIfdFlags, // [in] flags
FLMUINT uiLimitParm, // [in] Max characters
FLMBYTE * pKeyBuf, // [out] key buffer to fill
FLMUINT * puiKeyLen) // [out] returns length
{
const FLMBYTE * pText = *ppText;
FLMUINT uiLen = *puiTextLen;
FLMUINT uiWordLen = 0;
FLMUINT uiLimit = uiLimitParm ? uiLimitParm : IFD_DEFAULT_SUBSTRING_LIMIT;
FLMUINT uiFlags = 0;
FLMUINT uiLeadingSpace = FLM_NO_SPACE;
FLMBOOL bIgnoreSpaceDefault = (uiIfdFlags & IFD_NO_SPACE) ? TRUE : FALSE;
FLMBOOL bIgnoreSpace = TRUE;
FLMBOOL bIgnoreDash = (uiIfdFlags & IFD_NO_DASH) ? TRUE : FALSE;
FLMBOOL bMinSpaces = (uiIfdFlags & (IFD_MIN_SPACES | IFD_NO_SPACE)) ? TRUE : FALSE;
FLMBOOL bNoUnderscore = (uiIfdFlags & IFD_NO_UNDERSCORE) ? TRUE : FALSE;
FLMBOOL bFirstCharacter = TRUE;
// Set uiFlags
if( bIgnoreSpaceDefault)
uiFlags |= FLM_NO_SPACE;
if( bIgnoreDash)
uiFlags |= FLM_NO_DASH;
if( bNoUnderscore)
uiFlags |= FLM_NO_UNDERSCORE;
if( uiIfdFlags & IFD_MIN_SPACES)
uiFlags |= FLM_MIN_SPACES;
/*
The limit must return one more than requested in order
for the text to collation routine to set the truncated flag.
*/
uiLimit++;
while( uiLen && uiLimit--)
{
FLMBYTE ch = *pText;
FLMUINT16 ui16WPValue;
FLMUNICODE ui16UniValue;
FLMUINT uiCharLen;
if( (ch & ASCII_CHAR_MASK) == ASCII_CHAR_CODE)
{
if( ch == ASCII_UNDERSCORE && bNoUnderscore)
{
ch = ASCII_SPACE;
}
if( ch == ASCII_SPACE && bMinSpaces)
{
if( !bIgnoreSpace)
{
pKeyBuf[ uiWordLen++ ] = ASCII_SPACE;
}
bIgnoreSpace = TRUE;
pText++;
uiLen--;
continue;
}
ui16WPValue = (FLMUINT16) ch;
uiCharLen = 1;
}
else
{
if( (uiCharLen = flmTextGetValue( pText, uiLen, NULL,
uiFlags | uiLeadingSpace,
&ui16WPValue, &ui16UniValue)) == 0)
break;
flmAssert( uiCharLen <= uiLen);
}
uiLeadingSpace = 0;
bIgnoreSpace = bIgnoreSpaceDefault;
uiLen -= uiCharLen;
while( uiCharLen--)
{
pKeyBuf[ uiWordLen++ ] = *pText++;
}
// If on the first word position to start on next character
// for the next call.
if( bFirstCharacter)
{
bFirstCharacter = FALSE;
// First character - set return value.
*ppText = pText;
*puiTextLen = uiLen;
}
}
pKeyBuf[ uiWordLen ] = '\0';
// Case of all spaces - the FALSE will trigger indexing is done.
*puiKeyLen = (FLMUINT)uiWordLen;
return( ( uiWordLen) ? TRUE : FALSE);
}
/****************************************************************************
Desc: Keyword-ize the information in a node - node is assumed to be a
TEXT node.
VISIT: This needs a lot of work to decide what to do with Kanji and
word joining charactings. Need to use the routines in fqtextc.cpp
to determine the character type. Also, the code should be redone to
be like the substring code above instead of count the buffer.
****************************************************************************/
FLMBOOL KYEachWordParse(
const FLMBYTE ** pText,
FLMUINT * puiTextLen,
FLMUINT uiLimitParm, // [in] Max characters
FLMBYTE * pKeyBuf, // [out] Buffer of at least MAX_KEY_SIZ
FLMUINT * puiKeyLen)
{
const FLMBYTE * pKey = NULL;
const FLMBYTE * pTmpKey;
FLMUINT uiLimit = uiLimitParm ? uiLimitParm : IFD_DEFAULT_SUBSTRING_LIMIT;
FLMUINT uiLen;
FLMUINT uiBytesProcessed = 0;
FLMBOOL bSkippingDelim = TRUE;
FLMBOOL bHaveWord = FALSE;
FLMUINT uiWordLen = 0;
FLMUINT16 ui16WPValue;
FLMUNICODE ui16UniValue;
FLMUINT uiCharLen;
FLMUINT uiType;
uiLen = *puiTextLen;
pTmpKey = *pText;
while ((uiBytesProcessed < uiLen) && (!bHaveWord) && uiLimit)
{
uiCharLen = flmTextGetCharType( pTmpKey, uiLen,
&ui16WPValue, &ui16UniValue, &uiType);
/* Determine how to handle what we got. */
if (bSkippingDelim)
{
/*
If we were skipping delimiters, and we run into a non-delimiter
character, set the bSkippingDelim flag to FALSE to indicate the
beginning of a word.
*/
if (uiType & SDWD_CHR)
{
pKey = pTmpKey;
uiWordLen = uiCharLen;
bSkippingDelim = FALSE;
uiLimit--;
}
}
else
{
/*
If we were NOT skipping delimiters, and we run into a delimiter
output the word.
*/
if (uiType & (DELI_CHR | WDJN_CHR))
bHaveWord = TRUE;
else
{
uiWordLen += uiCharLen;
uiLimit--;
}
}
/* Increment str to skip past what we are pointing at. */
pTmpKey += uiCharLen;
uiBytesProcessed += uiCharLen;
}
*pText = pTmpKey;
*puiTextLen -= uiBytesProcessed;
/* Return the word, if any. */
if (uiWordLen)
{
*puiKeyLen = uiWordLen;
f_memcpy( pKeyBuf, pKey, uiWordLen);
}
return( ( uiWordLen) ? TRUE : FALSE);
}