mars-flaim/flaim/src/kyeword.cpp

//-------------------------------------------------------------------------
// Desc:	Eachword/substring parsing for eachword/substring indexing.
// Tabs:	3
//
//		Copyright (c) 1990-2000,2003-2006 Novell, Inc. All Rights Reserved.
//
//		This program is free software; you can redistribute it and/or
//		modify it under the terms of version 2 of the GNU General Public
//		License as published by the Free Software Foundation.
//
//		This program is distributed in the hope that it will be useful,
//		but WITHOUT ANY WARRANTY; without even the implied warranty of
//		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//		GNU General Public License for more details.
//
//		You should have received a copy of the GNU General Public License
//		along with this program; if not, contact Novell, Inc.
//
//		To contact Novell about this file by physical or electronic mail,
//		you may find current contact information at www.novell.com
//
// $Id: kyeword.cpp 12313 2006-01-19 15:14:44 -0700 (Thu, 19 Jan 2006) dsanders $
//-------------------------------------------------------------------------

#include "flaimsys.h"

/****************************************************************************
Desc:	Substring-ize the string in a node.  Normalize spaces and hyphens if
		told to.  Example: ABC  DEF
			ABC DEF
			BC DEF
			C DEF
			DEF
VISIT: This needs a lot of word to decide what to do with Kanji and
		word joining charactings.  Need to use the routines in fqtextc.cpp
		to determine the character type.
****************************************************************************/
FLMBOOL KYSubstringParse(
	const FLMBYTE **	ppText,				// [in][out] points to text
	FLMUINT  *	 		puiTextLen,			// [in][out] length of text
	FLMUINT				uiIfdFlags,			// [in] flags
	FLMUINT				uiLimitParm,		// [in] Max characters
	FLMBYTE *	  		pKeyBuf,				// [out] key buffer to fill
  	FLMUINT *			puiKeyLen)			// [out] returns length
{
	const FLMBYTE *	pText = *ppText;
	FLMUINT				uiLen = *puiTextLen;
	FLMUINT				uiWordLen = 0;
	FLMUINT				uiLimit = uiLimitParm ? uiLimitParm : IFD_DEFAULT_SUBSTRING_LIMIT;
	FLMUINT				uiFlags = 0;
	FLMUINT				uiLeadingSpace = FLM_NO_SPACE;

	FLMBOOL				bIgnoreSpaceDefault = (uiIfdFlags & IFD_NO_SPACE) ? TRUE : FALSE;
	FLMBOOL				bIgnoreSpace = TRUE;
	FLMBOOL				bIgnoreDash   = (uiIfdFlags & IFD_NO_DASH) ? TRUE : FALSE;
	FLMBOOL				bMinSpaces    = (uiIfdFlags & (IFD_MIN_SPACES | IFD_NO_SPACE)) ? TRUE : FALSE;
	FLMBOOL				bNoUnderscore = (uiIfdFlags & IFD_NO_UNDERSCORE) ? TRUE : FALSE;
	FLMBOOL				bFirstCharacter = TRUE;

	// Set uiFlags
	if( bIgnoreSpaceDefault)
		uiFlags |= FLM_NO_SPACE;
	if( bIgnoreDash)
		uiFlags |= FLM_NO_DASH;
	if( bNoUnderscore)
		uiFlags |= FLM_NO_UNDERSCORE;
	if( uiIfdFlags & IFD_MIN_SPACES)
		uiFlags |= FLM_MIN_SPACES;

	/*
	The limit must return one more than requested in order
	for the text to collation routine to set the truncated flag.
	*/
	uiLimit++;

	while( uiLen && uiLimit--)
	{
		FLMBYTE			ch = *pText;
		FLMUINT16		ui16WPValue;
		FLMUNICODE		ui16UniValue;
		FLMUINT			uiCharLen;

		if( (ch & ASCII_CHAR_MASK) == ASCII_CHAR_CODE)
		{
			if( ch == ASCII_UNDERSCORE && bNoUnderscore)
			{
				ch = ASCII_SPACE;
			}
			if( ch == ASCII_SPACE && bMinSpaces)
			{
				if( !bIgnoreSpace)
				{
					pKeyBuf[ uiWordLen++ ] = ASCII_SPACE;
				}
				bIgnoreSpace = TRUE;
				pText++;
				uiLen--;
				continue;
			}
			ui16WPValue = (FLMUINT16) ch;
			uiCharLen = 1;
		}
		else
		{
			if( (uiCharLen = flmTextGetValue( pText, uiLen, NULL,
									uiFlags | uiLeadingSpace,
									&ui16WPValue, &ui16UniValue)) == 0)
				break;
			flmAssert( uiCharLen <= uiLen);
		}
		uiLeadingSpace = 0;
		bIgnoreSpace = bIgnoreSpaceDefault;
		uiLen -= uiCharLen;
		while( uiCharLen--)
		{
			pKeyBuf[ uiWordLen++ ] = *pText++;
		}

		// If on the first word position to start on next character
		// for the next call.
		if( bFirstCharacter)
		{
			bFirstCharacter = FALSE;
			// First character - set return value.
			*ppText = pText;
			*puiTextLen = uiLen;
		}
	}
	pKeyBuf[ uiWordLen ] = '\0';
	// Case of all spaces - the FALSE will trigger indexing is done.
	*puiKeyLen = (FLMUINT)uiWordLen;
	return( ( uiWordLen) ? TRUE : FALSE);
}

/****************************************************************************
Desc:	Keyword-ize the information in a node - node is assumed to be a
		TEXT node.
VISIT: This needs a lot of work to decide what to do with Kanji and
		word joining charactings.  Need to use the routines in fqtextc.cpp
		to determine the character type.  Also, the code should be redone to
		be like the substring code above instead of count the buffer.
****************************************************************************/
FLMBOOL KYEachWordParse(
	const FLMBYTE **	pText,
	FLMUINT  *	 		puiTextLen,
	FLMUINT				uiLimitParm,		// [in] Max characters
	FLMBYTE *	  		pKeyBuf,				// [out] Buffer of at least MAX_KEY_SIZ
  	FLMUINT  *			puiKeyLen)
{
	const FLMBYTE *	pKey = NULL;
	const FLMBYTE *	pTmpKey;
	FLMUINT				uiLimit = uiLimitParm ? uiLimitParm : IFD_DEFAULT_SUBSTRING_LIMIT;
	FLMUINT				uiLen;
	FLMUINT				uiBytesProcessed = 0;
	FLMBOOL				bSkippingDelim = TRUE;
	FLMBOOL				bHaveWord = FALSE;
	FLMUINT				uiWordLen = 0;
	FLMUINT16			ui16WPValue;
	FLMUNICODE			ui16UniValue;
	FLMUINT				uiCharLen;
	FLMUINT				uiType;

	uiLen = *puiTextLen;
	pTmpKey = *pText;
	while ((uiBytesProcessed < uiLen) && (!bHaveWord) && uiLimit)
	{
		uiCharLen = flmTextGetCharType( pTmpKey, uiLen,
									&ui16WPValue, &ui16UniValue, &uiType);

		/* Determine how to handle what we got. */

		if (bSkippingDelim)
		{

			/*
			If we were skipping delimiters, and we run into a non-delimiter
			character, set the bSkippingDelim flag to FALSE to indicate the
			beginning of a word.
			*/

			if (uiType & SDWD_CHR)
			{
				pKey = pTmpKey;
				uiWordLen = uiCharLen;
				bSkippingDelim = FALSE;
				uiLimit--;
			}
		}
		else
		{

			/*
			If we were NOT skipping delimiters, and we run into a delimiter
			output the word.
			*/

			if (uiType & (DELI_CHR | WDJN_CHR))
				bHaveWord = TRUE;
			else
			{
				uiWordLen += uiCharLen;
				uiLimit--;
			}
		}

		/* Increment str to skip past what we are pointing at. */

		pTmpKey += uiCharLen;
		uiBytesProcessed += uiCharLen;
	}

	*pText = pTmpKey;
	*puiTextLen -= uiBytesProcessed;

	/* Return the word, if any. */

	if (uiWordLen)
	{
		*puiKeyLen = uiWordLen;
		f_memcpy( pKeyBuf, pKey, uiWordLen);
	}

	return( ( uiWordLen) ? TRUE : FALSE);
}