Logo Search packages:      
Sourcecode: postgresql-8.4 version File versions  Download package

ts_parse.c
/*-------------------------------------------------------------------------
 *
 * ts_parse.c
 *          main parse functions for tsearch
 *
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *      $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.12.2.1 2009/08/30 16:53:37 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "tsearch/ts_cache.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"

#define IGNORE_LONGLEXEME     1

/*
 * Lexize subsystem
 */

00027 typedef struct ParsedLex
{
      int               type;
      char     *lemm;
      int               lenlemm;
      bool        resfollow;
      struct ParsedLex *next;
} ParsedLex;

00036 typedef struct ListParsedLex
{
      ParsedLex  *head;
      ParsedLex  *tail;
} ListParsedLex;

00042 typedef struct
{
      TSConfigCacheEntry *cfg;
      Oid               curDictId;
      int               posDict;
      DictSubState dictState;
      ParsedLex  *curSub;
      ListParsedLex towork;         /* current list to work */
      ListParsedLex waste;          /* list of lexemes that already lexized */

      /*
       * fields to store last variant to lexize (basically, thesaurus or similar
       * to, which wants      several lexemes
       */

      ParsedLex  *lastRes;
      TSLexeme   *tmpRes;
} LexizeData;

static void
LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
{
      ld->cfg = cfg;
      ld->curDictId = InvalidOid;
      ld->posDict = 0;
      ld->towork.head = ld->towork.tail = ld->curSub = NULL;
      ld->waste.head = ld->waste.tail = NULL;
      ld->lastRes = NULL;
      ld->tmpRes = NULL;
}

static void
LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
{
      if (list->tail)
      {
            list->tail->next = newpl;
            list->tail = newpl;
      }
      else
            list->head = list->tail = newpl;
      newpl->next = NULL;
}

static ParsedLex *
LPLRemoveHead(ListParsedLex *list)
{
      ParsedLex  *res = list->head;

      if (list->head)
            list->head = list->head->next;

      if (list->head == NULL)
            list->tail = NULL;

      return res;
}

static void
LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
{
      ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));

      newpl->type = type;
      newpl->lemm = lemm;
      newpl->lenlemm = lenlemm;
      LPLAddTail(&ld->towork, newpl);
      ld->curSub = ld->towork.tail;
}

static void
RemoveHead(LexizeData *ld)
{
      LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));

      ld->posDict = 0;
}

static void
setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
{
      if (correspondLexem)
      {
            *correspondLexem = ld->waste.head;
      }
      else
      {
            ParsedLex  *tmp,
                           *ptr = ld->waste.head;

            while (ptr)
            {
                  tmp = ptr->next;
                  pfree(ptr);
                  ptr = tmp;
            }
      }
      ld->waste.head = ld->waste.tail = NULL;
}

static void
moveToWaste(LexizeData *ld, ParsedLex *stop)
{
      bool        go = true;

      while (ld->towork.head && go)
      {
            if (ld->towork.head == stop)
            {
                  ld->curSub = stop->next;
                  go = false;
            }
            RemoveHead(ld);
      }
}

static void
setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
{
      if (ld->tmpRes)
      {
            TSLexeme   *ptr;

            for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
                  pfree(ptr->lexeme);
            pfree(ld->tmpRes);
      }
      ld->tmpRes = res;
      ld->lastRes = lex;
}

static TSLexeme *
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
{
      int               i;
      ListDictionary *map;
      TSDictionaryCacheEntry *dict;
      TSLexeme   *res;

      if (ld->curDictId == InvalidOid)
      {
            /*
             * usial mode: dictionary wants only one word, but we should keep in
             * mind that we should go through all stack
             */

            while (ld->towork.head)
            {
                  ParsedLex  *curVal = ld->towork.head;

                  map = ld->cfg->map + curVal->type;

                  if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
                  {
                        /* skip this type of lexeme */
                        RemoveHead(ld);
                        continue;
                  }

                  for (i = ld->posDict; i < map->len; i++)
                  {
                        dict = lookup_ts_dictionary_cache(map->dictIds[i]);

                        ld->dictState.isend = ld->dictState.getnext = false;
                        ld->dictState.private = NULL;
                        res = (TSLexeme *) DatumGetPointer(FunctionCall4(
                                                                                           &(dict->lexize),
                                                                   PointerGetDatum(dict->dictData),
                                                                     PointerGetDatum(curVal->lemm),
                                                                    Int32GetDatum(curVal->lenlemm),
                                                                    PointerGetDatum(&ld->dictState)
                                                                                                 ));

                        if (ld->dictState.getnext)
                        {
                              /*
                               * dictionary wants next word, so setup and store current
                               * position and go to multiword mode
                               */

                              ld->curDictId = DatumGetObjectId(map->dictIds[i]);
                              ld->posDict = i + 1;
                              ld->curSub = curVal->next;
                              if (res)
                                    setNewTmpRes(ld, curVal, res);
                              return LexizeExec(ld, correspondLexem);
                        }

                        if (!res)         /* dictionary doesn't know this lexeme */
                              continue;

                        RemoveHead(ld);
                        setCorrLex(ld, correspondLexem);
                        return res;
                  }

                  RemoveHead(ld);
            }
      }
      else
      {                                         /* curDictId is valid */
            dict = lookup_ts_dictionary_cache(ld->curDictId);

            /*
             * Dictionary ld->curDictId asks  us about following words
             */

            while (ld->curSub)
            {
                  ParsedLex  *curVal = ld->curSub;

                  map = ld->cfg->map + curVal->type;

                  if (curVal->type != 0)
                  {
                        bool        dictExists = false;

                        if (curVal->type >= ld->cfg->lenmap || map->len == 0)
                        {
                              /* skip this type of lexeme */
                              ld->curSub = curVal->next;
                              continue;
                        }

                        /*
                         * We should be sure that current type of lexeme is recognized
                         * by our dictinonary: we just check is it exist in list of
                         * dictionaries ?
                         */
                        for (i = 0; i < map->len && !dictExists; i++)
                              if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
                                    dictExists = true;

                        if (!dictExists)
                        {
                              /*
                               * Dictionary can't work with current tpe of lexeme,
                               * return to basic mode and redo all stored lexemes
                               */
                              ld->curDictId = InvalidOid;
                              return LexizeExec(ld, correspondLexem);
                        }
                  }

                  ld->dictState.isend = (curVal->type == 0) ? true : false;
                  ld->dictState.getnext = false;

                  res = (TSLexeme *) DatumGetPointer(FunctionCall4(
                                                                                           &(dict->lexize),
                                                                   PointerGetDatum(dict->dictData),
                                                                     PointerGetDatum(curVal->lemm),
                                                                    Int32GetDatum(curVal->lenlemm),
                                                                    PointerGetDatum(&ld->dictState)
                                                                                           ));

                  if (ld->dictState.getnext)
                  {
                        /* Dictionary wants one more */
                        ld->curSub = curVal->next;
                        if (res)
                              setNewTmpRes(ld, curVal, res);
                        continue;
                  }

                  if (res || ld->tmpRes)
                  {
                        /*
                         * Dictionary normalizes lexemes, so we remove from stack all
                         * used lexemes, return to basic mode and redo end of stack
                         * (if it exists)
                         */
                        if (res)
                        {
                              moveToWaste(ld, ld->curSub);
                        }
                        else
                        {
                              res = ld->tmpRes;
                              moveToWaste(ld, ld->lastRes);
                        }

                        /* reset to initial state */
                        ld->curDictId = InvalidOid;
                        ld->posDict = 0;
                        ld->lastRes = NULL;
                        ld->tmpRes = NULL;
                        setCorrLex(ld, correspondLexem);
                        return res;
                  }

                  /*
                   * Dict don't want next lexem and didn't recognize anything, redo
                   * from ld->towork.head
                   */
                  ld->curDictId = InvalidOid;
                  return LexizeExec(ld, correspondLexem);
            }
      }

      setCorrLex(ld, correspondLexem);
      return NULL;
}

/*
 * Parse string and lexize words.
 *
 * prs will be filled in.
 */
void
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
{
      int               type,
                        lenlemm;
      char     *lemm = NULL;
      LexizeData  ldata;
      TSLexeme   *norms;
      TSConfigCacheEntry *cfg;
      TSParserCacheEntry *prsobj;
      void     *prsdata;

      cfg = lookup_ts_config_cache(cfgId);
      prsobj = lookup_ts_parser_cache(cfg->prsId);

      prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
                                                                               PointerGetDatum(buf),
                                                                               Int32GetDatum(buflen)));

      LexizeInit(&ldata, cfg);

      do
      {
            type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
                                                               PointerGetDatum(prsdata),
                                                               PointerGetDatum(&lemm),
                                                               PointerGetDatum(&lenlemm)));

            if (type > 0 && lenlemm >= MAXSTRLEN)
            {
#ifdef IGNORE_LONGLEXEME
                  ereport(NOTICE,
                              (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                               errmsg("word is too long to be indexed"),
                               errdetail("Words longer than %d characters are ignored.",
                                             MAXSTRLEN)));
                  continue;
#else
                  ereport(ERROR,
                              (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                               errmsg("word is too long to be indexed"),
                               errdetail("Words longer than %d characters are ignored.",
                                             MAXSTRLEN)));
#endif
            }

            LexizeAddLemm(&ldata, type, lemm, lenlemm);

            while ((norms = LexizeExec(&ldata, NULL)) != NULL)
            {
                  TSLexeme   *ptr = norms;

                  prs->pos++;             /* set pos */

                  while (ptr->lexeme)
                  {
                        if (prs->curwords == prs->lenwords)
                        {
                              prs->lenwords *= 2;
                              prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
                        }

                        if (ptr->flags & TSL_ADDPOS)
                              prs->pos++;
                        prs->words[prs->curwords].len = strlen(ptr->lexeme);
                        prs->words[prs->curwords].word = ptr->lexeme;
                        prs->words[prs->curwords].nvariant = ptr->nvariant;
                        prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
                        prs->words[prs->curwords].alen = 0;
                        prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
                        ptr++;
                        prs->curwords++;
                  }
                  pfree(norms);
            }
      } while (type > 0);

      FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}

/*
 * Headline framework
 */
static void
hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
{
      while (prs->curwords >= prs->lenwords)
      {
            prs->lenwords *= 2;
            prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
      }
      memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
      prs->words[prs->curwords].type = (uint8) type;
      prs->words[prs->curwords].len = buflen;
      prs->words[prs->curwords].word = palloc(buflen);
      memcpy(prs->words[prs->curwords].word, buf, buflen);
      prs->curwords++;
}

static void
hlfinditem(HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
{
      int               i;
      QueryItem  *item = GETQUERY(query);
      HeadlineWordEntry *word;

      while (prs->curwords + query->size >= prs->lenwords)
      {
            prs->lenwords *= 2;
            prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
      }

      word = &(prs->words[prs->curwords - 1]);
      for (i = 0; i < query->size; i++)
      {
            if (item->type == QI_VAL &&
                  tsCompareString(GETOPERAND(query) + item->operand.distance, item->operand.length,
                                          buf, buflen, item->operand.prefix) == 0)
            {
                  if (word->item)
                  {
                        memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
                        prs->words[prs->curwords].item = &item->operand;
                        prs->words[prs->curwords].repeated = 1;
                        prs->curwords++;
                  }
                  else
                        word->item = &item->operand;
            }
            item++;
      }
}

static void
addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
{
      ParsedLex  *tmplexs;
      TSLexeme   *ptr;

      while (lexs)
      {

            if (lexs->type > 0)
                  hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);

            ptr = norms;
            while (ptr && ptr->lexeme)
            {
                  hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
                  ptr++;
            }

            tmplexs = lexs->next;
            pfree(lexs);
            lexs = tmplexs;
      }

      if (norms)
      {
            ptr = norms;
            while (ptr->lexeme)
            {
                  pfree(ptr->lexeme);
                  ptr++;
            }
            pfree(norms);
      }
}

void
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
{
      int               type,
                        lenlemm;
      char     *lemm = NULL;
      LexizeData  ldata;
      TSLexeme   *norms;
      ParsedLex  *lexs;
      TSConfigCacheEntry *cfg;
      TSParserCacheEntry *prsobj;
      void     *prsdata;

      cfg = lookup_ts_config_cache(cfgId);
      prsobj = lookup_ts_parser_cache(cfg->prsId);

      prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
                                                                               PointerGetDatum(buf),
                                                                               Int32GetDatum(buflen)));

      LexizeInit(&ldata, cfg);

      do
      {
            type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
                                                               PointerGetDatum(prsdata),
                                                               PointerGetDatum(&lemm),
                                                               PointerGetDatum(&lenlemm)));

            if (type > 0 && lenlemm >= MAXSTRLEN)
            {
#ifdef IGNORE_LONGLEXEME
                  ereport(NOTICE,
                              (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                               errmsg("word is too long to be indexed"),
                               errdetail("Words longer than %d characters are ignored.",
                                             MAXSTRLEN)));
                  continue;
#else
                  ereport(ERROR,
                              (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                               errmsg("word is too long to be indexed"),
                               errdetail("Words longer than %d characters are ignored.",
                                             MAXSTRLEN)));
#endif
            }

            LexizeAddLemm(&ldata, type, lemm, lenlemm);

            do
            {
                  if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
                        addHLParsedLex(prs, query, lexs, norms);
                  else
                        addHLParsedLex(prs, query, lexs, NULL);
            } while (norms);

      } while (type > 0);

      FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}

text *
generateHeadline(HeadlineParsedText *prs)
{
      text     *out;
      char     *ptr;
      int               len = 128;
      int               numfragments = 0;
      int2        infrag = 0;

      HeadlineWordEntry *wrd = prs->words;

      out = (text *) palloc(len);
      ptr = ((char *) out) + VARHDRSZ;

      while (wrd - prs->words < prs->curwords)
      {
            while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
            {
                  int               dist = ptr - ((char *) out);

                  len *= 2;
                  out = (text *) repalloc(out, len);
                  ptr = ((char *) out) + dist;
            }

            if (wrd->in && !wrd->repeated)
            {
                  if (!infrag)
                  {

                        /* start of a new fragment */
                        infrag = 1;
                        numfragments++;
                        /* add a fragment delimitor if this is after the first one */
                        if (numfragments > 1)
                        {
                              memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
                              ptr += prs->fragdelimlen;
                        }

                  }
                  if (wrd->replace)
                  {
                        *ptr = ' ';
                        ptr++;
                  }
                  else if (!wrd->skip)
                  {
                        if (wrd->selected)
                        {
                              memcpy(ptr, prs->startsel, prs->startsellen);
                              ptr += prs->startsellen;
                        }
                        memcpy(ptr, wrd->word, wrd->len);
                        ptr += wrd->len;
                        if (wrd->selected)
                        {
                              memcpy(ptr, prs->stopsel, prs->stopsellen);
                              ptr += prs->stopsellen;
                        }
                  }
            }
            else if (!wrd->repeated)
            {
                  if (infrag)
                        infrag = 0;
                  pfree(wrd->word);
            }

            wrd++;
      }

      SET_VARSIZE(out, ptr - ((char *) out));
      return out;
}

Generated by  Doxygen 1.6.0   Back to index