Logo Search packages:      
Sourcecode: postgresql-8.4 version File versions

regexp.c

/*-------------------------------------------------------------------------
 *
 * regexp.c
 *      Postgres' interface to the regular expression package.
 *
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *      $PostgreSQL$
 *
 *          Alistair Crooks added the code for the regex caching
 *          agc - cached the regular expressions used - there's a good chance
 *          that we'll get a hit, so this saves a compile step for every
 *          attempted match. I haven't actually measured the speed improvement,
 *          but it `looks' a lot quicker visually when watching regression
 *          test output.
 *
 *          agc - incorporated Keith Bostic's Berkeley regex code into
 *          the tree for all ports. To distinguish this regex code from any that
 *          is existent on a platform, I've prepended the string "pg_" to
 *          the functions regcomp, regerror, regexec and regfree.
 *          Fixed a bug that was originally a typo by me, where `i' was used
 *          instead of `oldest' when compiling regular expressions - benign
 *          results mostly, although occasionally it bit you...
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "catalog/pg_type.h"
#include "funcapi.h"
#include "regex/regex.h"
#include "utils/builtins.h"
#include "utils/guc.h"

#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
      (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)


/* GUC-settable flavor parameter */
int   regex_flavor = REG_ADVANCED;


/* all the options of interest for regex functions */
typedef struct pg_re_flags
{
      int               cflags;                 /* compile flags for Spencer's regex code */
      bool        glob;             /* do it globally (for each occurrence) */
} pg_re_flags;

/* cross-call state for regexp_matches(), also regexp_split() */
typedef struct regexp_matches_ctx
{
      text     *orig_str;           /* data string in original TEXT form */
      int               nmatches;         /* number of places where pattern matched */
      int               npatterns;        /* number of capturing subpatterns */
      /* We store start char index and end+1 char index for each match */
      /* so the number of entries in match_locs is nmatches * npatterns * 2 */
      int            *match_locs;         /* 0-based character indexes */
      int               next_match;       /* 0-based index of next match to process */
      /* workspace for build_regexp_matches_result() */
      Datum    *elems;              /* has npatterns elements */
      bool     *nulls;              /* has npatterns elements */
} regexp_matches_ctx;

/*
 * We cache precompiled regular expressions using a "self organizing list"
 * structure, in which recently-used items tend to be near the front.
 * Whenever we use an entry, it's moved up to the front of the list.
 * Over time, an item's average position corresponds to its frequency of use.
 *
 * When we first create an entry, it's inserted at the front of
 * the array, dropping the entry at the end of the array if necessary to
 * make room.  (This might seem to be weighting the new entry too heavily,
 * but if we insert new entries further back, we'll be unable to adjust to
 * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
 * never-before-seen items used circularly.  We ought to be able to handle
 * that case, so we have to insert at the front.)
 *
 * Knuth mentions a variant strategy in which a used item is moved up just
 * one place in the list.  Although he says this uses fewer comparisons on
 * average, it seems not to adapt very well to the situation where you have
 * both some reusable patterns and a steady stream of non-reusable patterns.
 * A reusable pattern that isn't used at least as often as non-reusable
 * patterns are seen will "fail to keep up" and will drop off the end of the
 * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
 * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
 */

/* this is the maximum number of cached regular expressions */
#ifndef MAX_CACHED_RES
#define MAX_CACHED_RES  32
#endif

/* this structure describes one cached regular expression */
typedef struct cached_re_str
{
      char     *cre_pat;            /* original RE (not null terminated!) */
      int               cre_pat_len;      /* length of original RE, in bytes */
      int               cre_flags;        /* compile flags: extended,icase etc */
      regex_t           cre_re;                 /* the compiled regular expression */
} cached_re_str;

static int  num_res = 0;            /* # of cached re's */
static cached_re_str re_array[MAX_CACHED_RES];  /* cached re's */


/* Local functions */
static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
                               text *flags,
                               bool force_glob,
                               bool use_subpatterns,
                               bool ignore_degenerate);
static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);


/*
 * RE_compile_and_cache - compile a RE, caching if possible
 *
 * Returns regex_t *
 *
 *    text_re --- the pattern, expressed as a TEXT object
 *    cflags --- compile options for the pattern
 *
 * Pattern is given in the database encoding.  We internally convert to
 * an array of pg_wchar, which is what Spencer's regex package wants.
 */
static regex_t *
RE_compile_and_cache(text *text_re, int cflags)
{
      int               text_re_len = VARSIZE_ANY_EXHDR(text_re);
      char     *text_re_val = VARDATA_ANY(text_re);
      pg_wchar   *pattern;
      int               pattern_len;
      int               i;
      int               regcomp_result;
      cached_re_str re_temp;
      char        errMsg[100];

      /*
       * Look for a match among previously compiled REs.    Since the data
       * structure is self-organizing with most-used entries at the front, our
       * search strategy can just be to scan from the front.
       */
      for (i = 0; i < num_res; i++)
      {
            if (re_array[i].cre_pat_len == text_re_len &&
                  re_array[i].cre_flags == cflags &&
                  memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
            {
                  /*
                   * Found a match; move it to front if not there already.
                   */
                  if (i > 0)
                  {
                        re_temp = re_array[i];
                        memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
                        re_array[0] = re_temp;
                  }

                  return &re_array[0].cre_re;
            }
      }

      /*
       * Couldn't find it, so try to compile the new RE.  To avoid leaking
       * resources on failure, we build into the re_temp local.
       */

      /* Convert pattern string to wide characters */
      pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
      pattern_len = pg_mb2wchar_with_len(text_re_val,
                                                         pattern,
                                                         text_re_len);

      regcomp_result = pg_regcomp(&re_temp.cre_re,
                                                pattern,
                                                pattern_len,
                                                cflags);

      pfree(pattern);

      if (regcomp_result != REG_OKAY)
      {
            /* re didn't compile */
            pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
            /* XXX should we pg_regfree here? */
            ereport(ERROR,
                        (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
                         errmsg("invalid regular expression: %s", errMsg)));
      }

      /*
       * We use malloc/free for the cre_pat field because the storage has to
       * persist across transactions, and because we want to get control back on
       * out-of-memory.  The Max() is because some malloc implementations return
       * NULL for malloc(0).
       */
      re_temp.cre_pat = malloc(Max(text_re_len, 1));
      if (re_temp.cre_pat == NULL)
      {
            pg_regfree(&re_temp.cre_re);
            ereport(ERROR,
                        (errcode(ERRCODE_OUT_OF_MEMORY),
                         errmsg("out of memory")));
      }
      memcpy(re_temp.cre_pat, text_re_val, text_re_len);
      re_temp.cre_pat_len = text_re_len;
      re_temp.cre_flags = cflags;

      /*
       * Okay, we have a valid new item in re_temp; insert it into the storage
       * array.  Discard last entry if needed.
       */
      if (num_res >= MAX_CACHED_RES)
      {
            --num_res;
            Assert(num_res < MAX_CACHED_RES);
            pg_regfree(&re_array[num_res].cre_re);
            free(re_array[num_res].cre_pat);
      }

      if (num_res > 0)
            memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));

      re_array[0] = re_temp;
      num_res++;

      return &re_array[0].cre_re;
}

/*
 * RE_wchar_execute - execute a RE on pg_wchar data
 *
 * Returns TRUE on match, FALSE on no match
 *
 *    re --- the compiled pattern as returned by RE_compile_and_cache
 *    data --- the data to match against (need not be null-terminated)
 *    data_len --- the length of the data string
 *    start_search -- the offset in the data to start searching
 *    nmatch, pmatch    --- optional return area for match details
 *
 * Data is given as array of pg_wchar which is what Spencer's regex package
 * wants.
 */
static bool
RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
                         int start_search, int nmatch, regmatch_t *pmatch)
{
      int               regexec_result;
      char        errMsg[100];

      /* Perform RE match and return result */
      regexec_result = pg_regexec(re,
                                                data,
                                                data_len,
                                                start_search,
                                                NULL, /* no details */
                                                nmatch,
                                                pmatch,
                                                0);

      if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
      {
            /* re failed??? */
            pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
            ereport(ERROR,
                        (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
                         errmsg("regular expression failed: %s", errMsg)));
      }

      return (regexec_result == REG_OKAY);
}

/*
 * RE_execute - execute a RE
 *
 * Returns TRUE on match, FALSE on no match
 *
 *    re --- the compiled pattern as returned by RE_compile_and_cache
 *    dat --- the data to match against (need not be null-terminated)
 *    dat_len --- the length of the data string
 *    nmatch, pmatch    --- optional return area for match details
 *
 * Data is given in the database encoding.      We internally
 * convert to array of pg_wchar which is what Spencer's regex package wants.
 */
static bool
RE_execute(regex_t *re, char *dat, int dat_len,
               int nmatch, regmatch_t *pmatch)
{
      pg_wchar   *data;
      int               data_len;
      bool        match;

      /* Convert data string to wide characters */
      data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
      data_len = pg_mb2wchar_with_len(dat, data, dat_len);

      /* Perform RE match and return result */
      match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);

      pfree(data);
      return match;
}

/*
 * RE_compile_and_execute - compile and execute a RE
 *
 * Returns TRUE on match, FALSE on no match
 *
 *    text_re --- the pattern, expressed as a TEXT object
 *    dat --- the data to match against (need not be null-terminated)
 *    dat_len --- the length of the data string
 *    cflags --- compile options for the pattern
 *    nmatch, pmatch    --- optional return area for match details
 *
 * Both pattern and data are given in the database encoding.  We internally
 * convert to array of pg_wchar which is what Spencer's regex package wants.
 */
static bool
RE_compile_and_execute(text *text_re, char *dat, int dat_len,
                                 int cflags, int nmatch, regmatch_t *pmatch)
{
      regex_t    *re;

      /* Compile RE */
      re = RE_compile_and_cache(text_re, cflags);

      return RE_execute(re, dat, dat_len, nmatch, pmatch);
}


/*
 * parse_re_flags - parse the options argument of regexp_matches and friends
 *
 *    flags --- output argument, filled with desired options
 *    opts --- TEXT object, or NULL for defaults
 *
 * This accepts all the options allowed by any of the callers; callers that
 * don't want some have to reject them after the fact.
 */
static void
parse_re_flags(pg_re_flags *flags, text *opts)
{
      /* regex_flavor is always folded into the compile flags */
      flags->cflags = regex_flavor;
      flags->glob = false;

      if (opts)
      {
            char     *opt_p = VARDATA_ANY(opts);
            int               opt_len = VARSIZE_ANY_EXHDR(opts);
            int               i;

            for (i = 0; i < opt_len; i++)
            {
                  switch (opt_p[i])
                  {
                        case 'g':
                              flags->glob = true;
                              break;
                        case 'b':         /* BREs (but why???) */
                              flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
                              break;
                        case 'c':         /* case sensitive */
                              flags->cflags &= ~REG_ICASE;
                              break;
                        case 'e':         /* plain EREs */
                              flags->cflags |= REG_EXTENDED;
                              flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
                              break;
                        case 'i':         /* case insensitive */
                              flags->cflags |= REG_ICASE;
                              break;
                        case 'm':         /* Perloid synonym for n */
                        case 'n':         /* \n affects ^ $ . [^ */
                              flags->cflags |= REG_NEWLINE;
                              break;
                        case 'p':         /* ~Perl, \n affects . [^ */
                              flags->cflags |= REG_NLSTOP;
                              flags->cflags &= ~REG_NLANCH;
                              break;
                        case 'q':         /* literal string */
                              flags->cflags |= REG_QUOTE;
                              flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
                              break;
                        case 's':         /* single line, \n ordinary */
                              flags->cflags &= ~REG_NEWLINE;
                              break;
                        case 't':         /* tight syntax */
                              flags->cflags &= ~REG_EXPANDED;
                              break;
                        case 'w':         /* weird, \n affects ^ $ only */
                              flags->cflags &= ~REG_NLSTOP;
                              flags->cflags |= REG_NLANCH;
                              break;
                        case 'x':         /* expanded syntax */
                              flags->cflags |= REG_EXPANDED;
                              break;
                        default:
                              ereport(ERROR,
                                          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                           errmsg("invalid regexp option: \"%c\"",
                                                      opt_p[i])));
                              break;
                  }
            }
      }
}


/*
 * report whether regex_flavor is currently BASIC
 */
bool
regex_flavor_is_basic(void)
{
      return (regex_flavor == REG_BASIC);
}


/*
 *    interface routines called by the function manager
 */

Datum
nameregexeq(PG_FUNCTION_ARGS)
{
      Name        n = PG_GETARG_NAME(0);
      text     *p = PG_GETARG_TEXT_PP(1);

      PG_RETURN_BOOL(RE_compile_and_execute(p,
                                                              NameStr(*n),
                                                              strlen(NameStr(*n)),
                                                              regex_flavor,
                                                              0, NULL));
}

Datum
nameregexne(PG_FUNCTION_ARGS)
{
      Name        n = PG_GETARG_NAME(0);
      text     *p = PG_GETARG_TEXT_PP(1);

      PG_RETURN_BOOL(!RE_compile_and_execute(p,
                                                               NameStr(*n),
                                                               strlen(NameStr(*n)),
                                                               regex_flavor,
                                                               0, NULL));
}

Datum
textregexeq(PG_FUNCTION_ARGS)
{
      text     *s = PG_GETARG_TEXT_PP(0);
      text     *p = PG_GETARG_TEXT_PP(1);

      PG_RETURN_BOOL(RE_compile_and_execute(p,
                                                              VARDATA_ANY(s),
                                                              VARSIZE_ANY_EXHDR(s),
                                                              regex_flavor,
                                                              0, NULL));
}

Datum
textregexne(PG_FUNCTION_ARGS)
{
      text     *s = PG_GETARG_TEXT_PP(0);
      text     *p = PG_GETARG_TEXT_PP(1);

      PG_RETURN_BOOL(!RE_compile_and_execute(p,
                                                               VARDATA_ANY(s),
                                                               VARSIZE_ANY_EXHDR(s),
                                                               regex_flavor,
                                                               0, NULL));
}


/*
 *    routines that use the regexp stuff, but ignore the case.
 *    for this, we use the REG_ICASE flag to pg_regcomp
 */


Datum
nameicregexeq(PG_FUNCTION_ARGS)
{
      Name        n = PG_GETARG_NAME(0);
      text     *p = PG_GETARG_TEXT_PP(1);

      PG_RETURN_BOOL(RE_compile_and_execute(p,
                                                              NameStr(*n),
                                                              strlen(NameStr(*n)),
                                                              regex_flavor | REG_ICASE,
                                                              0, NULL));
}

Datum
nameicregexne(PG_FUNCTION_ARGS)
{
      Name        n = PG_GETARG_NAME(0);
      text     *p = PG_GETARG_TEXT_PP(1);

      PG_RETURN_BOOL(!RE_compile_and_execute(p,
                                                               NameStr(*n),
                                                               strlen(NameStr(*n)),
                                                               regex_flavor | REG_ICASE,
                                                               0, NULL));
}

Datum
texticregexeq(PG_FUNCTION_ARGS)
{
      text     *s = PG_GETARG_TEXT_PP(0);
      text     *p = PG_GETARG_TEXT_PP(1);

      PG_RETURN_BOOL(RE_compile_and_execute(p,
                                                              VARDATA_ANY(s),
                                                              VARSIZE_ANY_EXHDR(s),
                                                              regex_flavor | REG_ICASE,
                                                              0, NULL));
}

Datum
texticregexne(PG_FUNCTION_ARGS)
{
      text     *s = PG_GETARG_TEXT_PP(0);
      text     *p = PG_GETARG_TEXT_PP(1);

      PG_RETURN_BOOL(!RE_compile_and_execute(p,
                                                               VARDATA_ANY(s),
                                                               VARSIZE_ANY_EXHDR(s),
                                                               regex_flavor | REG_ICASE,
                                                               0, NULL));
}


/*
 * textregexsubstr()
 *          Return a substring matched by a regular expression.
 */
Datum
textregexsubstr(PG_FUNCTION_ARGS)
{
      text     *s = PG_GETARG_TEXT_PP(0);
      text     *p = PG_GETARG_TEXT_PP(1);
      regex_t    *re;
      regmatch_t  pmatch[2];
      int               so,
                        eo;

      /* Compile RE */
      re = RE_compile_and_cache(p, regex_flavor);

      /*
       * We pass two regmatch_t structs to get info about the overall match and
       * the match for the first parenthesized subexpression (if any). If there
       * is a parenthesized subexpression, we return what it matched; else
       * return what the whole regexp matched.
       */
      if (!RE_execute(re,
                              VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
                              2, pmatch))
            PG_RETURN_NULL();       /* definitely no match */

      if (re->re_nsub > 0)
      {
            /* has parenthesized subexpressions, use the first one */
            so = pmatch[1].rm_so;
            eo = pmatch[1].rm_eo;
      }
      else
      {
            /* no parenthesized subexpression, use whole match */
            so = pmatch[0].rm_so;
            eo = pmatch[0].rm_eo;
      }

      /*
       * It is possible to have a match to the whole pattern but no match
       * for a subexpression; for example 'foo(bar)?' is considered to match
       * 'foo' but there is no subexpression match.  So this extra test for
       * match failure is not redundant.
       */
      if (so < 0 || eo < 0)
            PG_RETURN_NULL();

      return DirectFunctionCall3(text_substr,
                                             PointerGetDatum(s),
                                             Int32GetDatum(so + 1),
                                             Int32GetDatum(eo - so));
}

/*
 * textregexreplace_noopt()
 *          Return a string matched by a regular expression, with replacement.
 *
 * This version doesn't have an option argument: we default to case
 * sensitive match, replace the first instance only.
 */
Datum
textregexreplace_noopt(PG_FUNCTION_ARGS)
{
      text     *s = PG_GETARG_TEXT_PP(0);
      text     *p = PG_GETARG_TEXT_PP(1);
      text     *r = PG_GETARG_TEXT_PP(2);
      regex_t    *re;

      re = RE_compile_and_cache(p, regex_flavor);

      PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
}

/*
 * textregexreplace()
 *          Return a string matched by a regular expression, with replacement.
 */
Datum
textregexreplace(PG_FUNCTION_ARGS)
{
      text     *s = PG_GETARG_TEXT_PP(0);
      text     *p = PG_GETARG_TEXT_PP(1);
      text     *r = PG_GETARG_TEXT_PP(2);
      text     *opt = PG_GETARG_TEXT_PP(3);
      regex_t    *re;
      pg_re_flags flags;

      parse_re_flags(&flags, opt);

      re = RE_compile_and_cache(p, flags.cflags);

      PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
}

/*
 * similar_escape()
 * Convert a SQL99 regexp pattern to POSIX style, so it can be used by
 * our regexp engine.
 */
Datum
similar_escape(PG_FUNCTION_ARGS)
{
      text     *pat_text;
      text     *esc_text;
      text     *result;
      char     *p,
                     *e,
                     *r;
      int               plen,
                        elen;
      bool        afterescape = false;
      int               nquotes = 0;

      /* This function is not strict, so must test explicitly */
      if (PG_ARGISNULL(0))
            PG_RETURN_NULL();
      pat_text = PG_GETARG_TEXT_PP(0);
      p = VARDATA_ANY(pat_text);
      plen = VARSIZE_ANY_EXHDR(pat_text);
      if (PG_ARGISNULL(1))
      {
            /* No ESCAPE clause provided; default to backslash as escape */
            e = "\\";
            elen = 1;
      }
      else
      {
            esc_text = PG_GETARG_TEXT_PP(1);
            e = VARDATA_ANY(esc_text);
            elen = VARSIZE_ANY_EXHDR(esc_text);
            if (elen == 0)
                  e = NULL;               /* no escape character */
            else if (elen != 1)
                  ereport(ERROR,
                              (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                               errmsg("invalid escape string"),
                          errhint("Escape string must be empty or one character.")));
      }

      /*----------
       * We surround the transformed input string with
       *                ***:^(?: ... )$
       * which is bizarre enough to require some explanation.  "***:" is a
       * director prefix to force the regex to be treated as an ARE regardless
       * of the current regex_flavor setting.  We need "^" and "$" to force
       * the pattern to match the entire input string as per SQL99 spec.      The
       * "(?:" and ")" are a non-capturing set of parens; we have to have
       * parens in case the string contains "|", else the "^" and "$" will
       * be bound into the first and last alternatives which is not what we
       * want, and the parens must be non capturing because we don't want them
       * to count when selecting output for SUBSTRING.
       *----------
       */

      /*
       * We need room for the prefix/postfix plus as many as 2 output bytes per
       * input byte
       */
      result = (text *) palloc(VARHDRSZ + 10 + 2 * plen);
      r = VARDATA(result);

      *r++ = '*';
      *r++ = '*';
      *r++ = '*';
      *r++ = ':';
      *r++ = '^';
      *r++ = '(';
      *r++ = '?';
      *r++ = ':';

      while (plen > 0)
      {
            char        pchar = *p;

            if (afterescape)
            {
                  if (pchar == '"') /* for SUBSTRING patterns */
                        *r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
                  else
                  {
                        *r++ = '\\';
                        *r++ = pchar;
                  }
                  afterescape = false;
            }
            else if (e && pchar == *e)
            {
                  /* SQL99 escape character; do not send to output */
                  afterescape = true;
            }
            else if (pchar == '%')
            {
                  *r++ = '.';
                  *r++ = '*';
            }
            else if (pchar == '_')
                  *r++ = '.';
            else if (pchar == '\\' || pchar == '.' || pchar == '?' ||
                         pchar == '{')
            {
                  *r++ = '\\';
                  *r++ = pchar;
            }
            else
                  *r++ = pchar;
            p++, plen--;
      }

      *r++ = ')';
      *r++ = '$';

      SET_VARSIZE(result, r - ((char *) result));

      PG_RETURN_TEXT_P(result);
}

/*
 * regexp_matches()
 *          Return a table of matches of a pattern within a string.
 */
Datum
regexp_matches(PG_FUNCTION_ARGS)
{
      FuncCallContext *funcctx;
      regexp_matches_ctx *matchctx;

      if (SRF_IS_FIRSTCALL())
      {
            text     *pattern = PG_GETARG_TEXT_PP(1);
            text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
            MemoryContext oldcontext;

            funcctx = SRF_FIRSTCALL_INIT();
            oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);

            /* be sure to copy the input string into the multi-call ctx */
            matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                                            flags, false, true, false);

            /* Pre-create workspace that build_regexp_matches_result needs */
            matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
            matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);

            MemoryContextSwitchTo(oldcontext);
            funcctx->user_fctx = (void *) matchctx;
      }

      funcctx = SRF_PERCALL_SETUP();
      matchctx = (regexp_matches_ctx *) funcctx->user_fctx;

      if (matchctx->next_match < matchctx->nmatches)
      {
            ArrayType  *result_ary;

            result_ary = build_regexp_matches_result(matchctx);
            matchctx->next_match++;
            SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
      }

      /* release space in multi-call ctx to avoid intraquery memory leak */
      cleanup_regexp_matches(matchctx);

      SRF_RETURN_DONE(funcctx);
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
regexp_matches_no_flags(PG_FUNCTION_ARGS)
{
      return regexp_matches(fcinfo);
}

/*
 * setup_regexp_matches --- do the initial matching for regexp_matches()
 *          or regexp_split()
 *
 * To avoid having to re-find the compiled pattern on each call, we do
 * all the matching in one swoop.  The returned regexp_matches_ctx contains
 * the locations of all the substrings matching the pattern.
 *
 * The three bool parameters have only two patterns (one for each caller)
 * but it seems clearer to distinguish the functionality this way than to
 * key it all off one "is_split" flag.
 */
static regexp_matches_ctx *
setup_regexp_matches(text *orig_str, text *pattern, text *flags,
                               bool force_glob, bool use_subpatterns,
                               bool ignore_degenerate)
{
      regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
      int               orig_len;
      pg_wchar   *wide_str;
      int               wide_len;
      pg_re_flags re_flags;
      regex_t    *cpattern;
      regmatch_t *pmatch;
      int               pmatch_len;
      int               array_len;
      int               array_idx;
      int               prev_match_end;
      int               start_search;

      /* save original string --- we'll extract result substrings from it */
      matchctx->orig_str = orig_str;

      /* convert string to pg_wchar form for matching */
      orig_len = VARSIZE_ANY_EXHDR(orig_str);
      wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
      wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);

      /* determine options */
      parse_re_flags(&re_flags, flags);
      if (force_glob)
      {
            /* user mustn't specify 'g' for regexp_split */
            if (re_flags.glob)
                  ereport(ERROR,
                              (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                         errmsg("regexp_split does not support the global option")));
            /* but we find all the matches anyway */
            re_flags.glob = true;
      }

      /* set up the compiled pattern */
      cpattern = RE_compile_and_cache(pattern, re_flags.cflags);

      /* do we want to remember subpatterns? */
      if (use_subpatterns && cpattern->re_nsub > 0)
      {
            matchctx->npatterns = cpattern->re_nsub;
            pmatch_len = cpattern->re_nsub + 1;
      }
      else
      {
            use_subpatterns = false;
            matchctx->npatterns = 1;
            pmatch_len = 1;
      }

      /* temporary output space for RE package */
      pmatch = palloc(sizeof(regmatch_t) * pmatch_len);

      /* the real output space (grown dynamically if needed) */
      array_len = re_flags.glob ? 256 : 32;
      matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
      array_idx = 0;

      /* search for the pattern, perhaps repeatedly */
      prev_match_end = 0;
      start_search = 0;
      while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
                                          pmatch_len, pmatch))
      {
            /*
             * If requested, ignore degenerate matches, which are zero-length
             * matches occurring at the start or end of a string or just after a
             * previous match.
             */
            if (!ignore_degenerate ||
                  (pmatch[0].rm_so < wide_len &&
                   pmatch[0].rm_eo > prev_match_end))
            {
                  /* enlarge output space if needed */
                  while (array_idx + matchctx->npatterns * 2 > array_len)
                  {
                        array_len *= 2;
                        matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
                                                                              sizeof(int) * array_len);
                  }

                  /* save this match's locations */
                  if (use_subpatterns)
                  {
                        int               i;

                        for (i = 1; i <= matchctx->npatterns; i++)
                        {
                              matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
                              matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
                        }
                  }
                  else
                  {
                        matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
                        matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
                  }
                  matchctx->nmatches++;
            }
            prev_match_end = pmatch[0].rm_eo;

            /* if not glob, stop after one match */
            if (!re_flags.glob)
                  break;

            /*
             * Advance search position.  Normally we start just after the end of
             * the previous match, but always advance at least one character (the
             * special case can occur if the pattern matches zero characters just
             * after the prior match or at the end of the string).
             */
            if (start_search < pmatch[0].rm_eo)
                  start_search = pmatch[0].rm_eo;
            else
                  start_search++;
            if (start_search > wide_len)
                  break;
      }

      /* Clean up temp storage */
      pfree(wide_str);
      pfree(pmatch);

      return matchctx;
}

/*
 * cleanup_regexp_matches - release memory of a regexp_matches_ctx
 */
static void
cleanup_regexp_matches(regexp_matches_ctx *matchctx)
{
      pfree(matchctx->orig_str);
      pfree(matchctx->match_locs);
      if (matchctx->elems)
            pfree(matchctx->elems);
      if (matchctx->nulls)
            pfree(matchctx->nulls);
      pfree(matchctx);
}

/*
 * build_regexp_matches_result - build output array for current match
 */
static ArrayType *
build_regexp_matches_result(regexp_matches_ctx *matchctx)
{
      Datum    *elems = matchctx->elems;
      bool     *nulls = matchctx->nulls;
      int               dims[1];
      int               lbs[1];
      int               loc;
      int               i;

      /* Extract matching substrings from the original string */
      loc = matchctx->next_match * matchctx->npatterns * 2;
      for (i = 0; i < matchctx->npatterns; i++)
      {
            int               so = matchctx->match_locs[loc++];
            int               eo = matchctx->match_locs[loc++];

            if (so < 0 || eo < 0)
            {
                  elems[i] = (Datum) 0;
                  nulls[i] = true;
            }
            else
            {
                  elems[i] = DirectFunctionCall3(text_substr,
                                                             PointerGetDatum(matchctx->orig_str),
                                                               Int32GetDatum(so + 1),
                                                               Int32GetDatum(eo - so));
                  nulls[i] = false;
            }
      }

      /* And form an array */
      dims[0] = matchctx->npatterns;
      lbs[0] = 1;
      /* XXX: this hardcodes assumptions about the text type */
      return construct_md_array(elems, nulls, 1, dims, lbs,
                                            TEXTOID, -1, false, 'i');
}

/*
 * regexp_split_to_table()
 *          Split the string at matches of the pattern, returning the
 *          split-out substrings as a table.
 */
Datum
regexp_split_to_table(PG_FUNCTION_ARGS)
{
      FuncCallContext *funcctx;
      regexp_matches_ctx *splitctx;

      if (SRF_IS_FIRSTCALL())
      {
            text     *pattern = PG_GETARG_TEXT_PP(1);
            text     *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
            MemoryContext oldcontext;

            funcctx = SRF_FIRSTCALL_INIT();
            oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);

            /* be sure to copy the input string into the multi-call ctx */
            splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
                                                            flags, true, false, true);

            MemoryContextSwitchTo(oldcontext);
            funcctx->user_fctx = (void *) splitctx;
      }

      funcctx = SRF_PERCALL_SETUP();
      splitctx = (regexp_matches_ctx *) funcctx->user_fctx;

      if (splitctx->next_match <= splitctx->nmatches)
      {
            Datum       result = build_regexp_split_result(splitctx);

            splitctx->next_match++;
            SRF_RETURN_NEXT(funcctx, result);
      }

      /* release space in multi-call ctx to avoid intraquery memory leak */
      cleanup_regexp_matches(splitctx);

      SRF_RETURN_DONE(funcctx);
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
{
      return regexp_split_to_table(fcinfo);
}

/*
 * regexp_split_to_array()
 *          Split the string at matches of the pattern, returning the
 *          split-out substrings as an array.
 */
Datum
regexp_split_to_array(PG_FUNCTION_ARGS)
{
      ArrayBuildState *astate = NULL;
      regexp_matches_ctx *splitctx;

      splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
                                                      PG_GETARG_TEXT_PP(1),
                                                      PG_GETARG_TEXT_PP_IF_EXISTS(2),
                                                      true, false, true);

      while (splitctx->next_match <= splitctx->nmatches)
      {
            astate = accumArrayResult(astate,
                                                  build_regexp_split_result(splitctx),
                                                  false,
                                                  TEXTOID,
                                                  CurrentMemoryContext);
            splitctx->next_match++;
      }

      /*
       * We don't call cleanup_regexp_matches here; it would try to pfree the
       * input string, which we didn't copy.  The space is not in a long-lived
       * memory context anyway.
       */

      PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
{
      return regexp_split_to_array(fcinfo);
}

/*
 * build_regexp_split_result - build output string for current match
 *
 * We return the string between the current match and the previous one,
 * or the string after the last match when next_match == nmatches.
 */
static Datum
build_regexp_split_result(regexp_matches_ctx *splitctx)
{
      int               startpos;
      int               endpos;

      if (splitctx->next_match > 0)
            startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
      else
            startpos = 0;
      if (startpos < 0)
            elog(ERROR, "invalid match ending position");

      if (splitctx->next_match < splitctx->nmatches)
      {
            endpos = splitctx->match_locs[splitctx->next_match * 2];
            if (endpos < startpos)
                  elog(ERROR, "invalid match starting position");
            return DirectFunctionCall3(text_substr,
                                                   PointerGetDatum(splitctx->orig_str),
                                                   Int32GetDatum(startpos + 1),
                                                   Int32GetDatum(endpos - startpos));
      }
      else
      {
            /* no more matches, return rest of string */
            return DirectFunctionCall2(text_substr_no_len,
                                                   PointerGetDatum(splitctx->orig_str),
                                                   Int32GetDatum(startpos + 1));
      }
}

Generated by  Doxygen 1.6.0   Back to index