Logo Search packages:      
Sourcecode: postgresql-8.4 version File versions  Download package

dynahash.c
/*-------------------------------------------------------------------------
 *
 * dynahash.c
 *      dynamic hash tables
 *
 * dynahash.c supports both local-to-a-backend hash tables and hash tables in
 * shared memory.  For shared hash tables, it is the caller's responsibility
 * to provide appropriate access interlocking.  The simplest convention is
 * that a single LWLock protects the whole hash table.      Searches (HASH_FIND or
 * hash_seq_search) need only shared lock, but any update requires exclusive
 * lock.  For heavily-used shared tables, the single-lock approach creates a
 * concurrency bottleneck, so we also support "partitioned" locking wherein
 * there are multiple LWLocks guarding distinct subsets of the table.  To use
 * a hash table in partitioned mode, the HASH_PARTITION flag must be given
 * to hash_create.      This prevents any attempt to split buckets on-the-fly.
 * Therefore, each hash bucket chain operates independently, and no fields
 * of the hash header change after init except nentries and freeList.
 * A partitioned table uses a spinlock to guard changes of those two fields.
 * This lets any subset of the hash buckets be treated as a separately
 * lockable partition.  We expect callers to use the low-order bits of a
 * lookup key's hash value as a partition number --- this will work because
 * of the way calc_bucket() maps hash values to bucket numbers.
 *
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *      $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.79 2009/01/01 17:23:51 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */

/*
 * Original comments:
 *
 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
 * Coded into C, with minor code improvements, and with hsearch(3) interface,
 * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
 * also, hcreate/hdestroy routines added to simulate hsearch(3).
 *
 * These routines simulate hsearch(3) and family, with the important
 * difference that the hash table is dynamic - can grow indefinitely
 * beyond its original size (as supplied to hcreate()).
 *
 * Performance appears to be comparable to that of hsearch(3).
 * The 'source-code' options referred to in hsearch(3)'s 'man' page
 * are not implemented; otherwise functionality is identical.
 *
 * Compilation controls:
 * DEBUG controls some informative traces, mainly for debugging.
 * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
 * when combined with HASH_DEBUG, these are displayed by hdestroy().
 *
 * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
 * concatenation property, in probably unnecessary code 'optimisation'.
 *
 * Modified margo@postgres.berkeley.edu February 1990
 *          added multiple table interface
 * Modified by sullivan@postgres.berkeley.edu April 1990
 *          changed ctl structure for shared memory
 */

#include "postgres.h"

#include "access/xact.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "utils/dynahash.h"
#include "utils/memutils.h"


/*
 * Constants
 *
 * A hash table has a top-level "directory", each of whose entries points
 * to a "segment" of ssize bucket headers.      The maximum number of hash
 * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
 * the number of records in the table can be larger, but we don't want a
 * whole lot of records per bucket or performance goes down.
 *
 * In a hash table allocated in shared memory, the directory cannot be
 * expanded because it must stay at a fixed address.  The directory size
 * should be selected using hash_select_dirsize (and you'd better have
 * a good idea of the maximum number of entries!).    For non-shared hash
 * tables, the initial directory size can be left at the default.
 */
#define DEF_SEGSIZE                    256
#define DEF_SEGSIZE_SHIFT        8  /* must be log2(DEF_SEGSIZE) */
#define DEF_DIRSIZE                    256
#define DEF_FFACTOR                    1  /* default fill factor */


/* A hash bucket is a linked list of HASHELEMENTs */
typedef HASHELEMENT *HASHBUCKET;

/* A hash segment is an array of bucket headers */
typedef HASHBUCKET *HASHSEGMENT;

/*
 * Header structure for a hash table --- contains all changeable info
 *
 * In a shared-memory hash table, the HASHHDR is in shared memory, while
 * each backend has a local HTAB struct.  For a non-shared table, there isn't
 * any functional difference between HASHHDR and HTAB, but we separate them
 * anyway to share code between shared and non-shared tables.
 */
00108 struct HASHHDR
{
      /* In a partitioned table, take this lock to touch nentries or freeList */
      slock_t           mutex;                  /* unused if not partitioned table */

      /* These fields change during entry addition/deletion */
      long        nentries;         /* number of entries in hash table */
      HASHELEMENT *freeList;        /* linked list of free elements */

      /* These fields can change, but not in a partitioned table */
      /* Also, dsize can't change in a shared table, even if unpartitioned */
      long        dsize;                  /* directory size */
      long        nsegs;                  /* number of allocated segments (<= dsize) */
      uint32            max_bucket;       /* ID of maximum bucket in use */
      uint32            high_mask;        /* mask to modulo into entire table */
      uint32            low_mask;         /* mask to modulo into lower half of table */

      /* These fields are fixed at hashtable creation */
      Size        keysize;          /* hash key length in bytes */
      Size        entrysize;        /* total user element size in bytes */
      long        num_partitions; /* # partitions (must be power of 2), or 0 */
      long        ffactor;          /* target fill factor */
      long        max_dsize;        /* 'dsize' limit if directory is fixed size */
      long        ssize;                  /* segment size --- must be power of 2 */
      int               sshift;                 /* segment shift = log2(ssize) */
      int               nelem_alloc;      /* number of entries to allocate at once */

#ifdef HASH_STATISTICS

      /*
       * Count statistics here.  NB: stats code doesn't bother with mutex, so
       * counts could be corrupted a bit in a partitioned table.
       */
      long        accesses;
      long        collisions;
#endif
};

#define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)

/*
 * Top control structure for a hashtable --- in a shared table, each backend
 * has its own copy (OK since no fields change at runtime)
 */
00152 struct HTAB
{
      HASHHDR    *hctl;             /* => shared control information */
      HASHSEGMENT *dir;             /* directory of segment starts */
      HashValueFunc hash;                 /* hash function */
      HashCompareFunc match;        /* key comparison function */
      HashCopyFunc keycopy;         /* key copying function */
      HashAllocFunc alloc;          /* memory allocator */
      MemoryContext hcxt;                 /* memory context if default allocator used */
      char     *tabname;            /* table name (for error messages) */
      bool        isshared;         /* true if table is in shared memory */

      /* freezing a shared table isn't allowed, so we can keep state here */
      bool        frozen;                 /* true = no more inserts allowed */

      /* We keep local copies of these fixed values to reduce contention */
      Size        keysize;          /* hash key length in bytes */
      long        ssize;                  /* segment size --- must be power of 2 */
      int               sshift;                 /* segment shift = log2(ssize) */
};

/*
 * Key (also entry) part of a HASHELEMENT
 */
#define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))

/*
 * Fast MOD arithmetic, assuming that y is a power of 2 !
 */
#define MOD(x,y)                 ((x) & ((y)-1))

#if HASH_STATISTICS
static long hash_accesses,
                  hash_collisions,
                  hash_expansions;
#endif

/*
 * Private function prototypes
 */
static void *DynaHashAlloc(Size size);
static HASHSEGMENT seg_alloc(HTAB *hashp);
static bool element_alloc(HTAB *hashp, int nelem);
static bool dir_realloc(HTAB *hashp);
static bool expand_table(HTAB *hashp);
static HASHBUCKET get_hash_entry(HTAB *hashp);
static void hdefault(HTAB *hashp);
static int  choose_nelem_alloc(Size entrysize);
static bool init_htab(HTAB *hashp, long nelem);
static void hash_corrupted(HTAB *hashp);
static void register_seq_scan(HTAB *hashp);
static void deregister_seq_scan(HTAB *hashp);
static bool has_seq_scans(HTAB *hashp);


/*
 * memory allocation support
 */
static MemoryContext CurrentDynaHashCxt = NULL;

static void *
DynaHashAlloc(Size size)
{
      Assert(MemoryContextIsValid(CurrentDynaHashCxt));
      return MemoryContextAlloc(CurrentDynaHashCxt, size);
}


/*
 * HashCompareFunc for string keys
 *
 * Because we copy keys with strlcpy(), they will be truncated at keysize-1
 * bytes, so we can only compare that many ... hence strncmp is almost but
 * not quite the right thing.
 */
static int
string_compare(const char *key1, const char *key2, Size keysize)
{
      return strncmp(key1, key2, keysize - 1);
}


/************************** CREATE ROUTINES **********************/

/*
 * hash_create -- create a new dynamic hash table
 *
 *    tabname: a name for the table (for debugging purposes)
 *    nelem: maximum number of elements expected
 *    *info: additional table parameters, as indicated by flags
 *    flags: bitmask indicating which parameters to take from *info
 *
 * Note: for a shared-memory hashtable, nelem needs to be a pretty good
 * estimate, since we can't expand the table on the fly.  But an unshared
 * hashtable can be expanded on-the-fly, so it's better for nelem to be
 * on the small side and let the table grow if it's exceeded.  An overly
 * large nelem will penalize hash_seq_search speed without buying much.
 */
HTAB *
hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
{
      HTAB     *hashp;
      HASHHDR    *hctl;

      /*
       * For shared hash tables, we have a local hash header (HTAB struct) that
       * we allocate in TopMemoryContext; all else is in shared memory.
       *
       * For non-shared hash tables, everything including the hash header is in
       * a memory context created specially for the hash table --- this makes
       * hash_destroy very simple.  The memory context is made a child of either
       * a context specified by the caller, or TopMemoryContext if nothing is
       * specified.
       */
      if (flags & HASH_SHARED_MEM)
      {
            /* Set up to allocate the hash header */
            CurrentDynaHashCxt = TopMemoryContext;
      }
      else
      {
            /* Create the hash table's private memory context */
            if (flags & HASH_CONTEXT)
                  CurrentDynaHashCxt = info->hcxt;
            else
                  CurrentDynaHashCxt = TopMemoryContext;
            CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
                                                                           tabname,
                                                                           ALLOCSET_DEFAULT_MINSIZE,
                                                                           ALLOCSET_DEFAULT_INITSIZE,
                                                                           ALLOCSET_DEFAULT_MAXSIZE);
      }

      /* Initialize the hash header, plus a copy of the table name */
      hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) +1);
      MemSet(hashp, 0, sizeof(HTAB));

      hashp->tabname = (char *) (hashp + 1);
      strcpy(hashp->tabname, tabname);

      if (flags & HASH_FUNCTION)
            hashp->hash = info->hash;
      else
            hashp->hash = string_hash;          /* default hash function */

      /*
       * If you don't specify a match function, it defaults to string_compare if
       * you used string_hash (either explicitly or by default) and to memcmp
       * otherwise.  (Prior to PostgreSQL 7.4, memcmp was always used.)
       */
      if (flags & HASH_COMPARE)
            hashp->match = info->match;
      else if (hashp->hash == string_hash)
            hashp->match = (HashCompareFunc) string_compare;
      else
            hashp->match = memcmp;

      /*
       * Similarly, the key-copying function defaults to strlcpy or memcpy.
       */
      if (flags & HASH_KEYCOPY)
            hashp->keycopy = info->keycopy;
      else if (hashp->hash == string_hash)
            hashp->keycopy = (HashCopyFunc) strlcpy;
      else
            hashp->keycopy = memcpy;

      if (flags & HASH_ALLOC)
            hashp->alloc = info->alloc;
      else
            hashp->alloc = DynaHashAlloc;

      if (flags & HASH_SHARED_MEM)
      {
            /*
             * ctl structure and directory are preallocated for shared memory
             * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
             * well.
             */
            hashp->hctl = info->hctl;
            hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
            hashp->hcxt = NULL;
            hashp->isshared = true;

            /* hash table already exists, we're just attaching to it */
            if (flags & HASH_ATTACH)
            {
                  /* make local copies of some heavily-used values */
                  hctl = hashp->hctl;
                  hashp->keysize = hctl->keysize;
                  hashp->ssize = hctl->ssize;
                  hashp->sshift = hctl->sshift;

                  return hashp;
            }
      }
      else
      {
            /* setup hash table defaults */
            hashp->hctl = NULL;
            hashp->dir = NULL;
            hashp->hcxt = CurrentDynaHashCxt;
            hashp->isshared = false;
      }

      if (!hashp->hctl)
      {
            hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
            if (!hashp->hctl)
                  ereport(ERROR,
                              (errcode(ERRCODE_OUT_OF_MEMORY),
                               errmsg("out of memory")));
      }

      hashp->frozen = false;

      hdefault(hashp);

      hctl = hashp->hctl;

      if (flags & HASH_PARTITION)
      {
            /* Doesn't make sense to partition a local hash table */
            Assert(flags & HASH_SHARED_MEM);
            /* # of partitions had better be a power of 2 */
            Assert(info->num_partitions == (1L << my_log2(info->num_partitions)));

            hctl->num_partitions = info->num_partitions;
      }

      if (flags & HASH_SEGMENT)
      {
            hctl->ssize = info->ssize;
            hctl->sshift = my_log2(info->ssize);
            /* ssize had better be a power of 2 */
            Assert(hctl->ssize == (1L << hctl->sshift));
      }
      if (flags & HASH_FFACTOR)
            hctl->ffactor = info->ffactor;

      /*
       * SHM hash tables have fixed directory size passed by the caller.
       */
      if (flags & HASH_DIRSIZE)
      {
            hctl->max_dsize = info->max_dsize;
            hctl->dsize = info->dsize;
      }

      /*
       * hash table now allocates space for key and data but you have to say how
       * much space to allocate
       */
      if (flags & HASH_ELEM)
      {
            Assert(info->entrysize >= info->keysize);
            hctl->keysize = info->keysize;
            hctl->entrysize = info->entrysize;
      }

      /* make local copies of heavily-used constant fields */
      hashp->keysize = hctl->keysize;
      hashp->ssize = hctl->ssize;
      hashp->sshift = hctl->sshift;

      /* Build the hash directory structure */
      if (!init_htab(hashp, nelem))
            elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);

      /*
       * For a shared hash table, preallocate the requested number of elements.
       * This reduces problems with run-time out-of-shared-memory conditions.
       *
       * For a non-shared hash table, preallocate the requested number of
       * elements if it's less than our chosen nelem_alloc.  This avoids wasting
       * space if the caller correctly estimates a small table size.
       */
      if ((flags & HASH_SHARED_MEM) ||
            nelem < hctl->nelem_alloc)
      {
            if (!element_alloc(hashp, (int) nelem))
                  ereport(ERROR,
                              (errcode(ERRCODE_OUT_OF_MEMORY),
                               errmsg("out of memory")));
      }

      return hashp;
}

/*
 * Set default HASHHDR parameters.
 */
static void
hdefault(HTAB *hashp)
{
      HASHHDR    *hctl = hashp->hctl;

      MemSet(hctl, 0, sizeof(HASHHDR));

      hctl->nentries = 0;
      hctl->freeList = NULL;

      hctl->dsize = DEF_DIRSIZE;
      hctl->nsegs = 0;

      /* rather pointless defaults for key & entry size */
      hctl->keysize = sizeof(char *);
      hctl->entrysize = 2 * sizeof(char *);

      hctl->num_partitions = 0;     /* not partitioned */

      hctl->ffactor = DEF_FFACTOR;

      /* table has no fixed maximum size */
      hctl->max_dsize = NO_MAX_DSIZE;

      hctl->ssize = DEF_SEGSIZE;
      hctl->sshift = DEF_SEGSIZE_SHIFT;

#ifdef HASH_STATISTICS
      hctl->accesses = hctl->collisions = 0;
#endif
}

/*
 * Given the user-specified entry size, choose nelem_alloc, ie, how many
 * elements to add to the hash table when we need more.
 */
static int
choose_nelem_alloc(Size entrysize)
{
      int               nelem_alloc;
      Size        elementSize;
      Size        allocSize;

      /* Each element has a HASHELEMENT header plus user data. */
      /* NB: this had better match element_alloc() */
      elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);

      /*
       * The idea here is to choose nelem_alloc at least 32, but round up so
       * that the allocation request will be a power of 2 or just less. This
       * makes little difference for hash tables in shared memory, but for hash
       * tables managed by palloc, the allocation request will be rounded up to
       * a power of 2 anyway.  If we fail to take this into account, we'll waste
       * as much as half the allocated space.
       */
      allocSize = 32 * 4;                 /* assume elementSize at least 8 */
      do
      {
            allocSize <<= 1;
            nelem_alloc = allocSize / elementSize;
      } while (nelem_alloc < 32);

      return nelem_alloc;
}

/*
 * Compute derived fields of hctl and build the initial directory/segment
 * arrays
 */
static bool
init_htab(HTAB *hashp, long nelem)
{
      HASHHDR    *hctl = hashp->hctl;
      HASHSEGMENT *segp;
      long        lnbuckets;
      int               nbuckets;
      int               nsegs;

      /*
       * initialize mutex if it's a partitioned table
       */
      if (IS_PARTITIONED(hctl))
            SpinLockInit(&hctl->mutex);

      /*
       * Divide number of elements by the fill factor to determine a desired
       * number of buckets.  Allocate space for the next greater power of two
       * number of buckets
       */
      lnbuckets = (nelem - 1) / hctl->ffactor + 1;

      nbuckets = 1 << my_log2(lnbuckets);

      /*
       * In a partitioned table, nbuckets must be at least equal to
       * num_partitions; were it less, keys with apparently different partition
       * numbers would map to the same bucket, breaking partition independence.
       * (Normally nbuckets will be much bigger; this is just a safety check.)
       */
      while (nbuckets < hctl->num_partitions)
            nbuckets <<= 1;

      hctl->max_bucket = hctl->low_mask = nbuckets - 1;
      hctl->high_mask = (nbuckets << 1) - 1;

      /*
       * Figure number of directory segments needed, round up to a power of 2
       */
      nsegs = (nbuckets - 1) / hctl->ssize + 1;
      nsegs = 1 << my_log2(nsegs);

      /*
       * Make sure directory is big enough. If pre-allocated directory is too
       * small, choke (caller screwed up).
       */
      if (nsegs > hctl->dsize)
      {
            if (!(hashp->dir))
                  hctl->dsize = nsegs;
            else
                  return false;
      }

      /* Allocate a directory */
      if (!(hashp->dir))
      {
            CurrentDynaHashCxt = hashp->hcxt;
            hashp->dir = (HASHSEGMENT *)
                  hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
            if (!hashp->dir)
                  return false;
      }

      /* Allocate initial segments */
      for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
      {
            *segp = seg_alloc(hashp);
            if (*segp == NULL)
                  return false;
      }

      /* Choose number of entries to allocate at a time */
      hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);

#if HASH_DEBUG
      fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n%s%ld\n",
                  "TABLE POINTER   ", hashp,
                  "DIRECTORY SIZE  ", hctl->dsize,
                  "SEGMENT SIZE    ", hctl->ssize,
                  "SEGMENT SHIFT   ", hctl->sshift,
                  "FILL FACTOR     ", hctl->ffactor,
                  "MAX BUCKET      ", hctl->max_bucket,
                  "HIGH MASK       ", hctl->high_mask,
                  "LOW  MASK       ", hctl->low_mask,
                  "NSEGS           ", hctl->nsegs,
                  "NENTRIES        ", hctl->nentries);
#endif
      return true;
}

/*
 * Estimate the space needed for a hashtable containing the given number
 * of entries of given size.
 * NOTE: this is used to estimate the footprint of hashtables in shared
 * memory; therefore it does not count HTAB which is in local memory.
 * NB: assumes that all hash structure parameters have default values!
 */
Size
hash_estimate_size(long num_entries, Size entrysize)
{
      Size        size;
      long        nBuckets,
                        nSegments,
                        nDirEntries,
                        nElementAllocs,
                        elementSize,
                        elementAllocCnt;

      /* estimate number of buckets wanted */
      nBuckets = 1L << my_log2((num_entries - 1) / DEF_FFACTOR + 1);
      /* # of segments needed for nBuckets */
      nSegments = 1L << my_log2((nBuckets - 1) / DEF_SEGSIZE + 1);
      /* directory entries */
      nDirEntries = DEF_DIRSIZE;
      while (nDirEntries < nSegments)
            nDirEntries <<= 1;            /* dir_alloc doubles dsize at each call */

      /* fixed control info */
      size = MAXALIGN(sizeof(HASHHDR));   /* but not HTAB, per above */
      /* directory */
      size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
      /* segments */
      size = add_size(size, mul_size(nSegments,
                                                MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
      /* elements --- allocated in groups of choose_nelem_alloc() entries */
      elementAllocCnt = choose_nelem_alloc(entrysize);
      nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
      elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
      size = add_size(size,
                              mul_size(nElementAllocs,
                                           mul_size(elementAllocCnt, elementSize)));

      return size;
}

/*
 * Select an appropriate directory size for a hashtable with the given
 * maximum number of entries.
 * This is only needed for hashtables in shared memory, whose directories
 * cannot be expanded dynamically.
 * NB: assumes that all hash structure parameters have default values!
 *
 * XXX this had better agree with the behavior of init_htab()...
 */
long
hash_select_dirsize(long num_entries)
{
      long        nBuckets,
                        nSegments,
                        nDirEntries;

      /* estimate number of buckets wanted */
      nBuckets = 1L << my_log2((num_entries - 1) / DEF_FFACTOR + 1);
      /* # of segments needed for nBuckets */
      nSegments = 1L << my_log2((nBuckets - 1) / DEF_SEGSIZE + 1);
      /* directory entries */
      nDirEntries = DEF_DIRSIZE;
      while (nDirEntries < nSegments)
            nDirEntries <<= 1;            /* dir_alloc doubles dsize at each call */

      return nDirEntries;
}

/*
 * Compute the required initial memory allocation for a shared-memory
 * hashtable with the given parameters.  We need space for the HASHHDR
 * and for the (non expansible) directory.
 */
Size
hash_get_shared_size(HASHCTL *info, int flags)
{
      Assert(flags & HASH_DIRSIZE);
      Assert(info->dsize == info->max_dsize);
      return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
}


/********************** DESTROY ROUTINES ************************/

void
hash_destroy(HTAB *hashp)
{
      if (hashp != NULL)
      {
            /* allocation method must be one we know how to free, too */
            Assert(hashp->alloc == DynaHashAlloc);
            /* so this hashtable must have it's own context */
            Assert(hashp->hcxt != NULL);

            hash_stats("destroy", hashp);

            /*
             * Free everything by destroying the hash table's memory context.
             */
            MemoryContextDelete(hashp->hcxt);
      }
}

void
hash_stats(const char *where, HTAB *hashp)
{
#if HASH_STATISTICS
      fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
                  where, hashp->hctl->accesses, hashp->hctl->collisions);

      fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
                  hashp->hctl->nentries, (long) hashp->hctl->keysize,
                  hashp->hctl->max_bucket, hashp->hctl->nsegs);
      fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
                  where, hash_accesses, hash_collisions);
      fprintf(stderr, "hash_stats: total expansions %ld\n",
                  hash_expansions);
#endif
}

/*******************************SEARCH ROUTINES *****************************/


/*
 * get_hash_value -- exported routine to calculate a key's hash value
 *
 * We export this because for partitioned tables, callers need to compute
 * the partition number (from the low-order bits of the hash value) before
 * searching.
 */
uint32
get_hash_value(HTAB *hashp, const void *keyPtr)
{
      return hashp->hash(keyPtr, hashp->keysize);
}

/* Convert a hash value to a bucket number */
static inline uint32
calc_bucket(HASHHDR *hctl, uint32 hash_val)
{
      uint32            bucket;

      bucket = hash_val & hctl->high_mask;
      if (bucket > hctl->max_bucket)
            bucket = bucket & hctl->low_mask;

      return bucket;
}

/*
 * hash_search -- look up key in table and perform action
 * hash_search_with_hash_value -- same, with key's hash value already computed
 *
 * action is one of:
 *          HASH_FIND: look up key in table
 *          HASH_ENTER: look up key in table, creating entry if not present
 *          HASH_ENTER_NULL: same, but return NULL if out of memory
 *          HASH_REMOVE: look up key in table, remove entry if present
 *
 * Return value is a pointer to the element found/entered/removed if any,
 * or NULL if no match was found.  (NB: in the case of the REMOVE action,
 * the result is a dangling pointer that shouldn't be dereferenced!)
 *
 * HASH_ENTER will normally ereport a generic "out of memory" error if
 * it is unable to create a new entry.    The HASH_ENTER_NULL operation is
 * the same except it will return NULL if out of memory.  Note that
 * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
 * since palloc internally ereports on out-of-memory.
 *
 * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
 * existing entry in the table, FALSE otherwise.  This is needed in the
 * HASH_ENTER case, but is redundant with the return value otherwise.
 *
 * For hash_search_with_hash_value, the hashvalue parameter must have been
 * calculated with get_hash_value().
 */
void *
hash_search(HTAB *hashp,
                  const void *keyPtr,
                  HASHACTION action,
                  bool *foundPtr)
{
      return hash_search_with_hash_value(hashp,
                                                         keyPtr,
                                                         hashp->hash(keyPtr, hashp->keysize),
                                                         action,
                                                         foundPtr);
}

void *
hash_search_with_hash_value(HTAB *hashp,
                                          const void *keyPtr,
                                          uint32 hashvalue,
                                          HASHACTION action,
                                          bool *foundPtr)
{
      HASHHDR    *hctl = hashp->hctl;
      Size        keysize;
      uint32            bucket;
      long        segment_num;
      long        segment_ndx;
      HASHSEGMENT segp;
      HASHBUCKET  currBucket;
      HASHBUCKET *prevBucketPtr;
      HashCompareFunc match;

#if HASH_STATISTICS
      hash_accesses++;
      hctl->accesses++;
#endif

      /*
       * Do the initial lookup
       */
      bucket = calc_bucket(hctl, hashvalue);

      segment_num = bucket >> hashp->sshift;
      segment_ndx = MOD(bucket, hashp->ssize);

      segp = hashp->dir[segment_num];

      if (segp == NULL)
            hash_corrupted(hashp);

      prevBucketPtr = &segp[segment_ndx];
      currBucket = *prevBucketPtr;

      /*
       * Follow collision chain looking for matching key
       */
      match = hashp->match;         /* save one fetch in inner loop */
      keysize = hashp->keysize;     /* ditto */

      while (currBucket != NULL)
      {
            if (currBucket->hashvalue == hashvalue &&
                  match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
                  break;
            prevBucketPtr = &(currBucket->link);
            currBucket = *prevBucketPtr;
#if HASH_STATISTICS
            hash_collisions++;
            hctl->collisions++;
#endif
      }

      if (foundPtr)
            *foundPtr = (bool) (currBucket != NULL);

      /*
       * OK, now what?
       */
      switch (action)
      {
            case HASH_FIND:
                  if (currBucket != NULL)
                        return (void *) ELEMENTKEY(currBucket);
                  return NULL;

            case HASH_REMOVE:
                  if (currBucket != NULL)
                  {
                        /* use volatile pointer to prevent code rearrangement */
                        volatile HASHHDR *hctlv = hctl;

                        /* if partitioned, must lock to touch nentries and freeList */
                        if (IS_PARTITIONED(hctlv))
                              SpinLockAcquire(&hctlv->mutex);

                        Assert(hctlv->nentries > 0);
                        hctlv->nentries--;

                        /* remove record from hash bucket's chain. */
                        *prevBucketPtr = currBucket->link;

                        /* add the record to the freelist for this table.  */
                        currBucket->link = hctlv->freeList;
                        hctlv->freeList = currBucket;

                        if (IS_PARTITIONED(hctlv))
                              SpinLockRelease(&hctlv->mutex);

                        /*
                         * better hope the caller is synchronizing access to this
                         * element, because someone else is going to reuse it the next
                         * time something is added to the table
                         */
                        return (void *) ELEMENTKEY(currBucket);
                  }
                  return NULL;

            case HASH_ENTER_NULL:
                  /* ENTER_NULL does not work with palloc-based allocator */
                  Assert(hashp->alloc != DynaHashAlloc);
                  /* FALL THRU */

            case HASH_ENTER:
                  /* Return existing element if found, else create one */
                  if (currBucket != NULL)
                        return (void *) ELEMENTKEY(currBucket);

                  /* disallow inserts if frozen */
                  if (hashp->frozen)
                        elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
                               hashp->tabname);

                  currBucket = get_hash_entry(hashp);
                  if (currBucket == NULL)
                  {
                        /* out of memory */
                        if (action == HASH_ENTER_NULL)
                              return NULL;
                        /* report a generic message */
                        if (hashp->isshared)
                              ereport(ERROR,
                                          (errcode(ERRCODE_OUT_OF_MEMORY),
                                           errmsg("out of shared memory")));
                        else
                              ereport(ERROR,
                                          (errcode(ERRCODE_OUT_OF_MEMORY),
                                           errmsg("out of memory")));
                  }

                  /* link into hashbucket chain */
                  *prevBucketPtr = currBucket;
                  currBucket->link = NULL;

                  /* copy key into record */
                  currBucket->hashvalue = hashvalue;
                  hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);

                  /* caller is expected to fill the data field on return */

                  /*
                   * Check if it is time to split a bucket.  Can't split if running
                   * in partitioned mode, nor if table is the subject of any active
                   * hash_seq_search scans.  Strange order of these tests is to try
                   * to check cheaper conditions first.
                   */
                  if (!IS_PARTITIONED(hctl) &&
                  hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
                        !has_seq_scans(hashp))
                  {
                        /*
                         * NOTE: failure to expand table is not a fatal error, it just
                         * means we have to run at higher fill factor than we wanted.
                         */
                        expand_table(hashp);
                  }

                  return (void *) ELEMENTKEY(currBucket);
      }

      elog(ERROR, "unrecognized hash action code: %d", (int) action);

      return NULL;                        /* keep compiler quiet */
}

/*
 * create a new entry if possible
 */
static HASHBUCKET
get_hash_entry(HTAB *hashp)
{
      /* use volatile pointer to prevent code rearrangement */
      volatile HASHHDR *hctlv = hashp->hctl;
      HASHBUCKET  newElement;

      for (;;)
      {
            /* if partitioned, must lock to touch nentries and freeList */
            if (IS_PARTITIONED(hctlv))
                  SpinLockAcquire(&hctlv->mutex);

            /* try to get an entry from the freelist */
            newElement = hctlv->freeList;
            if (newElement != NULL)
                  break;

            /* no free elements.  allocate another chunk of buckets */
            if (IS_PARTITIONED(hctlv))
                  SpinLockRelease(&hctlv->mutex);

            if (!element_alloc(hashp, hctlv->nelem_alloc))
            {
                  /* out of memory */
                  return NULL;
            }
      }

      /* remove entry from freelist, bump nentries */
      hctlv->freeList = newElement->link;
      hctlv->nentries++;

      if (IS_PARTITIONED(hctlv))
            SpinLockRelease(&hctlv->mutex);

      return newElement;
}

/*
 * hash_get_num_entries -- get the number of entries in a hashtable
 */
long
hash_get_num_entries(HTAB *hashp)
{
      /*
       * We currently don't bother with the mutex; it's only sensible to call
       * this function if you've got lock on all partitions of the table.
       */
      return hashp->hctl->nentries;
}

/*
 * hash_seq_init/_search/_term
 *                Sequentially search through hash table and return
 *                all the elements one by one, return NULL when no more.
 *
 * hash_seq_term should be called if and only if the scan is abandoned before
 * completion; if hash_seq_search returns NULL then it has already done the
 * end-of-scan cleanup.
 *
 * NOTE: caller may delete the returned element before continuing the scan.
 * However, deleting any other element while the scan is in progress is
 * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
 * if elements are added to the table while the scan is in progress, it is
 * unspecified whether they will be visited by the scan or not.
 *
 * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
 * worry about hash_seq_term cleanup, if the hashtable is first locked against
 * further insertions by calling hash_freeze.  This is used by nodeAgg.c,
 * wherein it is inconvenient to track whether a scan is still open, and
 * there's no possibility of further insertions after readout has begun.
 *
 * NOTE: to use this with a partitioned hashtable, caller had better hold
 * at least shared lock on all partitions of the table throughout the scan!
 * We can cope with insertions or deletions by our own backend, but *not*
 * with concurrent insertions or deletions by another.
 */
void
hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
{
      status->hashp = hashp;
      status->curBucket = 0;
      status->curEntry = NULL;
      if (!hashp->frozen)
            register_seq_scan(hashp);
}

void *
hash_seq_search(HASH_SEQ_STATUS *status)
{
      HTAB     *hashp;
      HASHHDR    *hctl;
      uint32            max_bucket;
      long        ssize;
      long        segment_num;
      long        segment_ndx;
      HASHSEGMENT segp;
      uint32            curBucket;
      HASHELEMENT *curElem;

      if ((curElem = status->curEntry) != NULL)
      {
            /* Continuing scan of curBucket... */
            status->curEntry = curElem->link;
            if (status->curEntry == NULL) /* end of this bucket */
                  ++status->curBucket;
            return (void *) ELEMENTKEY(curElem);
      }

      /*
       * Search for next nonempty bucket starting at curBucket.
       */
      curBucket = status->curBucket;
      hashp = status->hashp;
      hctl = hashp->hctl;
      ssize = hashp->ssize;
      max_bucket = hctl->max_bucket;

      if (curBucket > max_bucket)
      {
            hash_seq_term(status);
            return NULL;                  /* search is done */
      }

      /*
       * first find the right segment in the table directory.
       */
      segment_num = curBucket >> hashp->sshift;
      segment_ndx = MOD(curBucket, ssize);

      segp = hashp->dir[segment_num];

      /*
       * Pick up the first item in this bucket's chain.  If chain is not empty
       * we can begin searching it.  Otherwise we have to advance to find the
       * next nonempty bucket.  We try to optimize that case since searching a
       * near-empty hashtable has to iterate this loop a lot.
       */
      while ((curElem = segp[segment_ndx]) == NULL)
      {
            /* empty bucket, advance to next */
            if (++curBucket > max_bucket)
            {
                  status->curBucket = curBucket;
                  hash_seq_term(status);
                  return NULL;            /* search is done */
            }
            if (++segment_ndx >= ssize)
            {
                  segment_num++;
                  segment_ndx = 0;
                  segp = hashp->dir[segment_num];
            }
      }

      /* Begin scan of curBucket... */
      status->curEntry = curElem->link;
      if (status->curEntry == NULL)       /* end of this bucket */
            ++curBucket;
      status->curBucket = curBucket;
      return (void *) ELEMENTKEY(curElem);
}

void
hash_seq_term(HASH_SEQ_STATUS *status)
{
      if (!status->hashp->frozen)
            deregister_seq_scan(status->hashp);
}

/*
 * hash_freeze
 *                Freeze a hashtable against future insertions (deletions are
 *                still allowed)
 *
 * The reason for doing this is that by preventing any more bucket splits,
 * we no longer need to worry about registering hash_seq_search scans,
 * and thus caller need not be careful about ensuring hash_seq_term gets
 * called at the right times.
 *
 * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
 * with active scans (since hash_seq_term would then do the wrong thing).
 */
void
hash_freeze(HTAB *hashp)
{
      if (hashp->isshared)
            elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
      if (!hashp->frozen && has_seq_scans(hashp))
            elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
                   hashp->tabname);
      hashp->frozen = true;
}


/********************************* UTILITIES ************************/

/*
 * Expand the table by adding one more hash bucket.
 */
static bool
expand_table(HTAB *hashp)
{
      HASHHDR    *hctl = hashp->hctl;
      HASHSEGMENT old_seg,
                        new_seg;
      long        old_bucket,
                        new_bucket;
      long        new_segnum,
                        new_segndx;
      long        old_segnum,
                        old_segndx;
      HASHBUCKET *oldlink,
                     *newlink;
      HASHBUCKET  currElement,
                        nextElement;

      Assert(!IS_PARTITIONED(hctl));

#ifdef HASH_STATISTICS
      hash_expansions++;
#endif

      new_bucket = hctl->max_bucket + 1;
      new_segnum = new_bucket >> hashp->sshift;
      new_segndx = MOD(new_bucket, hashp->ssize);

      if (new_segnum >= hctl->nsegs)
      {
            /* Allocate new segment if necessary -- could fail if dir full */
            if (new_segnum >= hctl->dsize)
                  if (!dir_realloc(hashp))
                        return false;
            if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
                  return false;
            hctl->nsegs++;
      }

      /* OK, we created a new bucket */
      hctl->max_bucket++;

      /*
       * *Before* changing masks, find old bucket corresponding to same hash
       * values; values in that bucket may need to be relocated to new bucket.
       * Note that new_bucket is certainly larger than low_mask at this point,
       * so we can skip the first step of the regular hash mask calc.
       */
      old_bucket = (new_bucket & hctl->low_mask);

      /*
       * If we crossed a power of 2, readjust masks.
       */
      if ((uint32) new_bucket > hctl->high_mask)
      {
            hctl->low_mask = hctl->high_mask;
            hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
      }

      /*
       * Relocate records to the new bucket.    NOTE: because of the way the hash
       * masking is done in calc_bucket, only one old bucket can need to be
       * split at this point.  With a different way of reducing the hash value,
       * that might not be true!
       */
      old_segnum = old_bucket >> hashp->sshift;
      old_segndx = MOD(old_bucket, hashp->ssize);

      old_seg = hashp->dir[old_segnum];
      new_seg = hashp->dir[new_segnum];

      oldlink = &old_seg[old_segndx];
      newlink = &new_seg[new_segndx];

      for (currElement = *oldlink;
             currElement != NULL;
             currElement = nextElement)
      {
            nextElement = currElement->link;
            if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
            {
                  *oldlink = currElement;
                  oldlink = &currElement->link;
            }
            else
            {
                  *newlink = currElement;
                  newlink = &currElement->link;
            }
      }
      /* don't forget to terminate the rebuilt hash chains... */
      *oldlink = NULL;
      *newlink = NULL;

      return true;
}


static bool
dir_realloc(HTAB *hashp)
{
      HASHSEGMENT *p;
      HASHSEGMENT *old_p;
      long        new_dsize;
      long        old_dirsize;
      long        new_dirsize;

      if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
            return false;

      /* Reallocate directory */
      new_dsize = hashp->hctl->dsize << 1;
      old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
      new_dirsize = new_dsize * sizeof(HASHSEGMENT);

      old_p = hashp->dir;
      CurrentDynaHashCxt = hashp->hcxt;
      p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);

      if (p != NULL)
      {
            memcpy(p, old_p, old_dirsize);
            MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
            hashp->dir = p;
            hashp->hctl->dsize = new_dsize;

            /* XXX assume the allocator is palloc, so we know how to free */
            Assert(hashp->alloc == DynaHashAlloc);
            pfree(old_p);

            return true;
      }

      return false;
}


static HASHSEGMENT
seg_alloc(HTAB *hashp)
{
      HASHSEGMENT segp;

      CurrentDynaHashCxt = hashp->hcxt;
      segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);

      if (!segp)
            return NULL;

      MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);

      return segp;
}

/*
 * allocate some new elements and link them into the free list
 */
static bool
element_alloc(HTAB *hashp, int nelem)
{
      /* use volatile pointer to prevent code rearrangement */
      volatile HASHHDR *hctlv = hashp->hctl;
      Size        elementSize;
      HASHELEMENT *firstElement;
      HASHELEMENT *tmpElement;
      HASHELEMENT *prevElement;
      int               i;

      /* Each element has a HASHELEMENT header plus user data. */
      elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize);

      CurrentDynaHashCxt = hashp->hcxt;
      firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);

      if (!firstElement)
            return false;

      /* prepare to link all the new entries into the freelist */
      prevElement = NULL;
      tmpElement = firstElement;
      for (i = 0; i < nelem; i++)
      {
            tmpElement->link = prevElement;
            prevElement = tmpElement;
            tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
      }

      /* if partitioned, must lock to touch freeList */
      if (IS_PARTITIONED(hctlv))
            SpinLockAcquire(&hctlv->mutex);

      /* freelist could be nonempty if two backends did this concurrently */
      firstElement->link = hctlv->freeList;
      hctlv->freeList = prevElement;

      if (IS_PARTITIONED(hctlv))
            SpinLockRelease(&hctlv->mutex);

      return true;
}

/* complain when we have detected a corrupted hashtable */
static void
hash_corrupted(HTAB *hashp)
{
      /*
       * If the corruption is in a shared hashtable, we'd better force a
       * systemwide restart.  Otherwise, just shut down this one backend.
       */
      if (hashp->isshared)
            elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
      else
            elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
}

/* calculate ceil(log base 2) of num */
int
my_log2(long num)
{
      int               i;
      long        limit;

      for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
            ;
      return i;
}


/************************* SEQ SCAN TRACKING ************************/

/*
 * We track active hash_seq_search scans here.  The need for this mechanism
 * comes from the fact that a scan will get confused if a bucket split occurs
 * while it's in progress: it might visit entries twice, or even miss some
 * entirely (if it's partway through the same bucket that splits).  Hence
 * we want to inhibit bucket splits if there are any active scans on the
 * table being inserted into.  This is a fairly rare case in current usage,
 * so just postponing the split until the next insertion seems sufficient.
 *
 * Given present usages of the function, only a few scans are likely to be
 * open concurrently; so a finite-size stack of open scans seems sufficient,
 * and we don't worry that linear search is too slow.  Note that we do
 * allow multiple scans of the same hashtable to be open concurrently.
 *
 * This mechanism can support concurrent scan and insertion in a shared
 * hashtable if it's the same backend doing both.  It would fail otherwise,
 * but locking reasons seem to preclude any such scenario anyway, so we don't
 * worry.
 *
 * This arrangement is reasonably robust if a transient hashtable is deleted
 * without notifying us.  The absolute worst case is we might inhibit splits
 * in another table created later at exactly the same address.    We will give
 * a warning at transaction end for reference leaks, so any bugs leading to
 * lack of notification should be easy to catch.
 */

#define MAX_SEQ_SCANS 100

static HTAB *seq_scan_tables[MAX_SEQ_SCANS];    /* tables being scanned */
static int  seq_scan_level[MAX_SEQ_SCANS];            /* subtransaction nest level */
static int  num_seq_scans = 0;


/* Register a table as having an active hash_seq_search scan */
static void
register_seq_scan(HTAB *hashp)
{
      if (num_seq_scans >= MAX_SEQ_SCANS)
            elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
                   hashp->tabname);
      seq_scan_tables[num_seq_scans] = hashp;
      seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
      num_seq_scans++;
}

/* Deregister an active scan */
static void
deregister_seq_scan(HTAB *hashp)
{
      int               i;

      /* Search backward since it's most likely at the stack top */
      for (i = num_seq_scans - 1; i >= 0; i--)
      {
            if (seq_scan_tables[i] == hashp)
            {
                  seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
                  seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
                  num_seq_scans--;
                  return;
            }
      }
      elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
             hashp->tabname);
}

/* Check if a table has any active scan */
static bool
has_seq_scans(HTAB *hashp)
{
      int               i;

      for (i = 0; i < num_seq_scans; i++)
      {
            if (seq_scan_tables[i] == hashp)
                  return true;
      }
      return false;
}

/* Clean up any open scans at end of transaction */
void
AtEOXact_HashTables(bool isCommit)
{
      /*
       * During abort cleanup, open scans are expected; just silently clean 'em
       * out.  An open scan at commit means someone forgot a hash_seq_term()
       * call, so complain.
       *
       * Note: it's tempting to try to print the tabname here, but refrain for
       * fear of touching deallocated memory.  This isn't a user-facing message
       * anyway, so it needn't be pretty.
       */
      if (isCommit)
      {
            int               i;

            for (i = 0; i < num_seq_scans; i++)
            {
                  elog(WARNING, "leaked hash_seq_search scan for hash table %p",
                         seq_scan_tables[i]);
            }
      }
      num_seq_scans = 0;
}

/* Clean up any open scans at end of subtransaction */
void
AtEOSubXact_HashTables(bool isCommit, int nestDepth)
{
      int               i;

      /*
       * Search backward to make cleanup easy.  Note we must check all entries,
       * not only those at the end of the array, because deletion technique
       * doesn't keep them in order.
       */
      for (i = num_seq_scans - 1; i >= 0; i--)
      {
            if (seq_scan_level[i] >= nestDepth)
            {
                  if (isCommit)
                        elog(WARNING, "leaked hash_seq_search scan for hash table %p",
                               seq_scan_tables[i]);
                  seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
                  seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
                  num_seq_scans--;
            }
      }
}

Generated by  Doxygen 1.6.0   Back to index