Logo Search packages:      
Sourcecode: postgresql-8.4 version File versions

freespace.c

/*-------------------------------------------------------------------------
 *
 * freespace.c
 *      POSTGRES free space map for quickly finding free space in relations
 *
 *
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *      $PostgreSQL$
 *
 *
 * NOTES:
 *
 *  Free Space Map keeps track of the amount of free space on pages, and
 *  allows quickly searching for a page with enough free space. The FSM is
 *  stored in a dedicated relation fork of all heap relations, and those
 *  index access methods that need it (see also indexfsm.c). See README for
 *  more information.
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "access/htup.h"
#include "access/xlogutils.h"
#include "storage/bufpage.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/fsm_internals.h"
#include "storage/lmgr.h"
#include "storage/lwlock.h"
#include "storage/smgr.h"
#include "utils/rel.h"
#include "utils/inval.h"
#include "miscadmin.h"

/*
 * We use just one byte to store the amount of free space on a page, so we
 * divide the amount of free space a page can have into 256 different
 * categories. The highest category, 255, represents a page with at least
 * MaxFSMRequestSize bytes of free space, and the second highest category
 * represents the range from 254 * FSM_CAT_STEP, inclusive, to
 * MaxFSMRequestSize, exclusive.
 *
 * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
 * default 8k BLCKSZ, and that MaxFSMRequestSize is 24 bytes, the categories
 * look like this
 *
 *
 * Range     Category
 * 0    - 31   0
 * 32   - 63   1
 * ...    ...  ...
 * 8096 - 8127 253
 * 8128 - 8163 254
 * 8164 - 8192 255
 *
 * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
 * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
 * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
 * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
 * completely empty page, that would mean that we could never satisfy a
 * request of exactly MaxFSMRequestSize bytes.
 */
#define FSM_CATEGORIES  256
#define FSM_CAT_STEP    (BLCKSZ / FSM_CATEGORIES)
#define MaxFSMRequestSize     MaxHeapTupleSize

/*
 * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
 * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
 * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
 * this means that 4096 bytes is the smallest BLCKSZ that we can get away
 * with a 3-level tree, and 512 is the smallest we support.
 */
#define FSM_TREE_DEPTH  ((SlotsPerFSMPage >= 1626) ? 3 : 4)

#define FSM_ROOT_LEVEL  (FSM_TREE_DEPTH - 1)
#define FSM_BOTTOM_LEVEL 0

/*
 * The internal FSM routines work on a logical addressing scheme. Each
 * level of the tree can be thought of as a separately addressable file.
 */
typedef struct
{
      int level;        /* level */
      int logpageno;    /* page number within the level */
} FSMAddress;

/* Address of the root page. */
static const FSMAddress FSM_ROOT_ADDRESS = { FSM_ROOT_LEVEL, 0 };

/* functions to navigate the tree */
static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
static BlockNumber fsm_logical_to_physical(FSMAddress addr);

static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
static void fsm_extend(Relation rel, BlockNumber fsm_nblocks);

/* functions to convert amount of free space to a FSM category */
static uint8 fsm_space_avail_to_cat(Size avail);
static uint8 fsm_space_needed_to_cat(Size needed);
static Size  fsm_space_cat_to_avail(uint8 cat);

/* workhorse functions for various operations */
static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
                                            uint8 newValue, uint8 minValue);
static BlockNumber fsm_search(Relation rel, uint8 min_cat);
static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof);


/******** Public API ********/

/*
 * GetPageWithFreeSpace - try to find a page in the given relation with
 *          at least the specified amount of free space.
 *
 * If successful, return the block number; if not, return InvalidBlockNumber.
 *
 * The caller must be prepared for the possibility that the returned page
 * will turn out to have too little space available by the time the caller
 * gets a lock on it.  In that case, the caller should report the actual
 * amount of free space available on that page and then try again (see
 * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
 * extend the relation.
 */
BlockNumber
GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
{
      uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
      return fsm_search(rel, min_cat);
}

/*
 * RecordAndGetPageWithFreeSpace - update info about a page and try again.
 *
 * We provide this combo form to save some locking overhead, compared to
 * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
 * also some effort to return a page close to the old page; if there's a
 * page with enough free space on the same FSM page where the old one page
 * is located, it is preferred.
 */
BlockNumber
RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
                                            Size oldSpaceAvail, Size spaceNeeded)
{
      int               old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
      int               search_cat = fsm_space_needed_to_cat(spaceNeeded);
      FSMAddress  addr;
      uint16            slot;
      int               search_slot;

      /* Get the location of the FSM byte representing the heap block */
      addr = fsm_get_location(oldPage, &slot);

      search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);

      /*
       * If fsm_set_and_search found a suitable new block, return that.
       * Otherwise, search as usual.
       */
      if (search_slot != -1)
            return fsm_get_heap_blk(addr, search_slot);
      else
            return fsm_search(rel, search_cat);
}

/*
 * RecordPageWithFreeSpace - update info about a page.
 *
 * Note that if the new spaceAvail value is higher than the old value stored
 * in the FSM, the space might not become visible to searchers until the next
 * FreeSpaceMapVacuum call, which updates the upper level pages.
 */
void
RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
{
      int               new_cat = fsm_space_avail_to_cat(spaceAvail);
      FSMAddress  addr;
      uint16            slot;

      /* Get the location of the FSM byte representing the heap block */
      addr = fsm_get_location(heapBlk, &slot);

      fsm_set_and_search(rel, addr, slot, new_cat, 0);
}

/*
 * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
 *          WAL replay
 */
void
XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
                                          Size spaceAvail)
{
      int               new_cat = fsm_space_avail_to_cat(spaceAvail);
      FSMAddress  addr;
      uint16            slot;
      BlockNumber blkno;
      Buffer            buf;
      Page        page;

      /* Get the location of the FSM byte representing the heap block */
      addr = fsm_get_location(heapBlk, &slot);
      blkno = fsm_logical_to_physical(addr);

      /* If the page doesn't exist already, extend */
      buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
      LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);

      page = BufferGetPage(buf);
      if (PageIsNew(page))
            PageInit(page, BLCKSZ, 0);

      if (fsm_set_avail(page, slot, new_cat))
            MarkBufferDirty(buf);
      UnlockReleaseBuffer(buf);
}

/*
 * GetRecordedFreePage - return the amount of free space on a particular page,
 *          according to the FSM.
 */
Size
GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
{
      FSMAddress  addr;
      uint16            slot;
      Buffer            buf;
      uint8       cat;

      /* Get the location of the FSM byte representing the heap block */
      addr = fsm_get_location(heapBlk, &slot);

      buf = fsm_readbuf(rel, addr, false);
      if (!BufferIsValid(buf))
            return 0;
      cat = fsm_get_avail(BufferGetPage(buf), slot);
      ReleaseBuffer(buf);

      return fsm_space_cat_to_avail(cat);
}

/*
 * FreeSpaceMapTruncateRel - adjust for truncation of a relation.
 *
 * The caller must hold AccessExclusiveLock on the relation, to ensure
 * that other backends receive the relcache invalidation event that this
 * function sends, before accessing the FSM again.
 *
 * nblocks is the new size of the heap.
 */
void
FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
{
      BlockNumber new_nfsmblocks;
      FSMAddress  first_removed_address;
      uint16            first_removed_slot;
      Buffer            buf;

      RelationOpenSmgr(rel);

      /*
       * If no FSM has been created yet for this relation, there's nothing to
       * truncate.
       */
      if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
            return;

      /* Get the location in the FSM of the first removed heap block */
      first_removed_address = fsm_get_location(nblocks, &first_removed_slot);

      /*
       * Zero out the tail of the last remaining FSM page. If the slot
       * representing the first removed heap block is at a page boundary, as
       * the first slot on the FSM page that first_removed_address points to,
       * we can just truncate that page altogether.
       */
      if (first_removed_slot > 0)
      {
            buf = fsm_readbuf(rel, first_removed_address, false);
            if (!BufferIsValid(buf))
                  return; /* nothing to do; the FSM was already smaller */
            LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
            fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
            MarkBufferDirty(buf);
            UnlockReleaseBuffer(buf);

            new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
      }
      else
      {
            new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
            if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
                  return; /* nothing to do; the FSM was already smaller */
      }

      /* Truncate the unused FSM pages */
      smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks, rel->rd_istemp);

      /*
       * Need to invalidate the relcache entry, because rd_fsm_nblocks
       * seen by other backends is no longer valid.
       */
      if (!InRecovery)
            CacheInvalidateRelcache(rel);

      rel->rd_fsm_nblocks = new_nfsmblocks;
}

/*
 * FreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM
 */
void
FreeSpaceMapVacuum(Relation rel)
{
      bool dummy;

      /*
       * Traverse the tree in depth-first order. The tree is stored physically
       * in depth-first order, so this should be pretty I/O efficient.
       */
      fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, &dummy);
}

/******** Internal routines ********/

/*
 * Return category corresponding x bytes of free space
 */
static uint8
fsm_space_avail_to_cat(Size avail)
{
      int cat;

      Assert(avail < BLCKSZ);

      if (avail >= MaxFSMRequestSize)
            return 255;

      cat = avail / FSM_CAT_STEP;

      /*
       * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
       * more.
       */
      if (cat > 254)
            cat = 254;

      return (uint8) cat;
}

/*
 * Return the lower bound of the range of free space represented by given
 * category.
 */
static Size
fsm_space_cat_to_avail(uint8 cat)
{
      /* The highest category represents exactly MaxFSMRequestSize bytes. */
      if (cat == 255)
            return MaxFSMRequestSize;
      else
            return cat * FSM_CAT_STEP;
}

/*
 * Which category does a page need to have, to accommodate x bytes of data?
 * While fsm_size_to_avail_cat() rounds down, this needs to round up.
 */
static uint8
fsm_space_needed_to_cat(Size needed)
{
      int cat;

      /* Can't ask for more space than the highest category represents */
      if (needed > MaxFSMRequestSize)
                  elog(ERROR, "invalid FSM request size %lu",
                         (unsigned long) needed);

      if (needed == 0)
            return 1;

      cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;

      if (cat > 255)
            cat = 255;

      return (uint8) cat;
}

/*
 * Returns the physical block number an FSM page
 */
static BlockNumber
fsm_logical_to_physical(FSMAddress addr)
{
      BlockNumber pages;
      int leafno;
      int l;

      /*
       * Calculate the logical page number of the first leaf page below the
       * given page.
       */
      leafno = addr.logpageno;
      for (l = 0; l < addr.level; l++)
            leafno *= SlotsPerFSMPage;

      /* Count upper level nodes required to address the leaf page */
      pages = 0;
      for (l = 0; l < FSM_TREE_DEPTH; l++)
      {
            pages += leafno + 1;
            leafno /= SlotsPerFSMPage;
      }

      /*
       * If the page we were asked for wasn't at the bottom level, subtract
       * the additional lower level pages we counted above.
       */
      pages -= addr.level;

      /* Turn the page count into 0-based block number */
      return pages - 1;
}

/*
 * Return the FSM location corresponding to given heap block.
 */
static FSMAddress
fsm_get_location(BlockNumber heapblk, uint16 *slot)
{
      FSMAddress addr;

      addr.level = FSM_BOTTOM_LEVEL;
      addr.logpageno = heapblk / SlotsPerFSMPage;
      *slot = heapblk % SlotsPerFSMPage;

      return addr;
}

/*
 * Return the heap block number corresponding to given location in the FSM.
 */
static BlockNumber
fsm_get_heap_blk(FSMAddress addr, uint16 slot)
{
      Assert(addr.level == FSM_BOTTOM_LEVEL);
      return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
}

/*
 * Given a logical address of a child page, get the logical page number of
 * the parent, and the slot within the parent corresponding to the child.
 */
static FSMAddress
fsm_get_parent(FSMAddress child, uint16 *slot)
{
      FSMAddress parent;

      Assert(child.level < FSM_ROOT_LEVEL);

      parent.level = child.level + 1;
      parent.logpageno = child.logpageno / SlotsPerFSMPage;
      *slot = child.logpageno % SlotsPerFSMPage;

      return parent;
}

/*
 * Given a logical address of a parent page, and a slot number get the
 * logical address of the corresponding child page.
 */
static FSMAddress
fsm_get_child(FSMAddress parent, uint16 slot)
{
      FSMAddress child;

      Assert(parent.level > FSM_BOTTOM_LEVEL);

      child.level = parent.level - 1;
      child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;

      return child;
}

/*
 * Read a FSM page.
 *
 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
 * true, the FSM file is extended.
 */
static Buffer
fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
{
      BlockNumber blkno = fsm_logical_to_physical(addr);
      Buffer buf;

      RelationOpenSmgr(rel);

      /* If we haven't cached the size of the FSM yet, check it first */
      if (rel->rd_fsm_nblocks == InvalidBlockNumber)
      {
            if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
                  rel->rd_fsm_nblocks = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
            else
                  rel->rd_fsm_nblocks = 0;
      }

      /* Handle requests beyond EOF */
      if (blkno >= rel->rd_fsm_nblocks)
      {
            if (extend)
                  fsm_extend(rel, blkno + 1);
            else
                  return InvalidBuffer;
      }

      /*
       * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
       * information is not accurate anyway, so it's better to clear corrupt
       * pages than error out. Since the FSM changes are not WAL-logged, the
       * so-called torn page problem on crash can lead to pages with corrupt
       * headers, for example.
       */
      buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
      if (PageIsNew(BufferGetPage(buf)))
            PageInit(BufferGetPage(buf), BLCKSZ, 0);
      return buf;
}

/*
 * Ensure that the FSM fork is at least fsm_nblocks long, extending
 * it if necessary with empty pages. And by empty, I mean pages filled
 * with zeros, meaning there's no free space.
 */
static void
fsm_extend(Relation rel, BlockNumber fsm_nblocks)
{
      BlockNumber fsm_nblocks_now;
      Page pg;

      pg = (Page) palloc(BLCKSZ);
      PageInit(pg, BLCKSZ, 0);

      /*
       * We use the relation extension lock to lock out other backends
       * trying to extend the FSM at the same time. It also locks out
       * extension of the main fork, unnecessarily, but extending the
       * FSM happens seldom enough that it doesn't seem worthwhile to
       * have a separate lock tag type for it.
       *
       * Note that another backend might have extended or created the
       * relation before we get the lock.
       */
      LockRelationForExtension(rel, ExclusiveLock);

      /* Create the FSM file first if it doesn't exist */
      if ((rel->rd_fsm_nblocks == 0 || rel->rd_fsm_nblocks == InvalidBlockNumber)
            && !smgrexists(rel->rd_smgr, FSM_FORKNUM))
      {
            smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);
            fsm_nblocks_now = 0;
      }
      else
            fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);

      while (fsm_nblocks_now < fsm_nblocks)
      {
            smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
                           (char *) pg, rel->rd_istemp);
            fsm_nblocks_now++;
      }

      UnlockRelationForExtension(rel, ExclusiveLock);

      pfree(pg);

      /* Update the relcache with the up-to-date size */
      if (!InRecovery)
            CacheInvalidateRelcache(rel);
      rel->rd_fsm_nblocks = fsm_nblocks_now;
}

/*
 * Set value in given FSM page and slot.
 *
 * If minValue > 0, the updated page is also searched for a page with at
 * least minValue of free space. If one is found, its slot number is
 * returned, -1 otherwise.
 */
static int
fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
                           uint8 newValue, uint8 minValue)
{
      Buffer            buf;
      Page        page;
      int               newslot = -1;

      buf = fsm_readbuf(rel, addr, true);
      LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);

      page = BufferGetPage(buf);

      if (fsm_set_avail(page, slot, newValue))
            MarkBufferDirty(buf);

      if (minValue != 0)
      {
            /* Search while we still hold the lock */
            newslot = fsm_search_avail(buf, minValue,
                                                   addr.level == FSM_BOTTOM_LEVEL,
                                                   true);
      }

      UnlockReleaseBuffer(buf);

      return newslot;
}

/*
 * Search the tree for a heap page with at least min_cat of free space
 */
static BlockNumber
fsm_search(Relation rel, uint8 min_cat)
{
      int restarts = 0;
      FSMAddress addr = FSM_ROOT_ADDRESS;

      for (;;)
      {
            int slot;
            Buffer buf;
            uint8 max_avail = 0;

            /* Read the FSM page. */
            buf = fsm_readbuf(rel, addr, false);

            /* Search within the page */
            if (BufferIsValid(buf))
            {
                  LockBuffer(buf, BUFFER_LOCK_SHARE);
                  slot = fsm_search_avail(buf, min_cat,
                                                      (addr.level == FSM_BOTTOM_LEVEL),
                                                      false);
                  if (slot == -1)
                        max_avail = fsm_get_max_avail(BufferGetPage(buf));
                  UnlockReleaseBuffer(buf);
            }
            else
                  slot = -1;

            if (slot != -1)
            {
                  /*
                   * Descend the tree, or return the found block if we're at the
                   * bottom.
                   */
                  if (addr.level == FSM_BOTTOM_LEVEL)
                        return fsm_get_heap_blk(addr, slot);

                  addr = fsm_get_child(addr, slot);
            }
            else if (addr.level == FSM_ROOT_LEVEL)
            {
                  /*
                   * At the root, failure means there's no page with enough free
                   * space in the FSM. Give up.
                   */
                  return InvalidBlockNumber;
            }
            else
            {
                  uint16 parentslot;
                  FSMAddress parent;

                  /*
                   * At lower level, failure can happen if the value in the upper-
                   * level node didn't reflect the value on the lower page. Update
                   * the upper node, to avoid falling into the same trap again, and
                   * start over.
                   *
                   * There's a race condition here, if another backend updates this
                   * page right after we release it, and gets the lock on the parent
                   * page before us. We'll then update the parent page with the now
                   * stale information we had. It's OK, because it should happen
                   * rarely, and will be fixed by the next vacuum.
                   */
                  parent = fsm_get_parent(addr, &parentslot);
                  fsm_set_and_search(rel, parent, parentslot, max_avail, 0);

                  /*
                   * If the upper pages are badly out of date, we might need to
                   * loop quite a few times, updating them as we go. Any
                   * inconsistencies should eventually be corrected and the loop
                   * should end. Looping indefinitely is nevertheless scary, so
                   * provide an emergency valve.
                   */
                  if (restarts++ > 10000)
                        return InvalidBlockNumber;

                  /* Start search all over from the root */
                  addr = FSM_ROOT_ADDRESS;
            }
      }
}


/*
 * Recursive guts of FreeSpaceMapVacuum
 */
static uint8
fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
{
      Buffer buf;
      Page page;
      uint8 max_avail;

      /* Read the page if it exists, or return EOF */
      buf = fsm_readbuf(rel, addr, false);
      if (!BufferIsValid(buf))
      {
            *eof_p = true;
            return 0;
      }
      else
            *eof_p = false;

      page = BufferGetPage(buf);

      /*
       * Recurse into children, and fix the information stored about them
       * at this level.
       */
      if (addr.level > FSM_BOTTOM_LEVEL)
      {
            int slot;
            bool eof = false;

            for (slot = 0; slot < SlotsPerFSMPage; slot++)
            {
                  int child_avail;

                  /* After we hit end-of-file, just clear the rest of the slots */
                  if (!eof)
                        child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof);
                  else
                        child_avail = 0;

                  /* Update information about the child */
                  if (fsm_get_avail(page, slot) != child_avail)
                  {
                        LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
                        fsm_set_avail(BufferGetPage(buf), slot, child_avail);
                        MarkBufferDirty(buf);
                        LockBuffer(buf, BUFFER_LOCK_UNLOCK);
                  }
            }
      }

      max_avail = fsm_get_max_avail(BufferGetPage(buf));

      /*
       * Reset the next slot pointer. This encourages the use of low-numbered
       * pages, increasing the chances that a later vacuum can truncate the
       * relation.
       */
      ((FSMPage) PageGetContents(page))->fp_next_slot = 0;

      ReleaseBuffer(buf);

      return max_avail;
}

Generated by  Doxygen 1.6.0   Back to index