Logo Search packages:      
Sourcecode: postgresql-8.4 version File versions  Download package

smgr.c

/*-------------------------------------------------------------------------
 *
 * smgr.c
 *      public interface routines to storage manager switch.
 *
 *      All file system operations in POSTGRES dispatch through these
 *      routines.
 *
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *      $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.117 2009/06/11 14:49:02 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "commands/tablespace.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"


/*
 * This struct of function pointers defines the API between smgr.c and
 * any individual storage manager module.  Note that smgr subfunctions are
 * generally expected to report problems via elog(ERROR).  An exception is
 * that smgr_unlink should use elog(WARNING), rather than erroring out,
 * because we normally unlink relations during post-commit/abort cleanup,
 * and so it's too late to raise an error.  Also, various conditions that
 * would normally be errors should be allowed during bootstrap and/or WAL
 * recovery --- see comments in md.c for details.
 */
00039 typedef struct f_smgr
{
      void        (*smgr_init) (void);    /* may be NULL */
      void        (*smgr_shutdown) (void);            /* may be NULL */
      void        (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
      void        (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
                                                                  bool isRedo);
      bool        (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
      void        (*smgr_unlink) (RelFileNode rnode, ForkNumber forknum,
                                                                  bool isRedo);
      void        (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
                                          BlockNumber blocknum, char *buffer, bool isTemp);
      void        (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
                                                                    BlockNumber blocknum);
      void        (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
                                                              BlockNumber blocknum, char *buffer);
      void        (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
                                          BlockNumber blocknum, char *buffer, bool isTemp);
      BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
      void        (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
                                                               BlockNumber nblocks, bool isTemp);
      void        (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
      void        (*smgr_pre_ckpt) (void);            /* may be NULL */
      void        (*smgr_sync) (void);    /* may be NULL */
      void        (*smgr_post_ckpt) (void);           /* may be NULL */
} f_smgr;


static const f_smgr smgrsw[] = {
      /* magnetic disk */
      {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
            mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
            mdpreckpt, mdsync, mdpostckpt
      }
};

static const int NSmgr = lengthof(smgrsw);


/*
 * Each backend has a hashtable that stores all extant SMgrRelation objects.
 */
static HTAB *SMgrRelationHash = NULL;

/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
static void smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum,
                               int which, bool isTemp, bool isRedo);


/*
 *    smgrinit(), smgrshutdown() -- Initialize or shut down storage
 *                                                managers.
 *
 * Note: smgrinit is called during backend startup (normal or standalone
 * case), *not* during postmaster start.  Therefore, any resources created
 * here or destroyed in smgrshutdown are backend-local.
 */
void
smgrinit(void)
{
      int               i;

      for (i = 0; i < NSmgr; i++)
      {
            if (smgrsw[i].smgr_init)
                  (*(smgrsw[i].smgr_init)) ();
      }

      /* register the shutdown proc */
      on_proc_exit(smgrshutdown, 0);
}

/*
 * on_proc_exit hook for smgr cleanup during backend shutdown
 */
static void
smgrshutdown(int code, Datum arg)
{
      int               i;

      for (i = 0; i < NSmgr; i++)
      {
            if (smgrsw[i].smgr_shutdown)
                  (*(smgrsw[i].smgr_shutdown)) ();
      }
}

/*
 *    smgropen() -- Return an SMgrRelation object, creating it if need be.
 *
 *          This does not attempt to actually open the object.
 */
SMgrRelation
smgropen(RelFileNode rnode)
{
      SMgrRelation reln;
      bool        found;

      if (SMgrRelationHash == NULL)
      {
            /* First time through: initialize the hash table */
            HASHCTL           ctl;

            MemSet(&ctl, 0, sizeof(ctl));
            ctl.keysize = sizeof(RelFileNode);
            ctl.entrysize = sizeof(SMgrRelationData);
            ctl.hash = tag_hash;
            SMgrRelationHash = hash_create("smgr relation table", 400,
                                                         &ctl, HASH_ELEM | HASH_FUNCTION);
      }

      /* Look up or create an entry */
      reln = (SMgrRelation) hash_search(SMgrRelationHash,
                                                        (void *) &rnode,
                                                        HASH_ENTER, &found);

      /* Initialize it if not present before */
      if (!found)
      {
            int               forknum;

            /* hash_search already filled in the lookup key */
            reln->smgr_owner = NULL;
            reln->smgr_which = 0;   /* we only have md.c at present */

            /* mark it not open */
            for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
                  reln->md_fd[forknum] = NULL;
      }

      return reln;
}

/*
 * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
 *
 * There can be only one owner at a time; this is sufficient since currently
 * the only such owners exist in the relcache.
 */
void
smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
{
      /*
       * First, unhook any old owner.  (Normally there shouldn't be any, but it
       * seems possible that this can happen during swap_relation_files()
       * depending on the order of processing.  It's ok to close the old
       * relcache entry early in that case.)
       */
      if (reln->smgr_owner)
            *(reln->smgr_owner) = NULL;

      /* Now establish the ownership relationship. */
      reln->smgr_owner = owner;
      *owner = reln;
}

/*
 *    smgrexists() -- Does the underlying file for a fork exist?
 */
bool
smgrexists(SMgrRelation reln, ForkNumber forknum)
{
      return (*(smgrsw[reln->smgr_which].smgr_exists)) (reln, forknum);
}

/*
 *    smgrclose() -- Close and delete an SMgrRelation object.
 */
void
smgrclose(SMgrRelation reln)
{
      SMgrRelation *owner;
      ForkNumber  forknum;

      for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
            (*(smgrsw[reln->smgr_which].smgr_close)) (reln, forknum);

      owner = reln->smgr_owner;

      if (hash_search(SMgrRelationHash,
                              (void *) &(reln->smgr_rnode),
                              HASH_REMOVE, NULL) == NULL)
            elog(ERROR, "SMgrRelation hashtable corrupted");

      /*
       * Unhook the owner pointer, if any.  We do this last since in the remote
       * possibility of failure above, the SMgrRelation object will still exist.
       */
      if (owner)
            *owner = NULL;
}

/*
 *    smgrcloseall() -- Close all existing SMgrRelation objects.
 */
void
smgrcloseall(void)
{
      HASH_SEQ_STATUS status;
      SMgrRelation reln;

      /* Nothing to do if hashtable not set up */
      if (SMgrRelationHash == NULL)
            return;

      hash_seq_init(&status, SMgrRelationHash);

      while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
            smgrclose(reln);
}

/*
 *    smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
 *                               if one exists.
 *
 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
 * uselessly creating a hashtable entry only to drop it again when no
 * such entry exists already.
 */
void
smgrclosenode(RelFileNode rnode)
{
      SMgrRelation reln;

      /* Nothing to do if hashtable not set up */
      if (SMgrRelationHash == NULL)
            return;

      reln = (SMgrRelation) hash_search(SMgrRelationHash,
                                                        (void *) &rnode,
                                                        HASH_FIND, NULL);
      if (reln != NULL)
            smgrclose(reln);
}

/*
 *    smgrcreate() -- Create a new relation.
 *
 *          Given an already-created (but presumably unused) SMgrRelation,
 *          cause the underlying disk file or other storage for the fork
 *          to be created.
 *
 *          If isRedo is true, it is okay for the underlying file to exist
 *          already because we are in a WAL replay sequence.
 */
void
smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
{
      /*
       * Exit quickly in WAL replay mode if we've already opened the file. If
       * it's open, it surely must exist.
       */
      if (isRedo && reln->md_fd[forknum] != NULL)
            return;

      /*
       * We may be using the target table space for the first time in this
       * database, so create a per-database subdirectory if needed.
       *
       * XXX this is a fairly ugly violation of module layering, but this seems
       * to be the best place to put the check.  Maybe TablespaceCreateDbspace
       * should be here and not in commands/tablespace.c?  But that would imply
       * importing a lot of stuff that smgr.c oughtn't know, either.
       */
      TablespaceCreateDbspace(reln->smgr_rnode.spcNode,
                                          reln->smgr_rnode.dbNode,
                                          isRedo);

      (*(smgrsw[reln->smgr_which].smgr_create)) (reln, forknum, isRedo);
}

/*
 *    smgrdounlink() -- Immediately unlink a relation.
 *
 *          The specified fork of the relation is removed from the store.  This
 *          should not be used during transactional operations, since it can't be
 *          undone.
 *
 *          If isRedo is true, it is okay for the underlying file to be gone
 *          already.
 */
void
smgrdounlink(SMgrRelation reln, ForkNumber forknum, bool isTemp, bool isRedo)
{
      RelFileNode rnode = reln->smgr_rnode;
      int               which = reln->smgr_which;

      /* Close the fork */
      (*(smgrsw[which].smgr_close)) (reln, forknum);

      smgr_internal_unlink(rnode, forknum, which, isTemp, isRedo);
}

/*
 * Shared subroutine that actually does the unlink ...
 */
static void
smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum,
                               int which, bool isTemp, bool isRedo)
{
      /*
       * Get rid of any remaining buffers for the relation.  bufmgr will just
       * drop them without bothering to write the contents.
       */
      DropRelFileNodeBuffers(rnode, forknum, isTemp, 0);

      /*
       * It'd be nice to tell the stats collector to forget it immediately, too.
       * But we can't because we don't know the OID (and in cases involving
       * relfilenode swaps, it's not always clear which table OID to forget,
       * anyway).
       */

      /*
       * And delete the physical files.
       *
       * Note: smgr_unlink must treat deletion failure as a WARNING, not an
       * ERROR, because we've already decided to commit or abort the current
       * xact.
       */
      (*(smgrsw[which].smgr_unlink)) (rnode, forknum, isRedo);
}

/*
 *    smgrextend() -- Add a new block to a file.
 *
 *          The semantics are nearly the same as smgrwrite(): write at the
 *          specified position.  However, this is to be used for the case of
 *          extending a relation (i.e., blocknum is at or beyond the current
 *          EOF).  Note that we assume writing a block beyond current EOF
 *          causes intervening file space to become filled with zeroes.
 */
void
smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
               char *buffer, bool isTemp)
{
      (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum,
                                                                     buffer, isTemp);
}

/*
 *    smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
 */
void
smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
      (*(smgrsw[reln->smgr_which].smgr_prefetch)) (reln, forknum, blocknum);
}

/*
 *    smgrread() -- read a particular block from a relation into the supplied
 *                        buffer.
 *
 *          This routine is called from the buffer manager in order to
 *          instantiate pages in the shared buffer cache.  All storage managers
 *          return pages in the format that POSTGRES expects.
 */
void
smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
             char *buffer)
{
      (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
}

/*
 *    smgrwrite() -- Write the supplied buffer out.
 *
 *          This is to be used only for updating already-existing blocks of a
 *          relation (ie, those before the current EOF).  To extend a relation,
 *          use smgrextend().
 *
 *          This is not a synchronous write -- the block is not necessarily
 *          on disk at return, only dumped out to the kernel.  However,
 *          provisions will be made to fsync the write before the next checkpoint.
 *
 *          isTemp indicates that the relation is a temp table (ie, is managed
 *          by the local-buffer manager).  In this case no provisions need be
 *          made to fsync the write before checkpointing.
 */
void
smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
              char *buffer, bool isTemp)
{
      (*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum,
                                                                    buffer, isTemp);
}

/*
 *    smgrnblocks() -- Calculate the number of blocks in the
 *                             supplied relation.
 */
BlockNumber
smgrnblocks(SMgrRelation reln, ForkNumber forknum)
{
      return (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln, forknum);
}

/*
 *    smgrtruncate() -- Truncate supplied relation to the specified number
 *                              of blocks
 */
void
smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks,
                   bool isTemp)
{
      /*
       * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
       * just drop them without bothering to write the contents.
       */
      DropRelFileNodeBuffers(reln->smgr_rnode, forknum, isTemp, nblocks);

      /* Do the truncation */
      (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, forknum, nblocks,
                                                                         isTemp);
}

/*
 *    smgrimmedsync() -- Force the specified relation to stable storage.
 *
 *          Synchronously force all previous writes to the specified relation
 *          down to disk.
 *
 *          This is useful for building completely new relations (eg, new
 *          indexes).  Instead of incrementally WAL-logging the index build
 *          steps, we can just write completed index pages to disk with smgrwrite
 *          or smgrextend, and then fsync the completed index file before
 *          committing the transaction.  (This is sufficient for purposes of
 *          crash recovery, since it effectively duplicates forcing a checkpoint
 *          for the completed index.  But it is *not* sufficient if one wishes
 *          to use the WAL log for PITR or replication purposes: in that case
 *          we have to make WAL entries as well.)
 *
 *          The preceding writes should specify isTemp = true to avoid
 *          duplicative fsyncs.
 *
 *          Note that you need to do FlushRelationBuffers() first if there is
 *          any possibility that there are dirty buffers for the relation;
 *          otherwise the sync is not very meaningful.
 */
void
smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
{
      (*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln, forknum);
}


/*
 *    smgrpreckpt() -- Prepare for checkpoint.
 */
void
smgrpreckpt(void)
{
      int               i;

      for (i = 0; i < NSmgr; i++)
      {
            if (smgrsw[i].smgr_pre_ckpt)
                  (*(smgrsw[i].smgr_pre_ckpt)) ();
      }
}

/*
 *    smgrsync() -- Sync files to disk during checkpoint.
 */
void
smgrsync(void)
{
      int               i;

      for (i = 0; i < NSmgr; i++)
      {
            if (smgrsw[i].smgr_sync)
                  (*(smgrsw[i].smgr_sync)) ();
      }
}

/*
 *    smgrpostckpt() -- Post-checkpoint cleanup.
 */
void
smgrpostckpt(void)
{
      int               i;

      for (i = 0; i < NSmgr; i++)
      {
            if (smgrsw[i].smgr_post_ckpt)
                  (*(smgrsw[i].smgr_post_ckpt)) ();
      }
}

Generated by  Doxygen 1.6.0   Back to index