Logo Search packages:      
Sourcecode: postgresql-8.4 version File versions  Download package

fd.c

/*-------------------------------------------------------------------------
 *
 * fd.c
 *      Virtual file descriptor code.
 *
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *      $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.149.2.1 2009/12/03 11:03:35 heikki Exp $
 *
 * NOTES:
 *
 * This code manages a cache of 'virtual' file descriptors (VFDs).
 * The server opens many file descriptors for a variety of reasons,
 * including base tables, scratch files (e.g., sort and hash spool
 * files), and random calls to C library routines like system(3); it
 * is quite easy to exceed system limits on the number of open files a
 * single process can have.  (This is around 256 on many modern
 * operating systems, but can be as low as 32 on others.)
 *
 * VFDs are managed as an LRU pool, with actual OS file descriptors
 * being opened and closed as needed.  Obviously, if a routine is
 * opened using these interfaces, all subsequent operations must also
 * be through these interfaces (the File type is not a real file
 * descriptor).
 *
 * For this scheme to work, most (if not all) routines throughout the
 * server should use these interfaces instead of calling the C library
 * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
 * may find ourselves short of real file descriptors anyway.
 *
 * This file used to contain a bunch of stuff to support RAID levels 0
 * (jbod), 1 (duplex) and 5 (xor parity).  That stuff is all gone
 * because the parallel query processing code that called it is all
 * gone.  If you really need it you could get it from the original
 * POSTGRES source.
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <sys/file.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#ifdef HAVE_SYS_RESOURCE_H
#include <sys/resource.h>           /* for getrlimit */
#endif

#include "miscadmin.h"
#include "access/xact.h"
#include "catalog/pg_tablespace.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/resowner.h"


/*
 * We must leave some file descriptors free for system(), the dynamic loader,
 * and other code that tries to open files without consulting fd.c.  This
 * is the number left free.  (While we can be pretty sure we won't get
 * EMFILE, there's never any guarantee that we won't get ENFILE due to
 * other processes chewing up FDs.  So it's a bad idea to try to open files
 * without consulting fd.c.  Nonetheless we cannot control all code.)
 *
 * Because this is just a fixed setting, we are effectively assuming that
 * no such code will leave FDs open over the long term; otherwise the slop
 * is likely to be insufficient.  Note in particular that we expect that
 * loading a shared library does not result in any permanent increase in
 * the number of open files.  (This appears to be true on most if not
 * all platforms as of Feb 2004.)
 */
#define NUM_RESERVED_FDS            10

/*
 * If we have fewer than this many usable FDs after allowing for the reserved
 * ones, choke.
 */
#define FD_MINFREE                        10


/*
 * A number of platforms allow individual processes to open many more files
 * than they can really support when *many* processes do the same thing.
 * This GUC parameter lets the DBA limit max_safe_fds to something less than
 * what the postmaster's initial probe suggests will work.
 */
int               max_files_per_process = 1000;

/*
 * Maximum number of file descriptors to open for either VFD entries or
 * AllocateFile/AllocateDir operations.  This is initialized to a conservative
 * value, and remains that way indefinitely in bootstrap or standalone-backend
 * cases.  In normal postmaster operation, the postmaster calls
 * set_max_safe_fds() late in initialization to update the value, and that
 * value is then inherited by forked subprocesses.
 *
 * Note: the value of max_files_per_process is taken into account while
 * setting this variable, and so need not be tested separately.
 */
static int  max_safe_fds = 32;      /* default if not changed */


/* Debugging.... */

#ifdef FDDEBUG
#define DO_DB(A) A
#else
#define DO_DB(A)                    /* A */
#endif

#define VFD_CLOSED (-1)

#define FileIsValid(file) \
      ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)

#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)

#define FileUnknownPos ((off_t) -1)

/* these are the assigned bits in fdstate below: */
#define FD_TEMPORARY          (1 << 0)    /* T = delete when closed */
#define FD_XACT_TEMPORARY     (1 << 1)    /* T = delete at eoXact */

/*
 * Flag to tell whether it's worth scanning VfdCache looking for temp files to
 * close
 */
static bool have_xact_temporary_files = false;

typedef struct vfd
{
      int               fd;                     /* current FD, or VFD_CLOSED if none */
      unsigned short fdstate;       /* bitflags for VFD's state */
      ResourceOwner resowner;       /* owner, for automatic cleanup */
      File        nextFree;         /* link to next free VFD, if in freelist */
      File        lruMoreRecently;  /* doubly linked recency-of-use list */
      File        lruLessRecently;
      off_t       seekPos;          /* current logical file position */
      char     *fileName;           /* name of file, or NULL for unused VFD */
      /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
      int               fileFlags;        /* open(2) flags for (re)opening the file */
      int               fileMode;         /* mode to pass to open(2) */
} Vfd;

/*
 * Virtual File Descriptor array pointer and size.    This grows as
 * needed.  'File' values are indexes into this array.
 * Note that VfdCache[0] is not a usable VFD, just a list header.
 */
static Vfd *VfdCache;
static Size SizeVfdCache = 0;

/*
 * Number of file descriptors known to be in use by VFD entries.
 */
static int  nfile = 0;

/*
 * List of stdio FILEs and <dirent.h> DIRs opened with AllocateFile
 * and AllocateDir.
 *
 * Since we don't want to encourage heavy use of AllocateFile or AllocateDir,
 * it seems OK to put a pretty small maximum limit on the number of
 * simultaneously allocated descs.
 */
#define MAX_ALLOCATED_DESCS  32

typedef enum
{
      AllocateDescFile,
      AllocateDescDir
} AllocateDescKind;

typedef struct
{
      AllocateDescKind kind;
      union
      {
            FILE     *file;
            DIR            *dir;
      }                 desc;
      SubTransactionId create_subid;
} AllocateDesc;

static int  numAllocatedDescs = 0;
static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS];

/*
 * Number of temporary files opened during the current session;
 * this is used in generation of tempfile names.
 */
static long tempFileCounter = 0;

/*
 * Array of OIDs of temp tablespaces.  When numTempTableSpaces is -1,
 * this has not been set in the current transaction.
 */
static Oid *tempTableSpaces = NULL;
static int  numTempTableSpaces = -1;
static int  nextTempTableSpace = 0;


/*--------------------
 *
 * Private Routines
 *
 * Delete            - delete a file from the Lru ring
 * LruDelete         - remove a file from the Lru ring and close its FD
 * Insert            - put a file at the front of the Lru ring
 * LruInsert         - put a file at the front of the Lru ring and open it
 * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
 * AllocateVfd       - grab a free (or new) file record (from VfdArray)
 * FreeVfd           - free a file record
 *
 * The Least Recently Used ring is a doubly linked list that begins and
 * ends on element zero.  Element zero is special -- it doesn't represent
 * a file and its "fd" field always == VFD_CLOSED.    Element zero is just an
 * anchor that shows us the beginning/end of the ring.
 * Only VFD elements that are currently really open (have an FD assigned) are
 * in the Lru ring.  Elements that are "virtually" open can be recognized
 * by having a non-null fileName field.
 *
 * example:
 *
 *       /--less----\                        /---------\
 *       v           \                v                 \
 *     #0 --more---> LeastRecentlyUsed --more-\ \
 *      ^\                                                  | |
 *       \\less--> MostRecentlyUsedFile   <---/ |
 *          \more---/                            \--less--/
 *
 *--------------------
 */
static void Delete(File file);
static void LruDelete(File file);
static void Insert(File file);
static int  LruInsert(File file);
static bool ReleaseLruFile(void);
static File AllocateVfd(void);
static void FreeVfd(File file);

static int  FileAccess(File file);
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
static void AtProcExit_Files(int code, Datum arg);
static void CleanupTempFiles(bool isProcExit);
static void RemovePgTempFilesInDir(const char *tmpdirname);


/*
 * pg_fsync --- do fsync with or without writethrough
 */
int
pg_fsync(int fd)
{
#ifndef HAVE_FSYNC_WRITETHROUGH_ONLY
      if (sync_method != SYNC_METHOD_FSYNC_WRITETHROUGH)
            return pg_fsync_no_writethrough(fd);
      else
#endif
            return pg_fsync_writethrough(fd);
}


/*
 * pg_fsync_no_writethrough --- same as fsync except does nothing if
 *    enableFsync is off
 */
int
pg_fsync_no_writethrough(int fd)
{
      if (enableFsync)
            return fsync(fd);
      else
            return 0;
}

/*
 * pg_fsync_writethrough
 */
int
pg_fsync_writethrough(int fd)
{
      if (enableFsync)
      {
#ifdef WIN32
            return _commit(fd);
#elif defined(F_FULLFSYNC)
            return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
#else
            return -1;
#endif
      }
      else
            return 0;
}

/*
 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
 *
 * Not all platforms have fdatasync; treat as fsync if not available.
 */
int
pg_fdatasync(int fd)
{
      if (enableFsync)
      {
#ifdef HAVE_FDATASYNC
            return fdatasync(fd);
#else
            return fsync(fd);
#endif
      }
      else
            return 0;
}

/*
 * InitFileAccess --- initialize this module during backend startup
 *
 * This is called during either normal or standalone backend start.
 * It is *not* called in the postmaster.
 */
void
InitFileAccess(void)
{
      Assert(SizeVfdCache == 0);    /* call me only once */

      /* initialize cache header entry */
      VfdCache = (Vfd *) malloc(sizeof(Vfd));
      if (VfdCache == NULL)
            ereport(FATAL,
                        (errcode(ERRCODE_OUT_OF_MEMORY),
                         errmsg("out of memory")));

      MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
      VfdCache->fd = VFD_CLOSED;

      SizeVfdCache = 1;

      /* register proc-exit hook to ensure temp files are dropped at exit */
      on_proc_exit(AtProcExit_Files, 0);
}

/*
 * count_usable_fds --- count how many FDs the system will let us open,
 *          and estimate how many are already open.
 *
 * We stop counting if usable_fds reaches max_to_probe.  Note: a small
 * value of max_to_probe might result in an underestimate of already_open;
 * we must fill in any "gaps" in the set of used FDs before the calculation
 * of already_open will give the right answer.  In practice, max_to_probe
 * of a couple of dozen should be enough to ensure good results.
 *
 * We assume stdin (FD 0) is available for dup'ing
 */
static void
count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
{
      int            *fd;
      int               size;
      int               used = 0;
      int               highestfd = 0;
      int               j;

#ifdef HAVE_GETRLIMIT
      struct rlimit rlim;
      int               getrlimit_status;
#endif

      size = 1024;
      fd = (int *) palloc(size * sizeof(int));

#ifdef HAVE_GETRLIMIT
#ifdef RLIMIT_NOFILE                /* most platforms use RLIMIT_NOFILE */
      getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
#else /* but BSD doesn't ... */
      getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
#endif   /* RLIMIT_NOFILE */
      if (getrlimit_status != 0)
            ereport(WARNING, (errmsg("getrlimit failed: %m")));
#endif   /* HAVE_GETRLIMIT */

      /* dup until failure or probe limit reached */
      for (;;)
      {
            int               thisfd;

#ifdef HAVE_GETRLIMIT

            /*
             * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
             * some platforms
             */
            if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
                  break;
#endif

            thisfd = dup(0);
            if (thisfd < 0)
            {
                  /* Expect EMFILE or ENFILE, else it's fishy */
                  if (errno != EMFILE && errno != ENFILE)
                        elog(WARNING, "dup(0) failed after %d successes: %m", used);
                  break;
            }

            if (used >= size)
            {
                  size *= 2;
                  fd = (int *) repalloc(fd, size * sizeof(int));
            }
            fd[used++] = thisfd;

            if (highestfd < thisfd)
                  highestfd = thisfd;

            if (used >= max_to_probe)
                  break;
      }

      /* release the files we opened */
      for (j = 0; j < used; j++)
            close(fd[j]);

      pfree(fd);

      /*
       * Return results.      usable_fds is just the number of successful dups. We
       * assume that the system limit is highestfd+1 (remember 0 is a legal FD
       * number) and so already_open is highestfd+1 - usable_fds.
       */
      *usable_fds = used;
      *already_open = highestfd + 1 - used;
}

/*
 * set_max_safe_fds
 *          Determine number of filedescriptors that fd.c is allowed to use
 */
void
set_max_safe_fds(void)
{
      int               usable_fds;
      int               already_open;

      /*----------
       * We want to set max_safe_fds to
       *                MIN(usable_fds, max_files_per_process - already_open)
       * less the slop factor for files that are opened without consulting
       * fd.c.  This ensures that we won't exceed either max_files_per_process
       * or the experimentally-determined EMFILE limit.
       *----------
       */
      count_usable_fds(max_files_per_process,
                               &usable_fds, &already_open);

      max_safe_fds = Min(usable_fds, max_files_per_process - already_open);

      /*
       * Take off the FDs reserved for system() etc.
       */
      max_safe_fds -= NUM_RESERVED_FDS;

      /*
       * Make sure we still have enough to get by.
       */
      if (max_safe_fds < FD_MINFREE)
            ereport(FATAL,
                        (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                         errmsg("insufficient file descriptors available to start server process"),
                         errdetail("System allows %d, we need at least %d.",
                                       max_safe_fds + NUM_RESERVED_FDS,
                                       FD_MINFREE + NUM_RESERVED_FDS)));

      elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
             max_safe_fds, usable_fds, already_open);
}

/*
 * BasicOpenFile --- same as open(2) except can free other FDs if needed
 *
 * This is exported for use by places that really want a plain kernel FD,
 * but need to be proof against running out of FDs.  Once an FD has been
 * successfully returned, it is the caller's responsibility to ensure that
 * it will not be leaked on ereport()!    Most users should *not* call this
 * routine directly, but instead use the VFD abstraction level, which
 * provides protection against descriptor leaks as well as management of
 * files that need to be open for more than a short period of time.
 *
 * Ideally this should be the *only* direct call of open() in the backend.
 * In practice, the postmaster calls open() directly, and there are some
 * direct open() calls done early in backend startup.  Those are OK since
 * this module wouldn't have any open files to close at that point anyway.
 */
int
BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
{
      int               fd;

tryAgain:
      fd = open(fileName, fileFlags, fileMode);

      if (fd >= 0)
            return fd;                    /* success! */

      if (errno == EMFILE || errno == ENFILE)
      {
            int               save_errno = errno;

            ereport(LOG,
                        (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                         errmsg("out of file descriptors: %m; release and retry")));
            errno = 0;
            if (ReleaseLruFile())
                  goto tryAgain;
            errno = save_errno;
      }

      return -1;                          /* failure */
}

#if defined(FDDEBUG)

static void
_dump_lru(void)
{
      int               mru = VfdCache[0].lruLessRecently;
      Vfd            *vfdP = &VfdCache[mru];
      char        buf[2048];

      snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
      while (mru != 0)
      {
            mru = vfdP->lruLessRecently;
            vfdP = &VfdCache[mru];
            snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
      }
      snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
      elog(LOG, buf);
}
#endif   /* FDDEBUG */

static void
Delete(File file)
{
      Vfd            *vfdP;

      Assert(file != 0);

      DO_DB(elog(LOG, "Delete %d (%s)",
                     file, VfdCache[file].fileName));
      DO_DB(_dump_lru());

      vfdP = &VfdCache[file];

      VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
      VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;

      DO_DB(_dump_lru());
}

static void
LruDelete(File file)
{
      Vfd            *vfdP;

      Assert(file != 0);

      DO_DB(elog(LOG, "LruDelete %d (%s)",
                     file, VfdCache[file].fileName));

      vfdP = &VfdCache[file];

      /* delete the vfd record from the LRU ring */
      Delete(file);

      /* save the seek position */
      vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
      Assert(vfdP->seekPos != (off_t) -1);

      /* close the file */
      if (close(vfdP->fd))
            elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);

      --nfile;
      vfdP->fd = VFD_CLOSED;
}

static void
Insert(File file)
{
      Vfd            *vfdP;

      Assert(file != 0);

      DO_DB(elog(LOG, "Insert %d (%s)",
                     file, VfdCache[file].fileName));
      DO_DB(_dump_lru());

      vfdP = &VfdCache[file];

      vfdP->lruMoreRecently = 0;
      vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
      VfdCache[0].lruLessRecently = file;
      VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;

      DO_DB(_dump_lru());
}

/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
      Vfd            *vfdP;

      Assert(file != 0);

      DO_DB(elog(LOG, "LruInsert %d (%s)",
                     file, VfdCache[file].fileName));

      vfdP = &VfdCache[file];

      if (FileIsNotOpen(file))
      {
            while (nfile + numAllocatedDescs >= max_safe_fds)
            {
                  if (!ReleaseLruFile())
                        break;
            }

            /*
             * The open could still fail for lack of file descriptors, eg due to
             * overall system file table being full.  So, be prepared to release
             * another FD if necessary...
             */
            vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
                                                 vfdP->fileMode);
            if (vfdP->fd < 0)
            {
                  DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno));
                  return vfdP->fd;
            }
            else
            {
                  DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
                  ++nfile;
            }

            /* seek to the right position */
            if (vfdP->seekPos != (off_t) 0)
            {
                  off_t       returnValue;

                  returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
                  Assert(returnValue != (off_t) -1);
            }
      }

      /*
       * put it at the head of the Lru ring
       */

      Insert(file);

      return 0;
}

static bool
ReleaseLruFile(void)
{
      DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));

      if (nfile > 0)
      {
            /*
             * There are opened files and so there should be at least one used vfd
             * in the ring.
             */
            Assert(VfdCache[0].lruMoreRecently != 0);
            LruDelete(VfdCache[0].lruMoreRecently);
            return true;                  /* freed a file */
      }
      return false;                       /* no files available to free */
}

static File
AllocateVfd(void)
{
      Index       i;
      File        file;

      DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));

      Assert(SizeVfdCache > 0);     /* InitFileAccess not called? */

      if (VfdCache[0].nextFree == 0)
      {
            /*
             * The free list is empty so it is time to increase the size of the
             * array.  We choose to double it each time this happens. However,
             * there's not much point in starting *real* small.
             */
            Size        newCacheSize = SizeVfdCache * 2;
            Vfd            *newVfdCache;

            if (newCacheSize < 32)
                  newCacheSize = 32;

            /*
             * Be careful not to clobber VfdCache ptr if realloc fails.
             */
            newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
            if (newVfdCache == NULL)
                  ereport(ERROR,
                              (errcode(ERRCODE_OUT_OF_MEMORY),
                               errmsg("out of memory")));
            VfdCache = newVfdCache;

            /*
             * Initialize the new entries and link them into the free list.
             */
            for (i = SizeVfdCache; i < newCacheSize; i++)
            {
                  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
                  VfdCache[i].nextFree = i + 1;
                  VfdCache[i].fd = VFD_CLOSED;
            }
            VfdCache[newCacheSize - 1].nextFree = 0;
            VfdCache[0].nextFree = SizeVfdCache;

            /*
             * Record the new size
             */
            SizeVfdCache = newCacheSize;
      }

      file = VfdCache[0].nextFree;

      VfdCache[0].nextFree = VfdCache[file].nextFree;

      return file;
}

static void
FreeVfd(File file)
{
      Vfd            *vfdP = &VfdCache[file];

      DO_DB(elog(LOG, "FreeVfd: %d (%s)",
                     file, vfdP->fileName ? vfdP->fileName : ""));

      if (vfdP->fileName != NULL)
      {
            free(vfdP->fileName);
            vfdP->fileName = NULL;
      }
      vfdP->fdstate = 0x0;

      vfdP->nextFree = VfdCache[0].nextFree;
      VfdCache[0].nextFree = file;
}

/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
FileAccess(File file)
{
      int               returnValue;

      DO_DB(elog(LOG, "FileAccess %d (%s)",
                     file, VfdCache[file].fileName));

      /*
       * Is the file open?  If not, open it and put it at the head of the LRU
       * ring (possibly closing the least recently used file to get an FD).
       */

      if (FileIsNotOpen(file))
      {
            returnValue = LruInsert(file);
            if (returnValue != 0)
                  return returnValue;
      }
      else if (VfdCache[0].lruLessRecently != file)
      {
            /*
             * We now know that the file is open and that it is not the last one
             * accessed, so we need to move it to the head of the Lru ring.
             */

            Delete(file);
            Insert(file);
      }

      return 0;
}

/*
 *    Called when we get a shared invalidation message on some relation.
 */
#ifdef NOT_USED
void
FileInvalidate(File file)
{
      Assert(FileIsValid(file));
      if (!FileIsNotOpen(file))
            LruDelete(file);
}
#endif

/*
 * open a file in an arbitrary directory
 *
 * NB: if the passed pathname is relative (which it usually is),
 * it will be interpreted relative to the process' working directory
 * (which should always be $PGDATA when this code is running).
 */
File
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
      char     *fnamecopy;
      File        file;
      Vfd            *vfdP;

      DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
                     fileName, fileFlags, fileMode));

      /*
       * We need a malloc'd copy of the file name; fail cleanly if no room.
       */
      fnamecopy = strdup(fileName);
      if (fnamecopy == NULL)
            ereport(ERROR,
                        (errcode(ERRCODE_OUT_OF_MEMORY),
                         errmsg("out of memory")));

      file = AllocateVfd();
      vfdP = &VfdCache[file];

      while (nfile + numAllocatedDescs >= max_safe_fds)
      {
            if (!ReleaseLruFile())
                  break;
      }

      vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);

      if (vfdP->fd < 0)
      {
            FreeVfd(file);
            free(fnamecopy);
            return -1;
      }
      ++nfile;
      DO_DB(elog(LOG, "PathNameOpenFile: success %d",
                     vfdP->fd));

      Insert(file);

      vfdP->fileName = fnamecopy;
      /* Saved flags are adjusted to be OK for re-opening file */
      vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
      vfdP->fileMode = fileMode;
      vfdP->seekPos = 0;
      vfdP->fdstate = 0x0;
      vfdP->resowner = NULL;

      return file;
}

/*
 * Open a temporary file that will disappear when we close it.
 *
 * This routine takes care of generating an appropriate tempfile name.
 * There's no need to pass in fileFlags or fileMode either, since only
 * one setting makes any sense for a temp file.
 *
 * Unless interXact is true, the file is remembered by CurrentResourceOwner
 * to ensure it's closed and deleted when it's no longer needed, typically at
 * the end-of-transaction. In most cases, you don't want temporary files to
 * outlive the transaction that created them, so this should be false -- but
 * if you need "somewhat" temporary storage, this might be useful. In either
 * case, the file is removed when the File is explicitly closed.
 */
File
OpenTemporaryFile(bool interXact)
{
      File        file = 0;

      /*
       * If some temp tablespace(s) have been given to us, try to use the next
       * one.  If a given tablespace can't be found, we silently fall back to
       * the database's default tablespace.
       *
       * BUT: if the temp file is slated to outlive the current transaction,
       * force it into the database's default tablespace, so that it will not
       * pose a threat to possible tablespace drop attempts.
       */
      if (numTempTableSpaces > 0 && !interXact)
      {
            Oid               tblspcOid = GetNextTempTableSpace();

            if (OidIsValid(tblspcOid))
                  file = OpenTemporaryFileInTablespace(tblspcOid, false);
      }

      /*
       * If not, or if tablespace is bad, create in database's default
       * tablespace.    MyDatabaseTableSpace should normally be set before we get
       * here, but just in case it isn't, fall back to pg_default tablespace.
       */
      if (file <= 0)
            file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
                                                                   MyDatabaseTableSpace :
                                                                   DEFAULTTABLESPACE_OID,
                                                                   true);

      /* Mark it for deletion at close */
      VfdCache[file].fdstate |= FD_TEMPORARY;

      /* Register it with the current resource owner */
      if (!interXact)
      {
            VfdCache[file].fdstate |= FD_XACT_TEMPORARY;

            ResourceOwnerEnlargeFiles(CurrentResourceOwner);
            ResourceOwnerRememberFile(CurrentResourceOwner, file);
            VfdCache[file].resowner = CurrentResourceOwner;

            /* ensure cleanup happens at eoxact */
            have_xact_temporary_files = true;
      }

      return file;
}

/*
 * Open a temporary file in a specific tablespace.
 * Subroutine for OpenTemporaryFile, which see for details.
 */
static File
OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
{
      char        tempdirpath[MAXPGPATH];
      char        tempfilepath[MAXPGPATH];
      File        file;

      /*
       * Identify the tempfile directory for this tablespace.
       *
       * If someone tries to specify pg_global, use pg_default instead.
       */
      if (tblspcOid == DEFAULTTABLESPACE_OID ||
            tblspcOid == GLOBALTABLESPACE_OID)
      {
            /* The default tablespace is {datadir}/base */
            snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
                         PG_TEMP_FILES_DIR);
      }
      else
      {
            /* All other tablespaces are accessed via symlinks */
            snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s",
                         tblspcOid, PG_TEMP_FILES_DIR);
      }

      /*
       * Generate a tempfile name that should be unique within the current
       * database instance.
       */
      snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
                   tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);

      /*
       * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
       * temp file that can be reused.
       */
      file = PathNameOpenFile(tempfilepath,
                                          O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
                                          0600);
      if (file <= 0)
      {
            /*
             * We might need to create the tablespace's tempfile directory, if no
             * one has yet done so.
             *
             * Don't check for error from mkdir; it could fail if someone else
             * just did the same thing.  If it doesn't work then we'll bomb out on
             * the second create attempt, instead.
             */
            mkdir(tempdirpath, S_IRWXU);

            file = PathNameOpenFile(tempfilepath,
                                                O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
                                                0600);
            if (file <= 0 && rejectError)
                  elog(ERROR, "could not create temporary file \"%s\": %m",
                         tempfilepath);
      }

      return file;
}

/*
 * close a file when done with it
 */
void
FileClose(File file)
{
      Vfd            *vfdP;
      struct stat filestats;

      Assert(FileIsValid(file));

      DO_DB(elog(LOG, "FileClose: %d (%s)",
                     file, VfdCache[file].fileName));

      vfdP = &VfdCache[file];

      if (!FileIsNotOpen(file))
      {
            /* remove the file from the lru ring */
            Delete(file);

            /* close the file */
            if (close(vfdP->fd))
                  elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);

            --nfile;
            vfdP->fd = VFD_CLOSED;
      }

      /*
       * Delete the file if it was temporary
       */
      if (vfdP->fdstate & FD_TEMPORARY)
      {
            /* reset flag so that die() interrupt won't cause problems */
            vfdP->fdstate &= ~FD_TEMPORARY;
            if (log_temp_files >= 0)
            {
                  if (stat(vfdP->fileName, &filestats) == 0)
                  {
                        if (filestats.st_size >= log_temp_files)
                              ereport(LOG,
                                          (errmsg("temporary file: path \"%s\", size %lu",
                                                      vfdP->fileName,
                                                      (unsigned long) filestats.st_size)));
                  }
                  else
                        elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
            }
            if (unlink(vfdP->fileName))
                  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
      }

      /* Unregister it from the resource owner */
      if (vfdP->resowner)
            ResourceOwnerForgetFile(vfdP->resowner, file);

      /*
       * Return the Vfd slot to the free list
       */
      FreeVfd(file);
}

/*
 * FilePrefetch - initiate asynchronous read of a given range of the file.
 * The logical seek position is unaffected.
 *
 * Currently the only implementation of this function is using posix_fadvise
 * which is the simplest standardized interface that accomplishes this.
 * We could add an implementation using libaio in the future; but note that
 * this API is inappropriate for libaio, which wants to have a buffer provided
 * to read into.
 */
int
FilePrefetch(File file, off_t offset, int amount)
{
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
      int               returnCode;

      Assert(FileIsValid(file));

      DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
                     file, VfdCache[file].fileName,
                     (int64) offset, amount));

      returnCode = FileAccess(file);
      if (returnCode < 0)
            return returnCode;

      returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
                                             POSIX_FADV_WILLNEED);

      return returnCode;
#else
      Assert(FileIsValid(file));
      return 0;
#endif
}

int
FileRead(File file, char *buffer, int amount)
{
      int               returnCode;

      Assert(FileIsValid(file));

      DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
                     file, VfdCache[file].fileName,
                     (int64) VfdCache[file].seekPos,
                     amount, buffer));

      returnCode = FileAccess(file);
      if (returnCode < 0)
            return returnCode;

retry:
      returnCode = read(VfdCache[file].fd, buffer, amount);

      if (returnCode >= 0)
            VfdCache[file].seekPos += returnCode;
      else
      {
            /*
             * Windows may run out of kernel buffers and return "Insufficient
             * system resources" error.  Wait a bit and retry to solve it.
             *
             * It is rumored that EINTR is also possible on some Unix filesystems,
             * in which case immediate retry is indicated.
             */
#ifdef WIN32
            DWORD       error = GetLastError();

            switch (error)
            {
                  case ERROR_NO_SYSTEM_RESOURCES:
                        pg_usleep(1000L);
                        errno = EINTR;
                        break;
                  default:
                        _dosmaperr(error);
                        break;
            }
#endif
            /* OK to retry if interrupted */
            if (errno == EINTR)
                  goto retry;

            /* Trouble, so assume we don't know the file position anymore */
            VfdCache[file].seekPos = FileUnknownPos;
      }

      return returnCode;
}

int
FileWrite(File file, char *buffer, int amount)
{
      int               returnCode;

      Assert(FileIsValid(file));

      DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
                     file, VfdCache[file].fileName,
                     (int64) VfdCache[file].seekPos,
                     amount, buffer));

      returnCode = FileAccess(file);
      if (returnCode < 0)
            return returnCode;

retry:
      errno = 0;
      returnCode = write(VfdCache[file].fd, buffer, amount);

      /* if write didn't set errno, assume problem is no disk space */
      if (returnCode != amount && errno == 0)
            errno = ENOSPC;

      if (returnCode >= 0)
            VfdCache[file].seekPos += returnCode;
      else
      {
            /*
             * See comments in FileRead()
             */
#ifdef WIN32
            DWORD       error = GetLastError();

            switch (error)
            {
                  case ERROR_NO_SYSTEM_RESOURCES:
                        pg_usleep(1000L);
                        errno = EINTR;
                        break;
                  default:
                        _dosmaperr(error);
                        break;
            }
#endif
            /* OK to retry if interrupted */
            if (errno == EINTR)
                  goto retry;

            /* Trouble, so assume we don't know the file position anymore */
            VfdCache[file].seekPos = FileUnknownPos;
      }

      return returnCode;
}

int
FileSync(File file)
{
      int               returnCode;

      Assert(FileIsValid(file));

      DO_DB(elog(LOG, "FileSync: %d (%s)",
                     file, VfdCache[file].fileName));

      returnCode = FileAccess(file);
      if (returnCode < 0)
            return returnCode;

      return pg_fsync(VfdCache[file].fd);
}

off_t
FileSeek(File file, off_t offset, int whence)
{
      int               returnCode;

      Assert(FileIsValid(file));

      DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
                     file, VfdCache[file].fileName,
                     (int64) VfdCache[file].seekPos,
                     (int64) offset, whence));

      if (FileIsNotOpen(file))
      {
            switch (whence)
            {
                  case SEEK_SET:
                        if (offset < 0)
                              elog(ERROR, "invalid seek offset: " INT64_FORMAT,
                                     (int64) offset);
                        VfdCache[file].seekPos = offset;
                        break;
                  case SEEK_CUR:
                        VfdCache[file].seekPos += offset;
                        break;
                  case SEEK_END:
                        returnCode = FileAccess(file);
                        if (returnCode < 0)
                              return returnCode;
                        VfdCache[file].seekPos = lseek(VfdCache[file].fd,
                                                                     offset, whence);
                        break;
                  default:
                        elog(ERROR, "invalid whence: %d", whence);
                        break;
            }
      }
      else
      {
            switch (whence)
            {
                  case SEEK_SET:
                        if (offset < 0)
                              elog(ERROR, "invalid seek offset: " INT64_FORMAT,
                                     (int64) offset);
                        if (VfdCache[file].seekPos != offset)
                              VfdCache[file].seekPos = lseek(VfdCache[file].fd,
                                                                           offset, whence);
                        break;
                  case SEEK_CUR:
                        if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
                              VfdCache[file].seekPos = lseek(VfdCache[file].fd,
                                                                           offset, whence);
                        break;
                  case SEEK_END:
                        VfdCache[file].seekPos = lseek(VfdCache[file].fd,
                                                                     offset, whence);
                        break;
                  default:
                        elog(ERROR, "invalid whence: %d", whence);
                        break;
            }
      }
      return VfdCache[file].seekPos;
}

/*
 * XXX not actually used but here for completeness
 */
#ifdef NOT_USED
off_t
FileTell(File file)
{
      Assert(FileIsValid(file));
      DO_DB(elog(LOG, "FileTell %d (%s)",
                     file, VfdCache[file].fileName));
      return VfdCache[file].seekPos;
}
#endif

int
FileTruncate(File file, off_t offset)
{
      int               returnCode;

      Assert(FileIsValid(file));

      DO_DB(elog(LOG, "FileTruncate %d (%s)",
                     file, VfdCache[file].fileName));

      returnCode = FileAccess(file);
      if (returnCode < 0)
            return returnCode;

      returnCode = ftruncate(VfdCache[file].fd, offset);
      return returnCode;
}


/*
 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
 * rather than plain fopen().  This lets fd.c deal with freeing FDs if
 * necessary to open the file.      When done, call FreeFile rather than fclose.
 *
 * Note that files that will be open for any significant length of time
 * should NOT be handled this way, since they cannot share kernel file
 * descriptors with other files; there is grave risk of running out of FDs
 * if anyone locks down too many FDs.  Most callers of this routine are
 * simply reading a config file that they will read and close immediately.
 *
 * fd.c will automatically close all files opened with AllocateFile at
 * transaction commit or abort; this prevents FD leakage if a routine
 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
 *
 * Ideally this should be the *only* direct call of fopen() in the backend.
 */
FILE *
AllocateFile(const char *name, const char *mode)
{
      FILE     *file;

      DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
                     numAllocatedDescs, name));

      /*
       * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
       * allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
       * from hogging every one of the available FDs, which'd lead to infinite
       * looping.
       */
      if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
            numAllocatedDescs >= max_safe_fds - 1)
            elog(ERROR, "too many private files demanded");

TryAgain:
      if ((file = fopen(name, mode)) != NULL)
      {
            AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];

            desc->kind = AllocateDescFile;
            desc->desc.file = file;
            desc->create_subid = GetCurrentSubTransactionId();
            numAllocatedDescs++;
            return desc->desc.file;
      }

      if (errno == EMFILE || errno == ENFILE)
      {
            int               save_errno = errno;

            ereport(LOG,
                        (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                         errmsg("out of file descriptors: %m; release and retry")));
            errno = 0;
            if (ReleaseLruFile())
                  goto TryAgain;
            errno = save_errno;
      }

      return NULL;
}

/*
 * Free an AllocateDesc of either type.
 *
 * The argument *must* point into the allocatedDescs[] array.
 */
static int
FreeDesc(AllocateDesc *desc)
{
      int               result;

      /* Close the underlying object */
      switch (desc->kind)
      {
            case AllocateDescFile:
                  result = fclose(desc->desc.file);
                  break;
            case AllocateDescDir:
                  result = closedir(desc->desc.dir);
                  break;
            default:
                  elog(ERROR, "AllocateDesc kind not recognized");
                  result = 0;             /* keep compiler quiet */
                  break;
      }

      /* Compact storage in the allocatedDescs array */
      numAllocatedDescs--;
      *desc = allocatedDescs[numAllocatedDescs];

      return result;
}

/*
 * Close a file returned by AllocateFile.
 *
 * Note we do not check fclose's return value --- it is up to the caller
 * to handle close errors.
 */
int
FreeFile(FILE *file)
{
      int               i;

      DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));

      /* Remove file from list of allocated files, if it's present */
      for (i = numAllocatedDescs; --i >= 0;)
      {
            AllocateDesc *desc = &allocatedDescs[i];

            if (desc->kind == AllocateDescFile && desc->desc.file == file)
                  return FreeDesc(desc);
      }

      /* Only get here if someone passes us a file not in allocatedDescs */
      elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");

      return fclose(file);
}


/*
 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
 * rather than plain opendir().  This lets fd.c deal with freeing FDs if
 * necessary to open the directory, and with closing it after an elog.
 * When done, call FreeDir rather than closedir.
 *
 * Ideally this should be the *only* direct call of opendir() in the backend.
 */
DIR *
AllocateDir(const char *dirname)
{
      DIR            *dir;

      DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
                     numAllocatedDescs, dirname));

      /*
       * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
       * allocatedDescs[]; the test against max_safe_fds prevents AllocateDir
       * from hogging every one of the available FDs, which'd lead to infinite
       * looping.
       */
      if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
            numAllocatedDescs >= max_safe_fds - 1)
            elog(ERROR, "too many private dirs demanded");

TryAgain:
      if ((dir = opendir(dirname)) != NULL)
      {
            AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];

            desc->kind = AllocateDescDir;
            desc->desc.dir = dir;
            desc->create_subid = GetCurrentSubTransactionId();
            numAllocatedDescs++;
            return desc->desc.dir;
      }

      if (errno == EMFILE || errno == ENFILE)
      {
            int               save_errno = errno;

            ereport(LOG,
                        (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                         errmsg("out of file descriptors: %m; release and retry")));
            errno = 0;
            if (ReleaseLruFile())
                  goto TryAgain;
            errno = save_errno;
      }

      return NULL;
}

/*
 * Read a directory opened with AllocateDir, ereport'ing any error.
 *
 * This is easier to use than raw readdir() since it takes care of some
 * otherwise rather tedious and error-prone manipulation of errno.      Also,
 * if you are happy with a generic error message for AllocateDir failure,
 * you can just do
 *
 *          dir = AllocateDir(path);
 *          while ((dirent = ReadDir(dir, path)) != NULL)
 *                process dirent;
 *          FreeDir(dir);
 *
 * since a NULL dir parameter is taken as indicating AllocateDir failed.
 * (Make sure errno hasn't been changed since AllocateDir if you use this
 * shortcut.)
 *
 * The pathname passed to AllocateDir must be passed to this routine too,
 * but it is only used for error reporting.
 */
struct dirent *
ReadDir(DIR *dir, const char *dirname)
{
      struct dirent *dent;

      /* Give a generic message for AllocateDir failure, if caller didn't */
      if (dir == NULL)
            ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not open directory \"%s\": %m",
                                    dirname)));

      errno = 0;
      if ((dent = readdir(dir)) != NULL)
            return dent;

#ifdef WIN32

      /*
       * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
       * released version
       */
      if (GetLastError() == ERROR_NO_MORE_FILES)
            errno = 0;
#endif

      if (errno)
            ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not read directory \"%s\": %m",
                                    dirname)));
      return NULL;
}

/*
 * Close a directory opened with AllocateDir.
 *
 * Note we do not check closedir's return value --- it is up to the caller
 * to handle close errors.
 */
int
FreeDir(DIR *dir)
{
      int               i;

      DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));

      /* Remove dir from list of allocated dirs, if it's present */
      for (i = numAllocatedDescs; --i >= 0;)
      {
            AllocateDesc *desc = &allocatedDescs[i];

            if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
                  return FreeDesc(desc);
      }

      /* Only get here if someone passes us a dir not in allocatedDescs */
      elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");

      return closedir(dir);
}


/*
 * closeAllVfds
 *
 * Force all VFDs into the physically-closed state, so that the fewest
 * possible number of kernel file descriptors are in use.  There is no
 * change in the logical state of the VFDs.
 */
void
closeAllVfds(void)
{
      Index       i;

      if (SizeVfdCache > 0)
      {
            Assert(FileIsNotOpen(0));           /* Make sure ring not corrupted */
            for (i = 1; i < SizeVfdCache; i++)
            {
                  if (!FileIsNotOpen(i))
                        LruDelete(i);
            }
      }
}


/*
 * SetTempTablespaces
 *
 * Define a list (actually an array) of OIDs of tablespaces to use for
 * temporary files.  This list will be used until end of transaction,
 * unless this function is called again before then.  It is caller's
 * responsibility that the passed-in array has adequate lifespan (typically
 * it'd be allocated in TopTransactionContext).
 */
void
SetTempTablespaces(Oid *tableSpaces, int numSpaces)
{
      Assert(numSpaces >= 0);
      tempTableSpaces = tableSpaces;
      numTempTableSpaces = numSpaces;

      /*
       * Select a random starting point in the list.  This is to minimize
       * conflicts between backends that are most likely sharing the same list
       * of temp tablespaces.  Note that if we create multiple temp files in the
       * same transaction, we'll advance circularly through the list --- this
       * ensures that large temporary sort files are nicely spread across all
       * available tablespaces.
       */
      if (numSpaces > 1)
            nextTempTableSpace = random() % numSpaces;
      else
            nextTempTableSpace = 0;
}

/*
 * TempTablespacesAreSet
 *
 * Returns TRUE if SetTempTablespaces has been called in current transaction.
 * (This is just so that tablespaces.c doesn't need its own per-transaction
 * state.)
 */
bool
TempTablespacesAreSet(void)
{
      return (numTempTableSpaces >= 0);
}

/*
 * GetNextTempTableSpace
 *
 * Select the next temp tablespace to use.      A result of InvalidOid means
 * to use the current database's default tablespace.
 */
Oid
GetNextTempTableSpace(void)
{
      if (numTempTableSpaces > 0)
      {
            /* Advance nextTempTableSpace counter with wraparound */
            if (++nextTempTableSpace >= numTempTableSpaces)
                  nextTempTableSpace = 0;
            return tempTableSpaces[nextTempTableSpace];
      }
      return InvalidOid;
}


/*
 * AtEOSubXact_Files
 *
 * Take care of subtransaction commit/abort.  At abort, we close temp files
 * that the subtransaction may have opened.  At commit, we reassign the
 * files that were opened to the parent subtransaction.
 */
void
AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
                          SubTransactionId parentSubid)
{
      Index       i;

      for (i = 0; i < numAllocatedDescs; i++)
      {
            if (allocatedDescs[i].create_subid == mySubid)
            {
                  if (isCommit)
                        allocatedDescs[i].create_subid = parentSubid;
                  else
                  {
                        /* have to recheck the item after FreeDesc (ugly) */
                        FreeDesc(&allocatedDescs[i--]);
                  }
            }
      }
}

/*
 * AtEOXact_Files
 *
 * This routine is called during transaction commit or abort (it doesn't
 * particularly care which).  All still-open per-transaction temporary file
 * VFDs are closed, which also causes the underlying files to be deleted
 * (although they should've been closed already by the ResourceOwner
 * cleanup). Furthermore, all "allocated" stdio files are closed. We also
 * forget any transaction-local temp tablespace list.
 */
void
AtEOXact_Files(void)
{
      CleanupTempFiles(false);
      tempTableSpaces = NULL;
      numTempTableSpaces = -1;
}

/*
 * AtProcExit_Files
 *
 * on_proc_exit hook to clean up temp files during backend shutdown.
 * Here, we want to clean up *all* temp files including interXact ones.
 */
static void
AtProcExit_Files(int code, Datum arg)
{
      CleanupTempFiles(true);
}

/*
 * Close temporary files and delete their underlying files.
 *
 * isProcExit: if true, this is being called as the backend process is
 * exiting. If that's the case, we should remove all temporary files; if
 * that's not the case, we are being called for transaction commit/abort
 * and should only remove transaction-local temp files.  In either case,
 * also clean up "allocated" stdio files and dirs.
 */
static void
CleanupTempFiles(bool isProcExit)
{
      Index       i;

      /*
       * Careful here: at proc_exit we need extra cleanup, not just
       * xact_temporary files.
       */
      if (isProcExit || have_xact_temporary_files)
      {
            Assert(FileIsNotOpen(0));           /* Make sure ring not corrupted */
            for (i = 1; i < SizeVfdCache; i++)
            {
                  unsigned short fdstate = VfdCache[i].fdstate;

                  if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
                  {
                        /*
                         * If we're in the process of exiting a backend process, close
                         * all temporary files. Otherwise, only close temporary files
                         * local to the current transaction. They should be closed
                         * by the ResourceOwner mechanism already, so this is just
                         * a debugging cross-check.
                         */
                        if (isProcExit)
                              FileClose(i);
                        else if (fdstate & FD_XACT_TEMPORARY)
                        {
                              elog(WARNING,
                                     "temporary file %s not closed at end-of-transaction",
                                     VfdCache[i].fileName);
                              FileClose(i);
                        }
                  }
            }

            have_xact_temporary_files = false;
      }

      /* Clean up "allocated" stdio files and dirs. */
      while (numAllocatedDescs > 0)
            FreeDesc(&allocatedDescs[0]);
}


/*
 * Remove temporary files left over from a prior postmaster session
 *
 * This should be called during postmaster startup.  It will forcibly
 * remove any leftover files created by OpenTemporaryFile.
 *
 * NOTE: we could, but don't, call this during a post-backend-crash restart
 * cycle.  The argument for not doing it is that someone might want to examine
 * the temp files for debugging purposes.  This does however mean that
 * OpenTemporaryFile had better allow for collision with an existing temp
 * file name.
 */
void
RemovePgTempFiles(void)
{
      char        temp_path[MAXPGPATH];
      DIR            *spc_dir;
      struct dirent *spc_de;

      /*
       * First process temp files in pg_default ($PGDATA/base)
       */
      snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
      RemovePgTempFilesInDir(temp_path);

      /*
       * Cycle through temp directories for all non-default tablespaces.
       */
      spc_dir = AllocateDir("pg_tblspc");

      while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
      {
            if (strcmp(spc_de->d_name, ".") == 0 ||
                  strcmp(spc_de->d_name, "..") == 0)
                  continue;

            snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
                         spc_de->d_name, PG_TEMP_FILES_DIR);
            RemovePgTempFilesInDir(temp_path);
      }

      FreeDir(spc_dir);

      /*
       * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
       * DataDir as well.
       */
#ifdef EXEC_BACKEND
      RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
#endif
}

/* Process one pgsql_tmp directory for RemovePgTempFiles */
static void
RemovePgTempFilesInDir(const char *tmpdirname)
{
      DIR            *temp_dir;
      struct dirent *temp_de;
      char        rm_path[MAXPGPATH];

      temp_dir = AllocateDir(tmpdirname);
      if (temp_dir == NULL)
      {
            /* anything except ENOENT is fishy */
            if (errno != ENOENT)
                  elog(LOG,
                         "could not open temporary-files directory \"%s\": %m",
                         tmpdirname);
            return;
      }

      while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
      {
            if (strcmp(temp_de->d_name, ".") == 0 ||
                  strcmp(temp_de->d_name, "..") == 0)
                  continue;

            snprintf(rm_path, sizeof(rm_path), "%s/%s",
                         tmpdirname, temp_de->d_name);

            if (strncmp(temp_de->d_name,
                              PG_TEMP_FILE_PREFIX,
                              strlen(PG_TEMP_FILE_PREFIX)) == 0)
                  unlink(rm_path);  /* note we ignore any error */
            else
                  elog(LOG,
                         "unexpected file found in temporary-files directory: \"%s\"",
                         rm_path);
      }

      FreeDir(temp_dir);
}

Generated by  Doxygen 1.6.0   Back to index