Date: Mon, 2 Jun 1997 17:42:10 -0700 (PDT)
From: Mark Zimmermann


this seems to merge multiple indices into a single pair of *.k and *.p
files for browsing ... apparently I tried doing this without heap data
structures and it took FOREVER, so I rewrote it here in a more
efficient way ... ^z

-----zmrgr46.c-----


/* revised zmerger experiments - ^z - (c) 1991
 * mods to use heap data structure for merge buffer indices - 1991/01/26
 * next:  variable buffer sizes, proportional to input file sizes...
 *
 * work on zmerger.c - new program to do an N-way merge of raw
 * index files created by zndxr.c (or by qndxr.c, TEXAS, TEX, FREE TEXT, etc.)
 *
 * This is free software and comes with NO WARRANTY WHATSOEVER!
 * See GNU GPL for licensing information.
 *
 * This program takes as input a list of files (a 'filelist' or '.f' file,
 * specifying which files go into the database and their lengths, etc.) and
 * produces a merged pair of index files corresponding to the contents
 * of that filelist -- a unified database to allow multi-file browsing
 * by (a forthcoming program) zbrwsr.c ...
 *
 * On the command line to zmerger, a single thing is required -- the
 * name of the filelist which holds the names of the documents 
 * (text files, generally) and their associated index files ('.k' and
 * '.p', also known as 'key' and 'ptr' files).  The filelist itself
 * contains the name of the special alphabetization ('alphafile') and
 * the names and locations to store the merged database files into.
 *
 * Thus, the usage of zmerger is:  zmerger filelist
 *
 * It makes sense to put all this stuff in the filelist master file
 * itself, since in real use multi-file browsing will (I hope!) be
 * applied to large collections of files (possibly thousands or more
 * different documents) -- too many to put onto a command line!
 *
 * The format of the filelist file is specified as follows:
 *
 * - a filelist shall consist of lines of printable characters,
 *   no line of which may be longer than 255 bytes.
 *
 * - all non-empty lines of the filelist file which do not begin
 *   with a '#' shall consist of a file name (possibly a full path)
 *   of a text 'document' file to be indexed, followed
 *   by a location for the index key file, followed
 *   by the index ptr file's location, followed by the offset to
 *   apply to the byte locations of words in that file.  That offset
 *   value begins at 0 for the first file, and is actually the
 *   cumulative length of all previous files in the filelist.
 *    ((note --- want to be sure to allow for segmentation of input
 *      text files later on ... must think about this offset question
 *      then...??))
 *
 * - the customary name of a filelist file shall end with '.f'; other
 *   names are allowed, but may cause conflicts or inconsistencies
 *   unless the default names for key and ptr files are explicitly
 *   overriden.
 *
 * - the default for the names of the filelist's associated master key
 *   and ptr files shall be the name of the filelist with 'k'
 *   and 'p' respectively replacing the last letter of the
 *   filelist's own file name, in the same directory as the
 *   filelist itself.
 *
 * - to override the defaults for the filelist's key and ptr files,
 *   the directive "#keyfile: fname" may be placed in the filelist
 *   (before the first document file name) to indicate where
 *   the key file is to be found; the directive "#ptrfile: fname" may
 *   similarly be used to locate the ptr file (where the '#' occurs
 *   in the first column and where 'fname' may include a full path,
 *   e.g., /usr/mark/data.k).
 *
 * - the default alphabetization order shall be that of the ASCII character
 *   set; the default mapping of characters in a text file is to treat
 *   all non-alphanumeric characters as delimiters, and to turn all
 *   lower-case alphabetic characters to upper-case.
 *
 * - to override the default alphabetization order and character map,
 *   the directive "#alphafile: fname" may be placed in the filelist
 *   before the first document file name.  (The format of
 *   the alphafile is described in detail elsewhere; it essentially
 *   consists of a list of the characters to be kept in indexed documents,
 *   in the order in which they are to be alphabetized, with all
 *   characters on each line of the alphafile mapped into the first
 *   on that line.)
 *
 * - a filelist line beginning with '# ' (a number-sign followed by a space)
 *   shall be a comment and shall be ignored by all programs using the
 *   filelist; in fact, any line beginning with a '#' but not with a
 *   recognized directive shall be ignored.
 *
 * - unprintable characters shall be included in a filelist as a
 *   decimal number after a '#'; thus, ^Z is written '#26'.  The '#'
 *   symbol itself shall be written '#35' when it is needed.
 *
 * - the '#' (number-sign, sharp, pound-sign, tic-tac-toe board)
 *   shall be the only special 'escape' character; all other printable
 *   symbols shall represent themselves.
 *
 */
 
/* if one needs to convert program for compilation by an old, dumb cc,
 * for generic use, this is what happens:
 *
 * - prototypes turned into function head type declarations (via gawk
 *    program to take everything out from within the () in the prototypes)
 * - function heads turned back to old form (for things that don't
 *    call functions as arguments, single line head, etc.), using gawk
 * - <stdlib.h> reference removed, as it offends...
 * - #define time_t, difftime(), SEEK_SET, and SEEK_END
 */
 
#ifdef DUMB_COMPILER
#define time_t              long
#define difftime(t1,t0)     (t1 - t0)
#define SEEK_SET            0
#define SEEK_END            2
#else
#include <stdlib.h>
#endif
 
 
#include <stdio.h>
#include <string.h>
#include <time.h>
 
/* for Mac (Think C) ... need to put this into a project with the ANSI
 * and the MacTraps libraries ... the <console.h> header lets main()
 * take command-line UNIX-style arguments ... very un-Macish, sorry!
 */
#ifdef THINK_C
#include <console.h>
#endif
 
/* define some parameters used by the following routines ... MAXSTRLEN is the
 * longest string expected to be seen in an alphafile, a filelist, etc. ...
 * KEYLENGTH is the number of letters to be kept (at most) for an indexed
 * word (28 is chosen for compatibility with my earlier index-building programs;
 * it's long enough to avoid truncating most words, adds up to a power of two
 * when the 4-byte long integer 'cumulative count' is appended to make an index
 * record in the key file, and is short enough that (for reasonably large input
 * files) the key file isn't exorbitantly huge ...
 */
#define MAXSTRLEN              256
#define KEYLENGTH               28
 
/* for the moment, use fixed-size buffers, but do this right later with
 * variable-sized (proportional to the file size)
 */
#define DEFAULTBUFFERSIZE    10240
 
#define TRUE                     1
#define FALSE                    0
 
 
 
/* define data type to hold keyfile records
 */
typedef struct
  {
    char key[KEYLENGTH];
    long ccount;
  }     ZKEY;
 
/* define data type to hold in-memory information essential to merger ...
 * for each file in the filelist, must hold:
 *  - offset to apply to the ptr records
 *  - keyfile name
 *  - ptrfile name
 *  - pointer to beginning of the keyfile buffer
 *  - pointer to next record in the keyfile buffer
 *  - pointer to end of the keyfile buffer
 *  - pointer to beginning of the ptrfile buffer
 *  - pointer to next record in the ptrfile buffer
 *  - pointer to end of the ptrfile buffer
 *  - number of the key record to start the next keybuf load
 *  - number of the ptr record to start the next ptrbuf load
 *  - total number of remaining ptrfile records
 *  - previous cumulative count value (which tells where ptrs for
 *      the current key begin)
 */
typedef struct
  {
    long offset;
    char *kfname;
    char *pfname;
    ZKEY *keyrecbufp;
    ZKEY *current_keyrecp;
    ZKEY *endkeyrecbufp;
    long *ptrbufp;
    long *current_ptrp;
    long *endptrbufp;
    long next_keybuf_start_num;
    long next_ptrbuf_start_num;
    long remaining_ptrcount;
    long prev_ccount;
  }     FLST;
 
/* begin prototypes */
 
int main (int argc, char *argv[]);
void print_help (void);
void load_alpha (char *fname);
void read_filelist (char *fname);
void print_switch_error (char *s);
long filesize (FILE *file);
long count_listed_files (FILE *filelistfile);
void *reserve_memory (long len);
void error_exit (int errnum, char *msg, long errarg);
void add_to_flist (char *filename, char *kfilename, char *pfilename, long offset);
void merge_indices (void);
void initial_load_inbuffers (long bufnum);
void fetch_next_keyrec (long flistnum);
void fetch_next_ptr (long *pbuf, long flistnum);
void push_index_onto_heap (long bufnum);
void open_outfiles (void);
int alpha_greater_than (long bufnum1, long bufnum2);
void write_out_smallest_remaining_record (void);
void close_outfiles (void);
 
/* end prototypes */
 
/* global variables */
 
char keyfilename[MAXSTRLEN];
FILE *kfp;
char ptrfilename[MAXSTRLEN];
FILE *pfp;
 
long nfiles     =   0;
long maxfiles   =   0;
long buffersize =   DEFAULTBUFFERSIZE;
 
FLST *flist = NULL;
long first_active_file;
ZKEY current_keyrec;
 
long *heap;
long last_heap_slot;
 
 
 
/* alphabetization order:  the array alpha_order[] determines
 * the order of alphabetization of characters during indexing.
 * The default is simply ASCII order, with numbers preceding
 * letters.  This default behavior may be changed using the
 * '-f mapfile' command line option.  See the documentation
 * accompanying function load_alpha() for details.
 *
 * Here, although many other patterns are possible, I
 * simply repeat the contents of the alpha_map[] array
 * (as used in zndxr.c) as the default...
 *
 * Note that the value of alpha_order entries should never
 * be zero for any characters which are actually not mapped
 * to null -- otherwise, alphabetization errors may result!
 */
unsigned char alpha_order[] =
  {
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
     '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',
     '8',  '9', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
     'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
     'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
     'X',  'Y',  'Z', '\0', '\0', '\0', '\0', '\0',
    '\0',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
     'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
     'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
     'X',  'Y',  'Z', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0'
  };
 
 
 
 
/*********************************************************/
 
 
main (int argc, char *argv[])
  {
/* get the command line arguments for Macintosh users ...
 */
#ifdef THINK_C
    argc = ccommand (&argv);
#endif
 
    printf ("\nWelcome to zmerger... - ^z\n");
 
    if (argc == 1)
        print_help ();
    else
      {
        read_filelist (argv[1]);
        merge_indices ();
      }
 
    printf ("\nGood-bye! - ^z\n");
 
    return;
  }
 
 
/* print out minimally helpful information for user... */
 
void print_help (void)
  {
    printf ("\nzmerger:  merge index files for free-text information retrieval\n");
    printf ("\nusage:  zmerger filelistname ... \n");
    printf ("\nzmerger (c) 1991 Mark Zimmermann - ^z\n");
    printf ("this program comes with NO WARRANTY whatsoever\n");
    printf ("zmerger is free software, under the GNU General Public License\n");
    
    return;
  }
 
 
/* load alpha file to redefine alphabetization order if necessary....
 *
 * The alpha file is optional.  It is an ASCII text file consisting of
 * a succession of lines, giving the characters to appear in the index
 * file in the order in which they are to be alphabetized. 
 * Alternative alphabetizations and character mappings are most
 * likely to be useful to those who need to work with non-Latin
 * alphabets or who wish to handle non-alphanumeric characters
 * in a special way.
 *
 * See the source code of zndxr.c for additional commentary and
 * explicit examples of alphafile formatting ... most of which is
 * not required for the index merge operation (e.g., for merging,
 * the character mapping and embedded-punctuation handling don't
 * actually matter -- all that should ever be needed is the alpha
 * order...).
 *
 * To include a nonprintable or 'special' character in an alpha file, write
 * a number-sign ('#') followed by the DECIMAL value of the character.
 * Thus, #126 = ~ = <tilde>, #35 = #, #10 = ^J = '\n' = newline, etc.
 * (The '~' character is special in that it flags embedded punctuation
 * for the index-builder, so write it as #126 if it's needed...)
 *
 * Comments may be included in an alphafile on a line beginning with a '# ';
 * any line with '# ' as its first two characters is ignored.
 *
 * No guarantees if an alphafile line is longer than MAXSTRLEN-1 characters!!!
 */
 
void load_alpha (char *fname)
  {
    FILE *alphafile;
    int c, i, len, order = 0;
    char buf[MAXSTRLEN];
 
    printf ("loading alphafile '%s'\n", fname);
    alphafile = fopen (fname, "r");
    if (alphafile == NULL)
        error_exit (200, "fopen() error in load_alpha()", 0L);
    
    for (i = 0; i < 256; ++i)
        alpha_order[i] = '\0';
    
    while (fgets (buf, MAXSTRLEN, alphafile) != NULL)
      {
        /* skip a comment line (any line which begins with '# ') and
         * skip a completely empty line (containing only '\n')
         */
        if ((buf[0] == '#' && buf[1] == ' ') || buf[0] == '\n')
            continue;
 
        /* increment the counter that keeps track of where to put this
         * line's characters alphabetically ...
         */
        if (++order > 255)
            error_exit (201, "++order error in load_alpha()", (long)order);
 
        len = strlen (buf);
        if (buf[len - 1] == '\n')
            /* remove any final '\n' from lines that fgets() supplies...
             */
            --len;
        
        for (i = 0; i < len; ++i)
          {
            c = buf[i];
            
            if (c == '~')
                /* skip '~' since we don't care about embedded punctuation
                 * for index merging purposes
                 */
                continue;
            
            /* convert escaped characters after '#' into proper form
             */
            if (c == '#')
              {
                c = 0;
                ++i;
                while (buf[i] >= '0' && buf[i] <= '9')
                    c = 10 * c + buf[i++] - '0';
                --i;
              }
            
            /* safety net -- following *should* never happen!
             */
            if (c < 0 || c > 255)
                error_exit (202, "character error in load_alpha()", (long)c);
            
            alpha_order[c] = order;
          }
      }
    fclose (alphafile);
    
    return;
  }
 
 
/* read a filelist and process the information in it... reserve memory for
 * our flist data structures, and for the heap of indices into the flist
 * items, to keep track of which input file comes next alphabetically...
 *
 * The format of the filelist file is designed to be compatible with
 * the '.f' file used by my other free text information retrieval
 * ("indexer/browser") programs.  A filelist file consists of a
 * succesion of lines each containing:
 *    the name of each document file
 *    the name of the document's '.k' index key file
 *    the name of the document's '.p' index ptr file
 *    the offset to apply to this document's word locations
 *
 * A line beginning with a '#' character is treated as a comment and is
 * ignored in the filelist, unless it precedes a special directive, such
 * as in the case of "#alphafile:", "#keyfile:", "#ptrfile:"...
 *
 * If you really want to begin a filename with a '#', put a space in front
 * of it in the filelist entry for that file, maybe....
 */
 
void read_filelist (char *fname)
  {
    FILE *filelistfile;
    int n, len;
    long fileoffset;
    char buf[MAXSTRLEN], filename[MAXSTRLEN], kfilename[MAXSTRLEN],
        pfilename[MAXSTRLEN];
 
    printf ("loading filelist file '%s'\n", fname);
    filelistfile = fopen (fname, "r");
    if (filelistfile == NULL)
        error_exit (212, "error opening filelist in read_filelist()", 0L);
    
    maxfiles = count_listed_files (filelistfile);
    printf ("reserving %ld bytes for %ld in-memory filelist structures\n",
                    sizeof (FLST) * maxfiles, maxfiles);
    flist = reserve_memory (sizeof (FLST) * maxfiles);
    /* for heap of indices, use subscripts beginning with 1, so need an extra slot;
     * also set count of heap slots in use to zero
     */
    heap = reserve_memory (sizeof (long) * (maxfiles + 1));
    last_heap_slot = 0;
    rewind (filelistfile);
    printf ("loading filelist...\n");
 
    len = strlen (fname);
    strcpy (keyfilename, fname);
    keyfilename[len - 1] = 'k';
    strcpy (ptrfilename, fname);
    ptrfilename[len - 1] = 'p';
 
    while (fgets (buf, MAXSTRLEN, filelistfile) != NULL)
      {
        /* skip over a comment line (any line which begins with '#' not
         * immediately followed by a recognized directive); process a
         * directive (a line beginning "#alphafile:", "#keyfile:",
         * or "#ptrfile:", followed by
         * a filename or numerical parameter)
         */
        if (buf[0] == '#')
          {
            n = sscanf (buf, "#alphafile: %s", filename);
            if (n == 1)
              {
                load_alpha (filename);
                continue;
              }
 
            n = sscanf (buf, "#keyfile: %s", filename);
            if (n == 1)
              {
                strcpy (keyfilename, filename);
                printf ("merged keyfile name = '%s'\n", keyfilename);
                continue;
              }
 
            n = sscanf (buf, "#ptrfile: %s", filename);
            if (n == 1)
              {
                strcpy (ptrfilename, filename);
                printf ("merged ptrfile name = '%s'\n", ptrfilename);
                continue;
              }
 
            continue;
          }
 
        /* now process 'normal' filelist lines, and add the relevant
         * information from them to the flist
         */
        n = sscanf (buf, " %s %s %s %ld",
                    filename, kfilename, pfilename, &fileoffset);
        
        /* skip over empty or mysteriously-bad lines
         */
        if (n == EOF || n != 4)
            continue;
            
        add_to_flist (filename, kfilename, pfilename, fileoffset);
      }
    
    fclose (filelistfile);
    
    printf ("filelist loading completed\n");
 
    return;
  }
 
 
/* print out error message in response to unrecognized switch
 */
 
void print_switch_error (char *s)
  {
    printf ("zmerger: %s: switch not recognized!\n", s);
    
    return;
  }
 
 
/* determine the size of the file
 */
 
long filesize (FILE *file)
  {
    long len;
 
    if (fseek (file, 0L, SEEK_END) != 0)
        error_exit (204, "fseek() error in filesize()", 0L);
    len = ftell (file);
    if (len == -1L)
        error_exit (205, "ftell() error in filesize()", 0L);
 
    return (len);
  }
 
 
/* count the number of files in the filelist ...
 */
 
long count_listed_files (FILE *filelistfile)
  {
    int n;
    long numfiles = 0, fileoffset;
    char buf[MAXSTRLEN], filename[MAXSTRLEN], kfilename[MAXSTRLEN],
        pfilename[MAXSTRLEN];
    
    while (fgets (buf, MAXSTRLEN, filelistfile) != NULL)
      {
        /* skip over a comment line (or other directive)
         */
        if (buf[0] == '#')
            continue;
 
        /* count filelist lines
         */
        n = sscanf (buf, " %s %s %s %ld",
                    filename, kfilename, pfilename, &fileoffset);
 
        /* skip over empty or mysteriously-bad lines, as elsewhere
         */
        if (n == EOF || n != 4)
            continue;
        
        /* ok, looks, good, count this as a file
         */
        ++numfiles;
      }
 
    return (numfiles);
  }
 
 
/* reserve memory for holding len bytes
 */
 
void *reserve_memory (long len)
  {
    void *p;
 
    p = (void *)malloc (len);
    if (p == NULL)
        error_exit (206, "out of memory error in reserve_memory()", len);
 
    return (p);
  }
 
 
/* exit with an error message -- some fatal mistake has been made
 */
 
void error_exit (int errnum, char *msg, long errarg)
  {
    fprintf (stderr, "\nFATAL ERROR -- very sorry\n");
    fprintf (stderr, "%d zmerger %s %ld\n", errnum, msg, errarg);
    exit (errnum);
    
    return;
  }
 
 
/* append new information to the flist, the in-memory data structure which
 * holds all the essential information for the merge_indices() to find files,
 * change ptr offsets, etc.; simultaneously, add the index in its proper location
 * to the heap which will be used to pull out words in alphabetical order
 */
 
void add_to_flist (char *filename, char *kfilename, char *pfilename, long offset)
  {
    char *fnp;
    FILE *fp;
 
    flist[nfiles].offset = offset;
 
    fnp = reserve_memory (strlen (kfilename) + 1);
    strcpy (fnp, kfilename);
    flist[nfiles].kfname = fnp;
 
    fnp = reserve_memory (strlen (pfilename) + 1);
    strcpy (fnp, pfilename);
    flist[nfiles].pfname = fnp;
 
    /* reserve memory for and initialize key and ptr buffers for
     * this file...
     */
 
    flist[nfiles].keyrecbufp = reserve_memory (buffersize);
    flist[nfiles].endkeyrecbufp = flist[nfiles].keyrecbufp +
        buffersize / sizeof (ZKEY);
    flist[nfiles].current_keyrecp = flist[nfiles].endkeyrecbufp;
    flist[nfiles].ptrbufp = reserve_memory (buffersize);
    flist[nfiles].endptrbufp = flist[nfiles].ptrbufp +
        buffersize / sizeof (long);
    flist[nfiles].current_ptrp = flist[nfiles].endptrbufp;
    initial_load_inbuffers (nfiles);
 
    /* things look good, so count the file and return ...
     */
 
    ++nfiles;
    if (nfiles > maxfiles)
        error_exit (213, "mysterious error in add_to_filelist(), sorry!\n",
            nfiles);
 
    return;
  }
 
 
/* do the job -- merge the index files together and write the results out
 * to disk as a unified database ... report on elapsed time, etc.
 *
 * maybe use buffer for output in next major revision?? ... for now, write out
 * records one by one and trust to the operating system to handle buffering on
 * the pair of output streams (.k and .p merged files)....
 */
 
void merge_indices (void)
  {
    time_t t0, t1;
    double secs;
 
    printf ("\nbeginning to merge index files\n");
    time (&t0);
    printf ("starting time = %s", ctime (&t0));
 
    if (nfiles < 2)
      {
        printf ("sorry, %ld is too few files to merge!\n", nfiles);
        return;
      }
 
    open_outfiles ();
    
    /* initialize essential globals for merge process */
    strncpy (current_keyrec.key, flist[heap[1]].current_keyrecp->key, KEYLENGTH);
    current_keyrec.ccount = 0;
 
    /* the inner loop:  until heap of files to be merged is empty, just keep
     * writing out the smallest remaining record!
     */
    while (last_heap_slot)
        write_out_smallest_remaining_record ();
 
    close_outfiles ();
 
    time (&t1);
    printf ("\ncongratulations! -- index file merger complete\n");
    printf ("ending time = %s", ctime (&t1));
    secs = difftime (t1, t0);
    printf ("elapsed time = %g seconds\n", secs);
 
    return;
  }
 
 
/* initialize the buffers to hold in memory the key and ptr records
 * about to be merged for a particular filelist item "i" ... also, put
 * this item's serial number (index) in its proper place in the heap
 * alphabetically...
 *
 * At the moment, in the FLST structure the offset, kfname, pfname, and
 * key/ptr buffer pointers have already been filled in ... so here, we
 * just need to check to make sure the keyfile and ptrfile are both available
 * (skip over the file & put 0 into remaining_ptrcount if not available)
 * and if ok, set value of next_keybuf_start_num to 0, fetch current_keyrec,
 * put 0 into next_ptrbuf_start_num and into prev_ccount, and put
 * (length of the ptrfile) / sizeof (ZPTR) into remaining_ptrcount....
 */
 
void initial_load_inbuffers (long i)
  {
    long pfsize;
    FILE *fp;
 
    if ((fp = fopen (flist[i].kfname, "rb")) == NULL)
          {
            printf ("WARNING:  zmerger couldn't find key file '%s'!\n",
                        flist[i].kfname);
            printf ("   ... its document will be omitted from the merged database!\n");
            flist[i].remaining_ptrcount = 0;
            return;                 /* exit in middle of function --- sorry! */
          }
    else
            fclose (fp);
 
    if ((fp = fopen (flist[i].pfname, "rb")) == NULL)
          {
            printf ("WARNING:  zmerger couldn't find ptr file '%s'!\n",
                        flist[i].pfname);
            printf ("   ... its document will be omitted from the merged database!\n");
            flist[i].remaining_ptrcount = 0;
            return;                 /* exit in middle of function --- sorry! */
          }
    else
          {
            pfsize = filesize (fp);
            fclose (fp);
          }
 
    flist[i].next_keybuf_start_num = 0;
    fetch_next_keyrec (i);
    flist[i].prev_ccount = 0;
    flist[i].next_ptrbuf_start_num = 0;
    flist[i].remaining_ptrcount = pfsize / sizeof (long);
    
    push_index_onto_heap (i);
 
    return;
  }
 
 
/* get the next key record for flist[i] ... check for errors
 * and give up if one is encountered.... do it using buffered approach
 */
 
void fetch_next_keyrec (long i)
  {
    FILE *fp;
    long keysread;
 
    ++flist[i].current_keyrecp;
 
    /* if we're still in the buffer, all done; if not, must fetch from
     * disk, reset pointers, etc....
     */
 
    if (flist[i].current_keyrecp >= flist[i].endkeyrecbufp)
      {
 
        if ((fp = fopen (flist[i].kfname, "rb")) == NULL)
            error_exit (214, "fetch_current_keyrec couldn't open keyfile #", i);
 
        if (fseek(fp, flist[i].next_keybuf_start_num * sizeof (ZKEY),
                    SEEK_SET) != 0)
            error_exit (215, "fetch_current_keyrec fseek() error for keyfile #", i);
 
        keysread = fread (flist[i].keyrecbufp, sizeof (ZKEY),
                            flist[i].endkeyrecbufp - flist[i].keyrecbufp, fp);
 
        if (keysread == 0)
            error_exit (216, "fetch_current_keyrec fread() error for keyfile #", i);
 
        flist[i].next_keybuf_start_num += keysread;
        flist[i].endkeyrecbufp = flist[i].keyrecbufp + keysread;
 
        flist[i].current_keyrecp = flist[i].keyrecbufp;
 
        fclose (fp);
      }
 
    return;
  }
 
/* bring the next ptr from filelist ptr file number flistnum into
 * buffer pbuf which holds it for further processing ...
 */
 
void fetch_next_ptr (long *pbuf, long i)
  {
    FILE *fp;
    long ptrsread;
 
    ++flist[i].current_ptrp;
    
    /* if still in the buffer, all done; if not, must reload
     */
    if (flist[i].current_ptrp >= flist[i].endptrbufp)
      {
        if ((fp = fopen (flist[i].pfname, "rb")) == NULL)
            error_exit (218, "fetch_next_ptr() fopen() error on ptrfile #",
                        i);
        if (fseek (fp, flist[i].next_ptrbuf_start_num * sizeof (long),
                    SEEK_SET) != 0)
            error_exit (219, "fetch_next_ptr() fseek() error on ptrfile #",
                    i);
        ptrsread = fread (flist[i].ptrbufp, sizeof (long),
                flist[i].endptrbufp - flist[i].ptrbufp, fp);
        if (ptrsread == 0)
            error_exit (220, "fetch_next_ptr() fread() error on ptrfile #",
                    i);
        flist[i].next_ptrbuf_start_num += ptrsread;
        flist[i].endptrbufp = flist[i].ptrbufp + ptrsread;
        flist[i].current_ptrp = flist[i].ptrbufp;
        fclose (fp);
      }
 
    *pbuf = *flist[i].current_ptrp;
 
    return;
  }
 
 
/* push a new index onto the heap, maintaining alphabetical order and heap
 * condition; see standard textbooks for description of heap algorithm, whereby
 * the new index is put into the (newly-incremented) last slot, and then
 * sifted upwards until the heap condition (parent node precedes both children
 * alphabetically, with my alpha_map) holds...
 */
 
void push_index_onto_heap (long bufnum)
  {
    long current_slot, test_slot;
    
    current_slot = ++last_heap_slot;
    test_slot = current_slot / 2;
    
    while (test_slot > 0 && alpha_greater_than (heap[test_slot], bufnum))
      { /* must move parent index down to make room for new bufnum insertion */
        heap[current_slot] = heap[test_slot];
        current_slot = test_slot;
        test_slot /= 2;
      }
    
    /* now have found the right place to put the new bufnum into */
    heap[current_slot] = bufnum;
    
    return;
  }
 
 
 
/* open the master index key and ptr files for writing out the results
 * of the index merging operation...
 */
 
void open_outfiles (void)
  {
    kfp = fopen (keyfilename, "wb");
    if (kfp == NULL)
        error_exit (209, "error opening key file in open_outfiles()", 0L);
 
    pfp = fopen (ptrfilename, "wb");
    if (pfp == NULL)
        error_exit (210, "error opening ptr file in open_outfiles()", 0L);
 
    return;
  }
 
 
/* compare the keys in two buffers and return true if the second one is greater
 * alphabetically than the first ... ((consider implementing this as a macro
 * somehow, to avoid overhead of a function call?)) ... break ties in favor of
 * the smaller buffer number, to maintain stability of the merge (so as to get the
 * data from earlier files in the filelist to come out first when browsing); thus
 * when the words are identical (as may happen pretty often), bufnum2 counts as
 * "greater than" bufnum1.  Note that once one hits a delimiter in the inner
 * loop, mapped to zero by alpha_order, there's no need to compare further....
 */
 
int alpha_greater_than (long bufnum1, long bufnum2)
  {
    int n;
    unsigned char *s1, *s2;
    
    s1 = (unsigned char*)flist[bufnum1].current_keyrecp->key;
    s2 = (unsigned char*)flist[bufnum2].current_keyrecp->key;
    
    /* make the next statement efficient, since it's going to be executed
     * pretty often!! This is the innermost loop :-)
     */
    for ( n = KEYLENGTH; --n && alpha_order[*s1] &&
            alpha_order[*s1] == alpha_order[*s2]; ++s1, ++s2)
        ;
 
    return ( (alpha_order[*s1] == alpha_order[*s2]) ? ( bufnum1 > bufnum2)
            : (alpha_order[*s1] > alpha_order[*s2]) );
  }
 
 
/* send the smallest remaining records to the output stream ... specifically,
 * check the key that is on the top of the heap of buffers, and if it
 * differs from the previously-recorded global current_keyrec.key, write that
 * previous current_keyrec (key and associated cumulative count) out to the merged
 * keyfile, and update current_keyrec.key.  In any event, continue by writing out
 * to the merged ptrfile all the ptr records corresponding to the now-current
 * smallest key (adding in the proper offset to the absolute values stored
 * in the individual ptrfile), increment the overall cumulative count
 * current_keyrec.ccount, decrement this flist item's remaining_ptrcount,
 * and (if there are any keys and ptrs left to be dealt with in this file)
 * move the current ccount for this flist item to prev_ccount, and
 * fetch the next key to be current_keyrec for this flist item; then sift
 * the current buffer down to its proper place in the heap.  If the current
 * buffer is now empty, remove it from the heap and put the formerly-last
 * heap buffer on top, then sift it down to make a legal heap again.
 *
 * it may sound complex, but it's really just dealing the right number of cards off
 * the top of the smallest remaining record's deck ...
 */
 
void write_out_smallest_remaining_record (void)
  {
    long i, smallest, ptrcount, current_ptr, smallest_offset, current_slot, test_slot;
    
    smallest = heap[1];
 
    /* it's ok to use strncmp here instead of custom alpha_order[], since
     * all we care about is equality test, and all identical characters
     * have already been mapped to their equivalents in creating the
     * keyfiles by zndxr...
     */
    if (strncmp (current_keyrec.key, flist[smallest].current_keyrecp->key,
                        KEYLENGTH) != 0)
      {
        if (fwrite (&current_keyrec, sizeof (ZKEY), 1, kfp) != 1)
            error_exit (217,
                "write_out() fwrite() error for key record at keyfile #",
                smallest);
        strncpy (current_keyrec.key, flist[smallest].current_keyrecp->key,
                    KEYLENGTH);
      }
 
    ptrcount = flist[smallest].current_keyrecp->ccount -
           flist[smallest].prev_ccount;
 
    smallest_offset = flist[smallest].offset;
    for (i = 0; i < ptrcount; ++i)
      {
        fetch_next_ptr (&current_ptr, smallest);
 
        current_ptr += smallest_offset;
 
        if (fwrite (&current_ptr, sizeof (long), 1, pfp) != 1)
            error_exit (221,
                "write_out() fwrite() error for ptr record from ptrfile #",
                smallest);
      }
 
    current_keyrec.ccount += ptrcount;
    flist[smallest].remaining_ptrcount -= ptrcount;
    
    /* if records remain in this file, set up to process next one; if not,
     * cut down the size of the heap and put the final heap item on top,
     * prior to the sifting down operation...
     */
    if (flist[smallest].remaining_ptrcount > 0)
      {
        flist[smallest].prev_ccount = flist[smallest].current_keyrecp->ccount;
        fetch_next_keyrec (smallest);
      }
    else
      {
        smallest = heap[last_heap_slot--];
        heap[1] = smallest;    /* must do this in case last_heap_slot == 1 */
      }
    
    /* now restore heap condition by sifting down the top, formerly smallest,
     * item to its proper place
     */
    current_slot = 1;
    test_slot = 2;
    while (test_slot <= last_heap_slot)
      {
        if (test_slot < last_heap_slot &&
              alpha_greater_than (heap[test_slot], heap[test_slot + 1]))
            ++test_slot;   /* now test_slot is smaller of the child nodes */
 
        if (alpha_greater_than (smallest, heap[test_slot]))
          {
            /* must move test_slot item up and commence work on next level down */
            heap[current_slot] = heap[test_slot];
            current_slot = test_slot;
            test_slot *= 2;
          }
        else
            break;
      }
 
    heap[current_slot] = smallest;
 
    return;
  }
 
 
/* close the output key and ptr files ... but first, must write out the very
 * last key record to the merged keyfile ... then the merger is completed...
 */
 
void close_outfiles (void)
  {
    if (fwrite (&current_keyrec, sizeof (ZKEY), 1, kfp) != 1)
        error_exit (222,
            "oops, close_outfiles() fwrite() error on final key record", 0L);
 
    printf ("closing output files...\n");
 
    fclose (kfp);
    fclose (pfp);
 
    return;
  }
 

