Date: Mon, 2 Jun 1997 17:40:02 -0700 (PDT)
From: Mark Zimmermann
Subject: zndxr41.c + comments


WOW! --- found some files from 1991, a real mother lode of stuff,
which I had mostly forgotten about ... seems to be heavily commented
with descriptions of alphabetization, multi-file indexing, etc.
issues! --- as I recall, these did compile & run pretty well, but were
by no means totally debugged ... so in next 4 msgs for you there will
be:

-rw-------  1 zimm         9618 Jun  2 17:35 zcmpndx04.c
-rw-------  1 zimm        39753 Jun  2 17:33 zbrwsr58.c
-rw-------  1 zimm        36129 Jun  2 17:26 zmrgr46.c
-rw-------  1 zimm        34197 Jun  2 17:19 zndxr41.c


For starters, here's "zndxr41.c", an experiment in rewriting the
generic indexer that includes some more explicit choices about
character mapping and alphabetization during index-building ... I
think I used this as the basis for the 1995 Cyrillic version
(maybe).... ^z

-------zndxr41.c-----

/* zndxr.c - new multifile index-builder by ^z
 * (c) 1991 - Mark Zimmermann
 *
 * this is free software and comes with no warranty WHATSOEVER!
 * See the GNU General Public License for details
 *
 * This new indexer simplifies and revises the qndxr.c, etc. programs
 * from 1987-90 in several ways -- the new indexer is memory-based
 * and separates out the index-merging functions to another program.
 * This new indexer allows user control of alphabetization
 * and character mapping, optional preservation of embedded punctuation
 * symbols, and the ability to use a filelist file to index
 * multiple files as a batch((, or to segment large files into parts
 * which are separately indexed --- yet to be implemented, again!)).  
 * Finally, the new indexer is written with transportability to MS-DOS,
 * UNIX, and Macintosh in mind ... though I do insist on using the new
 * ANSI-style function declarations, prototypes, etc.
 *
 * ((Note -- after frustrating experiences with the Intel segmented
 * 8088-style architecture, far pointers, huge memory models, etc.,
 * I have reverted to a very generic approach (i.e., I ignore the
 * DOS problems entirels!!!) --- thus, in order to make
 * this program work on files larger than 32kB or so (I speculate)
 * under MS-DOS may require some changes (e.g., 'large memory
 * model' compilation) for which I am neither equipped nor expert
 * enough .... help, somebody!??))
 */
 
/* if one needs to convert program for compilation by an old, dumb cc,
 * for generic use, this is what happens:
 *
 * - prototypes turned into function head type declarations (via gawk
 *    program to take everything out from within the () in the prototypes)
 * - function heads turned back to old form (for things that don't
 *    call functions as arguments, single line head, etc.), using gawk
 * - <stdlib.h> reference removed, as it offends...
 * - #define time_t, difftime(), SEEK_SET, and SEEK_END
 */
 
#ifdef DUMB_COMPILER
#define time_t              long
#define difftime(t1,t0)     (t1 - t0)
#define SEEK_SET            0
#define SEEK_END            2
#else
#include <stdlib.h>
#endif
 
#include <stdio.h>
#include <string.h>
#include <time.h>
 
/* for Mac (Think C) ... need to put this into a project with the ANSI
 * and the MacTraps libraries ... the <console.h> header lets main()
 * take command-line UNIX-style arguments ... very un-Macish, sorry,
 * but it works!!
 */
#ifdef THINK_C
#include <console.h>
#endif
 
#define MAXSTRLEN    256
#define KEYLENGTH     28
#define TRUE           1
#define FALSE          0
 
/* define data type for my index pointers, which hold the offset
 * into the document file where each word is found
 */
typedef unsigned char *  zndxptr;
 
/* begin prototypes */
/* keep all prototypes here, for my awk filter program to modify as needed
 * for dumb old compilers ... keep then each single line long, beginning with
 * lower-case letter in first column, and ending with ");" ....
 * converter program eats the contents of the parens....
 */
 
int main (int argc, char *argv[]);
void print_help (void);
void load_alpha (char *fname);
void index_listed_files (char *fname);
void build_index (void);
long filesize (FILE *file);
void *reserve_memory (long len);
void load_docfile (FILE* docfile, unsigned char *docp, long doclength);
long filter_doc (unsigned char *doc0, long doclength);
void make_index_ptrs (unsigned char *doc0, long doclength, zndxptr *ptr0, long ptrcount);
void sort_index_ptrs (zndxptr *ptr0, zndxptr *ptr1);
int  compare_words (zndxptr *p1, zndxptr *p2);
void open_index_files (char *kfilename, FILE **kfp, char *pfilename, FILE **pfp);
long write_index_files (unsigned char *doc0, zndxptr *ptr0, long ptrcount, FILE *kfile, FILE *pfile);
void fwrite_key (zndxptr pp, FILE *kfile);
void error_exit (int errnum, char *msg, long errarg);
 
/* end prototypes */
 
/* global variables */
 
/* info about file to be indexed:
 *  docfilename[] = name of document file (call it <docfile> below)
 *  keyfilename[] = name of key file (default <docfile>.k)
 *  ptrfilename[] = name of ptr file (default <docfile>.p)
 */
 
char docfilename[MAXSTRLEN];
char keyfilename[MAXSTRLEN];
char ptrfilename[MAXSTRLEN];
 
/* mapping to be applied to characters in the input file:
 * the array alpha_map[] translates characters by default
 * into upper-case alphabetic, preserves numbers, and
 * turns everything else into a delimiter (null).  The
 * default character mapping can be overridden by using
 * the '-f mapfile' option in the command line.  See the
 * documentation for function load_alpha() for details.
 *
 * Note that mapping characters to null allows for use of
 * strcmp(), strcpy(), etc. throughout the program, since
 * the words in the file are null-terminated strings ...
 * very convenient.
 */
unsigned char alpha_map[] =
  {
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
     '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',
     '8',  '9', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
     'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
     'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
     'X',  'Y',  'Z', '\0', '\0', '\0', '\0', '\0',
    '\0',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
     'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
     'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
     'X',  'Y',  'Z', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0'
  };
 
/* alphabetization order:  the array alpha_order[] determines
 * the order of alphabetization of characters during indexing.
 * The default is simply ASCII order, with numbers preceding
 * letters.  This default behavior may be changed using the
 * '-f mapfile' command line option.  See the documentation
 * accompanying function load_alpha() for details.
 *
 * Here, although many other patterns are possible, I
 * simply repeat the contents of the alpha_map[] array
 * above as the default...
 *
 * Note that the value of alpha_order entries should never
 * be zero for any characters which are actually not mapped
 * to null -- otherwise, alphabetization errors may result!
 */
unsigned char alpha_order[] =
  {
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
     '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',
     '8',  '9', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
     'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
     'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
     'X',  'Y',  'Z', '\0', '\0', '\0', '\0', '\0',
    '\0',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
     'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
     'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
     'X',  'Y',  'Z', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0'
  };
 
 
/* embedded punctuation handling:  a character found in
 * embedded_punct[] with a nonzero entry is treated specially:
 * the characters on either side in the document file (after
 * mapping by alpha_map[]) are checked, and if they are both
 * non-delimiters and non-punctuation, then the current character
 * is an 'embedded punctuation' symbol and is kept; otherwise, it
 * is set to null (a delimiter).
 *
 * Thus, the alpha_map[] array contains all characters which *may*
 * survive to be included in indexed words; the embedded_punct[]
 * array specifies characters which are in the alpha_map as
 * 'second-class' symbols, and which can only be kept if surrounded
 * on both sides by 'first-class' symbols.
 *
 * This is an extremely simple parser, obviously, but it is fast,
 * easy to understand, and (I hope) provides enough flexibility
 * to handle many users' requirements for special handling of
 * symbols used in marked-up texts, linguistic symbols, etc.
 * This approach also only requires a single character of lookahead,
 * which getc() and ungetc() can do for us ... though in this
 * implementation, the document is already in memory and there
 * is no problem with arbitrary look-ahead....
 *
 * The default behavior, for compatibility with earlier indexer
 * programs, is NOT to keep any embedded punctuation; hence, this
 * array embedded_punct[] is full of nulls....
 */
 
unsigned char embedded_punct[] =
  {
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0',
    '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0'
  };
 
 
 
 
/*********************************************************/
 
main (int argc, char *argv[])
  {
    int i;
    
    /* get the command line arguments for Macintosh users ...
     */
#ifdef THINK_C
    argc = ccommand (&argv);
#endif
 
    printf ("\nWelcome to zndxr version 0.37 - (c) 1991 - ^z\n");
 
    if (argc == 1)
        print_help ();
    else
        index_listed_files (argv[1]);
 
    printf ("\nGood-bye! - ^z\n");
    
    return;
  }
 
 
/* print out minimally helpful information for user... */
 
void print_help (void)
  {
    printf ("\nzndxr:  build index files for free-text information retrieval\n");
    printf ("\nusage:  zndxr filelist\n");
    printf ("\nzndxr (c) 1990 Mark Zimmermann - ^z\n");
    printf ("this program comes with NO WARRANTY whatsoever\n");
    printf ("zndxr is free software, under terms of the GNU General Public License\n");
    printf ("for additional information, read the source code, or if\n");
    printf ("absolutely necessary, write to me at P.O.Box 598,\n");
    printf ("Kensington, Maryland 20895-0598, USA.\n");
    
    return;
  }
 
 
/* load alpha file to redefine character mapping and/or alphabetization
 * order and/or handling of embedded delimiters within words.
 *
 * The mapping defined by the alpha file is applied to all characters
 * in the document to be indexed as that document is read into memory.
 * Further filtering, such as detection of embedded punctuation, is
 * done in a subsequent stage....
 *
 * The alpha file is optional.  It is an ASCII text file consisting of
 * a succession of lines, giving the characters to appear in the index
 * file in the order in which they are to be alphabetized. If several
 * characters are listed on one line, then all of them are mapped
 * (translated) into the first character on that line.  Thus
 * the default alphabetization order (ASCII) and character mapping
 * (lower-case alphabetic to upper-case) is equivalent to an alpha
 * file which looks like:
0
1
2
3
4
5
6
7
8
9
Aa
Bb
 <<etc.>>
Yy
Zz
 * Alternative alphabetizations and character mappings are most
 * likely to be useful to those who need to work with non-Latin
 * alphabets or who wish to handle non-alphanumeric characters
 * in a special way.
 *
 * There are two 'special' characters in an alphafile:
 *  # <number-sign>
 *  ~ <tilde>
 *
 * To include a nonprintable or 'special' character in an alpha file, write
 * a number-sign ('#') followed by the DECIMAL value of the character.
 * Thus, #126 = ~ = <tilde>, #35 = #, #10 = ^J = '\n' = newline, etc.
 *
 * Comments may be included in an alphafile on a line beginning with a '# ';
 * any line with '# ' as its first two characters is ignored.
 *
 * If it is desired to treat some delimiters specially, so that
 * they are retained in index words only when embedded in between
 * ordinary non-delimiters, then those special symbols are
 * listed after a '~' (tilde) character on the line.
 * Thus, to keep the embedded characters '.' (as in '3.14159'),
 * '-' (as in 'good-bye'), and ',' (as in '1,234,567'), and to
 * map all ',' commas into '.' periods for index listing, and to
 * alphabetize the '.' (and ','), and '-' in between Y and Z,
 * in that order, the final few lines of the alpha file might look like:
Yy
~.,
~-
Zz
 * Notwithstanding the great and powerful potential of the alphafile,
 * it is generally wisest (in my experience) to use the default
 * practice of omitting all punctuation symbols from the index.
 * This avoids problems with words occurring in scattered places
 * in the index lists due to various haphazard hyphenations or
 * accidents of punctuation.  Thus, if the expression 'free-text'
 * were hyphenated when used as a modifier, it would not be possible
 * to search for the word 'text' in that occurrence.  On the other
 * hand, it is unaesthetic to see the word 'don't' split into 'don'
 * and 't' in an index list, I agree....
 *
 * So, the option to keep punctuation characters as second-class
 * citizens, so that they only are retained when framed on both
 * sides by first-class indexed characters, is provided as an
 * alternative index-building strategy for those who don't want
 * to keep all punctuation in the index, but who dislike omitting
 * all punctuation as well.
 *
 * If you have specially formatted input data files (perhaps using
 * some symbols to indicate extra linguistic meanings, or fonts,
 * or data types, or whatever) you may be able to use a customized
 * alpha file to build particularly useful index structures.  Many 
 * linguists and others with unusual needs have asked me for this
 * capability -- here it is!
 *
 * This alpha file structure is used during index building; it is
 * also needed during index browsing, if non-default mappings or
 * alphabetization or delimiter-handling features are used, so that
 * the index browser program can correctly search the index files.
 *
 * No guarantees if a line is longer than MAXSTRLEN-1 characters!!!
 */
 
void load_alpha (char *fname)
  {
    FILE *alphafile;
    int c, i, len, firstc, flag, order = 0;
    char buf[MAXSTRLEN];
 
    printf ("opening alphafile '%s'\n", fname);
    alphafile = fopen (fname, "r");
    if (alphafile == NULL)
        error_exit (100, "fopen() error in load_alpha()", 0L);
    
    for (i = 0; i < 256; ++i)
      {
        alpha_map[i] = '\0';
        alpha_order[i] = '\0';
        embedded_punct[i] = '\0';
      }
    
    while (fgets (buf, MAXSTRLEN, alphafile) != NULL)
      {
        /* skip a comment line (any line which begins with '# ') and
         * skip a completely empty line (containing only '\n')
         */
        if ((buf[0] == '#' && buf[1] == ' ') || buf[0] == '\n')
            continue;
 
        /* increment the counter that keeps track of where to put this
         * line's characters alphabetically ...
         */
        if (++order > 255)
            error_exit (101, "++order error in load_alpha()", (long)order);
 
        firstc = -1;
        flag = FALSE;
        len = strlen (buf);
        if (buf[len - 1] == '\n')
            /* remove pesky final '\n' from lines that fgets() provides...
             */
            --len;
        
        for (i = 0; i < len; ++i)
          {
            c = buf[i];
            
            if (c == '~')
              {
                /* set flag to make rest of this line 'special'
                 */
                flag = TRUE;
                continue;
              }
            
            /* convert escaped characters after '#' into proper form
             */
            if (c == '#')
              {
                c = 0;
                ++i;
                while (buf[i] >= '0' && buf[i] <= '9')
                    c = 10 * c + buf[i++] - '0';
                --i;
              }
            
            /* safety net -- following *should* never happen!
             */
            if (c < 0 || c > 255)
                error_exit (102, "character error in load_alpha()", (long)c);
            
            if (firstc == -1)
                firstc = c;
 
            alpha_map[c] = firstc;
            alpha_order[c] = order;
            if (flag)
                embedded_punct[c] = TRUE;
          }
      }
    fclose (alphafile);
    
    return;
  }
 
 
/* index a list of files.  The format of the filelist file
 * is designed to be compatible with the '.f' file (to be) used by
 * my free text information retrieval ("browser") programs.  A
 * filelist file consists of a succesion of lines containing:
 *    the name of the document file itself
 *    the name of the '*.k' key file
 *    the name of the '*.p' ptr file
 *
 * All parameters after the document file name are optional;
 * if they are omitted, the entire document file is indexed and
 * the default .k and .p suffixes are appended to that file name
 * to get the key and ptr file names.
 * (Thus, in MS-DOS systems, the original file must have no extension
 * unless explicit names for key and ptr files are provided.)
 * This default behavior happens for file names given on the command
 * line...
 *
 * A line beginning with a '#' character is treated as a comment and is
 * ignored in the filelist.  The single exception, as far as zndxr is
 * concerned, is a line beginning with "#alphafile: ", which is read and
 * used to load an alphafile for subsequent index-building.  Thus, the line:
#alphafile:  greeble.a
 * says to use the custom alphabetization in file "greeble.a" hereafter.
 *
 * Note that it doesn't make much sense, ordinarily, to use different
 * alphabetizations within a database, and thus the alphafile indication
 * should appear early in the filelist, before any files have been indexed.
 * Note also that there may be other lines beginning with '#' in the
 * filelist, specifying things which the zndxr program doesn't care about.
 *
 * If you really want to begin a filename with a '#', put a space in front
 * of it in the filelist entry for that file; leading whitespace on a
 * normal filelist line is ignored....
 */
 
void index_listed_files (char *fname)
  {
    FILE *filelistfile;
    int n;
    char buf[MAXSTRLEN], alphafilename[MAXSTRLEN];
 
    printf ("opening filelist file '%s'\n", fname);
    filelistfile = fopen (fname, "r");
    if (filelistfile == NULL)
        error_exit (112, "error opening filelist in index_listed_files()", 0L);
 
    while (fgets (buf, MAXSTRLEN, filelistfile) != NULL)
      {
        /* skip over a comment line (any line which begins with '#' not
         * immediately followed by "alphafile:") and process an alphafile
         * inclusion directive (a line beginning "#alphafile:" followed by
         * an alphafile filename)
         */
        if (buf[0] == '#')
          {
            n = sscanf (buf, "#alphafile: %s", alphafilename);
            if (n == 1)
                load_alpha (alphafilename);
            continue;
          }
 
        n = sscanf (buf, " %s %s %s",
                    docfilename, keyfilename, ptrfilename);
        
        /* skip over empty or mysteriously-bad lines
         */
        if (n == EOF || n < 1 || n > 3)
            continue;
            
        if (n < 3)
          {
            strcpy (ptrfilename, docfilename);
            strcat (ptrfilename, ".p");
          }
        
        if (n < 2)
          {
            strcpy (keyfilename, docfilename);
            strcat (keyfilename, ".k");
          }
        
        build_index ();
      }
    
    fclose (filelistfile);
    
    return;
  }
 
 
/* build the index for a file
 */
 
void build_index (void)
  {
    long doclength, ptrcount, different_words;
    FILE *docfile, *kfile, *pfile;
    unsigned char *doc0;
    zndxptr *ptr0;
    time_t t0, t1;
    double secs;
 
    printf ("\nbeginning to build index for file '%s'\n", docfilename);
    time (&t0);
    printf ("starting time = %s", ctime (&t0));
 
    docfile = fopen (docfilename, "rb");
    if (docfile == NULL)
        error_exit (103, "fopen() error in build_index()", 0L);
 
    doclength = filesize (docfile);
    if (doclength <= 0)
        error_exit (117, "bad doclength in build_index()", doclength);
        
    printf ("reserving %ld bytes for document\n", doclength);
    
    /* actually, get two extra bytes, and set the first and last to null,
     * to avoid end-effect problems in following routines ... make the
     * doc0 pointer refer to the true document starting byte, however
     * (and don't forget to fix doc0 again before freeing the block of
     * of memory!!)
     */
    doc0 = reserve_memory (doclength + 2);
    *doc0++ = '\0';
    *(doc0 + doclength) = '\0';
    
    printf ("loading document & remapping characters\n");
    load_docfile (docfile, doc0, doclength);
    
    fclose (docfile);
    
    printf ("filtering characters & counting words to index\n");
    ptrcount = filter_doc (doc0, doclength);
    
    printf ("document contains %ld words\n", ptrcount);
    printf ("reserving %ld bytes for index pointers\n",
                sizeof (zndxptr) * ptrcount);
    ptr0 = reserve_memory (sizeof (zndxptr) * ptrcount);
    
    printf ("generating unsorted index pointers to every word\n");
    make_index_ptrs (doc0, doclength, ptr0, ptrcount);
    
    printf ("sorting index pointers\n");
    sort_index_ptrs (ptr0, ptr0 + ptrcount); /* check on this --- is last right?? */
    
    printf ("creating index files\n");
    open_index_files (keyfilename, &kfile, ptrfilename, &pfile);
    
    printf ("writing index files to disk\n");
    different_words = write_index_files (doc0, ptr0, ptrcount, kfile, pfile);
    
    printf ("document %s contained %ld different words\n", docfilename,
                different_words);
    printf ("index files '%s' and '%s' successfully written\n",
            keyfilename, ptrfilename);
    
    printf ("freeing memory allocated for document and pointers\n");
    free (--doc0);
    free (ptr0);
    
    printf ("\ncongratulations! -- indexing complete for %s\n", docfilename);
    time (&t1);
    printf ("ending time = %s", ctime (&t1));
    secs = difftime (t1, t0);
    printf ("elapsed time = %g seconds\n", secs);
    if (secs > 0)
        printf ("net indexing rate = %.2f megabytes/hour\n",
                    doclength * .0036 / secs);
 
    return;
  }
 
 
/* determine the size of the file
 */
 
long filesize (FILE *file)
  {
    long len;
 
    if (fseek (file, 0L, SEEK_END) != 0)
        error_exit (104, "fseek() error in filesize()", 0L);
    len = ftell (file);
    if (len == -1L)
        error_exit (105, "ftell() error in filesize()", 0L);
 
    return (len);
  }
 
 
/* reserve memory for holding len bytes
 */
 
void *reserve_memory (long len)
  {
    void *p;
 
    p = malloc (len);
    if (p == NULL)
        error_exit (106, "out of memory error in reserve_memory()", len);
 
    return (p);
  }
 
 
/* load document file, transforming characters as per alphabetic
 * mapping table alpha_map[]...
 */
 
void load_docfile (FILE* docfile, unsigned char *docp, long doclength)
  {
    long ptrcount = 0L, i;
    int inaword = FALSE;
 
    if (fseek (docfile, 0, SEEK_SET) != 0)
        error_exit (107, "fseek() error in load_docfile()", 0);
 
    for (i = 0; i < doclength; ++i)
        *docp++ = alpha_map[getc (docfile)];
 
    return;
  }
 
 
/* filter the document as loaded into memory (after the character
 * mapping has already been applied!) and count the number of words
 * to generate pointers to, so that we know how much memory to reserve
 * for storage of those ptrs....
 *
 * ((This routine may be replaced in order to do fancier parsing
 * and/or transformation of the text -- recognition of 'stop words', etc.,
 * as suggested by roskos & others ... note that it seems unaesthetic to
 * do two passes through the document in memory, here in filter_doc() and
 * below in make_index_ptrs() ... perhaps the two routines could be
 * combined into one?  On the other hand, it is very nice to know exactly
 * how many pointers are going to be created, so that just the right
 * amount of memory may be reserved....))
 *
 * In this generic version of the index-building program, however,
 * the only option during filtering is to preserve "second-class"
 * (e.g., punctuation) characters which are found embedded between
 * "first-class" (e.g., alphanumeric) characters.
 *
 * I realize that the "embedded_punct[*(doc0 - 1)]" clause in the
 * embedded-punctuation-testing "if" statement is unnecessary, but I keep
 * it in place for simplicity and reliability.
 */
 
long filter_doc (unsigned char *doc0, long doclength)
  {
    int inaword = FALSE;
    long i, ptrcount = 0L;
 
    for (i = 0; i < doclength; ++i)
      {
        if (embedded_punct[*doc0] && (! *(doc0 - 1) || ! *(doc0 + 1) ||
                embedded_punct[*(doc0 - 1)] || embedded_punct[*(doc0 + 1)]))
            *doc0 = '\0';
 
        if (*doc0 && ! inaword)
          {
            inaword = TRUE;
            ++ptrcount;
          }
        else if (! *doc0 && inaword)
            inaword = FALSE;
 
        ++doc0;
      }
 
    return (ptrcount);
  }
 
 
/* make the pointers to every word in the document by scanning
 * along, recognizing when each word begins....
 *
 * during the generation of the pointers, check that the number
 * of them agrees with the count provided by the filter_doc()
 * operation.... for safety's sake....
 */
 
void make_index_ptrs (unsigned char *doc0, long doclength, zndxptr *ptr0, long ptrcount)
  {
    int inaword = FALSE;
    long i;
 
    for (i = 0; i < doclength; ++i)
      {
        if (*doc0 && ! inaword)
          {
            inaword = TRUE;
            *ptr0 = doc0;
            ++ptr0;
            if (--ptrcount < 0)
                error_exit (108, "ptr count error in make_index_ptrs()", i);
          }
        else if (! *doc0 && inaword)
            inaword = FALSE;
 
        ++doc0;
      }
 
    return;
  }
 
 
/* sort the index pointers in memory based on the alphabetization
 * chosen for the document's words ...
 *
 * use my standard old reliable quicksort, simple implementation; no
 * attempt to apply optimizations yet!  Probably can improve indexing
 * performance by 20% or so with some effort here....
 */
 
void sort_index_ptrs (zndxptr *first, zndxptr *last)
  {
    register zndxptr *i, *j, tmp;
 
    while (last - first > 1)
      {
        i = first;
        j = last;
        for (;;)
          {
            while (++i < last && compare_words (i, first) < 0)
                ;
            while (--j > first && compare_words (j, first) > 0)
                ;
            if (i >= j)
                break;
            tmp = *i;
            *i = *j;
            *j = tmp;
          }
        tmp = *first;
        *first = *j;
        *j = tmp;
        if (j - first < last - (j + 1))
          {
            sort_index_ptrs (first, j);
            first = j + 1;
          }
        else
          {
            sort_index_ptrs (j + 1, last);
            last = j;
          }
      }
 
    return;
  }
 
 
/* compare the two words pointed to by pointers p1 and p2, using
 * the alphabetization order as defined by alpha_order[] array...
 * much like the strcmp() function, modified to return the right
 * result to make quicksort "stable" if words pointed to are tied
 * (in which case, return the sign of the difference of the ptrs,
 * so that the earlier pointer comes out first in the sort)....
 */
 
int compare_words (zndxptr *p1, zndxptr *p2)
  {
    int diff, n = KEYLENGTH;
    unsigned char *s1, *s2;
 
    s1 = *p1;
    s2 = *p2;
 
    for ( ; --n && (alpha_order[*s1] == alpha_order[*s2]);
                ++s1, ++s2)
        if (! *s1)
            break;
 
    diff = alpha_order[*s1] - alpha_order[*s2];
 
    if (diff == 0)
        diff = (*p1 - *p2 < 0) ? -1 : 1;
 
    return (diff);
  }
 
 
/* open the index key and ptr files for writing out the results
 * of the index building operations...
 */
 
void open_index_files (char *kfilename, FILE **kfp, char *pfilename, FILE **pfp)
  {
    *kfp = fopen (kfilename, "wb");
    if (*kfp == NULL)
        error_exit (109, "error opening key file in open_index_files()", 0L);
 
    *pfp = fopen (pfilename, "wb");
    if (*pfp == NULL)
        error_exit (110, "error opening ptr file in open_index_files()", 0L);
 
    return;
  }
 
 
/* write out the key and ptr files to disk... return the number of different
 * words in the document, as written to the keyfile
 *
 * note that the zndxptr pointers in memory must have the "base" value doc0 - start
 * subtracted from them in order to get absolute byte offsets into the document
 * file, before they are written out to the ".p" ptr file....
 */
 
long write_index_files (unsigned char *doc0, zndxptr *ptr0, long ptrcount, FILE *kfile, FILE *pfile)
  {
    long i, ccount, different_words;
    zndxptr *ptrp, base, ptr, current_word;
 
    ptrp = ptr0;
    current_word = *ptrp;
    base = doc0;
 
    for (i = 0; i < ptrcount; ++i)
      {
        if (strncmp ((char *)current_word, (char *)*ptrp, KEYLENGTH))
          {
            /* a new word has been encountered ... write out the formerly-current
             * word and its cumulative count value to the kfile, and then
             * set the current word to its new value
             */
            fwrite_key (current_word, kfile);
            ccount = ptrp - ptr0;
            if (fwrite (&ccount, sizeof (long), 1, kfile) < 1)
                error_exit (115, "error writing ccount in write_index_files()", 0L);
            current_word = *ptrp;
           }
        
        ptr = (zndxptr)(*ptrp - base);
 
        if (fwrite (&ptr, sizeof (ptr), 1, pfile) < 1)
            error_exit (111, "error writing ptr in write_index_files()", 0L);
 
        ++ptrp;
      }
 
    /* write out the final word and final cumulative count value to finish
     * up the key file
     */
    fwrite_key (current_word, kfile);
            ccount = ptrp - ptr0;
            if (fwrite (&ccount, sizeof (long), 1, kfile) < 1)
                error_exit (116, "error writing ccount in write_index_files()", 0L);
    
    different_words = filesize (kfile) / (KEYLENGTH + sizeof (long));
    
    fclose (kfile);
    fclose (pfile);
        
    return (different_words);
  }
  
 
/* write out a key word to the kfile; take the word from the document
 * as mapped into memory, and pad it on the right with blanks until it is
 * KEYLENGTH bytes long; then send it out....
 */
 
void fwrite_key (zndxptr pp, FILE *kfile)
  {
    char buf[KEYLENGTH], *bp, c;
    int j;
 
    bp = buf;
    for (j = 0; j < KEYLENGTH; ++j)
      {
        c = *pp++;
        if (c == '\0')
            break;
        *bp++ = c;
      }
    for ( ; j < KEYLENGTH; ++j)
        *bp++ = ' ';
 
    if (fwrite (buf, KEYLENGTH, 1, kfile) < 1)
        error_exit (114, "error writing key word in fwrite_key()", 0L);
 
    return;
  }
 
 
/* exit with an error message -- some fatal mistake has been made
 */
 
void error_exit (int errnum, char *msg, long errarg)
  {
    fprintf (stderr, "%d zndxr %s %ld\n", errnum, msg, errarg);
    exit (errnum);
    
    return;
  }

