Date: Mon, 2 Jun 1997 17:42:10 -0700 (PDT) From: Mark Zimmermann this seems to merge multiple indices into a single pair of *.k and *.p files for browsing ... apparently I tried doing this without heap data structures and it took FOREVER, so I rewrote it here in a more efficient way ... ^z -----zmrgr46.c----- /* revised zmerger experiments - ^z - (c) 1991 * mods to use heap data structure for merge buffer indices - 1991/01/26 * next: variable buffer sizes, proportional to input file sizes... * * work on zmerger.c - new program to do an N-way merge of raw * index files created by zndxr.c (or by qndxr.c, TEXAS, TEX, FREE TEXT, etc.) * * This is free software and comes with NO WARRANTY WHATSOEVER! * See GNU GPL for licensing information. * * This program takes as input a list of files (a 'filelist' or '.f' file, * specifying which files go into the database and their lengths, etc.) and * produces a merged pair of index files corresponding to the contents * of that filelist -- a unified database to allow multi-file browsing * by (a forthcoming program) zbrwsr.c ... * * On the command line to zmerger, a single thing is required -- the * name of the filelist which holds the names of the documents * (text files, generally) and their associated index files ('.k' and * '.p', also known as 'key' and 'ptr' files). The filelist itself * contains the name of the special alphabetization ('alphafile') and * the names and locations to store the merged database files into. * * Thus, the usage of zmerger is: zmerger filelist * * It makes sense to put all this stuff in the filelist master file * itself, since in real use multi-file browsing will (I hope!) be * applied to large collections of files (possibly thousands or more * different documents) -- too many to put onto a command line! * * The format of the filelist file is specified as follows: * * - a filelist shall consist of lines of printable characters, * no line of which may be longer than 255 bytes. * * - all non-empty lines of the filelist file which do not begin * with a '#' shall consist of a file name (possibly a full path) * of a text 'document' file to be indexed, followed * by a location for the index key file, followed * by the index ptr file's location, followed by the offset to * apply to the byte locations of words in that file. That offset * value begins at 0 for the first file, and is actually the * cumulative length of all previous files in the filelist. * ((note --- want to be sure to allow for segmentation of input * text files later on ... must think about this offset question * then...??)) * * - the customary name of a filelist file shall end with '.f'; other * names are allowed, but may cause conflicts or inconsistencies * unless the default names for key and ptr files are explicitly * overriden. * * - the default for the names of the filelist's associated master key * and ptr files shall be the name of the filelist with 'k' * and 'p' respectively replacing the last letter of the * filelist's own file name, in the same directory as the * filelist itself. * * - to override the defaults for the filelist's key and ptr files, * the directive "#keyfile: fname" may be placed in the filelist * (before the first document file name) to indicate where * the key file is to be found; the directive "#ptrfile: fname" may * similarly be used to locate the ptr file (where the '#' occurs * in the first column and where 'fname' may include a full path, * e.g., /usr/mark/data.k). * * - the default alphabetization order shall be that of the ASCII character * set; the default mapping of characters in a text file is to treat * all non-alphanumeric characters as delimiters, and to turn all * lower-case alphabetic characters to upper-case. * * - to override the default alphabetization order and character map, * the directive "#alphafile: fname" may be placed in the filelist * before the first document file name. (The format of * the alphafile is described in detail elsewhere; it essentially * consists of a list of the characters to be kept in indexed documents, * in the order in which they are to be alphabetized, with all * characters on each line of the alphafile mapped into the first * on that line.) * * - a filelist line beginning with '# ' (a number-sign followed by a space) * shall be a comment and shall be ignored by all programs using the * filelist; in fact, any line beginning with a '#' but not with a * recognized directive shall be ignored. * * - unprintable characters shall be included in a filelist as a * decimal number after a '#'; thus, ^Z is written '#26'. The '#' * symbol itself shall be written '#35' when it is needed. * * - the '#' (number-sign, sharp, pound-sign, tic-tac-toe board) * shall be the only special 'escape' character; all other printable * symbols shall represent themselves. * */ /* if one needs to convert program for compilation by an old, dumb cc, * for generic use, this is what happens: * * - prototypes turned into function head type declarations (via gawk * program to take everything out from within the () in the prototypes) * - function heads turned back to old form (for things that don't * call functions as arguments, single line head, etc.), using gawk * - reference removed, as it offends... * - #define time_t, difftime(), SEEK_SET, and SEEK_END */ #ifdef DUMB_COMPILER #define time_t long #define difftime(t1,t0) (t1 - t0) #define SEEK_SET 0 #define SEEK_END 2 #else #include #endif #include #include #include /* for Mac (Think C) ... need to put this into a project with the ANSI * and the MacTraps libraries ... the header lets main() * take command-line UNIX-style arguments ... very un-Macish, sorry! */ #ifdef THINK_C #include #endif /* define some parameters used by the following routines ... MAXSTRLEN is the * longest string expected to be seen in an alphafile, a filelist, etc. ... * KEYLENGTH is the number of letters to be kept (at most) for an indexed * word (28 is chosen for compatibility with my earlier index-building programs; * it's long enough to avoid truncating most words, adds up to a power of two * when the 4-byte long integer 'cumulative count' is appended to make an index * record in the key file, and is short enough that (for reasonably large input * files) the key file isn't exorbitantly huge ... */ #define MAXSTRLEN 256 #define KEYLENGTH 28 /* for the moment, use fixed-size buffers, but do this right later with * variable-sized (proportional to the file size) */ #define DEFAULTBUFFERSIZE 10240 #define TRUE 1 #define FALSE 0 /* define data type to hold keyfile records */ typedef struct { char key[KEYLENGTH]; long ccount; } ZKEY; /* define data type to hold in-memory information essential to merger ... * for each file in the filelist, must hold: * - offset to apply to the ptr records * - keyfile name * - ptrfile name * - pointer to beginning of the keyfile buffer * - pointer to next record in the keyfile buffer * - pointer to end of the keyfile buffer * - pointer to beginning of the ptrfile buffer * - pointer to next record in the ptrfile buffer * - pointer to end of the ptrfile buffer * - number of the key record to start the next keybuf load * - number of the ptr record to start the next ptrbuf load * - total number of remaining ptrfile records * - previous cumulative count value (which tells where ptrs for * the current key begin) */ typedef struct { long offset; char *kfname; char *pfname; ZKEY *keyrecbufp; ZKEY *current_keyrecp; ZKEY *endkeyrecbufp; long *ptrbufp; long *current_ptrp; long *endptrbufp; long next_keybuf_start_num; long next_ptrbuf_start_num; long remaining_ptrcount; long prev_ccount; } FLST; /* begin prototypes */ int main (int argc, char *argv[]); void print_help (void); void load_alpha (char *fname); void read_filelist (char *fname); void print_switch_error (char *s); long filesize (FILE *file); long count_listed_files (FILE *filelistfile); void *reserve_memory (long len); void error_exit (int errnum, char *msg, long errarg); void add_to_flist (char *filename, char *kfilename, char *pfilename, long offset); void merge_indices (void); void initial_load_inbuffers (long bufnum); void fetch_next_keyrec (long flistnum); void fetch_next_ptr (long *pbuf, long flistnum); void push_index_onto_heap (long bufnum); void open_outfiles (void); int alpha_greater_than (long bufnum1, long bufnum2); void write_out_smallest_remaining_record (void); void close_outfiles (void); /* end prototypes */ /* global variables */ char keyfilename[MAXSTRLEN]; FILE *kfp; char ptrfilename[MAXSTRLEN]; FILE *pfp; long nfiles = 0; long maxfiles = 0; long buffersize = DEFAULTBUFFERSIZE; FLST *flist = NULL; long first_active_file; ZKEY current_keyrec; long *heap; long last_heap_slot; /* alphabetization order: the array alpha_order[] determines * the order of alphabetization of characters during indexing. * The default is simply ASCII order, with numbers preceding * letters. This default behavior may be changed using the * '-f mapfile' command line option. See the documentation * accompanying function load_alpha() for details. * * Here, although many other patterns are possible, I * simply repeat the contents of the alpha_map[] array * (as used in zndxr.c) as the default... * * Note that the value of alpha_order entries should never * be zero for any characters which are actually not mapped * to null -- otherwise, alphabetization errors may result! */ unsigned char alpha_order[] = { '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\0', '\0', '\0', '\0', '\0', '\0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' }; /*********************************************************/ main (int argc, char *argv[]) { /* get the command line arguments for Macintosh users ... */ #ifdef THINK_C argc = ccommand (&argv); #endif printf ("\nWelcome to zmerger... - ^z\n"); if (argc == 1) print_help (); else { read_filelist (argv[1]); merge_indices (); } printf ("\nGood-bye! - ^z\n"); return; } /* print out minimally helpful information for user... */ void print_help (void) { printf ("\nzmerger: merge index files for free-text information retrieval\n"); printf ("\nusage: zmerger filelistname ... \n"); printf ("\nzmerger (c) 1991 Mark Zimmermann - ^z\n"); printf ("this program comes with NO WARRANTY whatsoever\n"); printf ("zmerger is free software, under the GNU General Public License\n"); return; } /* load alpha file to redefine alphabetization order if necessary.... * * The alpha file is optional. It is an ASCII text file consisting of * a succession of lines, giving the characters to appear in the index * file in the order in which they are to be alphabetized. * Alternative alphabetizations and character mappings are most * likely to be useful to those who need to work with non-Latin * alphabets or who wish to handle non-alphanumeric characters * in a special way. * * See the source code of zndxr.c for additional commentary and * explicit examples of alphafile formatting ... most of which is * not required for the index merge operation (e.g., for merging, * the character mapping and embedded-punctuation handling don't * actually matter -- all that should ever be needed is the alpha * order...). * * To include a nonprintable or 'special' character in an alpha file, write * a number-sign ('#') followed by the DECIMAL value of the character. * Thus, #126 = ~ = , #35 = #, #10 = ^J = '\n' = newline, etc. * (The '~' character is special in that it flags embedded punctuation * for the index-builder, so write it as #126 if it's needed...) * * Comments may be included in an alphafile on a line beginning with a '# '; * any line with '# ' as its first two characters is ignored. * * No guarantees if an alphafile line is longer than MAXSTRLEN-1 characters!!! */ void load_alpha (char *fname) { FILE *alphafile; int c, i, len, order = 0; char buf[MAXSTRLEN]; printf ("loading alphafile '%s'\n", fname); alphafile = fopen (fname, "r"); if (alphafile == NULL) error_exit (200, "fopen() error in load_alpha()", 0L); for (i = 0; i < 256; ++i) alpha_order[i] = '\0'; while (fgets (buf, MAXSTRLEN, alphafile) != NULL) { /* skip a comment line (any line which begins with '# ') and * skip a completely empty line (containing only '\n') */ if ((buf[0] == '#' && buf[1] == ' ') || buf[0] == '\n') continue; /* increment the counter that keeps track of where to put this * line's characters alphabetically ... */ if (++order > 255) error_exit (201, "++order error in load_alpha()", (long)order); len = strlen (buf); if (buf[len - 1] == '\n') /* remove any final '\n' from lines that fgets() supplies... */ --len; for (i = 0; i < len; ++i) { c = buf[i]; if (c == '~') /* skip '~' since we don't care about embedded punctuation * for index merging purposes */ continue; /* convert escaped characters after '#' into proper form */ if (c == '#') { c = 0; ++i; while (buf[i] >= '0' && buf[i] <= '9') c = 10 * c + buf[i++] - '0'; --i; } /* safety net -- following *should* never happen! */ if (c < 0 || c > 255) error_exit (202, "character error in load_alpha()", (long)c); alpha_order[c] = order; } } fclose (alphafile); return; } /* read a filelist and process the information in it... reserve memory for * our flist data structures, and for the heap of indices into the flist * items, to keep track of which input file comes next alphabetically... * * The format of the filelist file is designed to be compatible with * the '.f' file used by my other free text information retrieval * ("indexer/browser") programs. A filelist file consists of a * succesion of lines each containing: * the name of each document file * the name of the document's '.k' index key file * the name of the document's '.p' index ptr file * the offset to apply to this document's word locations * * A line beginning with a '#' character is treated as a comment and is * ignored in the filelist, unless it precedes a special directive, such * as in the case of "#alphafile:", "#keyfile:", "#ptrfile:"... * * If you really want to begin a filename with a '#', put a space in front * of it in the filelist entry for that file, maybe.... */ void read_filelist (char *fname) { FILE *filelistfile; int n, len; long fileoffset; char buf[MAXSTRLEN], filename[MAXSTRLEN], kfilename[MAXSTRLEN], pfilename[MAXSTRLEN]; printf ("loading filelist file '%s'\n", fname); filelistfile = fopen (fname, "r"); if (filelistfile == NULL) error_exit (212, "error opening filelist in read_filelist()", 0L); maxfiles = count_listed_files (filelistfile); printf ("reserving %ld bytes for %ld in-memory filelist structures\n", sizeof (FLST) * maxfiles, maxfiles); flist = reserve_memory (sizeof (FLST) * maxfiles); /* for heap of indices, use subscripts beginning with 1, so need an extra slot; * also set count of heap slots in use to zero */ heap = reserve_memory (sizeof (long) * (maxfiles + 1)); last_heap_slot = 0; rewind (filelistfile); printf ("loading filelist...\n"); len = strlen (fname); strcpy (keyfilename, fname); keyfilename[len - 1] = 'k'; strcpy (ptrfilename, fname); ptrfilename[len - 1] = 'p'; while (fgets (buf, MAXSTRLEN, filelistfile) != NULL) { /* skip over a comment line (any line which begins with '#' not * immediately followed by a recognized directive); process a * directive (a line beginning "#alphafile:", "#keyfile:", * or "#ptrfile:", followed by * a filename or numerical parameter) */ if (buf[0] == '#') { n = sscanf (buf, "#alphafile: %s", filename); if (n == 1) { load_alpha (filename); continue; } n = sscanf (buf, "#keyfile: %s", filename); if (n == 1) { strcpy (keyfilename, filename); printf ("merged keyfile name = '%s'\n", keyfilename); continue; } n = sscanf (buf, "#ptrfile: %s", filename); if (n == 1) { strcpy (ptrfilename, filename); printf ("merged ptrfile name = '%s'\n", ptrfilename); continue; } continue; } /* now process 'normal' filelist lines, and add the relevant * information from them to the flist */ n = sscanf (buf, " %s %s %s %ld", filename, kfilename, pfilename, &fileoffset); /* skip over empty or mysteriously-bad lines */ if (n == EOF || n != 4) continue; add_to_flist (filename, kfilename, pfilename, fileoffset); } fclose (filelistfile); printf ("filelist loading completed\n"); return; } /* print out error message in response to unrecognized switch */ void print_switch_error (char *s) { printf ("zmerger: %s: switch not recognized!\n", s); return; } /* determine the size of the file */ long filesize (FILE *file) { long len; if (fseek (file, 0L, SEEK_END) != 0) error_exit (204, "fseek() error in filesize()", 0L); len = ftell (file); if (len == -1L) error_exit (205, "ftell() error in filesize()", 0L); return (len); } /* count the number of files in the filelist ... */ long count_listed_files (FILE *filelistfile) { int n; long numfiles = 0, fileoffset; char buf[MAXSTRLEN], filename[MAXSTRLEN], kfilename[MAXSTRLEN], pfilename[MAXSTRLEN]; while (fgets (buf, MAXSTRLEN, filelistfile) != NULL) { /* skip over a comment line (or other directive) */ if (buf[0] == '#') continue; /* count filelist lines */ n = sscanf (buf, " %s %s %s %ld", filename, kfilename, pfilename, &fileoffset); /* skip over empty or mysteriously-bad lines, as elsewhere */ if (n == EOF || n != 4) continue; /* ok, looks, good, count this as a file */ ++numfiles; } return (numfiles); } /* reserve memory for holding len bytes */ void *reserve_memory (long len) { void *p; p = (void *)malloc (len); if (p == NULL) error_exit (206, "out of memory error in reserve_memory()", len); return (p); } /* exit with an error message -- some fatal mistake has been made */ void error_exit (int errnum, char *msg, long errarg) { fprintf (stderr, "\nFATAL ERROR -- very sorry\n"); fprintf (stderr, "%d zmerger %s %ld\n", errnum, msg, errarg); exit (errnum); return; } /* append new information to the flist, the in-memory data structure which * holds all the essential information for the merge_indices() to find files, * change ptr offsets, etc.; simultaneously, add the index in its proper location * to the heap which will be used to pull out words in alphabetical order */ void add_to_flist (char *filename, char *kfilename, char *pfilename, long offset) { char *fnp; FILE *fp; flist[nfiles].offset = offset; fnp = reserve_memory (strlen (kfilename) + 1); strcpy (fnp, kfilename); flist[nfiles].kfname = fnp; fnp = reserve_memory (strlen (pfilename) + 1); strcpy (fnp, pfilename); flist[nfiles].pfname = fnp; /* reserve memory for and initialize key and ptr buffers for * this file... */ flist[nfiles].keyrecbufp = reserve_memory (buffersize); flist[nfiles].endkeyrecbufp = flist[nfiles].keyrecbufp + buffersize / sizeof (ZKEY); flist[nfiles].current_keyrecp = flist[nfiles].endkeyrecbufp; flist[nfiles].ptrbufp = reserve_memory (buffersize); flist[nfiles].endptrbufp = flist[nfiles].ptrbufp + buffersize / sizeof (long); flist[nfiles].current_ptrp = flist[nfiles].endptrbufp; initial_load_inbuffers (nfiles); /* things look good, so count the file and return ... */ ++nfiles; if (nfiles > maxfiles) error_exit (213, "mysterious error in add_to_filelist(), sorry!\n", nfiles); return; } /* do the job -- merge the index files together and write the results out * to disk as a unified database ... report on elapsed time, etc. * * maybe use buffer for output in next major revision?? ... for now, write out * records one by one and trust to the operating system to handle buffering on * the pair of output streams (.k and .p merged files).... */ void merge_indices (void) { time_t t0, t1; double secs; printf ("\nbeginning to merge index files\n"); time (&t0); printf ("starting time = %s", ctime (&t0)); if (nfiles < 2) { printf ("sorry, %ld is too few files to merge!\n", nfiles); return; } open_outfiles (); /* initialize essential globals for merge process */ strncpy (current_keyrec.key, flist[heap[1]].current_keyrecp->key, KEYLENGTH); current_keyrec.ccount = 0; /* the inner loop: until heap of files to be merged is empty, just keep * writing out the smallest remaining record! */ while (last_heap_slot) write_out_smallest_remaining_record (); close_outfiles (); time (&t1); printf ("\ncongratulations! -- index file merger complete\n"); printf ("ending time = %s", ctime (&t1)); secs = difftime (t1, t0); printf ("elapsed time = %g seconds\n", secs); return; } /* initialize the buffers to hold in memory the key and ptr records * about to be merged for a particular filelist item "i" ... also, put * this item's serial number (index) in its proper place in the heap * alphabetically... * * At the moment, in the FLST structure the offset, kfname, pfname, and * key/ptr buffer pointers have already been filled in ... so here, we * just need to check to make sure the keyfile and ptrfile are both available * (skip over the file & put 0 into remaining_ptrcount if not available) * and if ok, set value of next_keybuf_start_num to 0, fetch current_keyrec, * put 0 into next_ptrbuf_start_num and into prev_ccount, and put * (length of the ptrfile) / sizeof (ZPTR) into remaining_ptrcount.... */ void initial_load_inbuffers (long i) { long pfsize; FILE *fp; if ((fp = fopen (flist[i].kfname, "rb")) == NULL) { printf ("WARNING: zmerger couldn't find key file '%s'!\n", flist[i].kfname); printf (" ... its document will be omitted from the merged database!\n"); flist[i].remaining_ptrcount = 0; return; /* exit in middle of function --- sorry! */ } else fclose (fp); if ((fp = fopen (flist[i].pfname, "rb")) == NULL) { printf ("WARNING: zmerger couldn't find ptr file '%s'!\n", flist[i].pfname); printf (" ... its document will be omitted from the merged database!\n"); flist[i].remaining_ptrcount = 0; return; /* exit in middle of function --- sorry! */ } else { pfsize = filesize (fp); fclose (fp); } flist[i].next_keybuf_start_num = 0; fetch_next_keyrec (i); flist[i].prev_ccount = 0; flist[i].next_ptrbuf_start_num = 0; flist[i].remaining_ptrcount = pfsize / sizeof (long); push_index_onto_heap (i); return; } /* get the next key record for flist[i] ... check for errors * and give up if one is encountered.... do it using buffered approach */ void fetch_next_keyrec (long i) { FILE *fp; long keysread; ++flist[i].current_keyrecp; /* if we're still in the buffer, all done; if not, must fetch from * disk, reset pointers, etc.... */ if (flist[i].current_keyrecp >= flist[i].endkeyrecbufp) { if ((fp = fopen (flist[i].kfname, "rb")) == NULL) error_exit (214, "fetch_current_keyrec couldn't open keyfile #", i); if (fseek(fp, flist[i].next_keybuf_start_num * sizeof (ZKEY), SEEK_SET) != 0) error_exit (215, "fetch_current_keyrec fseek() error for keyfile #", i); keysread = fread (flist[i].keyrecbufp, sizeof (ZKEY), flist[i].endkeyrecbufp - flist[i].keyrecbufp, fp); if (keysread == 0) error_exit (216, "fetch_current_keyrec fread() error for keyfile #", i); flist[i].next_keybuf_start_num += keysread; flist[i].endkeyrecbufp = flist[i].keyrecbufp + keysread; flist[i].current_keyrecp = flist[i].keyrecbufp; fclose (fp); } return; } /* bring the next ptr from filelist ptr file number flistnum into * buffer pbuf which holds it for further processing ... */ void fetch_next_ptr (long *pbuf, long i) { FILE *fp; long ptrsread; ++flist[i].current_ptrp; /* if still in the buffer, all done; if not, must reload */ if (flist[i].current_ptrp >= flist[i].endptrbufp) { if ((fp = fopen (flist[i].pfname, "rb")) == NULL) error_exit (218, "fetch_next_ptr() fopen() error on ptrfile #", i); if (fseek (fp, flist[i].next_ptrbuf_start_num * sizeof (long), SEEK_SET) != 0) error_exit (219, "fetch_next_ptr() fseek() error on ptrfile #", i); ptrsread = fread (flist[i].ptrbufp, sizeof (long), flist[i].endptrbufp - flist[i].ptrbufp, fp); if (ptrsread == 0) error_exit (220, "fetch_next_ptr() fread() error on ptrfile #", i); flist[i].next_ptrbuf_start_num += ptrsread; flist[i].endptrbufp = flist[i].ptrbufp + ptrsread; flist[i].current_ptrp = flist[i].ptrbufp; fclose (fp); } *pbuf = *flist[i].current_ptrp; return; } /* push a new index onto the heap, maintaining alphabetical order and heap * condition; see standard textbooks for description of heap algorithm, whereby * the new index is put into the (newly-incremented) last slot, and then * sifted upwards until the heap condition (parent node precedes both children * alphabetically, with my alpha_map) holds... */ void push_index_onto_heap (long bufnum) { long current_slot, test_slot; current_slot = ++last_heap_slot; test_slot = current_slot / 2; while (test_slot > 0 && alpha_greater_than (heap[test_slot], bufnum)) { /* must move parent index down to make room for new bufnum insertion */ heap[current_slot] = heap[test_slot]; current_slot = test_slot; test_slot /= 2; } /* now have found the right place to put the new bufnum into */ heap[current_slot] = bufnum; return; } /* open the master index key and ptr files for writing out the results * of the index merging operation... */ void open_outfiles (void) { kfp = fopen (keyfilename, "wb"); if (kfp == NULL) error_exit (209, "error opening key file in open_outfiles()", 0L); pfp = fopen (ptrfilename, "wb"); if (pfp == NULL) error_exit (210, "error opening ptr file in open_outfiles()", 0L); return; } /* compare the keys in two buffers and return true if the second one is greater * alphabetically than the first ... ((consider implementing this as a macro * somehow, to avoid overhead of a function call?)) ... break ties in favor of * the smaller buffer number, to maintain stability of the merge (so as to get the * data from earlier files in the filelist to come out first when browsing); thus * when the words are identical (as may happen pretty often), bufnum2 counts as * "greater than" bufnum1. Note that once one hits a delimiter in the inner * loop, mapped to zero by alpha_order, there's no need to compare further.... */ int alpha_greater_than (long bufnum1, long bufnum2) { int n; unsigned char *s1, *s2; s1 = (unsigned char*)flist[bufnum1].current_keyrecp->key; s2 = (unsigned char*)flist[bufnum2].current_keyrecp->key; /* make the next statement efficient, since it's going to be executed * pretty often!! This is the innermost loop :-) */ for ( n = KEYLENGTH; --n && alpha_order[*s1] && alpha_order[*s1] == alpha_order[*s2]; ++s1, ++s2) ; return ( (alpha_order[*s1] == alpha_order[*s2]) ? ( bufnum1 > bufnum2) : (alpha_order[*s1] > alpha_order[*s2]) ); } /* send the smallest remaining records to the output stream ... specifically, * check the key that is on the top of the heap of buffers, and if it * differs from the previously-recorded global current_keyrec.key, write that * previous current_keyrec (key and associated cumulative count) out to the merged * keyfile, and update current_keyrec.key. In any event, continue by writing out * to the merged ptrfile all the ptr records corresponding to the now-current * smallest key (adding in the proper offset to the absolute values stored * in the individual ptrfile), increment the overall cumulative count * current_keyrec.ccount, decrement this flist item's remaining_ptrcount, * and (if there are any keys and ptrs left to be dealt with in this file) * move the current ccount for this flist item to prev_ccount, and * fetch the next key to be current_keyrec for this flist item; then sift * the current buffer down to its proper place in the heap. If the current * buffer is now empty, remove it from the heap and put the formerly-last * heap buffer on top, then sift it down to make a legal heap again. * * it may sound complex, but it's really just dealing the right number of cards off * the top of the smallest remaining record's deck ... */ void write_out_smallest_remaining_record (void) { long i, smallest, ptrcount, current_ptr, smallest_offset, current_slot, test_slot; smallest = heap[1]; /* it's ok to use strncmp here instead of custom alpha_order[], since * all we care about is equality test, and all identical characters * have already been mapped to their equivalents in creating the * keyfiles by zndxr... */ if (strncmp (current_keyrec.key, flist[smallest].current_keyrecp->key, KEYLENGTH) != 0) { if (fwrite (¤t_keyrec, sizeof (ZKEY), 1, kfp) != 1) error_exit (217, "write_out() fwrite() error for key record at keyfile #", smallest); strncpy (current_keyrec.key, flist[smallest].current_keyrecp->key, KEYLENGTH); } ptrcount = flist[smallest].current_keyrecp->ccount - flist[smallest].prev_ccount; smallest_offset = flist[smallest].offset; for (i = 0; i < ptrcount; ++i) { fetch_next_ptr (¤t_ptr, smallest); current_ptr += smallest_offset; if (fwrite (¤t_ptr, sizeof (long), 1, pfp) != 1) error_exit (221, "write_out() fwrite() error for ptr record from ptrfile #", smallest); } current_keyrec.ccount += ptrcount; flist[smallest].remaining_ptrcount -= ptrcount; /* if records remain in this file, set up to process next one; if not, * cut down the size of the heap and put the final heap item on top, * prior to the sifting down operation... */ if (flist[smallest].remaining_ptrcount > 0) { flist[smallest].prev_ccount = flist[smallest].current_keyrecp->ccount; fetch_next_keyrec (smallest); } else { smallest = heap[last_heap_slot--]; heap[1] = smallest; /* must do this in case last_heap_slot == 1 */ } /* now restore heap condition by sifting down the top, formerly smallest, * item to its proper place */ current_slot = 1; test_slot = 2; while (test_slot <= last_heap_slot) { if (test_slot < last_heap_slot && alpha_greater_than (heap[test_slot], heap[test_slot + 1])) ++test_slot; /* now test_slot is smaller of the child nodes */ if (alpha_greater_than (smallest, heap[test_slot])) { /* must move test_slot item up and commence work on next level down */ heap[current_slot] = heap[test_slot]; current_slot = test_slot; test_slot *= 2; } else break; } heap[current_slot] = smallest; return; } /* close the output key and ptr files ... but first, must write out the very * last key record to the merged keyfile ... then the merger is completed... */ void close_outfiles (void) { if (fwrite (¤t_keyrec, sizeof (ZKEY), 1, kfp) != 1) error_exit (222, "oops, close_outfiles() fwrite() error on final key record", 0L); printf ("closing output files...\n"); fclose (kfp); fclose (pfp); return; }