Date: Mon, 2 Jun 1997 17:40:02 -0700 (PDT) From: Mark Zimmermann Subject: zndxr41.c + comments WOW! --- found some files from 1991, a real mother lode of stuff, which I had mostly forgotten about ... seems to be heavily commented with descriptions of alphabetization, multi-file indexing, etc. issues! --- as I recall, these did compile & run pretty well, but were by no means totally debugged ... so in next 4 msgs for you there will be: -rw------- 1 zimm 9618 Jun 2 17:35 zcmpndx04.c -rw------- 1 zimm 39753 Jun 2 17:33 zbrwsr58.c -rw------- 1 zimm 36129 Jun 2 17:26 zmrgr46.c -rw------- 1 zimm 34197 Jun 2 17:19 zndxr41.c For starters, here's "zndxr41.c", an experiment in rewriting the generic indexer that includes some more explicit choices about character mapping and alphabetization during index-building ... I think I used this as the basis for the 1995 Cyrillic version (maybe).... ^z -------zndxr41.c----- /* zndxr.c - new multifile index-builder by ^z * (c) 1991 - Mark Zimmermann * * this is free software and comes with no warranty WHATSOEVER! * See the GNU General Public License for details * * This new indexer simplifies and revises the qndxr.c, etc. programs * from 1987-90 in several ways -- the new indexer is memory-based * and separates out the index-merging functions to another program. * This new indexer allows user control of alphabetization * and character mapping, optional preservation of embedded punctuation * symbols, and the ability to use a filelist file to index * multiple files as a batch((, or to segment large files into parts * which are separately indexed --- yet to be implemented, again!)). * Finally, the new indexer is written with transportability to MS-DOS, * UNIX, and Macintosh in mind ... though I do insist on using the new * ANSI-style function declarations, prototypes, etc. * * ((Note -- after frustrating experiences with the Intel segmented * 8088-style architecture, far pointers, huge memory models, etc., * I have reverted to a very generic approach (i.e., I ignore the * DOS problems entirels!!!) --- thus, in order to make * this program work on files larger than 32kB or so (I speculate) * under MS-DOS may require some changes (e.g., 'large memory * model' compilation) for which I am neither equipped nor expert * enough .... help, somebody!??)) */ /* if one needs to convert program for compilation by an old, dumb cc, * for generic use, this is what happens: * * - prototypes turned into function head type declarations (via gawk * program to take everything out from within the () in the prototypes) * - function heads turned back to old form (for things that don't * call functions as arguments, single line head, etc.), using gawk * - reference removed, as it offends... * - #define time_t, difftime(), SEEK_SET, and SEEK_END */ #ifdef DUMB_COMPILER #define time_t long #define difftime(t1,t0) (t1 - t0) #define SEEK_SET 0 #define SEEK_END 2 #else #include #endif #include #include #include /* for Mac (Think C) ... need to put this into a project with the ANSI * and the MacTraps libraries ... the header lets main() * take command-line UNIX-style arguments ... very un-Macish, sorry, * but it works!! */ #ifdef THINK_C #include #endif #define MAXSTRLEN 256 #define KEYLENGTH 28 #define TRUE 1 #define FALSE 0 /* define data type for my index pointers, which hold the offset * into the document file where each word is found */ typedef unsigned char * zndxptr; /* begin prototypes */ /* keep all prototypes here, for my awk filter program to modify as needed * for dumb old compilers ... keep then each single line long, beginning with * lower-case letter in first column, and ending with ");" .... * converter program eats the contents of the parens.... */ int main (int argc, char *argv[]); void print_help (void); void load_alpha (char *fname); void index_listed_files (char *fname); void build_index (void); long filesize (FILE *file); void *reserve_memory (long len); void load_docfile (FILE* docfile, unsigned char *docp, long doclength); long filter_doc (unsigned char *doc0, long doclength); void make_index_ptrs (unsigned char *doc0, long doclength, zndxptr *ptr0, long ptrcount); void sort_index_ptrs (zndxptr *ptr0, zndxptr *ptr1); int compare_words (zndxptr *p1, zndxptr *p2); void open_index_files (char *kfilename, FILE **kfp, char *pfilename, FILE **pfp); long write_index_files (unsigned char *doc0, zndxptr *ptr0, long ptrcount, FILE *kfile, FILE *pfile); void fwrite_key (zndxptr pp, FILE *kfile); void error_exit (int errnum, char *msg, long errarg); /* end prototypes */ /* global variables */ /* info about file to be indexed: * docfilename[] = name of document file (call it below) * keyfilename[] = name of key file (default .k) * ptrfilename[] = name of ptr file (default .p) */ char docfilename[MAXSTRLEN]; char keyfilename[MAXSTRLEN]; char ptrfilename[MAXSTRLEN]; /* mapping to be applied to characters in the input file: * the array alpha_map[] translates characters by default * into upper-case alphabetic, preserves numbers, and * turns everything else into a delimiter (null). The * default character mapping can be overridden by using * the '-f mapfile' option in the command line. See the * documentation for function load_alpha() for details. * * Note that mapping characters to null allows for use of * strcmp(), strcpy(), etc. throughout the program, since * the words in the file are null-terminated strings ... * very convenient. */ unsigned char alpha_map[] = { '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\0', '\0', '\0', '\0', '\0', '\0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' }; /* alphabetization order: the array alpha_order[] determines * the order of alphabetization of characters during indexing. * The default is simply ASCII order, with numbers preceding * letters. This default behavior may be changed using the * '-f mapfile' command line option. See the documentation * accompanying function load_alpha() for details. * * Here, although many other patterns are possible, I * simply repeat the contents of the alpha_map[] array * above as the default... * * Note that the value of alpha_order entries should never * be zero for any characters which are actually not mapped * to null -- otherwise, alphabetization errors may result! */ unsigned char alpha_order[] = { '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '\0', '\0', '\0', '\0', '\0', '\0', '\0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\0', '\0', '\0', '\0', '\0', '\0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' }; /* embedded punctuation handling: a character found in * embedded_punct[] with a nonzero entry is treated specially: * the characters on either side in the document file (after * mapping by alpha_map[]) are checked, and if they are both * non-delimiters and non-punctuation, then the current character * is an 'embedded punctuation' symbol and is kept; otherwise, it * is set to null (a delimiter). * * Thus, the alpha_map[] array contains all characters which *may* * survive to be included in indexed words; the embedded_punct[] * array specifies characters which are in the alpha_map as * 'second-class' symbols, and which can only be kept if surrounded * on both sides by 'first-class' symbols. * * This is an extremely simple parser, obviously, but it is fast, * easy to understand, and (I hope) provides enough flexibility * to handle many users' requirements for special handling of * symbols used in marked-up texts, linguistic symbols, etc. * This approach also only requires a single character of lookahead, * which getc() and ungetc() can do for us ... though in this * implementation, the document is already in memory and there * is no problem with arbitrary look-ahead.... * * The default behavior, for compatibility with earlier indexer * programs, is NOT to keep any embedded punctuation; hence, this * array embedded_punct[] is full of nulls.... */ unsigned char embedded_punct[] = { '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0', '\0' }; /*********************************************************/ main (int argc, char *argv[]) { int i; /* get the command line arguments for Macintosh users ... */ #ifdef THINK_C argc = ccommand (&argv); #endif printf ("\nWelcome to zndxr version 0.37 - (c) 1991 - ^z\n"); if (argc == 1) print_help (); else index_listed_files (argv[1]); printf ("\nGood-bye! - ^z\n"); return; } /* print out minimally helpful information for user... */ void print_help (void) { printf ("\nzndxr: build index files for free-text information retrieval\n"); printf ("\nusage: zndxr filelist\n"); printf ("\nzndxr (c) 1990 Mark Zimmermann - ^z\n"); printf ("this program comes with NO WARRANTY whatsoever\n"); printf ("zndxr is free software, under terms of the GNU General Public License\n"); printf ("for additional information, read the source code, or if\n"); printf ("absolutely necessary, write to me at P.O.Box 598,\n"); printf ("Kensington, Maryland 20895-0598, USA.\n"); return; } /* load alpha file to redefine character mapping and/or alphabetization * order and/or handling of embedded delimiters within words. * * The mapping defined by the alpha file is applied to all characters * in the document to be indexed as that document is read into memory. * Further filtering, such as detection of embedded punctuation, is * done in a subsequent stage.... * * The alpha file is optional. It is an ASCII text file consisting of * a succession of lines, giving the characters to appear in the index * file in the order in which they are to be alphabetized. If several * characters are listed on one line, then all of them are mapped * (translated) into the first character on that line. Thus * the default alphabetization order (ASCII) and character mapping * (lower-case alphabetic to upper-case) is equivalent to an alpha * file which looks like: 0 1 2 3 4 5 6 7 8 9 Aa Bb <> Yy Zz * Alternative alphabetizations and character mappings are most * likely to be useful to those who need to work with non-Latin * alphabets or who wish to handle non-alphanumeric characters * in a special way. * * There are two 'special' characters in an alphafile: * # * ~ * * To include a nonprintable or 'special' character in an alpha file, write * a number-sign ('#') followed by the DECIMAL value of the character. * Thus, #126 = ~ = , #35 = #, #10 = ^J = '\n' = newline, etc. * * Comments may be included in an alphafile on a line beginning with a '# '; * any line with '# ' as its first two characters is ignored. * * If it is desired to treat some delimiters specially, so that * they are retained in index words only when embedded in between * ordinary non-delimiters, then those special symbols are * listed after a '~' (tilde) character on the line. * Thus, to keep the embedded characters '.' (as in '3.14159'), * '-' (as in 'good-bye'), and ',' (as in '1,234,567'), and to * map all ',' commas into '.' periods for index listing, and to * alphabetize the '.' (and ','), and '-' in between Y and Z, * in that order, the final few lines of the alpha file might look like: Yy ~., ~- Zz * Notwithstanding the great and powerful potential of the alphafile, * it is generally wisest (in my experience) to use the default * practice of omitting all punctuation symbols from the index. * This avoids problems with words occurring in scattered places * in the index lists due to various haphazard hyphenations or * accidents of punctuation. Thus, if the expression 'free-text' * were hyphenated when used as a modifier, it would not be possible * to search for the word 'text' in that occurrence. On the other * hand, it is unaesthetic to see the word 'don't' split into 'don' * and 't' in an index list, I agree.... * * So, the option to keep punctuation characters as second-class * citizens, so that they only are retained when framed on both * sides by first-class indexed characters, is provided as an * alternative index-building strategy for those who don't want * to keep all punctuation in the index, but who dislike omitting * all punctuation as well. * * If you have specially formatted input data files (perhaps using * some symbols to indicate extra linguistic meanings, or fonts, * or data types, or whatever) you may be able to use a customized * alpha file to build particularly useful index structures. Many * linguists and others with unusual needs have asked me for this * capability -- here it is! * * This alpha file structure is used during index building; it is * also needed during index browsing, if non-default mappings or * alphabetization or delimiter-handling features are used, so that * the index browser program can correctly search the index files. * * No guarantees if a line is longer than MAXSTRLEN-1 characters!!! */ void load_alpha (char *fname) { FILE *alphafile; int c, i, len, firstc, flag, order = 0; char buf[MAXSTRLEN]; printf ("opening alphafile '%s'\n", fname); alphafile = fopen (fname, "r"); if (alphafile == NULL) error_exit (100, "fopen() error in load_alpha()", 0L); for (i = 0; i < 256; ++i) { alpha_map[i] = '\0'; alpha_order[i] = '\0'; embedded_punct[i] = '\0'; } while (fgets (buf, MAXSTRLEN, alphafile) != NULL) { /* skip a comment line (any line which begins with '# ') and * skip a completely empty line (containing only '\n') */ if ((buf[0] == '#' && buf[1] == ' ') || buf[0] == '\n') continue; /* increment the counter that keeps track of where to put this * line's characters alphabetically ... */ if (++order > 255) error_exit (101, "++order error in load_alpha()", (long)order); firstc = -1; flag = FALSE; len = strlen (buf); if (buf[len - 1] == '\n') /* remove pesky final '\n' from lines that fgets() provides... */ --len; for (i = 0; i < len; ++i) { c = buf[i]; if (c == '~') { /* set flag to make rest of this line 'special' */ flag = TRUE; continue; } /* convert escaped characters after '#' into proper form */ if (c == '#') { c = 0; ++i; while (buf[i] >= '0' && buf[i] <= '9') c = 10 * c + buf[i++] - '0'; --i; } /* safety net -- following *should* never happen! */ if (c < 0 || c > 255) error_exit (102, "character error in load_alpha()", (long)c); if (firstc == -1) firstc = c; alpha_map[c] = firstc; alpha_order[c] = order; if (flag) embedded_punct[c] = TRUE; } } fclose (alphafile); return; } /* index a list of files. The format of the filelist file * is designed to be compatible with the '.f' file (to be) used by * my free text information retrieval ("browser") programs. A * filelist file consists of a succesion of lines containing: * the name of the document file itself * the name of the '*.k' key file * the name of the '*.p' ptr file * * All parameters after the document file name are optional; * if they are omitted, the entire document file is indexed and * the default .k and .p suffixes are appended to that file name * to get the key and ptr file names. * (Thus, in MS-DOS systems, the original file must have no extension * unless explicit names for key and ptr files are provided.) * This default behavior happens for file names given on the command * line... * * A line beginning with a '#' character is treated as a comment and is * ignored in the filelist. The single exception, as far as zndxr is * concerned, is a line beginning with "#alphafile: ", which is read and * used to load an alphafile for subsequent index-building. Thus, the line: #alphafile: greeble.a * says to use the custom alphabetization in file "greeble.a" hereafter. * * Note that it doesn't make much sense, ordinarily, to use different * alphabetizations within a database, and thus the alphafile indication * should appear early in the filelist, before any files have been indexed. * Note also that there may be other lines beginning with '#' in the * filelist, specifying things which the zndxr program doesn't care about. * * If you really want to begin a filename with a '#', put a space in front * of it in the filelist entry for that file; leading whitespace on a * normal filelist line is ignored.... */ void index_listed_files (char *fname) { FILE *filelistfile; int n; char buf[MAXSTRLEN], alphafilename[MAXSTRLEN]; printf ("opening filelist file '%s'\n", fname); filelistfile = fopen (fname, "r"); if (filelistfile == NULL) error_exit (112, "error opening filelist in index_listed_files()", 0L); while (fgets (buf, MAXSTRLEN, filelistfile) != NULL) { /* skip over a comment line (any line which begins with '#' not * immediately followed by "alphafile:") and process an alphafile * inclusion directive (a line beginning "#alphafile:" followed by * an alphafile filename) */ if (buf[0] == '#') { n = sscanf (buf, "#alphafile: %s", alphafilename); if (n == 1) load_alpha (alphafilename); continue; } n = sscanf (buf, " %s %s %s", docfilename, keyfilename, ptrfilename); /* skip over empty or mysteriously-bad lines */ if (n == EOF || n < 1 || n > 3) continue; if (n < 3) { strcpy (ptrfilename, docfilename); strcat (ptrfilename, ".p"); } if (n < 2) { strcpy (keyfilename, docfilename); strcat (keyfilename, ".k"); } build_index (); } fclose (filelistfile); return; } /* build the index for a file */ void build_index (void) { long doclength, ptrcount, different_words; FILE *docfile, *kfile, *pfile; unsigned char *doc0; zndxptr *ptr0; time_t t0, t1; double secs; printf ("\nbeginning to build index for file '%s'\n", docfilename); time (&t0); printf ("starting time = %s", ctime (&t0)); docfile = fopen (docfilename, "rb"); if (docfile == NULL) error_exit (103, "fopen() error in build_index()", 0L); doclength = filesize (docfile); if (doclength <= 0) error_exit (117, "bad doclength in build_index()", doclength); printf ("reserving %ld bytes for document\n", doclength); /* actually, get two extra bytes, and set the first and last to null, * to avoid end-effect problems in following routines ... make the * doc0 pointer refer to the true document starting byte, however * (and don't forget to fix doc0 again before freeing the block of * of memory!!) */ doc0 = reserve_memory (doclength + 2); *doc0++ = '\0'; *(doc0 + doclength) = '\0'; printf ("loading document & remapping characters\n"); load_docfile (docfile, doc0, doclength); fclose (docfile); printf ("filtering characters & counting words to index\n"); ptrcount = filter_doc (doc0, doclength); printf ("document contains %ld words\n", ptrcount); printf ("reserving %ld bytes for index pointers\n", sizeof (zndxptr) * ptrcount); ptr0 = reserve_memory (sizeof (zndxptr) * ptrcount); printf ("generating unsorted index pointers to every word\n"); make_index_ptrs (doc0, doclength, ptr0, ptrcount); printf ("sorting index pointers\n"); sort_index_ptrs (ptr0, ptr0 + ptrcount); /* check on this --- is last right?? */ printf ("creating index files\n"); open_index_files (keyfilename, &kfile, ptrfilename, &pfile); printf ("writing index files to disk\n"); different_words = write_index_files (doc0, ptr0, ptrcount, kfile, pfile); printf ("document %s contained %ld different words\n", docfilename, different_words); printf ("index files '%s' and '%s' successfully written\n", keyfilename, ptrfilename); printf ("freeing memory allocated for document and pointers\n"); free (--doc0); free (ptr0); printf ("\ncongratulations! -- indexing complete for %s\n", docfilename); time (&t1); printf ("ending time = %s", ctime (&t1)); secs = difftime (t1, t0); printf ("elapsed time = %g seconds\n", secs); if (secs > 0) printf ("net indexing rate = %.2f megabytes/hour\n", doclength * .0036 / secs); return; } /* determine the size of the file */ long filesize (FILE *file) { long len; if (fseek (file, 0L, SEEK_END) != 0) error_exit (104, "fseek() error in filesize()", 0L); len = ftell (file); if (len == -1L) error_exit (105, "ftell() error in filesize()", 0L); return (len); } /* reserve memory for holding len bytes */ void *reserve_memory (long len) { void *p; p = malloc (len); if (p == NULL) error_exit (106, "out of memory error in reserve_memory()", len); return (p); } /* load document file, transforming characters as per alphabetic * mapping table alpha_map[]... */ void load_docfile (FILE* docfile, unsigned char *docp, long doclength) { long ptrcount = 0L, i; int inaword = FALSE; if (fseek (docfile, 0, SEEK_SET) != 0) error_exit (107, "fseek() error in load_docfile()", 0); for (i = 0; i < doclength; ++i) *docp++ = alpha_map[getc (docfile)]; return; } /* filter the document as loaded into memory (after the character * mapping has already been applied!) and count the number of words * to generate pointers to, so that we know how much memory to reserve * for storage of those ptrs.... * * ((This routine may be replaced in order to do fancier parsing * and/or transformation of the text -- recognition of 'stop words', etc., * as suggested by roskos & others ... note that it seems unaesthetic to * do two passes through the document in memory, here in filter_doc() and * below in make_index_ptrs() ... perhaps the two routines could be * combined into one? On the other hand, it is very nice to know exactly * how many pointers are going to be created, so that just the right * amount of memory may be reserved....)) * * In this generic version of the index-building program, however, * the only option during filtering is to preserve "second-class" * (e.g., punctuation) characters which are found embedded between * "first-class" (e.g., alphanumeric) characters. * * I realize that the "embedded_punct[*(doc0 - 1)]" clause in the * embedded-punctuation-testing "if" statement is unnecessary, but I keep * it in place for simplicity and reliability. */ long filter_doc (unsigned char *doc0, long doclength) { int inaword = FALSE; long i, ptrcount = 0L; for (i = 0; i < doclength; ++i) { if (embedded_punct[*doc0] && (! *(doc0 - 1) || ! *(doc0 + 1) || embedded_punct[*(doc0 - 1)] || embedded_punct[*(doc0 + 1)])) *doc0 = '\0'; if (*doc0 && ! inaword) { inaword = TRUE; ++ptrcount; } else if (! *doc0 && inaword) inaword = FALSE; ++doc0; } return (ptrcount); } /* make the pointers to every word in the document by scanning * along, recognizing when each word begins.... * * during the generation of the pointers, check that the number * of them agrees with the count provided by the filter_doc() * operation.... for safety's sake.... */ void make_index_ptrs (unsigned char *doc0, long doclength, zndxptr *ptr0, long ptrcount) { int inaword = FALSE; long i; for (i = 0; i < doclength; ++i) { if (*doc0 && ! inaword) { inaword = TRUE; *ptr0 = doc0; ++ptr0; if (--ptrcount < 0) error_exit (108, "ptr count error in make_index_ptrs()", i); } else if (! *doc0 && inaword) inaword = FALSE; ++doc0; } return; } /* sort the index pointers in memory based on the alphabetization * chosen for the document's words ... * * use my standard old reliable quicksort, simple implementation; no * attempt to apply optimizations yet! Probably can improve indexing * performance by 20% or so with some effort here.... */ void sort_index_ptrs (zndxptr *first, zndxptr *last) { register zndxptr *i, *j, tmp; while (last - first > 1) { i = first; j = last; for (;;) { while (++i < last && compare_words (i, first) < 0) ; while (--j > first && compare_words (j, first) > 0) ; if (i >= j) break; tmp = *i; *i = *j; *j = tmp; } tmp = *first; *first = *j; *j = tmp; if (j - first < last - (j + 1)) { sort_index_ptrs (first, j); first = j + 1; } else { sort_index_ptrs (j + 1, last); last = j; } } return; } /* compare the two words pointed to by pointers p1 and p2, using * the alphabetization order as defined by alpha_order[] array... * much like the strcmp() function, modified to return the right * result to make quicksort "stable" if words pointed to are tied * (in which case, return the sign of the difference of the ptrs, * so that the earlier pointer comes out first in the sort).... */ int compare_words (zndxptr *p1, zndxptr *p2) { int diff, n = KEYLENGTH; unsigned char *s1, *s2; s1 = *p1; s2 = *p2; for ( ; --n && (alpha_order[*s1] == alpha_order[*s2]); ++s1, ++s2) if (! *s1) break; diff = alpha_order[*s1] - alpha_order[*s2]; if (diff == 0) diff = (*p1 - *p2 < 0) ? -1 : 1; return (diff); } /* open the index key and ptr files for writing out the results * of the index building operations... */ void open_index_files (char *kfilename, FILE **kfp, char *pfilename, FILE **pfp) { *kfp = fopen (kfilename, "wb"); if (*kfp == NULL) error_exit (109, "error opening key file in open_index_files()", 0L); *pfp = fopen (pfilename, "wb"); if (*pfp == NULL) error_exit (110, "error opening ptr file in open_index_files()", 0L); return; } /* write out the key and ptr files to disk... return the number of different * words in the document, as written to the keyfile * * note that the zndxptr pointers in memory must have the "base" value doc0 - start * subtracted from them in order to get absolute byte offsets into the document * file, before they are written out to the ".p" ptr file.... */ long write_index_files (unsigned char *doc0, zndxptr *ptr0, long ptrcount, FILE *kfile, FILE *pfile) { long i, ccount, different_words; zndxptr *ptrp, base, ptr, current_word; ptrp = ptr0; current_word = *ptrp; base = doc0; for (i = 0; i < ptrcount; ++i) { if (strncmp ((char *)current_word, (char *)*ptrp, KEYLENGTH)) { /* a new word has been encountered ... write out the formerly-current * word and its cumulative count value to the kfile, and then * set the current word to its new value */ fwrite_key (current_word, kfile); ccount = ptrp - ptr0; if (fwrite (&ccount, sizeof (long), 1, kfile) < 1) error_exit (115, "error writing ccount in write_index_files()", 0L); current_word = *ptrp; } ptr = (zndxptr)(*ptrp - base); if (fwrite (&ptr, sizeof (ptr), 1, pfile) < 1) error_exit (111, "error writing ptr in write_index_files()", 0L); ++ptrp; } /* write out the final word and final cumulative count value to finish * up the key file */ fwrite_key (current_word, kfile); ccount = ptrp - ptr0; if (fwrite (&ccount, sizeof (long), 1, kfile) < 1) error_exit (116, "error writing ccount in write_index_files()", 0L); different_words = filesize (kfile) / (KEYLENGTH + sizeof (long)); fclose (kfile); fclose (pfile); return (different_words); } /* write out a key word to the kfile; take the word from the document * as mapped into memory, and pad it on the right with blanks until it is * KEYLENGTH bytes long; then send it out.... */ void fwrite_key (zndxptr pp, FILE *kfile) { char buf[KEYLENGTH], *bp, c; int j; bp = buf; for (j = 0; j < KEYLENGTH; ++j) { c = *pp++; if (c == '\0') break; *bp++ = c; } for ( ; j < KEYLENGTH; ++j) *bp++ = ' '; if (fwrite (buf, KEYLENGTH, 1, kfile) < 1) error_exit (114, "error writing key word in fwrite_key()", 0L); return; } /* exit with an error message -- some fatal mistake has been made */ void error_exit (int errnum, char *msg, long errarg) { fprintf (stderr, "%d zndxr %s %ld\n", errnum, msg, errarg); exit (errnum); return; }