/* transportable version of the browser program -- 871008 ^z * version 0.11 -- with fixes to allow for non-ASCII characters * in the document/database file.... * * This program lets a user browse through indices created by * the qndxr program.... */ /* commands: * * ? print out helpful info * : quit program * :fnord open document file 'fnord' for browsing * >grok append copy of future output to notes file 'grok' * > end copying to notes file * 'DON.MAC append comment string 'DON.MAC' to notes file * xyzzy jump to INDEX item 'XYZZY' * ".012345 take string '.012345' as target for jump * -17 back up 17 lines from here * - back up one line; same as '-1' command * +23 jump down 23 lines from here * + or move down one line; same as '+1' command * .29 print 29 lines from here * = descend: INDEX -> CONTEXT -> TEXT display * ^ ascend: TEXT -> CONTEXT -> INDEX display * * empty 'working subset' proximity filter (nothing valid) * ** fill 'working subset' (everything valid) * & add word-neighborhood of current index item to subset * && add sentence-neighborhood to subset * &&& add paragraph-neighborhood to subset * ~ invert (logical NOT) entire working subset * ; repeat previous command * */ #include /* for FILE, printf(), etc. */ #include #include /* -----------------header stuff---------------- */ /* some definitions for the brwsr program... * 870805-07-... ^z */ /* tell the compiler that we are using Lightspeed C ... mostly so that * certain sections of non-transportable code will be activated to * compensate for the use of 16-bit int variables in LSC.... */ /* #define LIGHTSPEED */ /* KEY_LENGTH is the number of letters we have in each record.... */ #define KEY_LENGTH 28 /* CONTEXT_LINE_LENGTH is the total length of a key-word-in-context * display line.... */ #define CONTEXT_LINE_LENGTH 75 /* CONTEXT_OFFSET is the distance from the start of the context line * to where the key word itself begins.... */ #define CONTEXT_OFFSET 30 /* STRLEN is the most characters to allow in a line from the * file at one time, and the maximum length of a command line.... */ #define STRLEN 256 /* CMASK masks off characters to avoid negativity problems with funny * non-ASCII symbols... */ #define CMASK 0xFF /* NEIGHBORHOOD is the distance in bytes that defines the proximity * neighborhood of a word in the index ... used when defining a * subset for proximity searching.... NEIGHBORHOOD * 8 is the * compression factor for squeezing the document file down into the * array subset[] ... * Thus, NEIGHBORHOOD = 32, a good choice, defines a chunkiness of 32 * characters in making comparisons for proximity determination purposes.... */ #define NEIGHBORHOOD 32 /* WORD_RANGE, SENTENCE_RANGE, and PARAGRAPH_RANGE define how many chunks * of size NEIGHBORHOOD on each side of an item that the neighborhood of * each type extends. Values 1, 10, and 100 work pretty well.... */ #define WORD_RANGE 1 #define SENTENCE_RANGE 10 #define PARAGRAPH_RANGE 100 /* define a value to help recognize long operations, for which * the user should be warned of a likely delay in response ... */ #define GIVE_WARNING 200 /* structure of the records in the index key file: * a fixed-length character string kkey, filled out with blanks and * containing the unique alphanumeric 'words' in the document file, * changed to all-capital letters; * a cumulative count of how many total occurrences of words, including * the current one, have happened up to this point in the alphabetized * index. * * Obviously, the number of occurrences of the nth word is just the * difference between the nth cumulative count and the (n-1)th * cumulative count. Also, the (n-1)th cumulative count is a * pointer to the first occurrence of the nth word in the ptr_file * (which holds all the index offset pointers for the document file). * * See the index building program "ndxr.c" for further details of the * index structure.... */ typedef struct { char kkey[KEY_LENGTH]; long ccount; } KEY_REC; /* the pointer records are simply long integers, offsets from the start * of the document to the indexed words....*/ #define PTR_REC long /* define the values for the three levels that the user may be browsing * on: INDEX is the display of unique words and their occurrence counts; * CONTEXT is the key-word-in-context (KWIC) display; TEXT is the full * text of the document. */ #define INDEX 0 #define CONTEXT 1 #define TEXT 2 /* symbolic values for yes/no answers... */ #define TRUE 1 #define FALSE 0 /* ---------------prototypes------------------- */ /* prototypes for my brwsr functions ... * 870805... ^z */ #ifdef LIGHTSPEED /* in file_utils.c */ void open_files (char cmd[]); void close_files (void); void init_items (void); /* in brwsr.c */ void main (void); /* in show_items.c */ void show_current_item (void); void show_current_index_item (void); void show_current_context_item (void); void show_current_text_item (void); /* in brws_utils.c */ void get_key_rec (KEY_REC *recp, long n); PTR_REC get_ptr_rec (long n); long start_of_line (long p); long next_line (long p); void beep (void); /* in do_help/files.c */ void do_help (void); void do_open (char cmd[]); void do_redirection (char cmd[]); void do_comments (char cmd[]); /* in do_moves_etc.c */ void do_move (char cmd[]); int make_move (long n); int move_in_text (long move); int make_subindex_move (long move); /* in do_targets.c */ void do_descend (void); void do_ascend (void); void do_target_jump (char cmd[]); int zstrcmp (char *s1, char *s2); void init_target_item (KEY_REC *rp, char cmd[]); void do_multiprint (char cmd[]); /* in do_subset.c */ void do_make_subset (char cmd[]); void create_subset (void); void destroy_subset (void); void do_invert_subset (void); void do_add_neighborhood (char cmd[]); void set_neighborhood_bits (int n); void set_subset_bit (long offset); int get_subset_bit (long offset); long count_subset_recs (KEY_REC *this_rec, KEY_REC *prev_rec); #endif /* ------------------------main program-------------- */ /* First, define a few external variables to hold: * - pointers to the document, key, pointer, and notes files; * - numbers indicating the current and the minimum/maximum values for: * INDEX item (one line for each unique word in the document); * CONTEXT item (one line for each occurrence of chosen INDEX word); * TEXT item (byte offset from start of file to current line). * - the actual KEY_RECs corresponding to the current INDEX item and * the INDEX item just before that one * - the subset array pointer used for proximity searching, the flag * empty_subset, and the count of how many current records are in * the working subset subset_rec_count * - the level of user operations: * 0 means browsing the INDEX * 1 means browsing the CONTEXT (key-word-in-context = KWIC) * 2 means browsing the TEXT * * These items are referred to by various routines at scattered * locations, and it seems easiest to let them reside in external * variables.... */ FILE *doc_file = NULL, *key_file = NULL, *ptr_file = NULL, *notes_file = NULL; long current_item[3] = {0, 0, 0}; long min_item[3] = {0, 0, 0}; long max_item[3] = {0, 0, 0}; KEY_REC this_rec, prev_rec; char *subset = NULL; int empty_subset = TRUE; long subset_rec_count = 0; int level = INDEX; void main() { char cmd[STRLEN], prevcmd[STRLEN], *gets(), *strcpy(); void do_help(), do_open(), do_redirection(), do_comments(), do_move(), do_make_subset(), do_invert_subset(), do_add_neighborhood(), do_descend(), do_ascend(), do_target_jump(), do_multiprint(), close_files(); prevcmd[0] = '\0'; printf ("Greetings, program! ... type '?' for help.\n"); printf ("Browser version 0.11 by ^z - 871007\n"); while (gets (cmd)) { do_cmd: switch (cmd[0]) { case '?': do_help (); break; case '\0': strcpy (cmd, "+"); /* is just + */ case '+': /* + or - commands are both moves */ case '-': do_move (cmd); break; case ':': do_open (cmd); break; case '>': do_redirection (cmd); break; case '\'': do_comments (cmd); break; case '=': do_descend (); break; case '^': do_ascend (); break; case '.': do_multiprint (cmd); break; case '*': do_make_subset (cmd); break; case '&': do_add_neighborhood (cmd); break; case '~': do_invert_subset (); break; case ';': strcpy (cmd, prevcmd); goto do_cmd; break; case '"': do_target_jump (cmd + 1); break; default: do_target_jump (cmd); break; } strcpy (prevcmd, cmd); } } /* ---------------------brws_utils.c----------------- */ /* miscellaneous utility functions for helping the higher-level routines * browse around in indexed files.... * 870805-13-... ^z */ /* function to get the 'n'th KEY_REC from key_file and store it in the * KEY_REC space pointed to by recp ... note that if an illegal value * of 'n' is requested, a 'null' KEY_REC is produced.... */ void get_key_rec (recp, n) KEY_REC *recp; register long n; { extern FILE *key_file; extern long min_item[], max_item[]; char *strncpy(); void beep(); if (n < min_item[INDEX] || n > max_item[INDEX]) { strncpy ((char *)recp->kkey, " ", KEY_LENGTH); recp->ccount = 0; return; } if (fseek (key_file, sizeof (KEY_REC) * n, 0) != 0) { beep (); printf ("Fatal error in fseek() getting key record #%ld!\n", n); exit(); } if (fread ((char *)recp, sizeof (KEY_REC), 1, key_file) == 0) { beep (); printf ("Fatal error in fread() getting key record #%ld!\n", n); exit(); } } /* routine to get the nth pointer record (a long integer) from ptr_file; * the pointer record gives the actual offset in the document file of * the nth word in the expanded index.... */ PTR_REC get_ptr_rec (n) register long n; { PTR_REC p; extern FILE *ptr_file; void beep(); if (fseek (ptr_file, sizeof (PTR_REC) * n, 0) != 0) { beep (); printf ("Fatal error in fseek() getting pointer record #%ld!\n", n); exit(); } if (fread ((char *)&p, sizeof (PTR_REC), 1, ptr_file) == 0) { beep (); printf ("Fatal error in fread() getting pointer record #%ld!\n", n); exit(); } return (p); } /* Move backwards in document file to find the start of the line of text * which has offset p (from start of file) somewhere in that line. * However, don't go back beyond STRLEN characters at a time... that * is, put a ceiling of STRLEN on the maximum 'length of a line', to * avoid pathological input files with no \n's.... * Do the work by grabbing a buffer full and then scanning back in that, * rather than by laboriously lseek() and fgetc() back one character at * a time.... */ long start_of_line (p) register long p; { extern FILE *doc_file; char buf[STRLEN]; register long i; register int n; void beep(); if (p <= 0) return (0L); i = p - STRLEN; i = ((i < 0) ? 0 : i); if (fseek (doc_file, i, 0) != 0) { beep (); printf ("Fatal error in fseek() backing up in document!\n"); exit(); } if ((n = fread (buf, sizeof (char), p - i, doc_file)) == 0) { beep (); printf ("Fatal error in fread() backing up in document!\n"); exit(); } while (--n >= 0 && buf[n] != '\n'); return (i + n + 1); } /* scan forward until reaching the beginning of the next line of text; * if at or beyond the end of the file, return a pointer to the last * character in the file ... * Don't go beyond STRLEN characters, however, in scanning forward.... */ long next_line (p) long p; { register int c, n; extern long max_item[]; void beep(); if (p >= max_item[TEXT]) return (max_item[TEXT]); if (fseek (doc_file, p, 0) != 0) { beep (); printf ("Fatal error in fseek() scanning forward in document!\n"); exit(); } for (n = 0; n < STRLEN; ++n) if ((c = fgetc (doc_file)) == EOF || c == '\n') break; return (ftell (doc_file)); } /* paging James Bond .... */ void beep () { putchar ('\007'); } /* ---------------------do_help/files.c---------------- */ /* function to print out helpful information -- a command summary * for the user.... */ void do_help () { printf (" ? print this help\n"); printf (" : quit the program\n"); printf (" :fnord open document file 'fnord' for browsing\n"); printf (" >grok open notes file 'grok' and append output\n"); printf (" > close notes file\n"); printf (" 'DON.MAC append comment 'DON.MAC' to notes file\n"); printf (" xyzzy jump to 'XYZZY' in INDEX\n"); printf (" \".123 jump to '.123' in INDEX\n"); printf (" -17 back up 17 lines\n"); printf (" - back up one line\n"); printf (" +0 print current line\n"); printf (" move down one line\n"); printf (" +23 move down 23 lines\n"); printf (" = INDEX --> CONTEXT --> TEXT\n"); printf (" ^ INDEX <-- CONTEXT <-- TEXT\n"); printf (" .9 move down and print out 9 lines\n"); printf (" * create an empty working subset\n"); printf (" ** fill the working subset (complete database)\n"); printf (" & add word-neighborhood to subset\n"); printf (" && add sentence-neighborhood to subset\n"); printf (" &&& add paragraph-neighborhood to subset\n"); printf (" ~ invert (logical NOT) entire working subset\n"); printf (" ; repeat previous command\n"); } /* function to handle a ":" command, to either quit the program or to * open a new database for browsing.... display some amusing data * about the file that is opened, if successful (total length in bytes, * total number of words, and number of unique words in the file)... */ void do_open (cmd) char cmd[]; { char qcmd[STRLEN]; extern FILE *doc_file; extern long max_item[]; void destroy_subset (), open_files (), init_items (); if (cmd[1] == '\0') { printf ("Quit?\n"); gets (qcmd); if (qcmd[0] == 'y' || qcmd[0] == 'Y') { printf ("Good-bye ... ^z\n"); exit (); } else { printf ("Cancelled!\n"); return; } } destroy_subset (); open_files (cmd); if (doc_file == NULL) return; init_items (); printf ("File \"%s\" contains:\n", cmd + 1); printf ("%9ld characters\n%9ld total words\n%9ld unique words\n", max_item[TEXT] + 1, max_item[CONTEXT] + 1, max_item[INDEX] + 1); } /* function to open or close a notes file ... copies of browser * program output gets appended to that notes file as long as it * is open, and the user may add comment strings with the '"' command .... */ void do_redirection (cmd) char cmd[]; { extern FILE *notes_file; FILE *fopen (); void beep(); if (notes_file != NULL) fclose (notes_file); if (cmd[1] == '\0') return; if ((notes_file = fopen (cmd + 1, "a")) == NULL) { beep (); printf ("Error opening notes file \"%s\"!\n", cmd + 1); return; } } /* function to append a comment line to the notes_file... does the job * for the '"' command.... */ void do_comments (cmd) char cmd[]; { extern FILE *notes_file; void beep(); if (notes_file == NULL) { beep (); printf ("No notes file open!\n"); return; } else fprintf (notes_file, "%s\n", cmd + 1); } /* -------------------do_moves.c---------------------- */ /* function to interpret a command to move around in the current level * of the browser display ... handles "+", "-", and "" ... calls * routine make_move() to do the real work.... */ void do_move (cmd) char cmd[]; { long move; extern FILE *doc_file; char *strcat(); void beep(), show_current_item(); if (doc_file == NULL) { beep (); printf ("No file open!\n"); return; } if (cmd[1] == '\0') strcat (cmd, "1"); move = atol (cmd); make_move (move); show_current_item (); } /* function to do the actual moving around in the INDEX or CONTEXT or * TEXT displays .... Safety features prevent user from going * off the end of the file in either direction.... * make_move() returns FALSE if it fails to completely execute the move * (i.e., if the move would have run off either end of the file) and * beeps at the user ... it returns TRUE if the move succeeds.... */ int make_move (n) long n; { int r; PTR_REC get_ptr_rec (); extern long current_item[], min_item[], max_item[]; extern int level; extern char *subset; void beep(); r = TRUE; switch (level) { case INDEX: current_item[INDEX] += n; if (current_item[INDEX] < min_item[INDEX]) { current_item[INDEX] = min_item[INDEX]; r = FALSE; } if (current_item[INDEX] > max_item[INDEX]) { current_item[INDEX] = max_item[INDEX]; r = FALSE; } break; case CONTEXT: if (subset == NULL) { current_item[CONTEXT] += n; if (current_item[CONTEXT] < min_item[CONTEXT]) { current_item[CONTEXT] = min_item[CONTEXT]; r = FALSE; } if (current_item[CONTEXT] > max_item[CONTEXT]) { current_item[CONTEXT] = max_item[CONTEXT]; r = FALSE; } } else r = make_subindex_move (n); current_item[TEXT] = get_ptr_rec (current_item[CONTEXT]); break; case TEXT: r = move_in_text (n); break; } if (r == FALSE) beep (); return (r); } /* move up or down the chosen number of lines of text, and put the result * into current_item[TEXT] ... return FALSE if attempted move would have * run off the end of the file, non-zero if the move was completed without * error.... */ int move_in_text (move) register long move; { int result; register long loc; extern long current_item[]; result = TRUE; loc = current_item[TEXT]; if (move < 0) for ( ; move < 0; ++move) { if (loc <= 0) { result = FALSE; break; } loc = start_of_line (loc - 1); } else if (move > 0) for ( ; move > 0; --move) { loc = next_line (loc); if (loc >= max_item[TEXT]) { result = FALSE; loc = start_of_line (max_item[TEXT]); break; } } current_item[TEXT] = loc; return (result); } /* make a move in the working subindex ... must just step along in * whichever direction is desired until either hitting the end (in * which case the last valid item found in the subindex should be * returned as the current one) or finishing the requested move. * Return FALSE if hit a wall, TRUE otherwise.... * * This routine is only called when level == CONTEXT ... since * in TEXT level we are just moving around in the full text of * the document file, and in the INDEX level we count and display * even items with zero occurrences in the subindex... * * This routine is only called when subset != NULL, since if there * is no working subset the move becomes trivial arithmetic. * * This routine is only called when either current_item[CONTEXT] * is a good item in the subset already, or when there is a certainty * that a good item will be found in the subsequent scan (specifically, * when scanning down to find the first good item when called from * do_descend () ....). */ int make_subindex_move (move) long move; { register long i, last_good_item; int result; PTR_REC get_ptr_rec (); extern long current_item[], min_item[], max_item[]; extern char *subset; result = TRUE; last_good_item = current_item[CONTEXT]; if (move >= 0) { for (i = 0; i < move; ++i) { while (++current_item[CONTEXT] <= max_item[CONTEXT] && ! get_subset_bit ((long) get_ptr_rec (current_item[CONTEXT]))); if (current_item[CONTEXT] > max_item[CONTEXT]) break; last_good_item = current_item[CONTEXT]; } if (current_item[CONTEXT] > max_item[CONTEXT]) { result = FALSE; current_item[CONTEXT] = last_good_item; } } else { for (i = 0; i > move; --i) { while (--current_item[CONTEXT] >= min_item[CONTEXT] && ! get_subset_bit ((long) get_ptr_rec (current_item[CONTEXT]))); if (current_item[CONTEXT] < min_item[CONTEXT]) break; last_good_item = current_item[CONTEXT]; } if (current_item[CONTEXT] < min_item[CONTEXT]) { result = FALSE; current_item[CONTEXT] = last_good_item; } } return (result); } /* -------------------do_subset.c------------------- */ /* words to handle subset (proximity filter) tasks.... * ^z -- 870810-13-.... * * Subset (or subindex) restrictions are useful in doing searches for * combinations of terms, phrases, etc., where the terms are themselves * common words. Subsets also become indispensible when working in * very large databases, where even slightly-exotic terms occur too many * times for convenient browsing through their CONTEXT displays. * * When a subset has been defined, then instead of showing an INDEX * item as something like: * 3141 UNIX * that item is displayed with a count of how many occurrences are * 'valid' in addition to the total number of occurrences: * 27/3141 UNIX * Thus, in this case only 27 out of the 3141 appearances of the term * 'UNIX' happened to be in the neighborhood of the set of words chosen * by the user to define the current working subset. * * The 'neighborhood' (a half dozen words or so on each side) of a chosen * index term is added (logical OR) into the working subset by giving * the '&' command when that term's INDEX display is the current item. * If a larger neighborhood (half a dozen sentences or so each way) is * preferable, use the '&&' command; for a still larger neighborhood * (a few paragraphs on each side), use the '&&&' command. * * * IMPLEMENTATION NOTES: * * Subest searching is handled by an array of flag bits -- one bit * for each chunk of NEIGHBORHOOD bytes in the original text. Bits are * held in the array subset[]; they are zero for 'invalid' regions * of the original document, and one for 'valid' regions that have * been defined by proximity to index terms selected by the user. * * A value of 32 for NEIGHBORHOOD seems to work very well for defining * proximity. The compression factor from the original document to the * array subset[] is NEIGHBORHOOD*8 = 256 in that case. So, even for a * document that is tens of megabytes long, the subset[] array only * requires a few hundred kB and it is quite feasible to hold it in * memory. * * The presence of a subset is signalled when the global array pointer * 'subset' is non-NULL ... in which case show_current_index_item, * make_move, and other words modify their behavior in order to * reflect the chosen subset. If the subset is completely * empty, after having just been created or flushed, then the * global variable empty_subset is nonzero (true) and short-cuts * are taken in counting and displaying words.... * * Consider a single occurrence of a word. Suppose we want to add the * immediate neighborhood of that word to our working subset. To avoid * edge-effects due to the chunkiness of the definition of neighborhood, * the routine to set bits in subset[] actually sets 2*WORD_RANGE+1 bits: * the bit corresponding to the chunk wherein the chosen word falls, * and WORD_RANGE extra bits on each side. * * Thus, for the choice WORD_RANGE = 1, any index item which is * within NEIGHBORHOOD (=32) bytes of our chosen word is certain * to be included in the proximity subset; any item which occurs * at least 2*NEIGHBORHOOD (=64) bytes away is certain NOT to be * included, and items in the intermediate range have a * linearly-decreasing probability of false inclusion. The choice * of 32 for NEIGHBORHOOD and 1 for WORD_RANGE therefore gives * essentially all items within half a dozen or so words * (average word length is 5-6 letters) of the term used to * define the subset.... * * In extensive experimentation with the MacForth-based Browser v.244+ * program on the Macintosh, the above definition of proximity seemed * to be by far the most useful one in real life. It is also quite * straightforward to implement with good efficiency. In addition * to the 'word'-neighborhood proximity (within WORD_RANGE chunks), * it is occasionally convenient to add a somewhat larger * neighborhood to the working subset -- corresponding to proximity * within a few sentences or even a few paragraphs. To do that, * the && (sentence-proximity) and &&& (paragraph-proximity) commands * simply set more bits on either side of the word's location in * subset[]. Specifically, && sets SENTENCE_RANGE bits on each side, * and &&& sets PARAGRAPH_RANGE bits on each side. Values of 10 and 100 * respectively for those parameters seem to work quite well. */ /* Handle the * command to empty out the working subset before * beginning to do proximity-filtered searching (that is, to create * the array subset[] in empty form). * Also handle the ** command to destroy subset[] (which makes it * seem as if it is "full" from the user's viewpoint) * * Force the user back to INDEX level when the working subindex * is emptied or filled, to avoid inconsistencies in moving around * the CONTEXT level.... */ void do_make_subset (cmd) char cmd[]; { void beep (), create_subset (), destroy_subset (), show_current_item (); extern int level; if (cmd[1] == '\0') create_subset (); else if (cmd[1] == '*') destroy_subset (); else beep (); level = INDEX; show_current_item (); } /* Do the job of creating the empty array subset[] ... note that if * running on a machine with 16-bit int variables such as Lightspeed C * on the Mac, the calloc() function won't work for files bigger * than 16 MB or so (for NEIGHBORHOOD = 32) ....must modify this * to use clalloc(), the non-ANSI standard function for allocating * and clearing bigger memory chunks. Hence, the #ifdef LIGHTSPEED * lines below.... * * Note that there is no need to provide a "No file open!" error msg * since show_current_item() in calling function will do that for * us.... */ void create_subset () { extern char* subset; extern FILE *doc_file; extern int empty_subset; extern long max_item[]; #ifdef LIGHTSPEED char *clalloc (); #else char *calloc (); #endif void beep (), destroy_subset (); if (doc_file == NULL) return; destroy_subset (); #ifdef LIGHTSPEED if ((subset = clalloc ((unsigned long)(1 + max_item[TEXT]/(NEIGHBORHOOD*8)), (long)sizeof(char))) == NULL) #else if ((subset = calloc ((unsigned int)(1 + max_item[TEXT]/(NEIGHBORHOOD*8)), sizeof(char))) == NULL) #endif { beep (); printf ("Not enough memory for subset!\n"); return; } empty_subset = TRUE; } /* routine to destroy a subset (used in response to a ** command, * or when changing over to a new file with a : command, or when about * to create a new empty subset inside the * command) */ void destroy_subset () { extern char *subset; if (subset != NULL) free (subset); subset = NULL; } /* routine to invert the working subindex (response to a ~ command) -- * perform a logical NOT on the entire set, so that what was once a * member of the set becomes a non-member, and vice versa.... * * Must set empty_subset FALSE since we don't a priori know that the * resulting set is empty or full or what.... * * As with * and ** commands, kick the user back to the INDEX level * when this command is executed, for safety's sake.... */ void do_invert_subset () { register long i, lim; extern char *subset; extern long max_item[]; extern int empty_subset; void beep(); if (doc_file == NULL) { beep (); printf ("No file open!\n"); return; } if (subset == NULL) { beep (); printf ("No subset!\n"); return; } lim = max_item[TEXT]/(NEIGHBORHOOD*8); for (i = 0; i <= lim; ++i) subset[i] = ~subset[i]; empty_subset = FALSE; level = INDEX; show_current_item (); } /* handle the '&' command to add the current word's neighborhood to * the working subset: * & adds a few-words neighborhood * && adds a few-sentences neighborhood * &&& adds a few-paragraphs neighborhood * * Note that instead of calling show_current_item() to finish up * this function, since we know that every occurrence of the * current item will be in the working subset we can save time * and avoid re-counting them all ... hence, the final lines * of this routine */ void do_add_neighborhood (cmd) char cmd[]; { int nsize; extern int level; extern char *subset; extern FILE *doc_file, *notes_file; void beep (), set_neighborhood_bits (), get_key_rec (); extern KEY_REC this_rec, prev_rec; extern long current_item[], subset_rec_count; if (doc_file == NULL) { beep (); printf ("No file open!\n"); return; } if (subset == NULL) { beep (); printf ("No subset!\n"); return; } nsize = WORD_RANGE; if (cmd[1] == '&') { nsize = SENTENCE_RANGE; if (cmd[2] == '&') nsize = PARAGRAPH_RANGE; } level = INDEX; set_neighborhood_bits (nsize); empty_subset = FALSE; get_key_rec (&prev_rec, current_item[INDEX] - 1); get_key_rec (&this_rec, current_item[INDEX]); subset_rec_count = this_rec.ccount - prev_rec.ccount; printf ("%6ld/%-6ld %.28s\n", subset_rec_count, subset_rec_count, this_rec.kkey); if (notes_file != NULL) fprintf (notes_file, "%6ld/%-6ld %.28s\n", subset_rec_count, subset_rec_count, this_rec.kkey); } /* function to set n bits on each side of each current index term in the * subset[] array ... note that it is important to set min_item[CONTEXT] * and max_item[CONTEXT] values before using them, since they are only * set ordinarily when descending ('=' command) into the CONTEXT level... * values of prev_rec and this_rec are set every time an index item * is displayed, so they will be all right for us already.... */ void set_neighborhood_bits (n) register int n; { register long i; register PTR_REC j, j0, j1; extern long min_item[], max_item[]; extern KEY_REC prev_rec, this_rec; PTR_REC get_ptr_rec (); void set_subset_bit (); min_item[CONTEXT] = prev_rec.ccount; max_item[CONTEXT] = this_rec.ccount - 1; if ((this_rec.ccount-prev_rec.ccount) * n > GIVE_WARNING) printf ("Please wait (%ld bits to set)...\n", (this_rec.ccount - prev_rec.ccount) * n * 2 + 1); for (i = min_item[CONTEXT]; i <= max_item[CONTEXT]; ++i) { j = get_ptr_rec (i); j0 = j - n * NEIGHBORHOOD; if (j0 < min_item[TEXT]) j0 = min_item[TEXT]; j1 = j + n * NEIGHBORHOOD; if (j1 > max_item[TEXT]) j1 = max_item[TEXT]; for (j = j0; j <= j1; j += NEIGHBORHOOD) set_subset_bit (j); } } /* function to set a bit in the array subset[] for a chosen character * offset from the start of the file ... just convert the offset into * byte and bit numbers and then "OR" the 1 into that slot.... * * (An attempt to optimize for the specific value NEIGHBORHOOD = 32, * using & and >> in place of % and /, did not yield a significant * improvement in speed... ^z 870813) */ void set_subset_bit (offset) long offset; { int bit_no; long byte_no; extern char *subset; bit_no = (offset % (8 * NEIGHBORHOOD)) / NEIGHBORHOOD; byte_no = offset / (8 * NEIGHBORHOOD); subset[byte_no] |= 1 << bit_no; } /* function to return the value of a bit in the array subset[] for a * chosen character offset from the start of the document file ... * as in set_subset_bit() above, just convert offset into byte and * bit numbers, extract the relevant bit, shift it down, and return * it.... * * (An attempt to optimize for the specific value NEIGHBORHOOD = 32, * using & and >> in place of % and /, did not yield a significant * improvement in speed... ^z 870813) */ int get_subset_bit (offset) long offset; { int bit_no; long byte_no; extern char *subset; bit_no = (offset % (8 * NEIGHBORHOOD)) / NEIGHBORHOOD; byte_no = offset / (8 * NEIGHBORHOOD); return ((subset[byte_no] >> bit_no) & 1); } /* function to count and return the number of occurrences of this_rec * in the working subindex (i.e., the number of times the word comes * up with its subset[] bit turned ON) .... */ long count_subset_recs (this_recp, prev_recp) KEY_REC *this_recp, *prev_recp; { register long i, n; extern int empty_subset; PTR_REC get_ptr_rec (); int get_subset_bit (); if (empty_subset) return (0L); if (this_recp->ccount - prev_recp->ccount > GIVE_WARNING) printf ("Please wait (%ld words to check)...\n", this_recp->ccount - prev_recp->ccount); for (n = 0, i = prev_recp->ccount; i < this_recp->ccount; ++i) n += get_subset_bit ((long)get_ptr_rec (i)); return (n); } /* ----------------------do_targets.c----------------- */ /* some functions to respond to various user inputs... * 870806-13-... ^z */ /* function to move down a level in the browser hierarchy: * INDEX --> CONTEXT --> TEXT * includes various safety features against errors, and takes * care of initializations for the level being entered ... also * displays the current line when entering the new level, for the user's * convenience.... * * Modified for subset operations, ^z - 870813. Note that we must check * here to be sure that we are descending into a non-empty subset when * a subset is active and we are descending from INDEX --> CONTEXT; * we also must make sure in that case that we initialize the CONTEXT * display to start with the first good item, not just min_item[CONTEXT]. */ void do_descend () { extern int level, empty_subset; extern long current_item[], min_item[], max_item[], subset_rec_count; extern KEY_REC this_rec, prev_rec; extern FILE *doc_file; extern char *subset; void beep(); if (doc_file == NULL) { beep (); printf ("No file open!\n"); return; } if (current_item[INDEX] < 0) { beep (); printf ("No current index item!\n"); return; } if (subset != NULL && (empty_subset || subset_rec_count == 0)) { beep (); printf ("No items in subset!\n"); return; } ++level; if (level == CONTEXT) { min_item[CONTEXT] = prev_rec.ccount; max_item[CONTEXT] = this_rec.ccount - 1; if (subset == NULL) current_item[CONTEXT] = min_item[CONTEXT]; else { current_item[CONTEXT] = min_item[CONTEXT] - 1; make_subindex_move (1L); } current_item[TEXT] = get_ptr_rec (current_item[CONTEXT]); } if (level == TEXT) current_item[TEXT] = start_of_line (current_item[TEXT]); if (level > TEXT) { beep (); printf ("Can't descend past full text display!\n"); level = TEXT; } show_current_item (); } /* function to move up a level in the browser hierarchy: * TEXT --> CONTEXT --> INDEX * includes a safety net against going past INDEX, and * displays the current line when entering the new level... * note that when coming up from TEXT, must reset the current_item[TEXT] * before displaying CONTEXT line, as TEXT browsing changes it.... */ void do_ascend () { extern int level; void beep(); --level; if (level < INDEX) { beep (); printf ("Can't ascend past index display!\n"); level = INDEX; } if (level == CONTEXT) current_item[TEXT] = get_ptr_rec (current_item[CONTEXT]); show_current_item (); } /* function to jump the INDEX display to a target entered by the * user ... does a simple binary search (see K&R p.54) to get there * and then shows that line of the INDEX ... and if the * target word is not found, it beeps and shows the preceeding * word available in the index ... if called from another level, it * jumps to the INDEX level ... */ void do_target_jump (cmd) char cmd[]; { register int c, diff; register long low, high, mid; KEY_REC target_item, this_item; extern long current_item[], max_item[]; extern FILE *key_file; extern int level; void beep(), init_target_item(); if (key_file == NULL) { beep (); printf ("No file open!\n"); return; } level = INDEX; init_target_item (&target_item, cmd); low = min_item[INDEX]; high = max_item[INDEX]; while (low <= high) { mid = (low + high) / 2; get_key_rec (&this_item, mid); diff = zstrcmp (target_item.kkey, this_item.kkey); if (diff < 0) high = mid - 1; else if (diff > 0) low = mid + 1; else break; } current_item[INDEX] = mid; if (diff < 0) --current_item[INDEX]; if (current_item[INDEX] < 0) current_item[INDEX] = 0; if (diff != 0) beep (); show_current_item (); } /* my function to compare two strings and give a result as to who is * alphabetically earlier. Note that this is almost the same as strncmp() * with the fixed value of KEY_LENGTH as the maximum comparison distance, * except that I must be sure to mask the characters to make them positive * (since we want to be able to handle the non-ASCII funny letters in * the Apple character set properly/consistently). If the masking isn't * done, then inconsistent results can occur with those non-ASCII chars! */ int zstrcmp (s1, s2) register char *s1, *s2; { register int n = KEY_LENGTH; for (; --n && ((*s1 & 0xFF) == (*s2 & 0xFF)); s1++, s2++) if (!*s1) break; return ((*s1 & 0xFF) - (*s2 & 0xFF)); } /* initialize the target KEY_REC kkey string to the value typed in by * the user ... convert the user's string to all capital letters, and * pad it out to KEY_LENGTH with trailing blanks as needed ... use the * toupper() function warily (assume that it may be nonstandard and * mess up the value of non-lowercase characters, as it seems to on * the VAX and Sun C compilers I've tried -- so check and don't apply * toupper() except to lower-case characters!).... */ void init_target_item (rp, cmd) KEY_REC *rp; char cmd[]; { register int c, n; strncpy (rp->kkey, " ", KEY_LENGTH); for (n = 0; n < KEY_LENGTH && (c = cmd[n]) != 0; ++n) rp->kkey[n] = (islower (c) ? toupper (c) : c); } /* handle the printing out of N lines due to a ".N" user command ... do * the job simply by marching down a step and displaying that line N times * ... precisely equivalent to hitting the key N times. */ void do_multiprint (cmd) char cmd[]; { register long i, n; extern FILE *doc_file; void beep(); if (doc_file == NULL) { beep (); printf ("No file open!\n"); return; } if (cmd[1] == '\0') strcat (cmd, "1"); n = atol (cmd + 1); for (i = 0; i < n; ++i) { if (! make_move (1L)) break; show_current_item (); } } /* ---------------------file_utils.c--------------------- */ /* initialize, open, etc. various things for brwsr program... * 870805-13... ^z */ /* open the document, key, and pointer files ... * for now, demand that if the document file is named * then the * key file must be named *.k and the pointer file must be *.p * ... just the way that the ndxr program builds the index..... */ void open_files (cmd) char cmd[]; { char ocmd[STRLEN], *strcat(), *strcpy(); FILE *fopen(); extern FILE *doc_file, *key_file, *ptr_file; void beep(); close_files (); strcpy (ocmd, cmd + 1); if ((doc_file = fopen (ocmd, "r")) == NULL) { beep (); printf ("Error opening document file \"%s\"!\n", ocmd); close_files (); return; } strcat (ocmd, ".k"); if ((key_file = fopen (ocmd, "rb")) == NULL) { beep (); printf ("Error opening key file \"%s\"!\n", ocmd); close_files (); return; } ocmd[strlen (ocmd) - 2] = '\0'; strcat (ocmd, ".p"); if ((ptr_file = fopen (ocmd, "rb")) == NULL) { beep (); printf ("Error opening ptr file \"%s\"!\n", ocmd); close_files (); return; } } /* close off the document, key, and pointer files */ void close_files () { extern FILE *doc_file, *key_file, *ptr_file; if (doc_file != NULL) { fclose (doc_file); doc_file = NULL; } if (key_file != NULL) { fclose (key_file); key_file = NULL; } if (ptr_file != NULL) { fclose (ptr_file); ptr_file = NULL; } } /* set up initial values for current_item and max_item arrays ... * initially, current_item is set to an illegal value, -1, and * max_item[CONTEXT] and max_item[TEXT] convey information about * the sizes of the pointer and document files respectively... */ void init_items () { extern long current_item[], max_item[]; extern FILE *doc_file, *key_file, *ptr_file; extern int level; level = INDEX; current_item[INDEX] = -1; current_item[CONTEXT] = -1; current_item[TEXT] = -1; fseek (key_file, 0L, 2); max_item[INDEX] = ftell (key_file) / sizeof (KEY_REC) - 1; fseek (ptr_file, 0L, 2); max_item[CONTEXT] = ftell (ptr_file) / sizeof (PTR_REC) - 1; fseek (doc_file, 0L, 2); max_item[TEXT] = ftell (doc_file) - 1; } /* --------------------show_items.c------------------ */ /* functions to show a chosen item from an index key file,etc. * 870805-13... ^z */ /* function to show whatever the current item is, in whatever level * the user is at the moment.... basically, it just routes the * request to whichever is the appropriate agent to take care of * it, for INDEX, CONTEXT, or TEXT levels.... */ void show_current_item () { extern int level; extern FILE *doc_file; void beep(), show_current_index_item(), show_current_context_item(), show_current_text_item(); if (doc_file == NULL) { beep (); printf ("No file open!\n"); return; } if (level == INDEX) show_current_index_item (); else if (level == CONTEXT) show_current_context_item (); else show_current_text_item (); } /* function to show the current INDEX item (count of occurrences followed * by the indexed word itself) ... send copy to notes file, if open.... */ void show_current_index_item () { extern KEY_REC this_rec, prev_rec; extern long current_item[], subset_rec_count; extern FILE *notes_file; extern char *subset; long count_subset_recs (); void get_key_rec (); get_key_rec (&prev_rec, current_item[INDEX] - 1); get_key_rec (&this_rec, current_item[INDEX]); if (subset == NULL) { printf ("%6ld %.28s\n", this_rec.ccount - prev_rec.ccount, this_rec.kkey); if (notes_file != NULL) fprintf (notes_file, "%6ld %.28s\n", this_rec.ccount - prev_rec.ccount, this_rec.kkey); } else { subset_rec_count = count_subset_recs (&this_rec, &prev_rec); printf ("%6ld/%-6ld %.28s\n", subset_rec_count, this_rec.ccount - prev_rec.ccount, this_rec.kkey); if (notes_file != NULL) fprintf (notes_file, "%6ld/%-6ld %.28s\n", subset_rec_count, this_rec.ccount - prev_rec.ccount, this_rec.kkey); } } /* function to show the current CONTEXT item, a line in the form of a * key-word-in-context (KWIC) display with the indexed term in the' * middle ... the line is filtered so that the presence of tabs, 's, * or other nasty control characters won't cause any problems... * a little testing is needed to take care of end-effects, so that * nothing happens for context lines near either end of the document * file ... finally, send a copy to the notes file, if open.... */ void show_current_context_item () { register int n, m; register long p; char buf[CONTEXT_LINE_LENGTH + 1]; extern long current_item[]; extern FILE *doc_file; void beep(); p = current_item[TEXT] - CONTEXT_OFFSET; n = 0; if (p < 0) { for ( ; n < -p; ++n) buf[n] = ' '; p = 0; } if (fseek (doc_file, p, 0) != 0) { beep (); printf ("Error in fseek() getting a context line to show!\n"); return; } if ((m = fread (buf + n, sizeof (char), CONTEXT_LINE_LENGTH - n, doc_file)) == 0) { beep (); printf ("Error in fread() getting a context line to show!\n"); return; } n += m; for ( ; n < CONTEXT_LINE_LENGTH; ++n) buf[n] = ' '; for (n = 0; n < CONTEXT_LINE_LENGTH; ++n) if (! isprint (buf[n] & 0xFF)) buf[n] = ' '; buf[CONTEXT_LINE_LENGTH] = '\0'; printf ("%s\n", buf); if (notes_file != NULL) fprintf (notes_file, "%s\n", buf); } /* function to show the current TEXT item, a line from the document file * itself ... copy to notes file also, if so desired.... */ void show_current_text_item () { char buf[STRLEN + 1]; extern long current_item[]; extern FILE *doc_file; void beep(); if (fseek (doc_file, current_item[TEXT], 0) != 0) { beep (); printf ("Error in fseek() getting a text line to show!\n"); return; } if (fgets (buf, STRLEN, doc_file) == NULL) { beep (); printf ("Error in fgets() getting a text line to show!\n"); return; } printf ("%s", buf); if (notes_file != NULL) fprintf (notes_file, "%s", buf); }