Date: Mon, 2 Jun 1997 17:44:43 -0700 (PDT)
From: Mark Zimmermann

this apparently was meant to browse multi-file databases ... I
recollect vaguely trying it out on the first 5 books of the King
James Bible ... will send some test results and samples in later
msg for your amusement.... ^z

-----zbrwsr58.c-----

/* zbrwsr --- to fetch data from multifile indexed free-text databases
 * (c) 1991 by Mark Zimmermann; distributed under the terms of the GNU
 * General Public License
 *
 * This is free software and comes with NO WARRANTY WHATSOEVER!
 *
 * This program is meant to be a low-level free-text database browser,
 * kinda analogous to "ed", to be wrapped in various shells or guises
 * for actual users, or for weird folks like me to use raw sometimes....
 *
 * Usage:  zbrwsr [db]
 *
 * zbrwsr opens the database "db" and its associated index files.
 * The database consists of:
 *  - one or more document (text, usually) files (the docfile(s))
 *  - one index file containing an alphabetized list of all the distinct
 *		words in the db and their occurrence rates (the ".k" or keyfile)
 *  - one index file containing an alphabetized list of pointers to
 *		every instance of every word in the db (the ".p" or ptrfile)
 *	- if there is more than one docfile in the db, one index file
 *		containing a list of all the docfiles in their order as indexed
 *		(the ".f" or filelist file)
 *	- if a custom alphabetization order is used, a ".a" or alphafile
 *
 * The order of search when opening a db is as follows, given a database
 * name "dbname" on the command line or requested from within zbrwsr:
 *
 * 1. Try to open "dbname.f" as a filelist;
 *		if not found, then
 * 2. Try to open "dbname.k" and "dbname.p" as indices for the document
 *	  dbname, as a single-document db;
 *		if not found, then
 * 3. Try to open "dbname" itself as a filelist;
 *		if not found then
 * 4. Give up....
 *
 * ((think about how to handle errors better -- perhaps filelist files
 *		should always have some characteristic feature, to prevent any
 *		attempt to open the wrong type of file??  Perhaps they should
 *		always begin with a #keyfile and a #ptrfile set of lines??  That
 *		should avoid many mistakes where a nonfilelist file is opened!))
 * 
 */
 
/* if one needs to convert this program for compilation by an old, dumb cc,
 * for generic use, this is what happens:
 *
 * - prototypes turned into function head type declarations (via gawk
 *    program to take everything out from within the () in the prototypes)
 * - function heads turned back to old form (for things that don't
 *    call functions as arguments, single line head, etc.), using gawk
 * - <stdlib.h> reference removed, as it apparently offends...
 * - #define time_t, difftime(), SEEK_SET, and SEEK_END
 */
 
#ifdef DUMB_COMPILER
#define time_t              long
#define difftime(t1,t0)     (t1 - t0)
#define SEEK_SET            0
#define SEEK_END            2
#else
#include <stdlib.h>
#endif
 
 
#include <stdio.h>
#include <string.h>
#include <ctype.h>
 
/* for Mac (Think C) ... need to put this into a project with the ANSI
 * and the MacTraps libraries ... the <console.h> header lets main()
 * take command-line UNIX-style arguments ... very un-Macish, sorry!
 * not needed if no use is made of argc, argv...
 */
#ifdef THINK_C
#include <console.h>
#endif
 
/* define some parameters used by the following routines ... MAXSTRLEN is the
 * longest string expected to be seen in an alphafile, a filelist, etc. ...
 * KEYLENGTH is the number of letters to be kept (at most) for an indexed
 * word (28 is chosen for compatibility with my earlier index-building programs;
 * it's long enough to avoid truncating most words, adds up to a power of two
 * when the 4-byte long integer "cumulative count" is appended to make an index
 * record in the key file, and is short enough that (for reasonably large input
 * files) the key file isn't exorbitantly huge ...
 *
 * for now, allow a maximum of 10 subsets, numbers 0 through 9; change MAX_SUBSETS
 * if more are needed....
 *
 * hard-wire SUBSET_QUANTUM here, to make sure compiler knows it's a power of 2,
 * to get perhaps better efficiency.  ((Or should it be settable later on?  Might
 * want to add a command to set various parameters during execution??))
 */
#define MAXSTRLEN     256
#define SUBSET_QUANTUM 32
#define MAX_SUBSETS	   10
#define KEYLENGTH      28
#define KEY_WIDTH      28
#define NUM_WIDTH	    9
#define TRUE            1
#define FALSE           0
 
/* define data type to hold ptrfile records ... for now, just a long int
 * ((I need to read through and think deeper about the critical things that currently
 * exist in the code simply as "long" ints -- some of them surely would need to be
 * made bigger in the case of multi-gigabyte files.  Should ZPTR be kept as a
 * typedef, or should I just use plain old "long" for the ptrs, and redefine "long"
 * if/when files grow bigger than 2**31 bytes??))
 */
typedef long	ZPTR;
 
/* define data type to hold keyfile records
 */
typedef struct
  {
    char key[KEYLENGTH];
  	ZPTR ccount;
  }		ZKEY;
 
/* define data type to hold in-memory information essential to multifile
 * browsing  ... for each file in the filelist, must hold:
 *  - docfile name
 *  - offset to apply to the ptr records
 */
typedef struct
  {
  	char *docfilename;
  	long ptroffset;
  }		BROWSERFILELIST;
 
/* begin prototypes */
 
int main (int argc, char *argv[]);
void print_help (void);
void init_alpha_order (void);
void load_alpha (char *fname);
void open_database (char *cmd);
void read_filelist (char *fname, FILE *filelistfile);
void initialize_stuff (void);
long filesize (FILE *file);
void *reserve_memory (long len);
void error_exit (int errnum, char *msg, long errarg);
void add_to_flist (char *dfilename, long offset, long maxfiles);
long count_listed_files (FILE *filelistfile);
int  zstrcmp (unsigned char *s1, unsigned char *s2);
ZKEY get_keyrec (long n);
ZPTR get_ptrrec (long n);
void index_fetch (char *cmd);
void context_fetch (char *cmd);
void text_fetch (char *cmd);
long fetch_data (char *buf, ZPTR target, int before, int after);
long which_file (ZPTR ptr);
void find_word (char *cmd);
void do_subset_cmd (char *cmd);
long countInSubset (long start_instance_num, long count, long subset_num);
int inSubset (long ptr, long subset_num);
 
/* end prototypes */
 
/* global variables */
 
char keyfn[MAXSTRLEN];
FILE *kfp = NULL;
long nkeys = 0;
char ptrfn[MAXSTRLEN];
FILE *pfp = NULL;
long nptrs = 0;
long nfiles = 0;
long total_db_size = 0;
long subset_size = 0;
BROWSERFILELIST *flist = NULL;
unsigned char alpha_order[256];
char *subset[MAX_SUBSETS];
long neighborhood_size = 32;
 
/* TO DO:
 *  - let user reset neighborhood_size, etc. during run (add commands)
 *  - allow escape from slow operations
 *  - improve error detection and handling
 *  - etc....
 */
 
 
 
/*********************************************************/
 
int main (int argc, char *argv[])
  {
    int i;
    char cmd[MAXSTRLEN];
    
/* get the command line arguments for Macintosh users ...
 */
#ifdef THINK_C
    argc = ccommand (&argv);
#endif
 
	/* do necessary initializations */
	for (i = 0; i < MAX_SUBSETS; ++i)
		subset[i] = NULL;
 
	initialize_stuff ();
	
    printf ("\nWelcome to zbrwsr version 0.xx --- by ^z\n");
    
    if (argc > 1)
    	open_database (argv[1]);
    
    printf ("\n^z:");
 
	while (gets (cmd))
	  {
		switch (islower (cmd[0]) ? toupper (cmd[0]) : cmd[0])
		  {
		  	case '?':				/* HELP, version info, etc. */
		  	case 'H':
		  		print_help ();
		  		break;
		  	case 'Q':				/* QUIT */
		  		exit (0);
		  		break;
		  	case 'O':				/* OPEN a new db */
		   		open_database (cmd);
		  		break;
		  	case 'F':				/* FIND a word in the index list*/
				find_word (cmd);
		  		break;
		  	case 'I':				/*  INDEX <word_num> <count> [<subset_num>]
		  								look in INDEX for <word_num> n, and return
		  								<count> lines of results; if subset_num
		  								is given, work in that proximity subset */
		  		index_fetch (cmd);
		  		break;
		  	case 'C':	/* CONTEXT <instance_num> <count> <width> <offset> [<subset_num>]
		  								fetch <count> lines of key-word-in-context
		  								display <width> chars each, with key-word at
		  								<offset> bytes into each line, beginning at word
		  								instance number <instance_num>, optionally in
		  								subset <subset_num>; prefix with the
		  								target_byte value and the file name */
				context_fetch (cmd);
		  		break;
		  	case 'T':				/* TEXT <target_byte> <length> <offset>
		  								fetch <length> bytes from db begining at
		  								<target_byte> - <offset> and return that
		  								info (and put document filename on line 0) */
				text_fetch (cmd);
		  		break;
		  	case 'S':				/* SUBSET operation; see do_subset_cmd() for details.
		  								This operation is used to CREATE empty
		  								subsets, DESTROY them, PUT words in them,
		  								AND or OR them together, INVERT them,
		  								or whatever else you please.... */
		  		do_subset_cmd (cmd);
		  		break;
		  	default:				/* beep & complain if unrecognized command */
		  		fprintf (stderr, 
		  		  "\a '%s' is not a recognized zbrwsr command, sorry!\n", cmd);
		  		break;
		  }
		
		printf ("\n^z:");
	  }
  }
 
 
/* print out helpful(?) information for user... */
 
void print_help (void)
  {
    printf ("\zbrwsr:  browse index files for free-text information retrieval\n");
    printf ("\nzbrwsr (c) 1991 Mark Zimmermann - ^z\n");
    printf ("this program comes with NO WARRANTY whatsoever\n");
    printf ("zbrwsr is free software, under the GNU General Public License\n");
    printf ("\nCommand Summary:\n\n");
    printf ("?          - print this help\n");
    printf ("Quit       - quit, exit program\n");
    printf ("Open fname - open database `fname' (`.f' or `filelist' file)\n");
    printf ("Find WORD  - find `WORD' in index; return word_num\n");
    printf ("Index <word_num> <count> [<subset_num>]\n");
    printf ("           - print <count> lines of index starting at <word_num>;\n");
    printf ("             each line in form `<instance_num>:<word_count> <word>', or\n");
    printf ("             `<instance_num>:<in_subset_word_count> / <word_count> <word>'\n");
    printf ("Context <instance_num> <count> <width> <offset> [<subset_num>]\n");
    printf ("           - print <count> lines of context starting at <instance_num>, each\n");
    printf ("             line <width> long with word at column <offset>, prefixed by\n");
    printf ("             line giving instance_num, target_byte, and filename of source\n");
    printf ("Text <target_byte> <length> <offset>\n");
    printf ("           - print <length> bytes of text with <target_byte> located\n");
    printf ("             <offset> bytes into the returned data\n");
    printf ("Subset <subset_num> Create            - make a new, empty subset\n");
	printf ("Subset <subset_num> Destroy           - release memory used by subset\n");
	printf ("Subset <subset_num> Put <word_num>    - put <word_num> into subset\n");
	printf ("Subset <subset_num> &= <subset_num2>  - logical (Boolean) AND of subsets;\n");
	printf ("Subset <subset_num> |= <subset_num2>  - logical (Boolean) OR of subsets\n");
	printf ("Subset <subset_num> Invert            - logical NOT of <subset_num>\n");
    
    return;
  }
 
 
/* function to get the nth ZKEY record from the master key file ... return a "null"
 * ZKEY if a request for an out-of-range record number is made, a blank key word
 * and ccount = 0 (an illegal value); quit with an error if something else fails
 */
 
ZKEY get_keyrec (long n)
  {
  	ZKEY keyrec;
  	int i;
 
	if (n < 0 || n >= nkeys)
	  {
	  	for (i = 0; i < KEYLENGTH; ++i)
 	 		keyrec.key[i] = ' ';
 	 	keyrec.ccount = 0;
	  }
  	else
  	  {
  	  	if (fseek (kfp, sizeof (ZKEY) * n, SEEK_SET) != 0)
  	  		error_exit (316, "sorry, fseek() error in get_keyrec() at record #", n);
  		if (fread (&keyrec, sizeof (ZKEY), 1, kfp) == 0)
  	  		error_exit (317, "sorry, fread() error in get_keyrec() at record #", n);
   	  }
  	
  	return (keyrec);
  }
 
 
/* function to get the nth ZPTR pointer record from the master ptr file ...
 * return -1 if error occurs...
 */
 
ZPTR get_ptrrec (long n)
  {
  	ZPTR p;
  	
  	if (n < 0 || n >= nptrs)
  		p = -1;
  	else
  	  {
  	  	if (fseek (pfp, sizeof (ZPTR) * n, SEEK_SET) != 0)
  			error_exit (318, "sorry, fseek() error in get_ptrrec() at record #", n);
  		if (fread (&p, sizeof (ZPTR), 1, pfp) == 0)
  			error_exit (319, "sorry, fread() error in get_ptrrec() at record #", n);
  	  }
  	
  	return (p);
  }
 
 
/* index_fetch, function which responds to an INDEX command, syntax:
 * INDEX <word_num> <count> [<subset_num>] ---
 * look in the INDEX for word number <word_num>, in subset <subset_num> if
 * given a good subset_num, and return <count> lines of results, in the format
 *
 * <instance_num>:<word_count> <key_word>
 *     (if not working in a subset)       or
 * <instance_num>:<in_subset_word_count> / <word_count> <key_word>
 *     (if working in a subset)
 *
 * ((NOTE:  for now, don't do any of the cleverness about sampling and reporting
 * approximate percentages which I did in FREE TEXT on the Mac ... put that in
 * later, as needed??))
 */
 
void index_fetch (char *cmd)
  {
	ZKEY prev_rec, this_rec;
	long n, word_num, count, subset_num;
 
	subset_num = -1;
	if (sscanf (cmd, "%*s %ld %ld %ld", &word_num, &count, &subset_num) < 2)
	  {
		fprintf (stderr, 
		  	"\a502 zbrwsr:  INDEX <word_num> <count> [<subset_num>]\n");
		return;
	  }
	
	if (subset_num < 0)
	  {
		prev_rec = get_keyrec (word_num - 1);
		for (n = word_num; n < word_num + count; ++n)
   	     {
   			this_rec = get_keyrec (n);
 
    		printf ("%*ld:%*ld %.*s\n", NUM_WIDTH, prev_rec.ccount,
    				NUM_WIDTH, this_rec.ccount - prev_rec.ccount,
    				KEY_WIDTH, this_rec.key);
 
     		prev_rec = this_rec;
     	  }
      }
    else
      {
      	if (subset_num > MAX_SUBSETS || subset[subset_num] == NULL)
      	  {
      	  	fprintf (stderr, "\a600 zbrwsr:  subset %ld no good for index_fetch()\n",
      	  				subset_num);
      	  	return;
      	  }
      	prev_rec = get_keyrec (word_num - 1);
      	for (n = word_num; n < word_num + count; ++n)
      	  {
      	  	this_rec = get_keyrec (n);
      	  	
      	  	printf ("%*ld:%*ld / %-*ld %.*s\n", NUM_WIDTH, prev_rec.ccount,
      	  			NUM_WIDTH, countInSubset (prev_rec.ccount,
      	  						this_rec.ccount - prev_rec.ccount, subset_num),
      	  			NUM_WIDTH, this_rec.ccount - prev_rec.ccount,
      	  			KEY_WIDTH, this_rec.key);
 
			prev_rec = this_rec;
      	  }
      }
 
    return;
  }
 
 
/* context_fetch, function which responds to a CONTEXT command, syntax:
 * CONTEXT <instance_num> <count> <width> <offset> [<subset_num>] ---
 * fetch <count> lines of key-word-in-context, in subset <subset_num> if given a
 * positive value; display <width> chars each, with key-word at
 * <offset> bytes into each line, beginning at word
 * instance number <instance_num>; return the answer on the line after
 * the instance number, the database byte offset of the target byte, and the filename...
 * filter out any control characters in returned text...
 */
 
void context_fetch (char *cmd)
  {
	char *buf, *bp;
	long i, j, target_byte, file_num, instance_num,
		count, width, offset, subset_num;
 
	subset_num = -1;
	if (sscanf (cmd, "%*s %ld %ld %ld %ld %ld", 
		  		  &instance_num, &count, &width, &offset, &subset_num) < 4)
	  {
		fprintf (stderr,
		"\a503 zbrwsr:  CONTEXT <instance_num> <count> <width> <offset> [<subset_num>]\n");
		return;
	  }
	
	buf = reserve_memory (width + 1);
 
	if (subset_num < 0)
	  {			/* regular non-subset retrieval here */
		for (i = instance_num; i < instance_num + count; ++i)
		  {
			if (i >= nptrs)
			  {
			  	fprintf (stderr,
			  		"\a511 zbrwsr:  context_fetch attempt to go beyond end of db\n");
			  	return;
			  }
			target_byte = get_ptrrec (i);
			if (target_byte < 0 || target_byte >= total_db_size)
			  {
		  		fprintf (stderr, 
		  		  "\a508 zbrwsr:  error in context_fetch, sorry, at %ld\n", i);
    	 		return;
		 	  }
 
			file_num = fetch_data (buf, target_byte, offset, width - offset);
			for (bp = buf; bp < buf + width; ++bp)
				if (*bp < ' ')
					*bp = ' ';
			buf[width] = '\0';
			printf ("%*ld %*ld %s\n%s\n", NUM_WIDTH, i, NUM_WIDTH, target_byte,
						flist[file_num].docfilename, buf);
		  }
	  }
	else
	  {			/* working within a subset here; new/separate features possible... */
		if (subset_num > MAX_SUBSETS || subset[subset_num] == NULL)
		  {
		  	fprintf (stderr,
		  		"\a601 zbrwsr:  subset %ld no good for context_fetch()\n", subset_num);
    	 	return;
		  }
	
		for (i = instance_num, j = 0; j < count; ++j)
		  {
		  	for ( ; i < nptrs; ++i)
		  	  {
		  	  	target_byte = get_ptrrec (i);
		  	  	if (target_byte < 0 || target_byte >= total_db_size)
		  	  	  {
		  	  		fprintf (stderr, 
		  		 		 "\a509 zbrwsr:  error in context_fetch, sorry, at %ld\n", i);
    	 			return;
		  	  	  }
 
		  	  	if (inSubset (target_byte, subset_num))
		  	  		break;
		  	  }
		  	
			if (i >= nptrs)
			  {
			  	fprintf (stderr,
			  		"\a512 zbrwsr:  context_fetch attempt to go beyond end of db\n");
			  	return;
			  }
 
		  	/* now we have a good inSubset ptr... */
			file_num = fetch_data (buf, target_byte, offset, width - offset);
			for (bp = buf; bp < buf + width; ++bp)
				if (*bp < ' ')
					*bp = ' ';
			buf[width] = '\0';
			printf ("%*ld %*ld %s\n%s\n", NUM_WIDTH, i, NUM_WIDTH, target_byte,
						flist[file_num].docfilename, buf);
			++i;
		  }
	  }
 
	free (buf);
	return;
  }
 
 
/* text_fetch, function which responds to a TEXT command, syntax:
 *
 * TEXT <target_byte> <length> <offset>
 *	fetch <length> bytes from db begining at
 *	<target_byte> - <offset> and return that
 *	info (and put document filename on line 0)
 */
 
void text_fetch (char *cmd)
  {
  	char *buf;
  	long file_num, target_byte, count, offset;
  	
  	if (sscanf (cmd, "%*s %ld %ld %ld", &target_byte, &count, &offset) != 3)
  	  {
		fprintf (stderr, 
		  		  "\a504 zbrwsr:  TEXT <target_byte> <length> <offset>\n");
		return;
	  }
 
	if (target_byte + count - offset < 0 || target_byte - offset > total_db_size)
	  {
	  	fprintf (stderr,
	  				"\a510 zbrwsr:  target_byte = %ld out of range for db, sorry\n",
	  				target_byte);
	  	return;
	  }
 
  	buf = reserve_memory (count + 1);
  	
  	file_num = fetch_data (buf, target_byte, offset, count - offset);
  	buf[count] = '\0';
  	
  	printf ("%s\n", flist[file_num].docfilename);
  	printf ("%s", buf);
  	
  	free (buf);
  	
  	return;
  }
 
 
 
/* function to fetch data around location offset in the
 * database file(s) ... treating boundaries between multiple files as
 * impenetrable walls ((but should there be a way to let data slide
 * across boundaries??))  ((how about proximity search problems??))
 *
 * fetch_data takes a location to put its results, a target ZPTR byte
 * offset in the virtually-merged database, and integer values for
 * how much before and after the target to fetch.  Explicitly, bytes
 * are returned from offsets target-before through target+after-1,
 * so a total of before+after bytes are returned in buf...
 *
 * any locations that extend beyond a file boundary are filled with
 * blanks...
 *
 * fetch_data returns the number (in the filelist flist) of the file in
 * which the data was found, for convenience of the caller....
 */
 
long fetch_data (char *buf, ZPTR target, int before, int after)
  {
	long i, ftarget, fstart;
	int j, k, length;
	FILE *fp;
	
	i = which_file (target);
	ftarget = target - flist[i].ptroffset;
	fstart = ftarget - before;
	length = before + after;
	j = 0;
	
	/* fill with blanks before file start if necessary
	 */
	if (fstart < 0)
	  {
	  	for ( ; j < -fstart; ++j)
	  		buf[j] = ' ';
	  	fstart = 0;
	  }
	
	/* get the data from the chosen document
	 */
	if (((fp = fopen (flist[i].docfilename, "r")) == NULL) ||
			(fseek (fp, fstart, SEEK_SET) != 0) ||
			((k = fread (buf + j, sizeof (char), length - j, fp)) == 0) ||
			(fclose (fp) != 0))
	  {
	  	printf ("sorry, error getting text from file %s!\n", flist[i].docfilename);
	  	for (j = 0; j < length; ++j)
	  		buf[j] = ' ';
	  	return (i);
	  }
	
	/* fill with blanks beyond EOF if necessary
	 */
	j += k;
	for ( ; j < length; ++j)
		buf[j] = ' ';
	
	return (i);
  }
 
 
/* which_file (ptr) tells which file in the filelist belongs to ptr;
 * it returns the array index into flist ...
 *
 * ((replace linear search for which file to open in the filelist
 * with binary search, eventually, for efficiency/speed on big filelists??
 * On the other hand, how does the operating system figure out what to
 * open -- linear search or better??  Maybe this isn't worth worrying about
 * for a while??))
 */
 
long which_file (ZPTR ptr)
  {
	long i;
	
	for (i = 1; i < nfiles; ++i)
		if (flist[i].ptroffset > ptr)
	  		break;
	--i;
 
	return (i);
  }
 
 
 
/* function find_word, to look up a word in the index, FIND command syntax:
 * FIND <word>
 * 
 * result is to print the number of the word in the index list... (the argument
 * to send to the `INDEX' command, etc.)
 */
 
void find_word (char *cmd)
  {
  	int diff;
  	char name[MAXSTRLEN];
  	long low, high, mid, wordnum;
  	ZKEY this_keyrec;
  	
  	if (sscanf (cmd, "%*s %s", name) != 1)
  	  {
		fprintf (stderr, "\a501 zbrwsr:  FIND <target>\n");
		return;
	  }
 
  	low = 0;
  	high = nkeys - 1;
  	
  	while (low <= high)
  	  {
  	  	mid = (low + high) / 2;
  	  	this_keyrec = get_keyrec (mid);
  	  	if (this_keyrec.ccount == 0)
  	  	  {
  	  	  	printf ("sorry, error in find_word() seeking %s at %ld\a\n",
  	  	  				name, mid);
  	  	    return;
  	  	  }
  	  	diff = zstrcmp ((unsigned char *)name, (unsigned char *)this_keyrec.key);
  	  	if (diff < 0)
  	  		high = mid - 1;
  	  	else if (diff > 0)
  	  		low = mid + 1;
  	  	else
  	  		break;
  	  }
  	
  	if (diff < 0)
  		--mid;
  	if (mid < 0)
  		mid = 0;
 
	printf ("%ld\n", mid);
	
	return;
  }
  
  
/* do a command associated with a proximity-subset... following are legal commands:
 *	SUBSET <subset_num> CREATE  --- make a new, empty subset
 *  SUBSET <subset_num> DESTROY --- release memory used by subset
 *  SUBSET <subset_num> PUT <word_num> --- put word number <word_num> in subset
 *  SUBSET <subset_num> AND <subset_num2> --- logical (Boolean) AND of subsets,
 *				with result left in <subset_num>
 *  SUBSET <subset_num> OR <subset_num2>  --- logical (Boolean) OR of subsets,
 *				with result left in <subset_num>
 *  SUBSET <subset_num> INVERT --- logical NOT on subset <subset_num>
 *  
 */
 
void do_subset_cmd (char *cmd)
  {
  	register char *cp, *cp2, *cp_end;
  	int bit_num;
  	long subset_num, other_num, i, text_ptr, tp, tp_end, byte_num;
  	char subcmd[MAXSTRLEN];
	ZKEY prev_rec, this_rec;
  
	if (sscanf (cmd, "%*s %ld %s %ld", &subset_num, subcmd, &other_num) < 2)
	  {
		fprintf (stderr, "\a505 zbrwsr:  SUBSET <subset_num> <SUB_COMMAND> [<arg>]\n");
		return;
	  }
 
	if (subset_num < 0 || subset_num >= MAX_SUBSETS)
	  {
	  	fprintf (stderr, "\a506 zbrwsr:  illegal subset number %ld\n", subset_num);
	  	return;
	  }
 
	switch (islower (subcmd[0]) ? toupper (subcmd[0]) : subcmd[0])
	  {
	  	case 'C':								/* CREATE a new, empty subset */
	  		if (subset[subset_num] != NULL)
	  		  {
	  		  	fprintf (stderr,
	  		  	  "\a507 zbrwsr:  can't CREATE; subset %ld already in use\n", subset_num);
	  		  	return;
	  		  }
	  		subset[subset_num] = reserve_memory (subset_size);
	  		for (cp = subset[subset_num], cp_end = subset[subset_num] + subset_size;
	  				cp < cp_end; ++cp)
	  			*cp = 0x00;
	  		break;
	  	
	  	case 'D':								/* DESTROY a subset */
	  		if (subset[subset_num] != NULL)
	  		  {
	  		  	free (subset[subset_num]);
	  		  	subset[subset_num] = NULL;
	  		  }
	  		break;
	  	
	  	case 'P':		/* PUT a word into a subset, setting bits for its instances;
	  						note that we do NOT, in this implementation, pay any
	  						attention to the boundaries between files in the db....*/
	  	
			prev_rec = get_keyrec (other_num - 1);
			this_rec = get_keyrec (other_num);
			
			for (i = prev_rec.ccount; i < this_rec.ccount; ++i)
			  {
			  	text_ptr = get_ptrrec (i);
			  	if (text_ptr < 0)
			  	  {
	  		  		fprintf (stderr,
	  		  	 	  "\a510 zbrwsr:  error in SUBSET PUT fetching instance %ld\n", i);
	  		  		return;
			  	  }
				tp = text_ptr - neighborhood_size;
				if (tp < 0)
					tp = 0;
				tp_end = text_ptr + neighborhood_size;
				if (tp_end >= total_db_size)
					tp_end = total_db_size - 1;
				for ( ; tp <= tp_end; tp += SUBSET_QUANTUM)
				  {
					bit_num = (tp % (8 * SUBSET_QUANTUM)) / SUBSET_QUANTUM;
    				byte_num = tp / (8 * SUBSET_QUANTUM);
					*(subset[subset_num] + byte_num) |= 1 << bit_num;
				  }
			  }
	  		break;
	  	
	  	case '&':		/* AND two subsets together, leaving result in first */
	  		if (other_num < 0 || other_num > MAX_SUBSETS || subset[other_num] == NULL)
	  		  {
	  		  	fprintf (stderr,
	  		  	  "\a520 zbrwsr:  can't AND with second subset %ld\n", other_num);
	  		  	return;
	  		  }
	  		if (subset[subset_num] == NULL)
	  		  {
	  		  	fprintf (stderr,
	  		  	  "\a521 zbrwsr:  can't AND into null subset %ld\n", subset_num);
	  		  	return;
	  		  }
	  		for (cp = subset[subset_num], cp_end = subset[subset_num] + subset_size,
	  				cp2 = subset[other_num]; cp < cp_end; ++cp)
	  			*cp &= *cp2;
	  		break;
	  	
	  	case '|':		/* OR two subsets together, leaving result in first */
	  		if (other_num < 0 || other_num > MAX_SUBSETS || subset[other_num] == NULL)
	  		  {
	  		  	fprintf (stderr,
	  		  	  "\a522 zbrwsr:  can't OR with second subset %ld\n", other_num);
	  		  	return;
	  		  }
	  		if (subset[subset_num] == NULL)
	  		  {
	  		  	fprintf (stderr,
	  		  	  "\a523 zbrwsr:  can't OR into null subset %ld\n", subset_num);
	  		  	return;
	  		  }
	  		for (cp = subset[subset_num], cp_end = subset[subset_num] + subset_size,
	  				cp2 = subset[other_num]; cp < cp_end; ++cp)
	  			*cp |= *cp2;
	  		break;
	  	
	  	case 'I':								/* INVERT (logical NOT) a subset */
	  		if (subset[subset_num] == NULL)
	  		  {
	  		  	fprintf (stderr,
	  		  	  "\a524 zbrwsr:  can't INVERT into null subset %ld\n", subset_num);
	  		  	return;
	  		  }
	  		for (cp = subset[subset_num], cp_end = subset[subset_num] + subset_size;
	  				cp < cp_end; ++cp)
	  			*cp = ~*cp;
	  		break;
	  	
	  	default:								/* catch an error here */
	  		fprintf (stderr, 
		  		  "\a '%s' is not a recognized zbrwsr subset command, sorry!\n", cmd);
	  		break;
	  }
	
  	return;
  }
 
 
/* count the number of times that the word instances beginning at <start_instance_num>
 * and occurring <count> times occurs in the subset <subset_num> ... just fetch the
 * ptrs, check corresponding flag bits and count up the number turned 'on'....
 */
 
long countInSubset (long start_instance_num, long count, long subset_num)
  {
	long successes, n;
	
	successes = 0;
	
	for (n = start_instance_num; n < start_instance_num + count; ++n)
		if (inSubset (get_ptrrec (n), subset_num))
			++successes;
	
	return (successes);
  }
 
 
/* return true if ptr is "in" subset subset_num (its flag bit is turned on);
 * return false otherwise....
 */
 
int inSubset (long ptr, long subset_num)
  {
  	int bitNumber;
  	long byteNumber;
  	
  	bitNumber = (ptr % (8 * SUBSET_QUANTUM)) / SUBSET_QUANTUM;
  	byteNumber = ptr / (8 * SUBSET_QUANTUM);
  	
  	return ((subset[subset_num][byteNumber] >> bitNumber) & 1);
  }
 
 
/* initialize alpha_order table to default -- simple ASCII ordering, but with
 * space (blank) set to null (0), to make string comparisons in zstrcmp() come
 * out nicely for null-terminated strings, in the FIND command, etc.
 */
 
void init_alpha_order (void)
  {
  	int i;
  	
  	for (i = 0; i <= 255; ++i)
  		alpha_order[i] = i;
	alpha_order[' '] = '\0';
	
	return;
  }	
 
 
/* load alpha file to redefine alphabetization order if necessary....
 *
 * See the source code of zndxr.c for additional commentary and
 * explicit examples of alphafile formatting ...
 */
 
void load_alpha (char *fname)
  {
    FILE *alphafile;
    int c, i, len, order = 0;
    char buf[MAXSTRLEN];
 
    printf ("loading alphafile '%s'\n", fname);
    alphafile = fopen (fname, "r");
    if (alphafile == NULL)
    	error_exit (300, "fopen() error in load_alpha()", 0L);
	
	for (i = 0; i <= 255; ++i)
		alpha_order[i] = '\0';
	
	while (fgets (buf, MAXSTRLEN, alphafile) != NULL)
	  {
	  	/* skip a comment line (any line which begins with "# ") and
	  	 * skip a completely empty line (containing only '\n')
	  	 */
		if ((buf[0] == '#' && buf[1] == ' ') || buf[0] == '\n')
			continue;
 
		/* increment the counter that keeps track of where to put this
		 * line's characters alphabetically ...
		 */
		if (++order > 255)
    		error_exit (301, "++order error in load_alpha()", (long)order);
 
		len = strlen (buf);
		if (buf[len - 1] == '\n')
			/* remove any final '\n' from lines that fgets() supplies...
			 */
			--len;
		
		for (i = 0; i < len; ++i)
		  {
			c = buf[i];
			
			if (c == '~')
				/* skip '~' since we don't care about embedded punctuation
				 * for browsing purposes
				 */
			  	continue;
			
			/* convert escaped characters after '#' into proper form
			 */
			if (c == '#')
			  {
			 	c = 0;
			 	++i;
				while (buf[i] >= '0' && buf[i] <= '9')
					c = 10 * c + buf[i++] - '0';
				--i;
			  }
			
			/* safety net -- following *should* never happen!
			 */
			if (c < 0 || c > 255)
				error_exit (302, "character error in load_alpha()", (long)c);
			
			alpha_order[c] = order;
		  }
	  }
	fclose (alphafile);
	
	return;
  }
 
 
/* open a database given the file name "dbname":
 * 1. Try to open "dbname.f" as a filelist;
 *		if not found, then
 * 2. Try to open "dbname.k" and "dbname.p" as indices for the document
 *	  fname, as a single-document db;
 *		if not found, then
 * 3. Try to open "dbname" itself as a filelist;
 *		if not found then
 * 4. Give up....
 *
 * see zndxr.c sources for full details of filelist format...
 */
 
void open_database (char *cmd)
  {
	FILE *filelistfile, *lastdocfile;
	int len;
	char fname[MAXSTRLEN], dbname[MAXSTRLEN];
 
	if (sscanf (cmd, "%*s %s", dbname) != 1)
	  {
		fprintf (stderr, "\a500 zbrwsr:  OPEN <db_filename>\n");
		return;
	  }
 
    printf ("opening database '%s'\n", dbname);
    
    initialize_stuff ();
 
	strcpy (fname, dbname);
	strcat (fname, ".f");
	
    filelistfile = fopen (fname, "r");
    if (filelistfile != NULL)
      {
      	/* successful in finding the '.f' filelist... */
    	read_filelist (fname, filelistfile);
    	fclose (filelistfile);
   		printf ("filelist %s loading completed\n", fname);
      }
    else
      {
      	/* no .f filelist; see if it's a docfile with .k and .p
      	 * indices available
      	 */
      	printf ("filelist %s not found!\n", fname);
      	printf ("looking for single-file db %s\n", dbname);
    	
    	len = strlen (fname);
    	strcpy (keyfn, fname);
    	keyfn[len - 1] = 'k';
    	strcpy (ptrfn, fname);
   		ptrfn[len - 1] = 'p';
    	kfp = fopen (keyfn, "rb");
    	if (kfp != NULL)
   	 		fclose (kfp);
     	pfp = fopen (ptrfn, "rb");
     	if (pfp != NULL)
  	 		fclose (pfp);
    	
   	 	if (kfp != NULL && pfp != NULL)
   	 	  {
   	 	  	/* found .k and .p files; now set up fake flist
   	 	  	 * with single docfile, and proceed to open as usual...
   	 	  	 */
   	 	  	flist = reserve_memory (sizeof (BROWSERFILELIST));
   	 	  	add_to_flist (fname, 0, 1);
   	 	  }
   	 	else
   	 	  {
   	 	  	/* no .k and .p files, so make one last try...
   	 	  	 */
    		printf ("index files %s, %s not found!\n", keyfn, ptrfn);
    		printf ("looking for filelist %s\n", dbname);
    		
    		filelistfile = fopen (dbname, "r");
    		if (filelistfile != NULL)
    		  {
    			/* ok, assume that the file itself is a filelist
    			 * without .f to be added, and try to load it
    			 */
    			read_filelist (dbname, filelistfile);
    			fclose (filelistfile);
   				printf ("filelist %s loading completed\n", dbname);
    		  }
    		else
    		  {
    		  	/* ultimate failure here ... sorry!  return to caller...
    		  	 * ((think about how to handle errors better??))
    		  	 */
    		  	printf ("Sorry, couldn't open database %s\n", dbname);
    		  	return;
    		  }
    	  }
  	  }
  	
    printf ("opening master database key and ptr files %s and %s\n",
    			keyfn, ptrfn);
    kfp = fopen (keyfn, "rb");
    if (kfp == NULL)
    	error_exit (314, "error opening keyfile in open_database()", 0L);
    nkeys = filesize (kfp) / sizeof (ZKEY);
    pfp = fopen (ptrfn, "rb");
    if (pfp == NULL)
    	error_exit (315, "error opening ptrfile in open_database()", 0L);
    nptrs = filesize (pfp) / sizeof (ZPTR);
    
    /* get total database size for subset searching convenience; also set
     * subset size here based on number of bytes of database per bit of subset */
    lastdocfile = fopen (flist[nfiles-1].docfilename, "r");
    if (lastdocfile == NULL)
          total_db_size = flist[nfiles-1].ptroffset;
    else
      {
        total_db_size = flist[nfiles-1].ptroffset + filesize (lastdocfile);
        fclose (lastdocfile);
      }
	subset_size = 1 + total_db_size / (SUBSET_QUANTUM * 8);
	
	return;
  } 
    		
    		
    		
/* read in a filelist of database file information ...
 */
 
void read_filelist (char *fname, FILE *filelistfile)
  {
  	int n, len;
	long num, offset, maxfiles;
	char buf[MAXSTRLEN], filename[MAXSTRLEN];
  
    maxfiles = count_listed_files (filelistfile);
  	printf ("reserving %ld bytes for %ld in-memory filelist structures\n",
  	  				sizeof (BROWSERFILELIST) * maxfiles, maxfiles);
  	flist = reserve_memory (sizeof (BROWSERFILELIST) * maxfiles);
    rewind (filelistfile);
    printf ("loading filelist...\n");
    
    /* set up default .k and .p filenames */
    len = strlen (fname);
    strcpy (keyfn, fname);
    keyfn[len - 1] = 'k';
    strcpy (ptrfn, fname);
    ptrfn[len - 1] = 'p';
 
	while (fgets (buf, MAXSTRLEN, filelistfile) != NULL)
	  {
	  	/* skip over a comment line (any line which begins with '#' not
	  	 * immediately followed by a recognized directive); process a
	  	 * directive if relevant here (e.g., #alphafile fname)...
	  	 */
		if (buf[0] == '#')
		  {
		  	n = sscanf (buf, "#alphafile: %s", filename);
		  	if (n == 1)
		  	  {
		  		load_alpha (filename);
		  		continue;
		  	  }
		  	
		  	n = sscanf (buf, "#keyfile: %s", filename);
		  	if (n == 1)
		  	  {
		  		strcpy (keyfn, filename);
		  		printf ("database keyfile name = '%s'\n", keyfn);
		  		continue;
		  	  }
 
		  	n = sscanf (buf, "#ptrfile: %s", filename);
		  	if (n == 1)
		  	  {
		  		strcpy (ptrfn, filename);
		  		printf ("database ptrfile name = '%s'\n", ptrfn);
		  		continue;
		  	  }
 
		  	continue;
		   }
 
		/* now process "normal" filelist lines, and add the relevant
		 * information from them to the flist
		 */
		n = sscanf (buf, " %s %*s %*s %ld", filename, &offset);
		
		/* skip over empty or mysteriously-bad lines
		 */
		if (n == EOF || n != 2)
			continue;
			
		add_to_flist (filename, offset, maxfiles);
      }
    
    return;
  }
 
 
/* free memory used for an earlier filelist, if any,
 * close any open key and ptr files, reset everything, etc.
 */
 
void initialize_stuff (void)
  {
  	long i;
  	
  	for (i = 0; i < nfiles; ++i)
  	  	if (flist[i].docfilename != NULL)
  	  		free (flist[i].docfilename);
  	
  	if (flist != NULL)
  		free (flist);
  		
  	flist = NULL;
  	nfiles = 0;
  	if (kfp != NULL)
  		fclose (kfp);
  	kfp = NULL;
  	nkeys = 0;
  	if (pfp != NULL)
  		fclose (pfp);
  	pfp = NULL;
  	nptrs = 0;
  	total_db_size = 0;
  	subset_size = 0;
  	neighborhood_size = 48;
  	
  	init_alpha_order ();
  	
  	for (i = 0; i < MAX_SUBSETS; ++i)
  		if (subset[i] != NULL)
  		  {
  			free (subset[i]);
  			subset[i] = NULL;
  		  }
  	
  	return;
  }
  
 
/* determine the size of a file
 */
 
long filesize (FILE *file)
  {
    long len;
 
    if (fseek (file, 0L, SEEK_END) != 0)
    	error_exit (304, "fseek() error in filesize()", 0L);
    len = ftell (file);
    if (len == -1L)
    	error_exit (305, "ftell() error in filesize()", 0L);
 
    return (len);
  }
 
 
/* reserve memory for holding len bytes
 */
 
void *reserve_memory (long len)
  {
    void *p;
 
    p = malloc (len);
    if (p == NULL)
    	error_exit (306, "out of memory error in reserve_memory()", len);
 
    return (p);
  }
 
 
/* ugh! --- exit with an error message --- some fatal mistake has been made
 */
 
void error_exit (int errnum, char *msg, long errarg)
  {
	fprintf (stderr, "\nFATAL ERROR -- very sorry\n");
	fprintf (stderr, "%d zbrwsr %s %ld\n", errnum, msg, errarg);
	exit (errnum);
	
	return;
  }
 
 
/* append new information to the flist, the in-memory data structure which
 * holds all the essential information for zbrwsr to find files,
 * fetch data, etc....
 */
 
void add_to_flist (char *dfilename, long offset, long maxfiles)
  {
  	char *fnp;
  	FILE *fp;
  	
	flist[nfiles].ptroffset = offset;
 
	/* now store the doc file name and count the file
	 */
	fnp = reserve_memory (strlen (dfilename) + 1);
	strcpy (fnp, dfilename);
	flist[nfiles].docfilename = fnp;
	
	++nfiles;
	
	/* this test should *never* succeed!
	 */
	if (nfiles > maxfiles)
		error_exit (313, "sorry, filelist load error at nfiles =", nfiles);
	
	return;
  }
 
 
/* count the number of files in the filelist ... do it wastefully in terms of
 * CPU cycles, but safely (since this way our count should agree with that of
 * the routine that loads the information during the second pass through the
 * filelist file) ... and note that since the time of file I/O will probably be
 * far far greater than the time required to parse the line with sscanf(), this
 * way probably isn't all that inefficient anyway; and note that since the
 * filelist will probably be in an I/O buffer for the subsequent pass through,
 * this counting operation may not actually cost much extra time at all....
 * ((or is this just wishful thinking??))
 */
 
long count_listed_files (FILE *filelistfile)
  {
  	int n;
  	long numfiles = 0, offset;
	char buf[MAXSTRLEN], filename[MAXSTRLEN];
  	
  	while (fgets (buf, MAXSTRLEN, filelistfile) != NULL)
	  {
	  	/* skip over a comment line
	  	 */
		if (buf[0] == '#')
			continue;
 
		/* count filelist lines
		 */
		n = sscanf (buf, " %s %*s %*s %ld", filename, &offset);
		
		/* skip over empty or mysteriously-bad lines, as elsewhere
		 */
		if (n == EOF || n != 2)
			continue;
		
		/* ok, looks, good, count this as a file
		 */
		++numfiles;
      }
 
	return (numfiles);
  }
 
 
/* my function to compare two strings and give a result as to who is
 * alphabetically earlier.  Note that this is almost the same as strncmp()
 * with the fixed value of KEY_LENGTH as the maximum comparison distance,
 * except that I must be sure to handle the non-ASCII funny letters in
 * the Apple character set properly/consistently ... hence the need to
 * declare s1 and s2 to be type unsigned char *... also need to map the
 * characters according to any custom alphabetization order mapping, 
 * specified in alpha_order[] (default:  alpha_order[c] == c for nonblank c)
 *
 * note that with the default alpha_order[] mapping of ' ' (blank) to '\0',
 * life becomes rather simple and there is no worry about null-terminated
 * strings that come from the FIND command being mis-compared with the
 * right-padded-with-blanks index file entries....
 */
 
int zstrcmp (unsigned char *s1, unsigned char *s2)
  {
    int n = KEYLENGTH;
    
    for (; --n && alpha_order[*s1] == alpha_order[*s2]; s1++, s2++)
        if (!alpha_order[*s1])
        	break;
        
    return (alpha_order[*s1] - alpha_order[*s2]);
  }
 

