Date: Mon, 2 Jun 1997 17:45:49 -0700 (PDT)
From: Mark Zimmermann

here's a little experiment in doing statistical analyses of
different indices and looking for characteristic words.... ^z

-----zcmpndx04.c-----

/* Program to compare two indices -- "compndx.c" -- 870924-... by ^z
 * revised 910320-... ^z
 *
 * this is FREE SOFTWARE under the GNU GPL and comes with NO WARRANTY
 * WHATSOEVER!!!
 *
 * Concept:
 *
 *	take two files which have been indexed by zndxr.c -- compare them
 *	and report on terms which occur at significantly different rates
 *	in the second file relative to the first.
 *
 *
 * Simpleminded test for statistical significance:
 *
 *	Assume that words are randomly drawn from a pool of possibilities.
 *	Let t1 be the total number of words in file 1, and t2 be the total
 *	number of words in file 2.  Consider a particular word (such as
 *	"simpleminded"); suppose it occurs n1 times in file 1 and n2 times
 *	in file 2.
 *
 *	Suppose file 2 is big and file 1 is small.  Take the chosen word's
 *	average occurrence rate  in file 2, n2/t2, as the expected rate.
 *	Then in file 1 one expects to see the word t1*n2/t2 times.  The
 *	rough expectation of deviation about this value is its square root,
 *	sqrt(t1*n2/t2).  The word in file 1 that occurs n1 times thus is
 *	sigma = (n1 - t1*n2/t2)/sqrt(t1*n2/t2) standard deviations away
 *	from the expectation.  (If a word doesn't occur at all in file 2
 *	but shows up in file 1, be sure not to divide by zero!)
 *
 *	Let the user set a threshhold, or choose a default one, for reporting
 *	on words which are too frequent or too infrequent relative to the
 *	statistical expectation.  Do this for every word in file 1.....
 *
 */
 
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
 
/* for Mac (Think C) ... need to put this into a project with the ANSI
 * and the MacTraps libraries ... the <console.h> header lets main()
 * take command-line UNIX-style arguments ... very un-Macish, sorry!
 * not needed if no use is made of argc, argv...
 */
#ifdef THINK_C
#include <console.h>
#endif
 
/* KEY_LENGTH is the number of letters we have in each record.... */
#define KEY_LENGTH      28
 
/* structure of the records in the index key file:
 *      a fixed-length character string kkey, filled out with blanks and
 *              containing the unique alphanumeric 'words' in the document file,
 *              changed to all-capital letters;
 *      a cumulative count of how many total occurrences of words, including
 *              the current one, have happened up to this point in the alphabetized
 *              index.
 *
 *      Obviously, the number of occurrences of the nth word is just the
 *              difference between the nth cumulative count and the (n-1)th
 *              cumulative count.  Also, the (n-1)th cumulative count is a
 *              pointer to the first occurrence of the nth word in the ptr_file
 *              (which holds all the index offset pointers for the document file).
 *
 *      See the index building program "zndxr.c" for further details of the
 *              index structure....
 */
 
typedef struct
  {
        char kkey[KEY_LENGTH];
        long ccount;
  }  KEY_REC;
 
 
#ifndef TRUE
#define TRUE  1
#endif
#ifndef FALSE
#define FALSE  0
#endif
 
 
/* some function prototypes */
 
#ifdef THINK_C
 
void main(int argc, char *argv[]);
void show_words (FILE *kf1, long t1, long d1, FILE *kf2, long t2,
	long d2, double threshhold, int show_new_words, int show_missing_words);
FILE *open_kfile (char *fn);
int fetch_next_word (KEY_REC *recp, FILE *key_file);
void print_help_and_quit (void);
int is_true (char *cmd);
 
#endif 
 
 
/* *********************** main *******************************
 */
 
 
void main (argc, argv)
  int argc;
  char *argv[];
  {
	unsigned long start_time, end_time, time();
	long d1, d2, t1, t2;
	int show_new_words, show_missing_words;
	FILE *kf1, *kf2, *open_kfile();
	char *ctime();
	double threshhold, atof();
	KEY_REC last_word;
	void exit(), show_words(), print_help_and_quit();
 
/* get the command line arguments for Macintosh users ...
 */
#ifdef THINK_C
    argc = ccommand (&argv);
#endif
	
	if (argc < 3)
		print_help_and_quit ();
 
	time (&start_time);
	printf ("Starting work:  %s\n", ctime (&start_time));
 
	kf1 = open_kfile (argv[1]);
	kf2 = open_kfile (argv[2]);
	
	if (argc > 3)
		threshhold = atof (argv[3]);
	else
		threshhold = 5.0;
 
	if (argc > 4)
		show_new_words = is_true (argv[4]);
	else
		show_new_words = TRUE;
	
	if (argc > 5)
		show_missing_words = is_true (argv[5]);
	else
		show_missing_words = TRUE;
	
	fseek (kf1, 0L, 2);
	d1 = ftell (kf1) / sizeof(KEY_REC);
	fseek (kf1, (long)(-sizeof(KEY_REC)), 2);
	fetch_next_word (&last_word, kf1);
	t1 = last_word.ccount;
	fseek (kf2, 0L, 2);
	d2 = ftell (kf2) / sizeof(KEY_REC);
	fseek (kf2, (long)(-sizeof(KEY_REC)), 2);
	fetch_next_word (&last_word, kf2);
	t2 = last_word.ccount;
	
	printf ("file                total words     distinct words\n");
	printf ("----                -----------     --------------\n");
	printf ("%-20s%11ld        %11ld\n", argv[1], t1, d1);
	printf ("%-20s%11ld        %11ld\n", argv[2], t2, d2);
 
	printf ("\nThreshhold surplus sigmas = %5.2f\n", threshhold);
	printf ("\nSurprising words in \"%s\" as compared with \"%s\":\n",
  				argv[1], argv[2]);
 
	show_words (kf1, t1, d1, kf2, t2, d2, threshhold, show_new_words,
		show_missing_words);
	
  	printf ("\nComparison of \"%s\" with \"%s\" completed!\n",
  				argv[1], argv[2]);
  	time (&end_time);
  	printf ("Ending work:  %s\n", ctime (&end_time));
  	printf ("Elapsed time = %ld seconds.\n", end_time - start_time);
  	printf ("Comparison rate = %1.f words/second.\n", (float)(d1 + d2) /
  				( end_time - start_time ));
  }
 
 
/* function to open the *.k keyfile
 */
 
FILE *open_kfile (fn)
  char *fn;
  {
	FILE *fopen(), *fp;
	void exit();
	
	if ((fp = fopen (fn, "rb")) == NULL)
	  {
		printf ("Fatal error in attempt to open file \"%s\"!\n", fn);
		exit (1);
	  }
	
	return (fp);
  }
 
 
/* function to get the next KEY_REC from key_file and store it in the
 * KEY_REC space pointed to by recp ... return TRUE if all is seemingly
 * well, and FALSE if attempt to read past EOF or other error ... in which
 * case be sure to set the kkey field to a high ASCII value, to win any
 * comparisons with other keys alphabetically -- use "~" (tilde) for
 * that purpose.
 */
 
int fetch_next_word (recp, key_file)
  KEY_REC *recp;
  FILE *key_file;
  {
	void exit();
	char *strncpy();
 
	if (! fread ((char *)recp, sizeof (KEY_REC), 1, key_file))
	  {
		strncpy (recp->kkey, "~", 1);
		return (FALSE);
	  }
	return (TRUE);
  }
 
 
/* function to print out a little help for the user ...
 */
 
void print_help_and_quit ()
  {
	void exit();
 
	printf ("zcmpndx:  Not enough command line arguments!\n");
	printf ("\nPlease provide the names of two `.k' index keyfiles:\n");
	printf ("a file to be tested, and a standard (big, fiducial) file to\n");
	printf ("compare it against.  The program will then go through the\n");
	printf ("first file and print out all of the terms in it that occur\n");
	printf ("at a rate significantly above or below their expected\n");
	printf ("rate, as defined by the second file.  Optionally, put a\n");
	printf ("threshhold different than the default (5.0 `sigma') on\n");
	printf ("the command line after the two file names.  To suppress display\n");
	printf ("of new or missing words, put further nonzero parameters on\n");
	printf ("the command line.\n");
	printf ("\nThus:  zcmpndx test_file.k standard_file.k [threshhold [no_new [no_missing]]]\n");
	printf ("\nSend comments, etc. to the author, Mark^Zimmermann;\n");
	printf ("science@oasys.dt.navy.mil; [75066,2044] on CompuServe;\n");
	printf ("This is free software under GNU GPL & comes with NO WARRANTY WHATSOEVER!\n");
	
	exit(0);
  }
 
 
/* function to take a command-line value and make it TRUE or FALSE
 * in a reasonable way....
 */
 
is_true (cmd)
  char *cmd;
  {
	if (cmd[1] == 'y' || cmd[1] == 'Y' || atoi (cmd) != 0)
		return (TRUE);
	else
		return (FALSE);
  }
 
 
/* function to show the results... 
 */
 
void show_words (kf1, t1, d1, kf2, t2, d2, threshhold, show_new_words,
		show_missing_words)
  FILE *kf1, *kf2;
  long t1, d1, t2, d2;
  double threshhold;
  int show_new_words, show_missing_words;
  {
	KEY_REC rec1, rec2;
	long prev_cc1, prev_cc2, n1, n2;
	int new_word2;
	double sigma, t1_over_t2, expect, sqrt(), fabs();
 
	t1_over_t2 = (double)t1 / t2;
	fseek (kf1, 0L, 0);
	fseek (kf2, 0L, 0);
	prev_cc1 = 0;
	prev_cc2 = 0;
	fetch_next_word (&rec1, kf1);
	fetch_next_word (&rec2, kf2);
	new_word2 = TRUE;
	
	printf ("\n    word       sigma    expected  actual  in std\n");
	printf (  "    ----       -----    --------  ------  ------\n");
	
	do
	  {
		while (strncmp (rec1.kkey, rec2.kkey, KEY_LENGTH) > 0)
		  {
			if (show_missing_words && new_word2)
			  {
				n2 = rec2.ccount - prev_cc2;
				expect = t1_over_t2 * n2;
				sigma = -sqrt (expect);
				if (fabs (sigma) > threshhold)
					printf ("%.12s %7.1f     %7.1f       0%8ld\n",
							rec2.kkey, sigma, expect, n2);
			  }
			prev_cc2 = rec2.ccount;
			fetch_next_word (&rec2, kf2);
			new_word2 = TRUE;
		  }
		if (strncmp (rec1.kkey, rec2.kkey, KEY_LENGTH) == 0)
		  {
			n1 = rec1.ccount - prev_cc1;
			n2 = rec2.ccount - prev_cc2;
			expect = t1_over_t2 * n2;
			sigma = (n1 - expect) / sqrt (expect);
			if (fabs (sigma) > threshhold)
				printf ("%.12s %7.1f     %7.1f%8ld%8ld\n",
						rec1.kkey, sigma, expect, n1, n2);
		  }
		else
		  {
			if (show_new_words)
				printf ("%.12s     ---           0%8ld       0\n",
						rec1.kkey, rec1.ccount - prev_cc1);
		  }
		
		new_word2 = FALSE;
		prev_cc1 = rec1.ccount;
	  }
	while (fetch_next_word (&rec1, kf1));
	
	return;
  }
 

