Date: Mon, 2 Jun 1997 17:45:49 -0700 (PDT) From: Mark Zimmermann here's a little experiment in doing statistical analyses of different indices and looking for characteristic words.... ^z -----zcmpndx04.c----- /* Program to compare two indices -- "compndx.c" -- 870924-... by ^z * revised 910320-... ^z * * this is FREE SOFTWARE under the GNU GPL and comes with NO WARRANTY * WHATSOEVER!!! * * Concept: * * take two files which have been indexed by zndxr.c -- compare them * and report on terms which occur at significantly different rates * in the second file relative to the first. * * * Simpleminded test for statistical significance: * * Assume that words are randomly drawn from a pool of possibilities. * Let t1 be the total number of words in file 1, and t2 be the total * number of words in file 2. Consider a particular word (such as * "simpleminded"); suppose it occurs n1 times in file 1 and n2 times * in file 2. * * Suppose file 2 is big and file 1 is small. Take the chosen word's * average occurrence rate in file 2, n2/t2, as the expected rate. * Then in file 1 one expects to see the word t1*n2/t2 times. The * rough expectation of deviation about this value is its square root, * sqrt(t1*n2/t2). The word in file 1 that occurs n1 times thus is * sigma = (n1 - t1*n2/t2)/sqrt(t1*n2/t2) standard deviations away * from the expectation. (If a word doesn't occur at all in file 2 * but shows up in file 1, be sure not to divide by zero!) * * Let the user set a threshhold, or choose a default one, for reporting * on words which are too frequent or too infrequent relative to the * statistical expectation. Do this for every word in file 1..... * */ #include #include #include #include #include /* for Mac (Think C) ... need to put this into a project with the ANSI * and the MacTraps libraries ... the header lets main() * take command-line UNIX-style arguments ... very un-Macish, sorry! * not needed if no use is made of argc, argv... */ #ifdef THINK_C #include #endif /* KEY_LENGTH is the number of letters we have in each record.... */ #define KEY_LENGTH 28 /* structure of the records in the index key file: * a fixed-length character string kkey, filled out with blanks and * containing the unique alphanumeric 'words' in the document file, * changed to all-capital letters; * a cumulative count of how many total occurrences of words, including * the current one, have happened up to this point in the alphabetized * index. * * Obviously, the number of occurrences of the nth word is just the * difference between the nth cumulative count and the (n-1)th * cumulative count. Also, the (n-1)th cumulative count is a * pointer to the first occurrence of the nth word in the ptr_file * (which holds all the index offset pointers for the document file). * * See the index building program "zndxr.c" for further details of the * index structure.... */ typedef struct { char kkey[KEY_LENGTH]; long ccount; } KEY_REC; #ifndef TRUE #define TRUE 1 #endif #ifndef FALSE #define FALSE 0 #endif /* some function prototypes */ #ifdef THINK_C void main(int argc, char *argv[]); void show_words (FILE *kf1, long t1, long d1, FILE *kf2, long t2, long d2, double threshhold, int show_new_words, int show_missing_words); FILE *open_kfile (char *fn); int fetch_next_word (KEY_REC *recp, FILE *key_file); void print_help_and_quit (void); int is_true (char *cmd); #endif /* *********************** main ******************************* */ void main (argc, argv) int argc; char *argv[]; { unsigned long start_time, end_time, time(); long d1, d2, t1, t2; int show_new_words, show_missing_words; FILE *kf1, *kf2, *open_kfile(); char *ctime(); double threshhold, atof(); KEY_REC last_word; void exit(), show_words(), print_help_and_quit(); /* get the command line arguments for Macintosh users ... */ #ifdef THINK_C argc = ccommand (&argv); #endif if (argc < 3) print_help_and_quit (); time (&start_time); printf ("Starting work: %s\n", ctime (&start_time)); kf1 = open_kfile (argv[1]); kf2 = open_kfile (argv[2]); if (argc > 3) threshhold = atof (argv[3]); else threshhold = 5.0; if (argc > 4) show_new_words = is_true (argv[4]); else show_new_words = TRUE; if (argc > 5) show_missing_words = is_true (argv[5]); else show_missing_words = TRUE; fseek (kf1, 0L, 2); d1 = ftell (kf1) / sizeof(KEY_REC); fseek (kf1, (long)(-sizeof(KEY_REC)), 2); fetch_next_word (&last_word, kf1); t1 = last_word.ccount; fseek (kf2, 0L, 2); d2 = ftell (kf2) / sizeof(KEY_REC); fseek (kf2, (long)(-sizeof(KEY_REC)), 2); fetch_next_word (&last_word, kf2); t2 = last_word.ccount; printf ("file total words distinct words\n"); printf ("---- ----------- --------------\n"); printf ("%-20s%11ld %11ld\n", argv[1], t1, d1); printf ("%-20s%11ld %11ld\n", argv[2], t2, d2); printf ("\nThreshhold surplus sigmas = %5.2f\n", threshhold); printf ("\nSurprising words in \"%s\" as compared with \"%s\":\n", argv[1], argv[2]); show_words (kf1, t1, d1, kf2, t2, d2, threshhold, show_new_words, show_missing_words); printf ("\nComparison of \"%s\" with \"%s\" completed!\n", argv[1], argv[2]); time (&end_time); printf ("Ending work: %s\n", ctime (&end_time)); printf ("Elapsed time = %ld seconds.\n", end_time - start_time); printf ("Comparison rate = %1.f words/second.\n", (float)(d1 + d2) / ( end_time - start_time )); } /* function to open the *.k keyfile */ FILE *open_kfile (fn) char *fn; { FILE *fopen(), *fp; void exit(); if ((fp = fopen (fn, "rb")) == NULL) { printf ("Fatal error in attempt to open file \"%s\"!\n", fn); exit (1); } return (fp); } /* function to get the next KEY_REC from key_file and store it in the * KEY_REC space pointed to by recp ... return TRUE if all is seemingly * well, and FALSE if attempt to read past EOF or other error ... in which * case be sure to set the kkey field to a high ASCII value, to win any * comparisons with other keys alphabetically -- use "~" (tilde) for * that purpose. */ int fetch_next_word (recp, key_file) KEY_REC *recp; FILE *key_file; { void exit(); char *strncpy(); if (! fread ((char *)recp, sizeof (KEY_REC), 1, key_file)) { strncpy (recp->kkey, "~", 1); return (FALSE); } return (TRUE); } /* function to print out a little help for the user ... */ void print_help_and_quit () { void exit(); printf ("zcmpndx: Not enough command line arguments!\n"); printf ("\nPlease provide the names of two `.k' index keyfiles:\n"); printf ("a file to be tested, and a standard (big, fiducial) file to\n"); printf ("compare it against. The program will then go through the\n"); printf ("first file and print out all of the terms in it that occur\n"); printf ("at a rate significantly above or below their expected\n"); printf ("rate, as defined by the second file. Optionally, put a\n"); printf ("threshhold different than the default (5.0 `sigma') on\n"); printf ("the command line after the two file names. To suppress display\n"); printf ("of new or missing words, put further nonzero parameters on\n"); printf ("the command line.\n"); printf ("\nThus: zcmpndx test_file.k standard_file.k [threshhold [no_new [no_missing]]]\n"); printf ("\nSend comments, etc. to the author, Mark^Zimmermann;\n"); printf ("science@oasys.dt.navy.mil; [75066,2044] on CompuServe;\n"); printf ("This is free software under GNU GPL & comes with NO WARRANTY WHATSOEVER!\n"); exit(0); } /* function to take a command-line value and make it TRUE or FALSE * in a reasonable way.... */ is_true (cmd) char *cmd; { if (cmd[1] == 'y' || cmd[1] == 'Y' || atoi (cmd) != 0) return (TRUE); else return (FALSE); } /* function to show the results... */ void show_words (kf1, t1, d1, kf2, t2, d2, threshhold, show_new_words, show_missing_words) FILE *kf1, *kf2; long t1, d1, t2, d2; double threshhold; int show_new_words, show_missing_words; { KEY_REC rec1, rec2; long prev_cc1, prev_cc2, n1, n2; int new_word2; double sigma, t1_over_t2, expect, sqrt(), fabs(); t1_over_t2 = (double)t1 / t2; fseek (kf1, 0L, 0); fseek (kf2, 0L, 0); prev_cc1 = 0; prev_cc2 = 0; fetch_next_word (&rec1, kf1); fetch_next_word (&rec2, kf2); new_word2 = TRUE; printf ("\n word sigma expected actual in std\n"); printf ( " ---- ----- -------- ------ ------\n"); do { while (strncmp (rec1.kkey, rec2.kkey, KEY_LENGTH) > 0) { if (show_missing_words && new_word2) { n2 = rec2.ccount - prev_cc2; expect = t1_over_t2 * n2; sigma = -sqrt (expect); if (fabs (sigma) > threshhold) printf ("%.12s %7.1f %7.1f 0%8ld\n", rec2.kkey, sigma, expect, n2); } prev_cc2 = rec2.ccount; fetch_next_word (&rec2, kf2); new_word2 = TRUE; } if (strncmp (rec1.kkey, rec2.kkey, KEY_LENGTH) == 0) { n1 = rec1.ccount - prev_cc1; n2 = rec2.ccount - prev_cc2; expect = t1_over_t2 * n2; sigma = (n1 - expect) / sqrt (expect); if (fabs (sigma) > threshhold) printf ("%.12s %7.1f %7.1f%8ld%8ld\n", rec1.kkey, sigma, expect, n1, n2); } else { if (show_new_words) printf ("%.12s --- 0%8ld 0\n", rec1.kkey, rec1.ccount - prev_cc1); } new_word2 = FALSE; prev_cc1 = rec1.ccount; } while (fetch_next_word (&rec1, kf1)); return; }