CorrelOracle3SourceCode

 

The program that generated cross-links among ZhurnalWiki pages follows; see CorrelOracle3 (and CorrelOracle2 and CorrelOracle) for further discussion.


 #! /usr/bin/perl

 # [[CorrelOracle]] version 0.3 --- ^z = [[MarkZimmermann]] --- 25 Aug - 15 Sep 2001
 # an attempt to auto-magically link Wiki page files based on their correlations
 # Thanks to Bo Leuf for kindness and help!

 # WARNING! --- highly experimental! --- use only on a copy of "pages" files
 # THIS PROGRAM WILL MODIFY THE FILES IN "pages/" BY APPENDING LINES TO THEM!

 # Changes from [[CorrelOracle]] version 0.2:
 #  * add days of the week, months of the year, time zone, "Datetag" to stopwords
 #  * modified "words" to store "-" at the end of terms
 #  * added two-word phrases to "words"
 #  * improved annotations with reasons for links

 # Changes from [[CorrelOracle]] version 0.1:
 #  * words are split on non-alphabetic characters = [^A-Za-z] rather than on
 #       Perl's "nonword characters" = \W = [^A-Za-z_0-9]
 #  * "stopwords" are removed: the and of to in that you for it was is as have not
 #       with be your at we on he by but my this his which from are all me so 
 #       one if they had has been would she or there her his an when a b c d e 
 #       f g h i j k l m n o p q r s t u v w x y z 
 #     ((list adapted from Ayres as cited in NOTESCRIPT --- see [[HandOfOnesOwn]]))
 #  * "stemming" is done using the following rules:
 #       - drop final "ly" "y" "ies" "es" "s" "e" "ied" "ed" "ing"
 #       - then drop final letter of a doubled pair
 #       - accept resulting stem if 3 or more letters long
 #  * [[CorrelationLog]] file is created to store info about the process for later
 #       study, debugging, and idea-generation that may lead to improvements 

 # To experiment with [[CorrelOracle]], invoke:
 #    perl [[CorrelOracle]].perl
 # from right above "pages" directory *(A COPY, NOT THE ORIGINAL!)*
 # [[CorrelOracle]] will analyze the files in "pages" and will append links to
 # perhaps-correlated files for each file in "pages", based upon
 # statistical co-occurrence of words. [[CorrelOracle]] will also create
 # the file [[CorrelationLog]] with details of the correlations.

 # Note that [[CorrelOracle]] takes everything it sees as potential "words";
 # that is, it assumes that the pages it sees are clean,
 # without embedded tags or symbols or other word-like mark-up --- so
 # it may be wise to run a separate clean-up program to strip out
 # such tags from marked-up pages. See [[SnipPattern]] ....

 # Sample result: [[CorrelOracle]] might append to a file lines like
 #   ----
 #   If you liked this page, [[CorrelOracle3]] suggests: [[BooksToConsider]]
 #   //(0.4:/marot-/beau-/dougla-/)//, [[OutOfSync]] //(0.3:/infanc-/feel-/know-/)//,
 #   [[TinyTrainsAndVenetianGlass]] //(0.3:/align-/throughout-/deriv-/)//, ...

 # [[CorrelOracle]] Similarity Measure --- Algorithm Summary:
 # * look through all the files in the "pages" directory
 # * split their contents into alphanumeric "words" and make them lower case
 # * remove stopwords & stem the remaining words
 # * build hashes (associative arrays) of the words and their occurrence rates
 # * compare the words in every file with every other file; compute "similarity"
 # * for each file, find the 3 most similar files and append their names
 # * store more details in [[CorrelationLog]]

 # The "similarity" measure between two files is the sum of the individual
 # word similarities. The individual word similarities are the products of
 # the fraction of the total word occurrences that appear in each of the two
 # files being correlated, scaled by the lengths of the two files (so that
 # little files have about as good a chance to play as do big ones).

 # Example: suppose that in a set of 1000 files, the word "THINK" occurs 10
 # times, and the average file contains 200 words. Consider file X of length
 # 111 words in which "THINK" occurs 2 times, and file Y of length 333 words
 # in which "THINK" occurs 3 times. The contribution of "THINK" to the "similarity"
 # of files X and Y is then (2/10) * (3/10) * (200/111) * (200/333) = 0.065
 # which gets added to the contributions of all the other words in the two files
 # to produce the total "similarity" of file X and file Y

 # NOTE:  I MADE UP THIS "SIMILARITY" MEASURE MYSELF! IT MAY BE BOGUS!
 # It has not been validated, and there is little or no "science" behind it.
 # But it does seem to work, more or less; when pages have a "similarity"
 # of >1, then they do seem to be at least somewhat related, in my tests.

 # Future steps: experiment with other metrics of similarity, perhaps involving
 # better definition of a "word"; analyze co-occurrence of adjacent
 # word pairs rather than singleton words in isolation; explore similarity
 # metrics based on letter patterns ("N-Grams") instead of "words", ....

 # now begin the [[CorrelOracle]] program: start by grabbing pages one at a time

 print "[[CorrelOracle3]] --- EXPERIMENTAL!\n";
 opendir(DIR, "pages") or die "couldn't open 'pages'";
 @pages = grep !/^\./, readdir DIR;
 $pagecount = @pages;
 print "$pagecount pages to analyze\n";
 $i = 0;  # counter for use in loop over pages
 %pagenum = (); # hash to convert page names to numbers
 undef \\; # read entire files at once

 foreach $page (@pages) {
  if ( -e "pages/$page" ) {
   open(F, "pages/$page") or die "$page: $!";
   print "  $page ... ";
   $body = <F>;
   close(F);
  } else {
      die "$page didn't exist: $!";
  }

 # convert to lower case and split apart the words
  @rawwords = split(/[^a-z]+/, lc($body));

 # remove leading null string given by split() if file starts with delimiter
  shift @rawwords if $rawwords[0] eq "";

 # remove stopwords --- note the spaces around each word in the stopword string!
  $stopwords = " the and of to in that you for it was is as have not with be your
 at we on he by but my this his which from are all me so one if they had has been
 would she or there her his an when a b c d e f g h i j k l m n o p q r s t u v w
 x y z datetag est edt sunday monday tuesday wednesday thursday friday saturday
 january february march april may june july august september october november
 december ";
  @words = ();
  foreach $word (@rawwords) {
    if ($stopwords !~ / $word /) {
      push @words, $word;
    }
  }

 # stem the words simplemindedly, just enough to improve matching among files
  $wordcount = scalar(@words);
  for ($j = 0; $j < $wordcount; ++$j) {
    $tmp = $words[$j];
    $tmp =~ s/e$|ly$|y$|ies$|es$|s$|ied$|ed$|ing\\/;  # drop endings
    $tmp =~ s/(.)\1\\$1/;    # drop final character of doubled pair
    if (length($tmp) > 2) {
      $words[$j] = $tmp;     # accept result if 3 or more letters long
    }
    $words[$j] = "$words[$j]-"; # append "-" to indicate possible stemming
  }

 # include all adjacent two-word phrases in @words
  for ($j = 1; $j < $wordcount; ++$j) {
    push @words, "$words[$j-1] $words[$j]";
  }

 # count each word's occurrence rate, globally and in this particular file
  foreach $word (@words) {
    $globalwordcount{$word}++;
    $filewordcount[$i]{$word}++;
  }

  print $wordcount, "  words.\n";
  $filetotalwords[$i] = $wordcount;
  $k += $wordcount;
  $pagenum{$page} = $i;
  ++$i;
}

 $fileavgwords = $k / $pagecount;
 print "    (average $fileavgwords words/file)\n";

 # now for every file, compute a correlation with every other one,
 # and append information on the best 3 matches to the end of the file,
 # keeping a record of interesting data in [[CorrelationLog]]

 open(G, ">[[CorrelationLog]]") or die "[[CorrelationLog]]: $!";
 print G "Details of [[CorrelOracle3]] [[ZhurnalWiki]] page analysis:\n\n";

 for ($i = 0; $i < $pagecount; ++$i) {
  print "$pages[$i] best matches ";
  %pagesim = ();
  for ($j = 0; $j < $pagecount; ++$j) {
    if ($j == $i) {
      next;    # don't correlate a page with itself!
    }
    $similarity = 0;

 # similarity measure is product of the fraction of word occurrences in each file
 # (so words which are widely distributed have little weight) and then
 # normalized by the file lengths (to keep longer files from always winning)

    foreach $word (keys %{$filewordcount[$i]}) {
      if (exists $filewordcount[$j]{$word}) {
        $similarity += $filewordcount[$i]{$word} * $filewordcount[$j]{$word} / 
                        ($globalwordcount{$word} * $globalwordcount{$word});
      }
    }
    $pagesim{$pages[$j]} = $similarity * $fileavgwords * $fileavgwords /
                             ($filetotalwords[$i] * $filetotalwords[$j]);
  }

 # sort the page similarity results so that the best matches come first
  @tmp = sort { $pagesim{$b} <=> $pagesim{$a} } keys %pagesim;

 # for the best three matches, find out the words that contributed most to
 # the correlation, and record the first five (or less) of them in [[CorrelationLog]]
 # and at the end of the Wiki page

  print G "$pages[$i]:\n";
  open(F, ">>pages/$pages[$i]") or die "$pages[$i]: $!";
  print F "\n\n----\n\n";
  print F "\If you liked this page, [[CorrelOracle3]] suggests: ";
  for ($j = 0; $j < 3; ++$j) {
    $k = $pagenum{$tmp[$j]};
    %wordsim = ();
    foreach $word (keys %{$filewordcount[$i]}) {
      if (exists $filewordcount[$k]{$word}) {
        $wordsim{$word} = $filewordcount[$i]{$word} * $filewordcount[$k]{$word} /
                        ($globalwordcount{$word} * $globalwordcount{$word});
      }
    }
    @bestwords = sort { $wordsim{$b} <=> $wordsim{$a} } keys %wordsim;
    printf G "* %s: %4.2f = ", $tmp[$j], $pagesim{$tmp[$j]};
    printf F "%s \'\'(%3.1f/", $tmp[$j], $pagesim{$tmp[$j]};
    printf "%s (%3.1f:/", $tmp[$j], $pagesim{$tmp[$j]};
    for ($n = 0; $n < 5 && $n < @bestwords; ++$n) {
      $x = $fileavgwords * $fileavgwords *
           $wordsim{$bestwords[$n]} /($filetotalwords[$i]*$filetotalwords[$k]);
      last if ($x < 0.01);
      printf G "%4.2f %s, ",  $x, $bestwords[$n];
      printf F "%s/", $bestwords[$n];
      printf "%s/", $bestwords[$n];
    }
    printf G "...\n";
    printf F ")\'\', ";
    printf "), ";
  }
  print F "...\n";
  print "...\n";
  close(F);
  print "...\n";
 }
 close(G);

(correlates: CorrelOracle02SourceCode, CorrelOracle01SourceCode, SnipPattern01SourceCode, ...)