CorrelOracle01SourceCode

 

 #! /usr/bin/perl
 
 # [[CorrelOracle]] version 0.1 --- ^z = [[MarkZimmermann]] --- 25-26 Aug 2001
 # an attempt to auto-magically link Wiki page files based on their correlations
 # Thanks to Bo Leuf for kindness and help! (and code fragments)
 
 # WARNING! --- highly experimental! --- use only on a copy of "pages" files
 # THIS PROGRAM WILL MODIFY THE FILES IN "pages" BY APPENDING LINES TO THEM!
 # DO NOT USE FOR CONTROL OF NUCLEAR REACTORS, AIRCRAFT, OR MEDICAL EQUIPMENT
 # There are no guarantees in life --- your mileage may vary.
 
 # To experiment with this, invoke:
  #    perl [[CorrelOracle]].perl
  # from right above a "pages" directory *(A COPY, NOT THE ORIGINAL!)*
 # [[CorrelOracle]] will analyze the files in "pages" and will append a list of
 # perhaps-correlated files to the end of each file in "pages", based on
 # statistical co-occurrence of words
 
 # Some lines which might be appended to a file:
 #   ----
 #   //If you liked this page, the [[CorrelOracle]] (v.0.1) recommends:
 #   [[BuSab]] (1.7), [[LatePhysicists]] (0.6), [[GibbonReExaggeration2]] (0.6), ...//
 
 # Algorithm:
 # * look through all the files in the "pages" directory
 # * split their contents into alphanumeric "words" and make them ALL CAPS
 # * build hashes (associative arrays) of the words and their occurrence rates
 # * compare the words in every file with every other file; compute "similarity"
 # * for each file, find the 3 most similar files and append their names
 
 # The "similarity" measure between two files is the sum of the individual
 # word similarities. The individual word similarities are the products of
 # the fraction of the total word occurrences that appear in each of the two
 # files being correlated, scaled by the lengths of the two files so that
 # little files have about as good a chance to play as do big ones.
 
 # Example: suppose that in a set of 1000 files, the word "THINK" occurs 10
 # times, and the average file contains 200 words. Consider file X of length
 # 111 words in which "THINK" occurs 2 times, and file Y of length 333 words
 # in which "THINK" occurs 3 times. The contribution of "THINK" to the "similarity"
 # of files X and Y is then (2/10) * (3/10) * (200/111) * (200/333) = 0.065
 # which gets added to the contributions of all the other words in the two files
 # to produce the total "similarity" of file X and file Y
 
 # NOTE:  I MADE UP THIS "SIMILARITY" MEASURE MYSELF! IT MAY BE BOGUS!
 # It has not been validated, and there is little or no "science" behind it.
 # But it does seem to work, more or less; when pages have a "similarity"
 # of >1, then they do seem to be at least somewhat related, in my tests.
 
 # Next steps: experiment with other metrics of similarity, perhaps involving
 # better definition of a "word", perhaps using a little linguistics (e.g.,
 # "stemming" to strip endings), perhaps analyzing co-occurrence of adjacent
 # word pairs rather than singleton words in isolation....
 
 # now the program --- begin by grabbing the pages one at a time
 
 print "[[CorrelOracle]] version 0.1 --- HIGHLY EXPERIMENTAL!\n";
 print " (do not use without reading & understanding the code first)\n\n";
 opendir(DIR, "pages") or die "couldn't open 'pages'";
 @pages = grep !/^\./, readdir DIR;
 $pagecount = @pages;
 print "$pagecount pages to analyze\n";
 $i = 0;  # counter for use in loop over pages
 
 foreach $page (@pages) {
   if ( -e "pages/$page" ) {
    open(F, "pages/$page") or die "$page: $!";
    print "  $page ... ";
    my $nl = \\; undef \\;
    $body = <F>;  # get whole file at once
    close(F);
    \\ = $nl;
   } else {
       die "$page didn't exist: $!";
   }
 
 # capitalize and split apart the words
   @words = split('\W+', uc($body));
 
 # remove leading null string produced by split when file begins with delimiter
   shift @words if $words[0] eq "";
 
 # count each word's occurrence rate, globally and in this particular file
   foreach $word (@words) {
     $globalwordcount{$word}++;
     $filewordcount[$i]{$word}++;
   }
   print scalar(@words), "  words.\n";
   $filetotalwords[$i] = scalar(@words);
   $j += scalar(@words);
   \\ = $nl;
   ++$i;
 }
 $fileavgwords = $j / $pagecount;
 print "    (average $fileavgwords words/file)\n";
 
 # now for each file compute a correlation with every other one,
 # and append information on the best 3 matches to the end of the file
 
 for ($i = 0; $i < $pagecount; ++$i) {
   print "$pages[$i] best matches:\n  ";
   %pagesim = ();
   for ($j = 0; $j < $pagecount; ++$j) {
     if ($j == $i) {
       next;    # don't correlate a page with itself!
     }
     $similarity = 0;
 # similarity measure is product of the fraction of word occurrences in each file
 # (so words which are widely distributed have little weight) and then
 # normalized by the file lengths (to keep longer files from always winning)
     foreach $word (keys %{$filewordcount[$i]}) {
       $similarity += $filewordcount[$i]{$word} * $filewordcount[$j]{$word} / 
                         ($globalwordcount{$word} * $globalwordcount{$word});
     }
     $pagesim{$pages[$j]} = $similarity * $fileavgwords * $fileavgwords /
                              ($filetotalwords[$i] * $filetotalwords[$j]);
   }
   @tmp = sort { $pagesim{$b} <=> $pagesim{$a} }
            keys %pagesim;
   for ($j = 0; $j < 3; ++$j) {
     printf "%s (%3.1f), ", $tmp[$j], $pagesim{$tmp[$j]};
   }
 # HERE COMES THE FILE MODIFICATION! --- COMMENT OUT TO AVOID WRITING CHANGES!
   if ( -e "pages/$page[$i]" ) {
     open(F, ">>pages/$pages[$i]") or die "$pages[$i]: $!";
     print F "\n\n----\n\n";
     print F "\'\'(If you liked this page, the [[CorrelOracle]] (v.0.1) recommends: ";
     for ($j = 0; $j < 3; ++$j) {
       printf F "%s (%3.1f), ", $tmp[$j], $pagesim{$tmp[$j]};
     }
     print F "...)\'\'\n";
     close(F);
   }
   print "...\n";
 # done with this file ... on to the next!
 }


(correlates: CorrelOracle02SourceCode, CorrelOracle3SourceCode, SnipPattern01SourceCode, ...)

1 person liked this