#! /usr/bin/perl
# [[CorrelOracle]] version 0.1 --- ^z = [[MarkZimmermann]] --- 25-26 Aug 2001
# an attempt to auto-magically link Wiki page files based on their correlations
# Thanks to Bo Leuf for kindness and help! (and code fragments)
# WARNING! --- highly experimental! --- use only on a copy of "pages" files
# THIS PROGRAM WILL MODIFY THE FILES IN "pages" BY APPENDING LINES TO THEM!
# DO NOT USE FOR CONTROL OF NUCLEAR REACTORS, AIRCRAFT, OR MEDICAL EQUIPMENT
# There are no guarantees in life --- your mileage may vary.
# To experiment with this, invoke:
# perl [[CorrelOracle]].perl
# from right above a "pages" directory *(A COPY, NOT THE ORIGINAL!)*
# [[CorrelOracle]] will analyze the files in "pages" and will append a list of
# perhaps-correlated files to the end of each file in "pages", based on
# statistical co-occurrence of words
# Some lines which might be appended to a file:
# ----
# //If you liked this page, the [[CorrelOracle]] (v.0.1) recommends:
# [[BuSab]] (1.7), [[LatePhysicists]] (0.6), [[GibbonReExaggeration2]] (0.6), ...//
# Algorithm:
# * look through all the files in the "pages" directory
# * split their contents into alphanumeric "words" and make them ALL CAPS
# * build hashes (associative arrays) of the words and their occurrence rates
# * compare the words in every file with every other file; compute "similarity"
# * for each file, find the 3 most similar files and append their names
# The "similarity" measure between two files is the sum of the individual
# word similarities. The individual word similarities are the products of
# the fraction of the total word occurrences that appear in each of the two
# files being correlated, scaled by the lengths of the two files so that
# little files have about as good a chance to play as do big ones.
# Example: suppose that in a set of 1000 files, the word "THINK" occurs 10
# times, and the average file contains 200 words. Consider file X of length
# 111 words in which "THINK" occurs 2 times, and file Y of length 333 words
# in which "THINK" occurs 3 times. The contribution of "THINK" to the "similarity"
# of files X and Y is then (2/10) * (3/10) * (200/111) * (200/333) = 0.065
# which gets added to the contributions of all the other words in the two files
# to produce the total "similarity" of file X and file Y
# NOTE: I MADE UP THIS "SIMILARITY" MEASURE MYSELF! IT MAY BE BOGUS!
# It has not been validated, and there is little or no "science" behind it.
# But it does seem to work, more or less; when pages have a "similarity"
# of >1, then they do seem to be at least somewhat related, in my tests.
# Next steps: experiment with other metrics of similarity, perhaps involving
# better definition of a "word", perhaps using a little linguistics (e.g.,
# "stemming" to strip endings), perhaps analyzing co-occurrence of adjacent
# word pairs rather than singleton words in isolation....
# now the program --- begin by grabbing the pages one at a time
print "[[CorrelOracle]] version 0.1 --- HIGHLY EXPERIMENTAL!\n";
print " (do not use without reading & understanding the code first)\n\n";
opendir(DIR, "pages") or die "couldn't open 'pages'";
@pages = grep !/^\./, readdir DIR;
$pagecount = @pages;
print "$pagecount pages to analyze\n";
$i = 0; # counter for use in loop over pages
foreach $page (@pages) {
if ( -e "pages/$page" ) {
open(F, "pages/$page") or die "$page: $!";
print " $page ... ";
my $nl = \\; undef \\;
$body = <F>; # get whole file at once
close(F);
\\ = $nl;
} else {
die "$page didn't exist: $!";
}
# capitalize and split apart the words
@words = split('\W+', uc($body));
# remove leading null string produced by split when file begins with delimiter
shift @words if $words[0] eq "";
# count each word's occurrence rate, globally and in this particular file
foreach $word (@words) {
$globalwordcount{$word}++;
$filewordcount[$i]{$word}++;
}
print scalar(@words), " words.\n";
$filetotalwords[$i] = scalar(@words);
$j += scalar(@words);
\\ = $nl;
++$i;
}
$fileavgwords = $j / $pagecount;
print " (average $fileavgwords words/file)\n";
# now for each file compute a correlation with every other one,
# and append information on the best 3 matches to the end of the file
for ($i = 0; $i < $pagecount; ++$i) {
print "$pages[$i] best matches:\n ";
%pagesim = ();
for ($j = 0; $j < $pagecount; ++$j) {
if ($j == $i) {
next; # don't correlate a page with itself!
}
$similarity = 0;
# similarity measure is product of the fraction of word occurrences in each file
# (so words which are widely distributed have little weight) and then
# normalized by the file lengths (to keep longer files from always winning)
foreach $word (keys %{$filewordcount[$i]}) {
$similarity += $filewordcount[$i]{$word} * $filewordcount[$j]{$word} /
($globalwordcount{$word} * $globalwordcount{$word});
}
$pagesim{$pages[$j]} = $similarity * $fileavgwords * $fileavgwords /
($filetotalwords[$i] * $filetotalwords[$j]);
}
@tmp = sort { $pagesim{$b} <=> $pagesim{$a} }
keys %pagesim;
for ($j = 0; $j < 3; ++$j) {
printf "%s (%3.1f), ", $tmp[$j], $pagesim{$tmp[$j]};
}
# HERE COMES THE FILE MODIFICATION! --- COMMENT OUT TO AVOID WRITING CHANGES!
if ( -e "pages/$page[$i]" ) {
open(F, ">>pages/$pages[$i]") or die "$pages[$i]: $!";
print F "\n\n----\n\n";
print F "\'\'(If you liked this page, the [[CorrelOracle]] (v.0.1) recommends: ";
for ($j = 0; $j < 3; ++$j) {
printf F "%s (%3.1f), ", $tmp[$j], $pagesim{$tmp[$j]};
}
print F "...)\'\'\n";
close(F);
}
print "...\n";
# done with this file ... on to the next!
}
(correlates: CorrelOracle02SourceCode, CorrelOracle3SourceCode, SnipPattern01SourceCode, ...)