#! /usr/bin/perl # [[CorrelOracle]] version 0.1 --- ^z = [[MarkZimmermann]] --- 25-26 Aug 2001 # an attempt to auto-magically link Wiki page files based on their correlations # Thanks to Bo Leuf for kindness and help! (and code fragments) # WARNING! --- highly experimental! --- use only on a copy of "pages" files # THIS PROGRAM WILL MODIFY THE FILES IN "pages" BY APPENDING LINES TO THEM! # DO NOT USE FOR CONTROL OF NUCLEAR REACTORS, AIRCRAFT, OR MEDICAL EQUIPMENT # There are no guarantees in life --- your mileage may vary. # To experiment with this, invoke: # perl [[CorrelOracle]].perl # from right above a "pages" directory *(A COPY, NOT THE ORIGINAL!)* # [[CorrelOracle]] will analyze the files in "pages" and will append a list of # perhaps-correlated files to the end of each file in "pages", based on # statistical co-occurrence of words # Some lines which might be appended to a file: # ---- # //If you liked this page, the [[CorrelOracle]] (v.0.1) recommends: # [[BuSab]] (1.7), [[LatePhysicists]] (0.6), [[GibbonReExaggeration2]] (0.6), ...// # Algorithm: # * look through all the files in the "pages" directory # * split their contents into alphanumeric "words" and make them ALL CAPS # * build hashes (associative arrays) of the words and their occurrence rates # * compare the words in every file with every other file; compute "similarity" # * for each file, find the 3 most similar files and append their names # The "similarity" measure between two files is the sum of the individual # word similarities. The individual word similarities are the products of # the fraction of the total word occurrences that appear in each of the two # files being correlated, scaled by the lengths of the two files so that # little files have about as good a chance to play as do big ones. # Example: suppose that in a set of 1000 files, the word "THINK" occurs 10 # times, and the average file contains 200 words. Consider file X of length # 111 words in which "THINK" occurs 2 times, and file Y of length 333 words # in which "THINK" occurs 3 times. The contribution of "THINK" to the "similarity" # of files X and Y is then (2/10) * (3/10) * (200/111) * (200/333) = 0.065 # which gets added to the contributions of all the other words in the two files # to produce the total "similarity" of file X and file Y # NOTE: I MADE UP THIS "SIMILARITY" MEASURE MYSELF! IT MAY BE BOGUS! # It has not been validated, and there is little or no "science" behind it. # But it does seem to work, more or less; when pages have a "similarity" # of >1, then they do seem to be at least somewhat related, in my tests. # Next steps: experiment with other metrics of similarity, perhaps involving # better definition of a "word", perhaps using a little linguistics (e.g., # "stemming" to strip endings), perhaps analyzing co-occurrence of adjacent # word pairs rather than singleton words in isolation.... # now the program --- begin by grabbing the pages one at a time print "[[CorrelOracle]] version 0.1 --- HIGHLY EXPERIMENTAL!\n"; print " (do not use without reading & understanding the code first)\n\n"; opendir(DIR, "pages") or die "couldn't open 'pages'"; @pages = grep !/^\./, readdir DIR; $pagecount = @pages; print "$pagecount pages to analyze\n"; $i = 0; # counter for use in loop over pages foreach $page (@pages) { if ( -e "pages/$page" ) { open(F, "pages/$page") or die "$page: $!"; print " $page ... "; my $nl = \\; undef \\; $body = <F>; # get whole file at once close(F); \\ = $nl; } else { die "$page didn't exist: $!"; } # capitalize and split apart the words @words = split('\W+', uc($body)); # remove leading null string produced by split when file begins with delimiter shift @words if $words[0] eq ""; # count each word's occurrence rate, globally and in this particular file foreach $word (@words) { $globalwordcount{$word}++; $filewordcount[$i]{$word}++; } print scalar(@words), " words.\n"; $filetotalwords[$i] = scalar(@words); $j += scalar(@words); \\ = $nl; ++$i; } $fileavgwords = $j / $pagecount; print " (average $fileavgwords words/file)\n"; # now for each file compute a correlation with every other one, # and append information on the best 3 matches to the end of the file for ($i = 0; $i < $pagecount; ++$i) { print "$pages[$i] best matches:\n "; %pagesim = (); for ($j = 0; $j < $pagecount; ++$j) { if ($j == $i) { next; # don't correlate a page with itself! } $similarity = 0; # similarity measure is product of the fraction of word occurrences in each file # (so words which are widely distributed have little weight) and then # normalized by the file lengths (to keep longer files from always winning) foreach $word (keys %{$filewordcount[$i]}) { $similarity += $filewordcount[$i]{$word} * $filewordcount[$j]{$word} / ($globalwordcount{$word} * $globalwordcount{$word}); } $pagesim{$pages[$j]} = $similarity * $fileavgwords * $fileavgwords / ($filetotalwords[$i] * $filetotalwords[$j]); } @tmp = sort { $pagesim{$b} <=> $pagesim{$a} } keys %pagesim; for ($j = 0; $j < 3; ++$j) { printf "%s (%3.1f), ", $tmp[$j], $pagesim{$tmp[$j]}; } # HERE COMES THE FILE MODIFICATION! --- COMMENT OUT TO AVOID WRITING CHANGES! if ( -e "pages/$page[$i]" ) { open(F, ">>pages/$pages[$i]") or die "$pages[$i]: $!"; print F "\n\n----\n\n"; print F "\'\'(If you liked this page, the [[CorrelOracle]] (v.0.1) recommends: "; for ($j = 0; $j < 3; ++$j) { printf F "%s (%3.1f), ", $tmp[$j], $pagesim{$tmp[$j]}; } print F "...)\'\'\n"; close(F); } print "...\n"; # done with this file ... on to the next! }
(correlates: CorrelOracle02SourceCode, CorrelOracle3SourceCode, SnipPattern01SourceCode, ...)