CorrelOracle02SourceCode

#! /usr/bin/perl

# CorrelOracle version 0.2 --- ^z = MarkZimmermann --- 25 Aug - 8 Sep 2001
# an attempt to auto-magically link Wiki page files based on their correlations
# Thanks to Bo Leuf for kindness and help!

# WARNING! --- highly experimental! --- use only on a copy of "pages" files
# THIS PROGRAM WILL MODIFY THE FILES IN "pages/" BY APPENDING LINES TO THEM!

# Changes from CorrelOracle version 0.1:
#  * words are split on non-alphabetic characters = [^A-Za-z] rather than on
#       Perl's "nonword characters" = \W = [^A-Za-z_0-9]
#  * "stopwords" are removed: the and of to in that you for it was is as have not 
#       with be your at we on he by but my this his which from are all me so 
#       one if they had has been would she or there her his an when a b c d e 
#       f g h i j k l m n o p q r s t u v w x y z 
#     ((list adapted from Ayres as cited in NOTESCRIPT --- see HandOfOnesOwn))
#  * "stemming" is done using the following rules:
#       - drop final "ly" "y" "ies" "es" "s" "e" "ied" "ed" "ing"
#       - then drop final letter of a doubled pair
#       - accept resulting stem if 3 or more letters long
#  * CorrelationLog file is created to store info about the process for later
#       study, debugging, and idea-generation that may lead to improvements

# To experiment with CorrelOracle, invoke:
#    perl CorrelOracle.perl
# from right above "pages" directory *(A COPY, NOT THE ORIGINAL!)*
# CorrelOracle will analyze the files in "pages" and will append links to
# perhaps-correlated files for each file in "pages", based upon
# statistical co-occurrence of words. CorrelOracle will also create
# the file CorrelationLog with details of the correlations.

# Note that CorrelOracle takes everything it sees as potential "words";
# that is, it assumes that the pages it sees are clean,
# without embedded tags or symbols or other word-like mark-up --- so
# it may be wise to run a separate clean-up program to strip out
# such tags from marked-up pages. See SnipPattern ....

# Sample result: CorrelOracle might append to a file lines like
#   ----
#   If you liked this page, CorrelOracle2 suggests: BooksToConsider
#   //(0.4:/marot-/beau-/dougla-/)//, OutOfSync //(0.3:/infanc-/feel-/know-/)//,
#   TinyTrainsAndVenetianGlass //(0.3:/align-/throughout-/deriv-/)//, ...

# CorrelOracle Similarity Measure --- Algorithm Summary:
# * look through all the files in the "pages" directory
# * split their contents into alphanumeric "words" and make them lower case
# * remove stopwords & stem the remaining words
# * build hashes (associative arrays) of the words and their occurrence rates
# * compare the words in every file with every other file; compute "similarity"
# * for each file, find the 3 most similar files and append their names
# * store more details in CorrelationLog

# The "similarity" measure between two files is the sum of the individual
# word similarities. The individual word similarities are the products of
# the fraction of the total word occurrences that appear in each of the two
# files being correlated, scaled by the lengths of the two files (so that
# little files have about as good a chance to play as do big ones).

# Example: suppose that in a set of 1000 files, the word "THINK" occurs 10
# times, and the average file contains 200 words. Consider file X of length
# 111 words in which "THINK" occurs 2 times, and file Y of length 333 words
# in which "THINK" occurs 3 times. The contribution of "THINK" to the "similarity"
# of files X and Y is then (2/10) * (3/10) * (200/111) * (200/333) = 0.065
# which gets added to the contributions of all the other words in the two files
# to produce the total "similarity" of file X and file Y

# NOTE:  I MADE UP THIS "SIMILARITY" MEASURE MYSELF! IT MAY BE BOGUS!
# It has not been validated, and there is little or no "science" behind it.
# But it does seem to work, more or less; when pages have a "similarity"
# of >1, then they do seem to be at least somewhat related, in my tests.

# Future steps: experiment with other metrics of similarity, perhaps involving
# better definition of a "word"; analyze co-occurrence of adjacent
# word pairs rather than singleton words in isolation; explore similarity
# metrics based on letter patterns ("N-Grams") instead of "words", ....

# now begin the CorrelOracle program: start by grabbing pages one at a time

print "CorrelOracle2 --- EXPERIMENTAL!\n";
opendir(DIR, "pages") or die "couldn't open 'pages'";
@pages = grep !/^\./, readdir DIR;
$pagecount = @pages;
print "$pagecount pages to analyze\n";
$i = 0;  # counter for use in loop over pages
%pagenum = (); # hash to convert page names to numbers
undef \\; # read entire files at once

foreach $page (@pages) {
  if ( -e "pages/$page" ) {
   open(F, "pages/$page") or die "$page: $!";
   print "  $page ... ";
   $body = <F>;
   close(F);
  } else {
      die "$page didn't exist: $!";
  }

# convert to lower case and split apart the words
  @rawwords = split(/[^a-z]+/, lc($body));

# remove leading null string given by split() if file starts with delimiter
  shift @rawwords if $rawwords[0] eq "";

# remove stopwords; note the spaces around each word in the stopword string!
  $stopwords = " the and of to in that you for it was is as have not with be your at we on he by but my this his which from are all me so one if they had has been would she or there her his an when a b c d e f g h i j k l m n o p q r s t u v w x y z ";
  @words = ();
  foreach $word (@rawwords) {
    if ($stopwords !~ / $word /) {
      push @words, $word;
    }
  }

# stem the words
  $wordcount = scalar(@words);
  for ($j = 0; $j < $wordcount; ++$j) {
    $tmp = $words[$j];
    $tmp =~ s/e$|ly$|y$|ies$|es$|s$|ied$|ed$|ing\\/;  # drop endings
    $tmp =~ s/(.)\1\\$1/;    # drop final character of doubled pair
    if (length($tmp) > 2) {
      $words[$j] = $tmp;     # accept result if 3 or more letters long
    }
  }

# count each word's occurrence rate, globally and in this particular file
  foreach $word (@words) {
    $globalwordcount{$word}++;
    $filewordcount[$i]{$word}++;
  }

  print $wordcount, "  words.\n";
  $filetotalwords[$i] = $wordcount;
  $k += $wordcount;
  $pagenum{$page} = $i;
  ++$i;
}

$fileavgwords = $k / $pagecount;
print "    (average $fileavgwords words/file)\n";

# now for every file, compute a correlation with every other one,
# and append information on the best 3 matches to the end of the file,
# keeping a record of interesting data in CorrelationLog

open(G, ">CorrelationLog") or die "CorrelationLog: $!";
print G "Details of CorrelOracle2 ZhurnalWiki page analysis:\n\n";

for ($i = 0; $i < $pagecount; ++$i) {
  print "$pages[$i] best matches ";
  %pagesim = ();
  for ($j = 0; $j < $pagecount; ++$j) {
    if ($j == $i) {
      next;    # don't correlate a page with itself!
    }
    $similarity = 0;

# similarity measure is product of the fraction of word occurrences in each file
# (so words which are widely distributed have little weight) and then
# normalized by the file lengths (to keep longer files from always winning)


    foreach $word (keys %{$filewordcount[$i]}) {
      if (exists $filewordcount[$j]{$word}) {
        $similarity += $filewordcount[$i]{$word} * $filewordcount[$j]{$word} / 
                        ($globalwordcount{$word} * $globalwordcount{$word});
      }
    }
    $pagesim{$pages[$j]} = $similarity * $fileavgwords * $fileavgwords /
                             ($filetotalwords[$i] * $filetotalwords[$j]);
  }

# sort the page similarity results so that the best matches come first
  @tmp = sort { $pagesim{$b} <=> $pagesim{$a} } keys %pagesim;

# for the best three matches, find out the best words that contributed to
# the correlation, and record the first few of them in CorrelationLog and
# at the end of the Wiki page

  print G "$pages[$i]:\n";
  open(F, ">>pages/$pages[$i]") or die "$pages[$i]: $!";
  print F "\n\n----\n\n";
  print F "\If you liked this page, CorrelOracle2 suggests: ";
  for ($j = 0; $j < 3; ++$j) {
    $k = $pagenum{$tmp[$j]};
    %wordsim = ();
    foreach $word (keys %{$filewordcount[$i]}) {
      if (exists $filewordcount[$k]{$word}) {
        $wordsim{$word} = $filewordcount[$i]{$word} * $filewordcount[$k]{$word} /
                        ($globalwordcount{$word} * $globalwordcount{$word});
      }
    }
    @bestwords = sort { $wordsim{$b} <=> $wordsim{$a} } keys %wordsim;
    printf G "* %s: %4.2f = ", $tmp[$j], $pagesim{$tmp[$j]};
    printf F "%s \'\'(%3.1f/", $tmp[$j], $pagesim{$tmp[$j]};
    printf "%s (%3.1f:/", $tmp[$j], $pagesim{$tmp[$j]};
    for ($n = 0; $n < 3 && $n < @bestwords; ++$n) {
      printf G "%4.2f %s-, ",  $fileavgwords * $fileavgwords *
           $wordsim{$bestwords[$n]} /($filetotalwords[$i]*$filetotalwords[$k]),
           $bestwords[$n];
      printf F "%s-/", $bestwords[$n];
      printf "%s-/", $bestwords[$n];
    }
    printf G "...\n";
    printf F ")\'\', ";
    printf "), ";
  }
  print F "...\n";
  print "...\n";
  close(F);
  print "...\n";

}
close(G);

TopicProgramming - Datetag20010909


(correlates: CorrelOracle3SourceCode, CorrelOracle01SourceCode, SnipPattern01SourceCode, ...)