Crazy Talk

This is a program that I wrote about ten years ago, to generate nonsense from a given input text. It builds a statistical model of the input text, which allows it to determine, given any two words, what the most likely third word is. Then it can use the last two words of those three to pick a new word, and so on. Since it doesn’t always pick the most likely word, but selects randomly from all options based on likelihood, it tends to create abrupt switches of concepts around short phrases like “it to” or “in the”.

For bonus points, it posts to a Blogger blog, assuming they haven’t changed their API.

It’s written in Perl, and I assume still works. If not, updating it to Python would be fun, and a good excuse for me to practice my Python.

#!/usr/bin/perl
#
# Generate a bunch of crazy text, and post it to a blog
#
#

use Net::Blogger;
use Getopt::Std;

# From a list of words and their probabilities, select one,
# weighted by probability.
sub select_random
{
   my %choices = %{$_[0]}; #Get a hash from a reference. This is goddamn ugly.
   my $hash_sum = 0;

   # Get the sums of all the probabilites of the possible next word.
   foreach $key (keys(%choices))
   {
      $hash_sum += $choices{$key};
   }

   my $random = (rand() * $hash_sum);

   # Do the actual selection of the next word
   foreach $key (keys(%choices))
   {
      $random -= $choices{$key};
      if($random <= 0)
      {
         return $key;
      }
   }
   #Execution should not reach this point.
   return "SOMETHING HAS GONE HORRIBLY WRONG IN SELECT_RANDOM";
}

# Given a path, load it into a hash of hashes of hashes. This is for trigrams.
# Using recursion, the algorithm could be generalized to N-grams, but it is
# unlikely that this will give any great gain in output quality. The structure
# of the hash of hashes of hashes is such that the third rank of hashes
# contains the probability that the third word (third rank hash key) will
# follow the first two (The first and second rank keys).
sub load_corpus
{
   my $filepath = shift;
   open(CORPUS_FILE, "<", $filepath);
   my @input = ;
   my $corpus;

   foreach $line (@input)
   {
      $corpus .= $line;
   }

   # strip quotes, parens, and other double-ended punctuation
   $corpus =~ s/[*<>()[]"']//g;

   # strip double-dashes, replace with single dash
   $corpus =~ s/--/-/g;

   # remove elipsis
   $corpus =~ s/.../ /g;

   # pad periods, semicolons, etc with space
   $corpus =~ s/[!.;:-,?]/ $& /g;

   # replace all whitespace with one space
   $corpus =~ s/s+/ /g;

   # lowercase everything
   $corpus = lc $corpus;

   # split it into an array
   my @corpus = split(' ', $corpus);

   my $index = 0;
   my $end = @corpus;
   $end -= 3;

   #This ends up being a hash of hashes of hashes.
   %words;

   while($index < $end)    {       $first = @corpus[$index];        $second = @corpus[$index+1];        $third = @corpus[$index+2];       $words{ $first =>
               { $second =>
                  { $third =>
                     $words{$first}{$second}{$third}++
                  }
               }
            };
      $index++;
   }
   return %words;
}

#Post to blogger
sub post{
   my $uname="username";
   my $password="pass";

   my $b = Net::Blogger->new(appkey=>'1234567890ABCDEF');

   $b->Username($uname);
   $b->Password($password);
   $b->BlogId($b->GetBlogId(blogname=>'Daily World Review'));

   my $id = $b->newPost(postbody=>$output);

   print "Posted as $id";
}

sub usage {
   print "usage: $0 [-vp] -f filen";
   print "-v			Verbose mode.n";
   print "-p			Post to blog. By default, does not post.n";
   print "-f filename	File to use as input to the text generation algorithm.n";
   print "-l integer	Maximum number of words to generate. Punctuation counts as words.n";
   exit(0);
}

#Seed the RNG, before any calls to rand
srand(time());

# Get the options and parse them. If no options are given, display usage.
my %options=();
getopts("vpf:l:", %options) or usage();

# Set the file path to read from and read it
# my $corpus_path = "/home/ams/nutter/cutting_edge.txt";
if(defined($options{f})){
   %data = load_corpus($options{f});
}
else{
   print("You must specify an input filen");
   usage();
}

#set the word count
my $wordcnt=300;
if(defined($options{l})){
   $wordcnt=$options{l};
}

my @seed=("in", "the");

#%data = load_corpus($corpus_path);

my $output;

#Add the first words to the output
$output = $output . join(" ", @seed);

for($ii=0; $ii < $wordcnt; $ii++)
{
   %choices = %{$words{$seed[0]}{$seed[1]}};
   $next = select_random(%choices);
   shift(@seed);
   $seed[1] = $next;
   $output = $output . " " . $next;
}

#Tighten spacing around commas, terminal punctuation, and dashes
$output =~ s/ ,/,/g;
$output =~ s/ ././g;
$output =~ s/ !/!/g;
$output =~ s/ ?/?/g;
$output =~ s/ - /-/g;
$output =~ s/ :/:/g;
$output =~ s/ ;/;/g;

# Uppercase the pronoun "I"
$output =~ s/ i / I /g;

#Uppercase letters after terminal punctuation
$output =~ s/[.?!] ([a-z])/".  " . uc($1)/eg;

# End at end of sentence rather than just cutting off.
$output =~ s/[^.]*$//g;

# Capitalize the first letter
$output =~ s/^(.)/uc($1)/eg;

print $output;

#Send it to blogger if desired
post() if defined $options{p};