#!/usr/local/bin/perl
# $Id: kspam,v 1.3 2003/04/28 16:39:19 karl Exp $
# This program is in the public domain.
# Written fall 2002 by Karl Berry.
# 
# See http://www.paulgraham.com/spam.html for the algorithm we are
# implementing (and for many other alternative implementations).
# 
# exit 0 if stdin is spam, 1 if nonspam.
# If multiple messages, 1 if any messages are nonspam.
# 
# No output to stdout unless --verbose.
# 
# todo:
# -purge

use Getopt::Long;

$DIR = "$ENV{HOME}/.kspam";
$BADLIST = "$DIR/badlist";
$GOODLIST = "$DIR/goodlist";
$LOCKFILE = "$DIR/kspam.lock";
#
# Graham constants.
$KEEPERS = 15;          # number of tokens to consider.
$MAX_TOKENLEN = 25;     # ignore tokens longer than this.
$MIN_FREQ = 5;          # ignore tokens that occur fewer times than this.
$ONLY_BAD_PROB = .99;   # spam probability if only in badlist.
$ONLY_GOOD_PROB = .01;  # spam probability if only in goodlist.
$SPAM_CUTOFF = .9;      # call it spam if final probability is more than this.

exit (&main ());

sub main
{
  my $exit_status = 0;
  &read_command_line ();
  
  # serialize access.
  -d $DIR || system ("mkdir -p $DIR");
  &lock ();

  # read existing token hashes.
  my $badlist = &hash_read ($BADLIST);
  my $goodlist = &hash_read ($GOODLIST);
  
  # loop to handle multiple messages.
  while (!eof (*STDIN)) {
    my %interesting = ();
    my @msgtokens = ();
    $MESSAGE_ID = "<nomsgid>";
    
    # tokenize one message.
    while (defined (my $token = &get_token (*STDIN))) {
      push (@msgtokens, $token);  # save all tokens for later hash update.
      
      if (! ($OPT{"spam"} || $OPT{"nonspam"})) {
        # if we're forcing the result, we don't need to compute any
        # probabilities.
        # 
        my $tokenprob = &graham_probability ($token, $goodlist, $badlist);
        next if ! defined $tokenprob;  # ignore this token in the decision.
        
        &add_if_interesting_enough ($token, $tokenprob, \%interesting);
      }
    }
    
    # was it spam?  if the user didn't force it one way or the other,
    # compute the combined probability.  If it was forced, indicate that
    # in the log line with numbers >1 or <0.
    # 
    my $msgprob;
    if ($OPT{"spam"} || $OPT{"SPAM"}) {
      $msgprob = 2 + $OPT{"SPAM"};
    } elsif ($OPT{"nonspam"} ||$OPT{"NONSPAM"}) {
      $msgprob = -1 - $OPT{"NONSPAM"};
    } else {
      $msgprob = &bayes_probability (values %interesting);
    }
    
    my $msgresult;
    if ($msgprob < $SPAM_CUTOFF) {
      $exit_status = 1;
      $msgresult = "nonspam";
      &hash_add ($goodlist, \@msgtokens);
      &hash_subtract ($badlist, \@msgtokens) if $OPT{"NONSPAM"};
    } else {
      $msgresult = "spam";
      &hash_add ($badlist, \@msgtokens);
      &hash_subtract ($goodlist, \@msgtokens) if $OPT{"SPAM"};
    }

    &log (sprintf "%s %s/%.2f %s", 
          $MESSAGE_ID, $msgresult, $msgprob,
          &interesting_string (\%interesting));
  }  
  
  &unlock ();
  return $exit_status;
}



# Return next token from FH.  At eof or a message boundary ("\nFrom "),
# return undef.  Ignore headers which just contain unique
# identification, a la bogofilter.
# 
# Yes, lex would be cleaner, but then the glue gets complex ...
# 
# We keep a global $LASTCHAR that saves the character that terminated
# the last returned token.  (Primarily so that we can recognize `\nFrom '.)
# 
sub get_token
{
  my ($fh) = @_;
  my $ret = "";
  
  while (! $ret) {
    my ($char, $lastchar);
    
    while (defined ($char = getc ($fh))) {
      last if $char =~ /[][)(><"=\s]/;
      last if $char eq "." && $lastchar eq ".";  # ... and the like.
      
      $ret .= $char;
      $lastchar = $char;
    }
    &ddebug ("  potential token: $ret (at `$char', LASTCHAR=`$LASTCHAR')");
    
    if (! defined ($char) && ! $ret) {  # eof
      $ret = undef;
      last;
    }

    if ($LASTCHAR eq "\n"
        && $ret =~ /^(x-vm-.*
                    |message-id
                    |list-
                    |x-mailman-version
                    |in-reply-to
                    |date
                    |references
                   ):/ix) {
      # a mail header we don't care about; ignore the whole line.
      # Not sure if we should ignore Received: headers or not.
      my $val = "";
      while (defined ($char = getc ($fh)) && $char ne "\n") {
        $val .= $char;
      }
      &ddebug ("skipped to end of line for $ret $val");
      #
      # As a kludge, set global for sake of logging.
      $MESSAGE_ID = $val if $ret =~ /message-id:/i;
      #
      $ret = "";
    }
    
    my $original = $ret;
    $ret =~ s/[?!,.;:]+$//;  # remove trailing punctuation.

    # ignore numbers, dates, --boundaries.
    if (length ($ret) <= 2
        || length ($ret) > $MAX_TOKENLEN
        || $ret =~ /^\+?[-\d:]+$/) {
      $ret = "";  # too short, too long, or not wanted, read some more.
      $LASTCHAR = $char;
    } else {
      &ddebug ("last=$LASTCHAR, original=$original, char=$char");
      if ($LASTCHAR eq "\n" && $original eq "From" && $char eq " ") {
        # message separator.
        $ret = undef;
        $LASTCHAR = $char;
        last;
      } else {  # normal token return.
        $ret = lc ($ret);  # ignore case
        $LASTCHAR = $char;
      }
    }
  }

  &debug ("returning token: $ret");
  return $ret;
}



# Return probability according to Graham's formula that TOKEN implies
# the email is spam, using GOODLIST and BADLIST.
# 
sub graham_probability
{
  my ($token,$goodlist,$badlist) = @_;
  my $ret;
  my $info = "";
  
  my $g = $goodlist->{$token} * 2;  # weight good numbers
  my $b = $badlist->{$token};

  if ($g + $b >= $MIN_FREQ) {
    if ($g && $b) {
      # normal case, token appears in both lists.
      my $goodcount = $goodlist->{" msgcount"};
      $goodvalue = min (1, $g / $goodcount);
      #
      my $badcount = $badlist->{" msgcount"};
      $badvalue = min (1, $b / $badcount);
      #
      $ret = $badvalue / ($badvalue + $goodvalue);
      $info = "bad=$badvalue [$b/$badcount], good=$goodvalue [$g/$goodcount]";
    } elsif ($b == 0) {
      $ret = $ONLY_GOOD_PROB;
      $info = "only good";
    } elsif ($g == 0) {
      $ret = $ONLY_BAD_PROB;
      $info = "only bad";
    }
  } else {
    $ret = undef;  # didn't occur enough times, ignore it.
    $info = "$g+$b < $MIN_FREQ occurrences, ignoring";
  }

  &debug (" graham($token)=$ret ($info)");
  return $ret;
}

sub min
{
  my (@list) = @_;
  $min = $list[0];
  for my $e (@list) {
    $min = $e if $e < $min;
  }
  return $min;
}



# Given @LIST with elements being probabilities E1..En, return
# combined probability.  In short, this is:
#   E1*E2*...*En
#   ------------
#   E1*E2*...*En + (1-E1)(1-E2)...(1-En)
#
# See: http://www.paulgraham.com/naivebayes.html
#      http://www.mathpages.com/home/kmath267.htm
# 
sub bayes_probability
{
  my (@list) = @_;
  my $ret;
  
  my $product = 1;
  my $denom2 = 1;
  for my $e (@list) {
    $product *= $e;
    $denom2 *= (1 - $e);
  }
  
  $ret = $product / ($product + $denom2);
  &debug ("combined bayes probability: $ret (product=$product denom2=$denom2 "
          . (join ("*", reverse sort @list)) . ")");
  return $ret;
}



# Add $TOKEN to $INTERESTING if $TOKENPROB is further from .5 from one
# of the existing entries, or if we haven't filled INTERESTING yet.
# Keep $INTERESTING to at most $KEEPERS entries.
# 
sub add_if_interesting_enough
{
  my ($token,$tokenprob,$interesting,$was) = @_;

  return if $interesting->{$token};  # done if already interesting.
    # Graham says nothing about weighting that repeated token, so we don't.

  # find most boring token so far.
  my $least;
  my $least_distance = 1;
  for my $k (keys %$interesting) {
    my $this_distance = abs ($interesting->{$k} - .5);
    if ($this_distance < $least_distance) {
      $least = $k;
      $least_distance = $this_distance;
    }
  }
  &ddebug (" least: $least/$interesting->{$least}, distance $least_distance");
  
  # If our new token is more interesting, or if we haven't filled up all
  # the slots yet, save it.
  # 
  if ($least_distance < abs ($tokenprob - .5)
      || keys %$interesting < $KEEPERS) {
    if (keys %$interesting >= $KEEPERS) {
      &debug (" removed most boring token $least/$interesting->{$least}");
      delete $interesting->{$least};
    }
    $interesting->{$token} = $tokenprob;

    &debug (" new interesting token #" 
            . (keys %$interesting)
            . " $token/$tokenprob"
            . &interesting_string ($interesting));
  }
}


# For debugging, return string of HASH, sorted by values.
# Assume the values are floating point numbers, and return just the
# first two decimal digits.
# 
sub interesting_string
{
  my ($hash) = @_;
  my @ret = ();
  
  for my $k (sort { $hash->{$b} <=> $hash->{$a} } keys %$hash) {
    my $val = sprintf "%.2f", $hash->{$k};
    $val =~ s/^0//;
    push (@ret, "$k/$val");
  }
  
  return " (" . join (" ", @ret) . ")";
}



# Read $FILENAME, consisting of a token and a probability on each line,
# plus a line " msgcount" and the number of messages read so far.
# Return the corresponding hash, plus an element " filename" with value
# $FILENAME.  (If the file doesn't exist yet, " filename" will be the
# only element.)
# 
sub hash_read
{
  my ($filename) = @_;
  my %ret;
  $ret{" filename"} = $filename;

  local *FILE;
  open (FILE, $filename) || return \%ret;  # doesn't exist yet.
  while (<FILE>) {
    my ($token,$value) = split ();
    $token = " $token" if /^ /; # for " msgcount".
    $ret{$token} = $value;
  }
  close (FILE) || warn "close($filename) failed: $!";
  
  return \%ret;
}



# Increment or decrement (according to $STEP) each element of $TOKENS in
# $HASH, and rewrite the file (whose name is in $HASH->{" filename"}).
# Step $HASH{" msgcount"} in the new file, too.
# 
sub hash_update
{
  my ($hash,$step,$tokens) = @_;

  return if $OPT{"test"};  # skip rewriting files?
  
  my $filename = $hash->{" filename"};
  local *FILE;
  open (FILE, ">$filename") || die "open(>$filename) failed: $!";
  
  # update count for the new tokens, and the total message count.
  for my $t (@$tokens, " msgcount") {
    $hash->{$t} += $step;
  }
  
  # write everything.  may as well keep it in order by interest.
  for my $k (sort { $hash->{$b} <=> $hash->{$a} } keys %$hash) {
    next if $k eq " filename";  # don't write the filename
    next unless $k;
    next if $hash->{$k} <= 0;  # might happen from forced decrementing
    print FILE "$k $hash->{$k}\n";
  }
  close (FILE) || warn "close(>$filename) failed: $!";
}

sub hash_add
{
  my ($hash,$tokens) = @_;
  &hash_update ($hash, +1, $tokens);
}

sub hash_subtract
{
  my ($hash,$tokens) = @_;
  &hash_update ($hash, -1, $tokens);
}



# write to log file and (if --verbose) stdout.
# 
sub log
{
  if (! $LOG) {
    $LOG = ">>$DIR/log";
    open (LOG) || die "open($LOG) failed: $!";
  }
  print LOG "@_\n";
  print "@_\n" if $OPT{"verbose"};
}


sub debug
{
  return unless $OPT{"debug"};
  print "debug: @_\n";
}

sub ddebug
{
  return unless $OPT{"debug"} > 1;
  print "ddebug:@_\n";
}

sub debug_hash
{
#  return unless $opt{"debug"};
  my ($label) = shift;
  my (%hash) = (ref $_[0] && $_[0] =~ /.*HASH.*/) ? %{$_[0]} : @_;

  warn "$label\n";
  my $str = "";
  for my $key (sort keys %hash) {
    my $val = $hash{$key};
    $val =~ s/\n/\\n/g;
    $str .= " $key => $val\n";
  }

  warn ($str || "{empty hash}");
}



# use lockfile(1) from procmail.
# 
sub lock
{
  # Although a kspam run might actually take longer than this if pumping
  # lots of mail through, we don't want to hang forever, we want to let
  # the mail through, even if we fail to detect a spam.  Retrying 100
  # times, 8 seconds each time, is about 13 minutes.
  # 
  my $cmd = "lockfile -r 100 $LOCKFILE";
  my $status = system ($cmd);
  die "`$cmd' failed: $!" if $status;
}

sub unlock
{
  unlink ($LOCKFILE)
  || die "unlink($LOCKFILE) failed: $!";
}


sub read_command_line
{
  %OPT = ();
  $OPT{"debug"} = 0;
  
  exit 2 unless Getopt::Long::GetOptions (\%OPT,
    "debug=i",
    "help",
    "nonspam",
    "NONSPAM",
    "spam",
    "SPAM",
    "test",
    "verbose",
    "version",
  );

  if ($OPT{"help"}) {
    print "Usage: $0 [OPTION]... [FILE]...

Read `From '-separated message(s) from standard input.  By default, run
it through Paul Graham's algorithm for probabilistic spam detection
according to word frequency (see http://www.paulgraham.com/spam.html).

The basic idea is to compare the incoming message to pre-existing word
probabilities for spam and nonspam, then use the 15 most interesting
tokens to determine how to classify the new message.  The word lists are
then updated accordingly.

The word lists are kept in the files ~/.kspam/{bad,good}list for spam
and nonspam, respectively (these are simple text files).  In addition,
kspam writes a single line to ~/.kspam/log for each message.

Best used from .procmailrc and assorted scripts, see the distribution
for info.

Options (may be unambiguously abbreviated):
--debug=LEVEL  with LEVEL=1, write voluminous information about what the
                 program is doing; with LEVEL=2, include information
                 about the tokenizing step.
--nonspam      force the incoming message(s) to be treated as nonspam.
--NONSPAM      like --nonspam, but also decrement their tokens in the badlist
                 (for reclassifying a real message that was erroneously
                  considered spam.)
--spam         force the incoming message(s) to be treated as spam.
--SPAM         like --spam, but also decrement their tokens in the goodlist
                 (for reclassifying a spam that was erroneously
                  considered a real message.)
--test         do not update the word lists.
--verbose      write log line to stdout (as well as ~/.kspam/log).

--help         this information.
--version      output version id and exit.

Email questions, suggestions, bug reports to karl\@cs.umb.edu.
";
    exit 0;
  }

  if ($OPT{"version"}) {
    print '$Id: kspam,v 1.3 2003/04/28 16:39:19 karl Exp $' . "\n";
    print "\nThis program is in the public domain.\n";
    exit 0;
  }

  #&debug_hash ("Options:", %OPT);
}
