#!/usr/local/bin/perl
# $Id: kseed,v 1.5 2002/12/15 01:22:58 karl Exp $
# This program is in the public domain.
# Written fall 2002 by Karl Berry.
# 
# Read messages, either save them in `corpus.{spam,nonspam}' or
# `test.{spam,nonspam}', choosing randomly whether we'll use it as a
# corpus message to seed the lists, or save it as a test message to see
# if the program is working.  Whether it's spam or nonspam is specified
# on the command line.
# 
# (This could be part of kspam, given enough options, I suppose.)

use Getopt::Long;

$DIR = "$ENV{HOME}/.kspam";

exit (&main ());


sub main
{
  &read_command_line ();
  
  my $msg = "";
  while (<>) {
    if (/^From /) {
      &do_msg ($msg) if $msg;
      $msg = "";
    }
    $msg .= $_;
  }
  
  &do_msg ($msg) if $msg;
  return 0;
}



# Handle MSG, choosing whether we'll use it for the seed corpus (in
# which case run it through kspam) or hold it back for testing.
# 
sub do_msg
{
  my ($msg) = @_;
  
  # will we use this msg as a corpus or test message?
  my $class = rand () < $OPT{"corpus"} ? "corpus" : "test";
  
  # if corpus, run the msg through the filter.
  if ($class eq "corpus") {
    local *KSPAM;
    $KSPAM = "| kspam --$WHAT";
    open (KSPAM) || die "open($KSPAM) failed: $!";
    print KSPAM $msg;
    close (KSPAM);
  }

  # in either case, append the message to an archive file.
  local *ARCHIVE;
  $ARCHIVE = ">>$DIR/$class.$WHAT";
  open (ARCHIVE) || die "open($ARCHIVE) failed: $!";
  print ARCHIVE $msg;
  close (ARCHIVE) || warn "close($ARCHIVE) failed: $!";
}



sub read_command_line
{
  %OPT = ();
  $OPT{"corpus"} = .9;

  exit 2 unless Getopt::Long::GetOptions (\%OPT,
    "corpus=f",
    "help",
    "nonspam",
    "spam",
    "version",
  );

  if ($OPT{"help"}) {
    print "Usage: $0 [OPTION]... [--spam|--nonspam] [FILE]...

Read `From '-separated message(s) from standard input or the given
FILE(s).  Separate them according to class --spam or --nonspam, write
them to either ~/.kspam/corpus.CLASS or test.CLASS, according to a given
probability.  Then, run corpus.CLASS through kspam --CLASS.

The idea here is to take a bunch of preclassified spam or nonspam
messages, and randomly separate them into a main body (then used to seed
the probabilistic word lists for kspam) and a small test group.  The
only purpose of the test messages is to run through kspam --test to see
that they are correctly classified.

Options (may be unambiguously abbreviated):
--corpus=PROB  chance of selecting a message for the corpus [$OPT{corpus}]
--nonspam      write to ~/.kspam/{corpus,test}.nonspam.
--spam         write to ~/.kspam/{corpus,test}.spam.

--help         this information.
--version      output version id and exit.

Email questions, suggestions, bug reports to karl\@cs.umb.edu.
";
    exit 0;
  }

  if ($OPT{"version"}) {
    print '$Id: kseed,v 1.5 2002/12/15 01:22:58 karl Exp $' . "\n";
    print "\nThis program is in the public domain.\n";
    exit 0;
  }

  if ($OPT{"spam"}) {
    $WHAT = "spam";
  } elsif ($OPT{"nonspam"}) {
    $WHAT = "nonspam";
  } else {
    die "$0: please specify --spam or --nonspam; try --help if needed.\n";
  }
}
