#!/usr/local/bin/perl # $Id: kseed,v 1.5 2002/12/15 01:22:58 karl Exp $ # This program is in the public domain. # Written fall 2002 by Karl Berry. # # Read messages, either save them in `corpus.{spam,nonspam}' or # `test.{spam,nonspam}', choosing randomly whether we'll use it as a # corpus message to seed the lists, or save it as a test message to see # if the program is working. Whether it's spam or nonspam is specified # on the command line. # # (This could be part of kspam, given enough options, I suppose.) use Getopt::Long; $DIR = "$ENV{HOME}/.kspam"; exit (&main ()); sub main { &read_command_line (); my $msg = ""; while (<>) { if (/^From /) { &do_msg ($msg) if $msg; $msg = ""; } $msg .= $_; } &do_msg ($msg) if $msg; return 0; } # Handle MSG, choosing whether we'll use it for the seed corpus (in # which case run it through kspam) or hold it back for testing. # sub do_msg { my ($msg) = @_; # will we use this msg as a corpus or test message? my $class = rand () < $OPT{"corpus"} ? "corpus" : "test"; # if corpus, run the msg through the filter. if ($class eq "corpus") { local *KSPAM; $KSPAM = "| kspam --$WHAT"; open (KSPAM) || die "open($KSPAM) failed: $!"; print KSPAM $msg; close (KSPAM); } # in either case, append the message to an archive file. local *ARCHIVE; $ARCHIVE = ">>$DIR/$class.$WHAT"; open (ARCHIVE) || die "open($ARCHIVE) failed: $!"; print ARCHIVE $msg; close (ARCHIVE) || warn "close($ARCHIVE) failed: $!"; } sub read_command_line { %OPT = (); $OPT{"corpus"} = .9; exit 2 unless Getopt::Long::GetOptions (\%OPT, "corpus=f", "help", "nonspam", "spam", "version", ); if ($OPT{"help"}) { print "Usage: $0 [OPTION]... [--spam|--nonspam] [FILE]... Read `From '-separated message(s) from standard input or the given FILE(s). Separate them according to class --spam or --nonspam, write them to either ~/.kspam/corpus.CLASS or test.CLASS, according to a given probability. Then, run corpus.CLASS through kspam --CLASS. The idea here is to take a bunch of preclassified spam or nonspam messages, and randomly separate them into a main body (then used to seed the probabilistic word lists for kspam) and a small test group. The only purpose of the test messages is to run through kspam --test to see that they are correctly classified. Options (may be unambiguously abbreviated): --corpus=PROB chance of selecting a message for the corpus [$OPT{corpus}] --nonspam write to ~/.kspam/{corpus,test}.nonspam. --spam write to ~/.kspam/{corpus,test}.spam. --help this information. --version output version id and exit. Email questions, suggestions, bug reports to karl\@cs.umb.edu. "; exit 0; } if ($OPT{"version"}) { print '$Id: kseed,v 1.5 2002/12/15 01:22:58 karl Exp $' . "\n"; print "\nThis program is in the public domain.\n"; exit 0; } if ($OPT{"spam"}) { $WHAT = "spam"; } elsif ($OPT{"nonspam"}) { $WHAT = "nonspam"; } else { die "$0: please specify --spam or --nonspam; try --help if needed.\n"; } }