#!/bin/bash
#
# teach-sa_lite.sh
#
# Version : 0.1 (based on teach-sa.sh 0.5)
# Latest version of teach-sa.sh is always available at http://www.ruwenzori.net/code/teach-sa/
#
# Latest version of teach-sa_lite.sh is available at http://www.new-life.org.au/~tin/teach-sa_lite.sh
#
#
# Status:
# Seems to be working OK on my Debian server with home maildirs.
#
# What :
# This script reads mail in designated maildir folders and feeds
# them to spamassassin for bayesian learning.
#
# NOT! :
# This lite version of the script has the spam reporting and ham
# learning functions removed.
#
# It is good for implementing any sort of supervised training in
# addition to Spamassassin's unsupervised training (also known as
# automatic whitelist) while reducing training-related admin workload
# to nearly zero. Fits any setup storing mail as maildir, but could
# trivially be modified to work with mbox based systems.
#
# Dependancies :
# - archivemail
# - spamassassin (duh!)
#
# Configuration : 
# The file CertainSpamFolderList in the working directory must contain a
# list of maildirs containing messages identified with certainty as spam
# (one maildir per line) !
#
# This program should have permissions to read/write users maildirs
#
# WARNING : the spam folder are both emptied during execution
# of this program. Be careful what you keep in there!
#
#
# Original Author : Jean-Marc Liotier
# Lite version modifier: Tim Bates
#
#
# License
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.


# ----------------------------------------------------------------------
# ------------ Mandatory user configurable variables -------------------
# ----------------------------------------------------------------------

# Full path of the directory where teach-sa.sh and the configuration files reside.
workdir="/opt/scripts/teach-sa_lite"

# Local postmaster address
ZePostmaster="postmaster@localhost"

# ----------------------------------------------------------------------
# ---------- End of mandatory user configurable variables --------------
# ----------------------------------------------------------------------



# Author : Tim Bates <tin@adventure.hopto.org>
# Based on original non-lite version by Jean-Marc Liotier <jim@liotier.org>
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.


# Get tomorrow's date for use below
tomorrow=`date -I --date '+1 day'`

# Move the certain spam messages from the maildirs to temporary mbox files
# This is needed because sa-learn groks mbox but not maildir
for ZeCertainSpamFolder in `cat $workdir/CertainSpamFolderList`
	do
		rm -f $ZeCertainSpamFolder.sa-learn.tmp
		# archivemail is not designed to archive current messages hence the --date=$tomorrow hack
		archivemail -q --include-flagged --no-compress --date=$tomorrow --suffix=.sa-learn.tmp $ZeCertainSpamFolder
	done


# In order to access all user maildirs this script runs as root. sa-learn
# writes in the current users home directory and I have found no way to
# make it write anywhere else. Since the systemwide spamassassin bayesian
# filtering database resides in /var/mail/.spamassassin we have to resort
# to this symlinking hack. It is harmless since the root user was not
# supposed to use spamassassin anyway.
rm -rf /root/.spamassassin
ln -s /var/mail/.spamassassin /root/.spamassassin

# Feed the yucky spam in each temporary spam mbox to sa-learn and report it
for ZeCertainSpamFolder in `cat $workdir/CertainSpamFolderList`
	do
		if [ -f "$ZeCertainSpamFolder.sa-learn.tmp" ]
		  then
			# Spamassassin bayesian learning
			sa-learn --spam --no-sync --mbox $ZeCertainSpamFolder.sa-learn.tmp
			
			# Remove the temporary file
			rm -f $ZeCertainSpamFolder.sa-learn.tmp
		fi
	done

# It is faster to rebuild after all the spam has been fed instead of doing
# it every time we feed a spam to spamassassin's sa-learn.
sa-learn --sync

# chown for the same reason as the /root/.spamassassin symlink hack : we are
# the root user but we act on behalf of the mail user
chown -R mail:mail /var/mail/.spamassassin
