#!/usr/bin/env perl
# The following is the collation script.  Copy this file into your
# working directory under the name 'collate.pl'.  Use a text editor
# to remove any lines before the line starting with #!/usr/bin/env perl.
# Be sure to save this file as a plain text file.
# ==================================================================
# A script to automate the collation of Latin self-study assignments
# Kirk Lougheed, 1/1/1998

# $Id: collate.pl,v 1.28 2015/04/23 23:59:07 kirk Exp kirk $

# use diagnostics;

use Encode;
use FileHandle;

$debug = 0;          # set to 1 to see which lines are being discarded
$linelength = 69;    # default maximum line length
$definitlen = 3;     # default length of initials string
$wheelock = 1;       # 1 = Wheelock format, 0 - translation format
$filedebug = 0;      # what files are being looked at
$allowtranslate = 1; # 0 = never allow translation mode
$fixing = 1;         # 1 = perform fixups
$sortdefault = "5";  # default sortkey value if none specified
$cmdline = 1;        # 1 = running from a command line, 0 = double clicked
$unicode = 0;        # 1 = assume Unicode character set
$macapp = 1;         # 1 = running as a Platypus-generated app on Mac OS X
@lines = ();         # array of lines in current file
if ($unicode) {      # flowed text if we're doing Unicode
    $linelength = 2000;
}
if ($macapp) {       # if a Mac drap & drop app, no command line
    $cmdline = 0;
}


#
# Special strings and their replacements when not handling Unicode.
#
%replace = (
    "=09" => " ",               # ASCII: tab (-> space)
    "=0A" => "\n",              # ASCII: linefeed
    "=0A=0A" => "\n\n",         # ASCII: two linefeeds
    "=0A=0A=0A" => "\n\n\n",    # ASCII: three linefeeds
    "=0D" => "\n",              # ASCII: carriage return (-> linefeed)
    "=20" => " ",               # ASCII: space
    "=2C" => ".",               # ASCII: comma
    "=2E" => ".",               # ASCII: period
    "=3D" => "=",               # ASCII: equals
    "=3D=3D" => "==",           # ASCII: two equals
    "=3D=3D=3D" => "===",       # ASCII: three equals
    "=3A" => ":",               # ASCII: colon
    "=3F" => "?",               # ASCII: question mark
    "=91" => "\'",              # WinLatin1: \221 into an ascii singlequote
    "=92" => "\'",              # WinLatin1: \222 into an ascii singlequote
    "=93" => "\"",              # WinLatin1: \223 into an ascii doublequote
    "=94" => "\"",              # WinLatin1: \224 into an ascii doublequote
    "=96" => "-",               # WinLatin1: \226 into a single hyphen
    "=97" => "--",              # WinLatin1: \227 into a double hyphen
    "=AF" => "delete",          # ISO-8859-1: overbar
    "=C6" => "Ae",              # IOS-8859-1: the "AE" diagraph
    "=A0" => " ",               # ISO-8859-1: non-breaking space
    "=E1" => "A",               # ISO-8859-1: a w/ accent grave
    "=E2" => "A",               # ISO-8859-1: a w/ caret
    "=E6" => "ae",              # ISO-8859-1: the "ae" diagraph
    "=E9" => "E",               # ISO-8859-1: e w/ accent grave
    "=EA" => "E",               # ISO-8859-1: e w/ caret
    "=EB" => "e",               # ISO-8859-1: e w/ diaersis
    "=ED" => "I",               # ISO-8859-1: i w/ accent grave
    "=EE" => "I",               # ISO-8859-1: i w/ caret
    "=F3" => "O",               # ISO-8859-1: o w/ accent grave
    "=F4" => "O",               # ISO-8859-1: o w/ caret
    "=F6" => "o",               # ISO-8859-1: o w/ diaersis
    "=FA" => "U",               # ISO-8859-1: u w/ accent grave
    "=FB" => "U",               # ISO-8859-1: u w/ caret
    
    "=C2=A0" => " ",            # Unicode: U00A0 non-breaking space
    "=C3=86" => "AE",           # Unicode: U00C6 AE diagraph
    "=C3=A6" => "ae",           # Unicode: U00E6 ae diagraph
    "=C3=8B" => "E",            # Unicode: U00CB E w/diaersis
    "=C3=AB" => "e",            # Unicode: U00EB e w/diaersis
    "=C3=96" => "O",            # Unicode: U00D6 O w/diaersis
    "=C3=B6" => "o",            # Unicode: U00F6 o w/diaersis

    "=C3=A1" => "A",            # Unicode: U00E1 a w/ accent grave
    "=C3=A2" => "A",            # Unicode: U00E2 a w/ caret
    "=C3=A9" => "E",            # Unicode: U00E9 e w/ accent grave
    "=C3=AA" => "E",            # Unicode: U00EA e w/ caret
    "=C3=AD" => "I",            # Unicode: U00ED i w/ accent grave
    "=C3=AE" => "I",            # Unicode: U00EE i w/ caret
    "=C3=B3" => "O",            # Unicode: U00F3 o w/ accent grave
    "=C3=B4" => "O",            # Unicode: U00F4 o w/ caret
    "=C3=BA" => "U",            # Unicode: U00FA u w/ accent grave
    "=C3=BB" => "U",            # Unicode: U00FB u w/ caret

    "=C4=81" => "A",            # Unicode: U0101 a w/ macron
    "=C4=93" => "E",            # Unicode: U0113 e w/ macron
    "=C4=AB" => "I",            # Unicode: U012B i w/ macron
    "=C5=8D" => "O",            # Unicode: U014D o w/ macron
    "=C5=AB" => "U",            # Unicode: U016B u w/ macron

    "=E2=80=93" => "--",        # Unicode: U2013 en-dash
    "=E2=80=94" => "--",        # Unicode: U2014 em-dash
    "=E2=80=98" => "\'",        # Unicode: U2018 open single-quote
    "=E2=80=99" => "\'",        # Unicode: U2019 close single-quote
    "=E2=80=9C" => "\"",        # Unicode: U201C open double-quote
    "=E2=80=9D" => "\"",        # Unicode: U201D close double-quote
    "=E2=80=A6" => "\.\.\.",    # Unicode: U2026 ellipsis
    "=EF=BB=BF" => "delete"     # Unicode: byte order marker
);

#
# Operating system specific initialization
# Currently just filename details.
#
$dirseparator = '/';             # Perl on Unix and Windows use slash
$defaultin = 'lesson';           # default input file if being dumb

#
# File handles for STDOUT and STDERR
#
$ERROR = FileHandle->new;
$OUTPUT = FileHandle->new;

#
# Sort out input and configuration files.
# Expand directories into list of files
# Find header file, default to "header.txt" in connected directory.
# Find names file, default to "names.txt" in connected directory.
# Ignore backup and save files.
#
@inputfilelist = ();
$headerfile = "header.txt";       
$namesfile = "names.txt";
if (scalar(@ARGV) == 0) {  # no files given, so look in "lesson" folder
    @ARGV = ($defaultin);
    $cmdline = 0;          # set flag that we're not running from a CLI
}
@ARGV = <@ARGV>;           # expand any wildcards in the filenames
open_output();
print $ERROR "Starting with arguments: @ARGV\n" if $filedebug;
foreach $file (@ARGV) {
    print $ERROR "Examining $file - " if $filedebug;
    #
    # For Windows force backslash to slash to avoid quoting problems.
    # Perl on Windows copes with Unix-style pathnames.
    #
    $file =~ s/\\/\//g;

    #
    # If this is a directory, read in its list of files.
    # Ignore two special directories, upcoming and lastweek
    # Discard . and .. directories, then construct fully qualified
    # pathname and add to end of ARGV in case of nested directories.
    #
    if (-d $file) {
        next if ($file =~ m/upcoming/);
        next if ($file =~ m/lastweek/);
        next if ($file =~ m/\.app/);
        opendir(DIRFILE, $file);
        @tmplist = readdir(DIRFILE);
        closedir(DIRFILE);
        foreach (@tmplist) {
            next if (m/^\./);
            push(@ARGV, $file.$dirseparator.$_);
        }
        print $ERROR "a directory containing @tmplist\n" if $filedebug;
        next;
    }

    #
    # Ignore files ending in ~ (emacs backup) or # (emacs save).
    # Extract any file containing the string "header.txt" or "names.txt".
    # We'll try reading any surviving files.
    #
    $_ = $file;
    if (m/~$/ || m/#$/ || m/.BAK$/ || m/.pl$/) {
        print $ERROR "ignoring\n" if $filedebug;
        next;
    }
    if (m/header\.txt/) {
        print $ERROR "the header.txt file\n" if $filedebug;
        $headerfile = $file;
        next;
    }
    if (m/names\.txt/) {
        print $ERROR "the names.txt file\n" if $filedebug;
        $namesfile = $file;
        next;
    }
    print $ERROR "an input file\n" if $filedebug;
    push(@inputfilelist, $file);
}

#
# Establish our configuration from the "names.txt" file.
#
@endlist = ('END');
%sectionlist = ('TR' => 1, 'PR' => 1, 'SA' => 1, 'GM' => 1);
load_config($namesfile);

#
# Roll through each file extracting the good stuff
#
foreach $file (@inputfilelist) {
    $gathering = 0;
    $foundtext = 0;
    $tr_counter = 1;
    $lastperson = "";
    load_file($file, \&fix_data);
    foreach (@lines) {

        #
        # Skip comment lines starting with a '#'
        # Finish up a paragraph when a comment line is seen and hope that
        # people don't put comment lines in the middle of their text.
        #
        if (m/^#/) {
            print $ERROR "$_\n" if $debug;
            &finish_paragraph;
            next;
        }

        #
        # Check this line with our list of ways to end the file.
        # If we found an eof indicator and we had been gathering
        # text, then we might really done.  Otherwise we probably
        # tripped over a MIME boundary at the start of the file.
        #
        $foundend = 0;
        foreach $token (@endlist) {
            if (m/^\Q$token\E/) {
                print $ERROR "$_\n" if $debug;
                $foundend = 1;
                last;
            }
        }
        if ($foundend && $gathering) {
            &finish_paragraph;
            last;
        }

        #
        # If translation mode is allowed, check the beginning of the
        # line for initials, numbers, and such.  Otherwise stay in
        # Wheelock mode.
        #
        if ($allowtranslate) {
            #
            # Automatically coerce lines starting with initials listed in
            # our configuration files into the standard format "TR 1 S ".
            # The number is supplied by context.  Initials are followed by:
            #  - a (dot or colon or comma or hyphen) and an optional space
            #  - a space
            # A successful coercion clears wheelock formatting mode.        
            #
            if (m/^(([A-Za-z]{1,4})[.:,-]? )/) {
                if ($personlist{$2}) {
                    $person = $2;
                    s/$1//;   # zap whatever leading substring we matched.
                    $_ = "TR ".$tr_counter." ".$person." ".$_;
                    $tr_counter = $tr_counter + 1;
                    $wheelock = 0;
                }
            }
    
            #
            # If line starts with something like 1.14.3 or 51.3, then
            # this is an assignment file and the user is "..".
            # A successful coercion clears wheelock formatting mode.
            #
            if (m/^\d{1,3}\.\d{1,3}/) {
                $_ = "TR ".$tr_counter." .. ".$_;
                $tr_counter = $tr_counter + 1;
                $wheelock = 0;
            }
    
            #
            # If the line starts with two dots and a space, it is our
            # special user "..".  Again, a successful coercion clears 
            # the wheelock formatting mode.
            #
            if (m/^(\.\. )/) {
                s/$1//;          # clear out the ".. " we matched
                $_ = "TR ".$tr_counter." .. ".$_;
                $tr_counter = $tr_counter + 1;
                $wheelock = 0;
            }
        }

        #
        # Check for the start of a new paragraph of good stuff.
        # Paragraph start string must start line and be followed by a space,
        # one to three decimal digits, an optional period, and a final space.
        # If no new paragraph, we're either gathering a paragraph or skipping
        # junk at the beginning of the file.
        #
        $newparagraph = 0;
        if (m/^([A-Z0-9]{1,5}) \d{1,3}\.? /) {
            if ($sectionlist{$1}) {
                $newparagraph = 1;
            }
        }

        #
        # If this is not a new paragraph and we are gathering a paragraph,
        # add the current input line to the $text variable.
        #
        if (!$newparagraph && $gathering) {
            $text = $text." ".$_;
            next;
        }

        #
        # Finish previous paragraph and start a new one.
        # We expect: a two letter section code (e.g. SA, PR, TR),
        # followed by a space, followed by a one or two digit number,
        # followed by a space, followed by initials, followed by a space,
        # finally followed by the first line of text.  The initials will
        # be padded out or truncated to $definitlen bytes.
        # Single digit numbers will be given a leading zero.
        #
        if ($newparagraph) {
            &finish_paragraph;
            ($section, $exercise, $person, $text) = split(/ /, $_, 4);

            #
            # If person doesn't match previous person, someone likely
            # forgot to put in their initials.  Quietly fix this up.
            #
            if ($fixing && ($lastperson ne "") && ($lastperson ne "..") &&
                ($person ne $lastperson)) {
                print $ERROR "Note: $section $exercise in $file, correcting";
                print $ERROR " \"$person\" to \"$lastperson\"\n";
                $text = $person." ".$text;
                $person = $lastperson;
            }           

            #
            # Recall the first person specified for this file.
            # If the contributor gets this one correct, we can quietly
            # fix up later mistakes.
            #
            if ($lastperson eq "") {
                $lastperson = $person;
            }

            #
            # Remove any periods or leading zeros from the exercise number.
            # Pad with leading space for sorting.  Allow three digits.
            #
            $exercise =~ s/\.//g;
            $exercise =~ s/^0/ /g;
            if (length($exercise) == 1) {
                $exercise = "  ".$exercise;
            } else {
		if (length($exercise) == 2) {
		    $exercise = " ".$exercise;
		}
            }

            #
            # Build the canonical key, e.g. "5:PR:01:5:KSL".
            # By sorting on this key everything falls out in the right order.
            # Set flag that we are gathering a paragraph in $text.
            #
            $srtsec = $sortkey{$section};
            $srtsec = $sortdefault if ($srtsec eq "");
            $srtper = $sortkey{$person};
            $srtper = $sortdefault if ($srtper eq "");
            $key = $srtsec.":".$section.":".$exercise.":".$srtper.":".$person;
            $gathering = 1;

            #
            # We are a suspicious sort.  If this key is already present,
            # complain about a duplicate.
            #
            if ($array{$key}) {
                print $ERROR "Duplicate item in $file: $key : $_\n";
                $gathering = 0;
            }
            next;
        }

        #
        # Reach here only if we are ignoring this line.
        #
        print $ERROR "$_\n" if $debug;
    }

    #
    # Finish up at end of file
    # Complain if no text
    #   
    &finish_paragraph;
    if (!$foundtext) {
        print $ERROR "No material found in $file\n";
    }
}

#
# Done parsing, so print out preambles
#
# The header.txt file is a standard preamble for Wheelock and
# translation modes. The contents of header.txt are printed first.
#
if (load_file($headerfile, 0)) {
    foreach (@lines) { print $OUTPUT "$_\n"; }
}

#
# Now write out the list of names and initials
#
foreach $person (sort alpha keys(%contriblist)) {
    next if ($person =~ m/^\.\.$/);
    local($initials) = $person."     ";
    substr($initials, 4) = '';
    print $OUTPUT "# " if ($wheelock);
    print $OUTPUT "$initials $contriblist{$person}\n";
}

#
# Write out sorted associative array of everyone's contributions.
# The master key has form
#  <section sortkey>:<section>:<exercise>:<person sortkey>:<person>
# The section and person sortkeys allow us to sort section names and
# personal names according to arbitary whim.
#
$lastexercise = "";
$lastuser = "";
if (!$wheelock) {
    $definitlen = 4;
}
foreach $key (sort alpha keys(%array)) {

    #
    # Dissect a key of form "5:PR:11:5:KSL" into three strings.
    #
    ($discard1, $section, $exercise, $discard2, $person) = split(/:/, $key, 5);

    #
    # Turn a person's initials into a fixed length string
    #
    if (($person =~ m/^\.\./)  && !$wheelock) {
        $person = "   ";
    }
    $person = $person."     ";
    substr($person, $definitlen) = '';

    #
    # Trim leading spaces from $exercise number
    #
    $exercise =~ s/^\s+//;

    #
    # Calculate the leadin for subsequent lines.
    #
    if ($wheelock) {
	$thisleadin = length($section) + 1;
	$thisleadin += length($exercise) + 1 + $definitlen + 1;
    } else {
	$thisleadin = $definitlen + 1;
    }

    #
    # Construct exercise and name strings.
    #
    $thisexercise = $section." ".$exercise;
    $thisuser = $thisexercise." ".$person." ";

    #
    # Determine the separation of paragraphs    
    # Separate different exercises (e.g. PR 11 vs PR 12) by two blank lines
    # (Wheelock mode) or by dashes and two blank lines (translation mode).
    # Translation mode also separates different users within an exercise.
    #   
    if ($thisexercise ne $lastexercise) {
        if ($wheelock) {
            print $OUTPUT "\n\n";
        } else {
            print $OUTPUT ((" " x 32)."------\n\n");
        }
    } else {
        if (!$wheelock && ($thisuser ne $lastuser)) {
            print $OUTPUT "\n";
        }
    }

    #
    # Figure out the string we will use to start this new paragraph
    #
    if ($wheelock) {
        $startstring = $thisuser;
    } else {
        $startstring = $person." ";
    }

    #
    # Print this paragraph.
    #
    $firstword = 1;
    $_ = $array{$key};
    while ($_) {
        #
        # We have a set of space-separated tokens in $_.
        # Peel off the head of the list, put shortened list back in $_.
        #
        if (m/ /) {
            ($token, $remainder) = split(/ /, $_, 2);
            $_ = $remainder;
        } else {
            $token = $_;
            $_ = "";
        }

        #
        # If the token is an ampersand, force a new line right now.
        # Since the next word will prepend a space, drop a space here.
        #
        if ($token eq "&") {
            print $OUTPUT "\n".(" " x ($thisleadin - 1));
            $currcount = $linelength - ($thisleadin - 1);
            next;
        }

        #
        # If the token is ">>", force an indentation
        # Total indentation is 5 spaces: 4 here and 1 with first word.
        #
        if ($token eq ">>") {
            print $OUTPUT "\n".(" " x (4 + $thisleadin));
            $currcount = $linelength - ($thisleadin + 4);
            next;
        }

        #
        # Pretty print
        #
        $tokenlength = length($token);
        if ($firstword) {
            print $OUTPUT $startstring.$token;
            $currcount = $linelength - length($startstring) - $tokenlength;
            $firstword = 0;
        } else {
            if ($tokenlength > $currcount) {
                print $OUTPUT "\n".(" " x $thisleadin).$token;
                $currcount = $linelength - $thisleadin - $tokenlength;
            } else {
                print $OUTPUT " ".$token;
                $currcount -= ($tokenlength + 1);
            }
        }
    }
    print $OUTPUT "\n";
    $lastexercise = $thisexercise;
    $lastuser = $thisuser;
}
close_output();
exit;

#
# open_output
# Open the default output and error files.
# If we're running from a command line, don't create errors.txt file.
# Instead send directly to STDERR.
#
sub open_output {
    $basedir = "";
    if ($macapp) {   # Platypus sets our working directory to "/"
	chdir();
	use Cwd;
	$basedir = getcwd()."/Desktop/";
    }
    $outputtxt = $basedir."output.txt";
    $errorstxt = $basedir."errors.txt";
    open($OUTPUT, ">$outputtxt") or die("Can't open $outputtxt");
    if ($macapp) {
        open($ERROR, ">$errorstxt") or die("Can't open $errorstxt");
    } else {
	$ERROR = STDERR;
    }
}

#
# close_output
# Close any output files, delete any empty ones
#
sub close_output {
    close($OUTPUT);
    unlink($outputtxt) if (-z $outputtxt);
    if (!$cmdline) {
        close($ERROR);
        unlink($errorstxt) if (-z $errorstxt);
    }
}

#
# alpha
# Sort alphabetically, ignoring case
#
sub alpha { lc($a) cmp lc($b) }

#
# load_config ($file)
# Parse contents of the configuration file
#
sub load_config {
    my ($file) = @_;
    my ($token, $argument, $person);
    return if !load_file($file, 0);
    foreach (@lines) {
        next if m/^#/;
        ($token, $argument) = split(/:/, $_, 2);
        $argument =~ s/^\s+//;
        if ($token eq "section") { $sectionlist{$argument} = 1; next; }
        if ($token eq "end")     { push(@endlist, $argument); next; }
        if ($token eq "wheelock") { $allowtranslate = 0; next; }
	if ($token eq "unicode") { $unicode = 1; $linelength = 2000; next; }
	if ($token eq "longlines") { $linelength = 2000; next; }
        if ($token eq "sort") {
            ($person, $_) = split(/[ \t]/, $argument, 2);
            s/^\s+//;
            if ($sortkey{$person} && ($sortkey{$person} ne $_)) {
                print $ERROR "Sorting key conflict for $person:";
                print $ERROR "\"$_\" and \"$sortkey{$person}\"\n";
                next;
            }
            $sortkey{$person} = $_;
            next;
        }
        if ($token eq "person")  {
            ($person, $_) = split(/[ \t]/, $argument, 2);
            s/^\s+//;
            if ($personlist{$person} && ($personlist{$person} ne $_)) {
                print $ERROR "Conflict for $person:";
                print $ERROR "\"$_\" and \"$personlist{$person}\"\n";
                next;
            }
            $personlist{$person} = $_;
            next;
        }
    }
}

#
# load_file($filename, $function)
# Read a text file, massage it, and load it into the @lines array
# We do this to convert EOL conventions: the CR (Macintosh) or CRLF
# goes to the LF (Unix/Windows) to keep ourselves sane on a MacOS X system.
# Optional function for massaging a block of text before splitting it.
# Return 0 on open failure, 1 otherwise.
#
sub load_file {
    print $ERROR "Reading $file\n" if $filedebug;
    my ($file, $function) = @_;
    if (not open(INPUT, $file)) {
        print $ERROR "Can't open $file for input: $!\n";
        return(0);
    }
    select(INPUT);
    undef $/;                   # no input line terminator
    $data = <INPUT>;            # read everything at once
    close(INPUT);
    $/ = "\n";                  # restore terminator just in case
    $data =~ s/\r\n?/\n/g;      # turn CR (Mac) or CRLF into LF (Unix/Windows)
    if ($function) {
        &$function(\$data);
    }
    @lines = split(/\n/, $data, -1); # block of text to array of lines w/o LF
    return(1);
}

#
# fix_data
# Given a block of data with canonical EOL, do other fixups
#
sub fix_data {
    my ($data) = @_;
    
    #
    # If this data has an email header, chop it out by matching from
    # beginning of string until the first blank line.
    #
    if ($$data =~ m/^From:/m) {
        $$data =~ s/(\A.+?\n\n)//s;
    }

    #
    # If this data has a <HTML> tag, chop out most of the HTML cruft
    # by matching from the tag until the end of string.  Ignore case
    # and allow attributes.
    #
    $$data =~ s/<HTML.*\Z//is;
    $$data =~ s/<!DOCTYPE HTML.*\Z//is;

    #
    # Expand any base64 encoded message.
    #
    base64_decode(\$$data);

    #
    # If we're in Unicode mode, decode anything that looks quoted-printable.
    # Our ultimate string will be UTF-8 encoded Unicode.  Test if the data
    # is already UTF-8. If not, assume ISO-8859-1 and convert to UTF-8.
    # Normalize the end of line convention in case we have created new text.
    #
    # April 23, 2015
    # With Perl 5.18 on the Mac Yosemite Release (OS X 10.10) the call below
    # to decode_utf8 started zeroing the buffer $$data.  Making a copy 
    # into $datacopy eliminates the problem.
    #
    if ($unicode) {
        quoted_decode(\$$data);
	$datacopy = $$data;
	eval {  Encode::decode_utf8($datacopy, Encode::FB_CROAK) };
	if ($@) {
	    Encode::from_to($$data, "iso-8859-1", "utf-8");
	}
	$$data =~ s/\r\n?/\n/g;
    } else {
        $$data =~ s/=\n//g;         # quoted-printable ends lines with =

        #
        # Loop over the list of =XX sequences and their replacements.
        # The =XX=XX=XX and =XX=XX sequences are Unicode characters.
	# Do them first.
        # On average this should be faster than doing a s/// for each
        # possibility.  I assume quoted-printable uses capital letters;
	# this avoids erroneous matching of URL fragments.
        #
        foreach $pattern  (qr/=[0-9A-F]{2}=[0-9A-F]{2}=[0-9A-F]{2}/,
			   qr/=[0-9A-F]{2}=[0-9A-F]{2}/,
                           qr/=[0-9A-F]{2}/) {
            while ($$data =~ m/($pattern)/g) {
                $replacement = $replace{$1};
                if ($replacement) {
		    if ($replacement eq "delete") {
			$replacement = "";
		    }
                    $$data =~ s/$1/$replacement/;
                } else {
		    if ($1 eq "=3D=3D=3D") {
			$$data =~ s/$1/===/;
		    } else {
			print $ERROR "Unknown code $1 found in $file\n";
		    }
                }
            }
        }

        #
        # Convert some plaintext Unicode sequences to US-ASCII
	#
	# This should be rewritten as loop driven off a table.
        #
        $$data =~ s/\304\200/A/g;  # A w/ macron
        $$data =~ s/\304\201/A/g;  # a w/ macron
        $$data =~ s/\303\204/A/g;  # A w/ diaersis
        $$data =~ s/\303\244/a/g;  # a w/ diaersis

        $$data =~ s/\304\222/E/g;
        $$data =~ s/\304\223/E/g;
        $$data =~ s/\303\213/E/g;
        $$data =~ s/\303\253/e/g;

        $$data =~ s/\304\252/I/g;
        $$data =~ s/\304\253/I/g;
        $$data =~ s/\303\217/I/g;
        $$data =~ s/\303\257/i/g;

        $$data =~ s/\305\214/O/g;
        $$data =~ s/\305\215/O/g;
        $$data =~ s/\303\226/O/g;
        $$data =~ s/\303\266/o/g;

        $$data =~ s/\305\252/U/g;
        $$data =~ s/\305\253/U/g;
        $$data =~ s/\303\234/U/g;
        $$data =~ s/\303\274/u/g;

        $$data =~ s/\342\200\223/--/g;
        $$data =~ s/\342\200\224/--/g;
        $$data =~ s/\342\200\231/'/g;
        $$data =~ s/\342\200\234/"/g;
        $$data =~ s/\342\200\235/"/g;

        #
        # Convert some fake Windows-style ISO-8859-1 to US-ASCII.
        # While we're here, turn tabs into spaces.
        # Note: \047 is ascii singlequote, \042 doublequote, \055 hyphen
        #
        #  WinLatin1    ASCII
        #  \221  =91     '     open single quote
        #  \222  =92     '     close single quote
        #  \223  =93     "     open double quote
        #  \224  =94     "     close double quote
        #  \225  =95           bullet (no ascii equiv)  (not in table below)
        #  \226  =96     -     hyphen
        #  \227  =97     -     em-dash
        #
        # The AEIOU is standard iso-8859-1 vowels with carets.
        #
        $$data =~tr/\t\221\222\223\224\226\227\342\352\356\364\373/ \047\047\042\042\055\055AEIOU/;

        #
        # Delete some characters entirely
        #
        $$data =~ tr/\177//d;
    }

    #
    # Remove these characters for both settings of $unicode
    #
    $$data =~ s/\000//g;           # null byte
    $$data =~ s/\357\273\277//g;   # byte-order mark

    #
    # Fix a few oddball things globally
    #
    $$data =~ s/&/ & /g;        # ensure spaces around &
    $$data =~ s/>>/ >> /g;      # ensure spaces around >>
    $$data =~ s/^>([^>])/$1/mg; # Remove a single level of forward comment

    #
    # Handle whitespace
    #
    $$data =~ s/[ \t]+/ /g;     # compress white space
    $$data =~ s/^\s+//mg;       # get rid of leading spaces
    $$data =~ s/\s+$//mg;       # get rid of trailing spaces
}

#
# base64_decode
# If the string "base64" is mentioned in our input string, assume we have
# base64 encoded MIME section and attempt to decode its contents.
#
sub base64_decode {
    my ($data) = @_;
    return if (not $$data =~ m/base64/ig);
    @rawarray = split(/\n/, $$data, -1);
    @encarray = ();
    foreach (@rawarray) {
        next if (m|[^A-Za-z0-9+=/]| or $_ eq "");
        push(@encarray, $_);
    }
    $str = join '', @encarray;
    if (length($str) % 4) {
        print $ERROR "Length of base64 data not a multiple of four";
    }
    $str =~ s/=+$//;                        # remove padding
    $str =~ tr|A-Za-z0-9+/| -_|;            # convert to uuencoded format
    $$data = join'', map(unpack("u", chr(32 + length($_)*3/4) . $_),
                         $str =~ /(.{1,60})/gs);
}

#
# quoted_decode
# If we have quoted-printable characters in our input string, decode them.
#
sub quoted_decode {
    my ($data) = @_;
    $$data =~ s/=\n//g;         # quoted-printable ends lines with =
    while ($$data =~ m/(=([0-9A-F]{2}))/g) {
        $result = chr(hex($2));
        $$data =~ s/$1/$result/;
    }
}

#
# finish_paragraph
# Finish gathering up a paragraph in $text, assign to associative array.
# If no paragraph, substitute an empty bracket, []
#
sub finish_paragraph {
    if ($gathering) {
        if ($text eq "") {
            $text = "[...]"
        }
        $array{$key} = $text;
        $gathering = 0;       # no longer gathering text into a paragraph
        $foundtext = 1;       # note that we have found text in this file
        $contriblist{$person} = $personlist{$person}; # note contributor
    }
}
