#!/usr/bin/perl
use strict;

# preprocess
# Perl-script for text preprocessing 
#  - Cuts text into sentences and sentences into words (tokens).
#  - Reads plain text from STDIN and outputs 
#    a list of words separated by newline.
#
# An abbreviation file may be given as input parameter --abbr, 
# the file has special syntax and the usage is connected 
# to other sámi tools.
# 
# $Id$

# Use the local character class. 
# It means that variable \w contains Latin-1 alphabet.
use locale;

# permit named arguments
use Getopt::Long;

my @words;
my %abbrs;
my $TRAB="TRAB";
my $ITRAB="ITRAB";
my $TRNUMAB="TRNUMAB";
my $IDIOM="IDIOM";

# Max size of a multi-word expression.
my $MULTIWORD_SIZE = 3;

# General punctuation, not much used variable.
my $PUNCT = quotemeta("|{}[]()$.*?!,;:'\.\"%");

# Punctuation marks that are always their own tokens,
# whether word or numeral expression.
# There are characters added to this list depending on if
# the processed string is word or numeral.
my $SEPARATE_PUNCT = quotemeta("|{}[]()Ťť?!;,'`\"");

# Regex for tokens that nevertheless contain punctuation that
# is specified in variable $SEPARATE_PUNCT.
my $CONTAIN_PUNCT = 'ja\/dahje|http|:\/\/|km\/h';

my $abbr_file;
my $help=0;
GetOptions ("abbr=s" => \$abbr_file,
			"help" => \$help) ;

if ($help) {
	&print_usage;
	exit;
}

# If the abbreviation file is given,
# read the abbreviations from the file to a hash.
if ($abbr_file) {
	&read_abbr (\%abbrs);
}

# read one paragraph at the time.
# the paragraph ends with two subsequent newlines.
$/ = "";

while (<>) {
	chomp;

	# join hyphenated words separated by newline. Check this!
	s/\w\-\n/\-/g;

	# split the sentence by space.
    my @words = split (/[\s]+/, $_) ;

	while (@words) {
		my $word = shift (@words) ;
		# An array for storing the tokens, each token in its own slot.
		my @tokens;
		
		# store punctuation from the front of the expression
		# to the tokens array. (todo: check variable $CONTAIN_PUNCT)
		if ($word =~ s/^([$SEPARATE_PUNCT]+)//) {
			push (@tokens, split(/ */, $1));
		}
		# Examine the type of the string. If the expression contains
		# alphabetical characters optionally preceded by one non-alphabetic
		# character, it is a word.
		if ($word =~ /^[\W]?[^\W\d]+.*/) {
			process_word ($word, \@words, \@tokens);
		}
		# Otherwise it is processed like numeral.
		elsif ($word) {
			process_numeral ($word, \@words, \@tokens);
		}
		for my $token (@tokens) {
			print "$token\n" unless (! $token);
		}
	}
}


sub process_word {
	my ($word, $words_aref, $tokens_aref) = @_;

	# variable for storing all the punctuation at the end of the word.
	# except dot (for abbreviations) and some other punct. if needed
	my $end_punct;
	# the variable storing dot and the possible some other punct.
	my $rest_punct;

    # Test for multiword expression.
	test_idiom(\$word, $words_aref, $MULTIWORD_SIZE);

    my $end_num;
    if ($word =~ /\.(\d+\.?)$/){
        $word =~ s/(\d+\.?)$//;
        $end_num = $1;
    }


    # Store the punctuation at the end of the string to a variable
	if ($word =~ s/([+=%:$SEPARATE_PUNCT]+)$//) {
		$end_punct = $1
	}

	# Cut the word into tokens if there is punctuation in the middle.
	# e.g. gielddat/guovllut
	# Check for expressions in $CONTAIN_PUNCT -variable.
	if ($word !~ /$CONTAIN_PUNCT/ && (my @parts = split (/([+=%$SEPARATE_PUNCT\/])/, $word))) {
		my $i;
		for ($i=0; $i < $#parts; $i++) {
			push (@{$tokens_aref}, $parts[$i]); 
		}
		# Process the last part further, it may start with punctuation
		# that is separate token.
		$word = $parts[$i];
		if ($word =~ s/^([$PUNCT])//) {
			push (@{$tokens_aref}, $1);
			}
		if (!$word) {
			push (@{$tokens_aref}, split(/ */, $end_punct));
			return;
		}
	}

	# If the punctuation at the end contains a sentence delimiter
	# ? or !, the word ends the sentence in any case, so the word can
	# be treated as a token.
	if ($end_punct =~ /[\?\!\.]/) {
		push (@{$tokens_aref}, $word);
	}
	# Otherwise
	# the word is checked for abbreviation and sentence boundary.
	elsif (! test_abbr($word, $words_aref->[0], $tokens_aref)) {
		# If the word was not an abbreviation, the rest
		# of the punctuation is removed and stored as separate tokens.
		# This operation is for taking the dot out.
		if ($word =~ s/([$SEPARATE_PUNCT\.:]+)$//) {
			$rest_punct =  $1;
		}
		push (@{$tokens_aref}, $word);
		push (@{$tokens_aref}, split(/ */, $rest_punct));			
	}
	push (@{$tokens_aref}, $end_num);
	push (@{$tokens_aref}, split(/ */, $end_punct));
}


sub test_abbr {
	my ($word, $next_word, $tokens_aref) = @_;

	my $abbr = $word;
	if (! ($abbr =~ s/\.$//)) {
		return 0;
	}
	
	# Transitive abbreviations are never followed
	# by sentence boundary.
	if (exists $abbrs{$TRAB}{$abbr}) {
		push (@{$tokens_aref}, $word);
		return 1;
	}
	# There is a sentence boundary if the abbreviation
	# is followed by other than number or non-alphabetic char.
	# Is this necessary? 
	elsif (exists $abbrs{$TRNUMAB}{$abbr}) {
		push (@{$tokens_aref}, $word);
		if ($next_word !~ /^[0-9]/ ) {
			push (@{$tokens_aref}, ".");
		}
		return 1;
	}
	# There is a sentence boundary if intransitive abbreviation 
	# is NOT followed by a small alphabetic char.
	elsif (exists $abbrs{$ITRAB}{$abbr}) {
		push (@{$tokens_aref}, $word);
		if ($next_word !~ /^[[:lower:]]/o) {
			push (@{$tokens_aref}, ".");
		}
		return 1;
	}
	# Check for abbreviation with 2-3 capital letters
	# If followed by a capital letter or number,
	# There is a sentence boundary. todo: Remove this as irrelevant!!
	elsif ($abbr =~ /^[[:upper:]1]{2,3}$/o ) {
		if ($next_word !~ /^[[:lower:]]/ ) {
			push (@{$tokens_aref}, $abbr);
			push (@{$tokens_aref}, ".");
		}
		else {
			push (@{$tokens_aref}, $word);
		}
		return 1;
	}
	return 0;
}

# If the word starts an multiword expression, it is replaced
# with the expression. The other parts are
# removed from the words array.

sub test_idiom {
	my ($word_ref, $words_aref, $size) = @_;

	my $token = $$word_ref;
	# Test for multiword expressions by growing the token
	# one word at a time
	my $i=0;
	while ($i < $size && $words_aref->[$i]) {
		$token = $token . " " . $words_aref->[$i];

		# Remove the punctuation at the end of the expression.
		(my $token_2 = $token) =~ s/[^\w]*$//;
		# Test if the formed multiword expression exists in the
		# idiom list. Test also lower case version.		 
		if (exists $abbrs{$IDIOM}{$token_2} || exists $abbrs{$IDIOM}{lcfirst($token_2)} ) {
			for (my $j=0; $j <= $i; $j++) {
				# Construct the new multiword processing unit.
				# Remove the parts of the multiword expression from
				# the word array.
				$$word_ref = $$word_ref . " " . $words_aref->[0];
				shift @{$words_aref};
			}
		}
		$i++;
	}
}


sub process_numeral {
	my ($word, $words_aref, $tokens_aref) = @_;

	# Punctuation that connects two or more numerals 
	# into one numeral expressions.
	my $NUM_PUNCT=quotemeta("-+*=/");

	# Combine punctuation with numeral if followed by other numeral.
	# cases like 123- 456 and 123 -456 and 123 456 and 123 - 456
	while (($word =~ /^[\d$NUM_PUNCT ]+$/) && ($words_aref->[0] =~ /^[\d$NUM_PUNCT ]+$/)) {
		$word = $word . " " . $words_aref->[0];
		shift @{$words_aref};
	}

	# Combine percent sign to the numeral when separate.
	# covers cases like: 50 %
	if (($word =~ /\d$/) && ($words_aref->[0] =~ /^\%/)) {
		$word = $word . " " . $words_aref->[0];
		shift @{$words_aref};
	}
	# Clean first the end of the token, where punctuation follows
	# the numeral or an ordinal and does not belong to the expression.
	# cases like: 123! and 123.), 123). 10,-. 
    # problem: 123.? and 123 is an ordinal.
	my $end_punct;
	if ($word =~ s/([$SEPARATE_PUNCT:]+\.?)$//){
		$end_punct = $1;
	}
	if ($word =~ /([$SEPARATE_PUNCT:\-\%]+\.?)$/){
		$word =~ s/(\.?)$//;
		$end_punct = $1;
	}

	# Check if there is an abbreviation or a word attached to
	# numeral.
	my $end_word;
	if ($word =~ /\.(\w+\.?)$/){
		$word =~ s/(\w+\.?)$//;
		$end_word = $1;
	}

	my $rest_punct;
	# Check for ending dot, it is a separate token if 
	# the following word starts with capital letter. Or if there is
	# case ending. If there is an abbreviation with numeral, don't
	# do checking.
	# Otherwise belongs to the expression.
	if (!defined $end_word) {
	if ((($words_aref->[0] !~ /^[[:lower:]\d]/o) && ($word =~ s/(\.)$// )) || ($word =~ s/(:\w+)(\.)/$1/)) {
		$end_punct = "." . $end_punct;

		# Clean the rest of the token:
		# cases like 123). where dot is preceded by punctuation.
		if ($word =~ s/([$SEPARATE_PUNCT:]+)$//) {
			$rest_punct =  $1;
		}
	}
    }

	# Push everything to the tokens array.
	push (@{$tokens_aref}, $word);
	push (@{$tokens_aref}, split(/ */, $rest_punct));
	push (@{$tokens_aref}, split(/ */, $end_punct));

    if (defined ($end_word)){
        process_word ($end_word, $words_aref, $tokens_aref);
    }

}


sub read_abbr {
	my ($abbr_href) = @_;
	
	local $/="\n";
    # The filename may contain a tilde ~
    # Expand the filename manually
	$abbr_file =~ s{^~([^/])*} {
		$1
			? (getpwnam($1))[7]
			: ($ENV{HOME} || (getpwuid($>))[7]
			   )
		}ex;

	open LEX, "< $abbr_file" or die "Cant open the file: $!\n";

	my $current;
	while (<LEX>) {
		chomp;
		if (/^LEXICON (.*)$/) {
			$current = $1;
			next;
		}
		$$abbr_href{$current}{$_} = 1;
	}
	close LEX;	
}

sub print_usage {
	print "Usage: preprocess --abbr=<file_name>\n";
	print "Split text into sentences and words.\n"
}