# Util.pm 
# Utility functions for the using the different langtech tools.

package langTools::Util;

use utf8;

use Encode;
use warnings;
use strict;
use Carp qw(cluck);

use Exporter;
our ($VERSION, @ISA, @EXPORT, @EXPORT_OK);

$VERSION = sprintf "%d.%03d", q$Revision$ =~ /(\d+)/g;
@ISA         = qw(Exporter);

@EXPORT = qw(&init_lookup &call_lookup &read_tags &generate_taglist &win_digr &digr_utf8 &format_compound);
@EXPORT_OK   = qw();

# Initialize expect object for analysis
# Returns a pointer to the object.
sub init_lookup {
	my ($command) =  @_;

	if (! $command) { cluck "No command specified"; }
	my $exp = Expect->spawn($command)
		or cluck "Cannot spawn $command";
	$exp->log_stdout(0);
	
	return $exp;
}


# Call expect object with a string.
# Returns the analysis.
sub call_lookup {
	my ($exp_ref, $string, $decode)  = @_;
	
	if (! $$exp_ref) { cluck "The expect object missing"; }

	if($decode) { $string = Encode::encode_utf8($string); }

	$$exp_ref->send("$string\n");
	$$exp_ref->expect(undef, '-re', '\r?\n\r?\n' );

	my $read_anl = $$exp_ref->before();

	if($decode) { $read_anl = Encode::decode_utf8($read_anl);}

	# Take away the original input.
	$read_anl =~ s/^.*?\n//;
	# Replace extra newlines.
	$read_anl =~ s/\r\n/\n/g;
	$read_anl =~ s/\r//g;

	return "$read_anl\n";

}


# Read the grammar for  paradigm tag list.
# Call the recursive function that generates the tag list.
sub generate_taglist {
	my ($gramfile, $tagfile, $taglist_aref) = @_;

	my @grammar;
    my %tags;

	if ($gramfile) {
		# Read from tag file and store to an array.
		open GRAM, "< $gramfile" or die "Cant open the file $gramfile: $!\n";
		my @tags;
		my $tag_class;
	  GRAM_FILE:
		while (<GRAM>) {
			chomp;
			next if /^\s*$/;
			next if /^%/;
			next if /^$/;
			next if /^#/;
			
			s/\s*$//;
			push (@grammar, $_);		
		}
	}
	read_tags($tagfile, \%tags);
	
	my @taglists;
	# Read each grammar rule and generate the taglist.
	for my $gram (@grammar) {
		my @classes = split (/\+/, $gram);
		my $pos = shift @classes;
		my $tag = $pos;
		my @taglist;

		generate_tag($tag, \%tags, \@classes, \@taglist);
		push(@{$$taglist_aref{$pos}}, @taglist );
	}
#	for my $pos ( keys %$taglist_aref ) {
#		print "\nJEE $pos OK @{$$taglist_aref{'Pron'}} ";
#	}
}

# Ttravel recursively the taglists and generate
# the tagsets for pardigm generation.
# The taglist is stored to the array reference $taglist_aref.
sub generate_tag {
	my ($tag, $tags_href, $classes_aref, $taglist_aref) = @_;

	if (! @$classes_aref) { push (@$taglist_aref, $tag); return;  }
	my $class = shift @$classes_aref;
	if ($class =~ s/\?//) {
		my $new_tag = $tag;
		my @new_class = @$classes_aref;
		generate_tag($new_tag, $tags_href, \@new_class, $taglist_aref);
	}

	if (! $$tags_href{$class}) { 
		my $new_tag = $tag . "+" . $class;
		my @new_class = @$classes_aref;
		generate_tag($new_tag, $tags_href, \@new_class, $taglist_aref);
		return;
	}

	for my $t (@{$$tags_href{$class}}) {
		my $new_tag = $tag . "+" . $t;
		my @new_class = @$classes_aref;
		generate_tag($new_tag, $tags_href, \@new_class, $taglist_aref);
	}
}			

# Read the morphological tags from a file (korpustags.txt)
sub read_tags {
	my ($tagfile, $tags_href) =  @_;

	# Read from tag file and store to an array.
	open TAGS, "< $tagfile" or die "Cant open the file $tagfile: $!\n";
	my @tags;
	my $tag_class;
  TAG_FILE:
	while (<TAGS>) {
		chomp;
		next if /^\s*$/;
		next if /^%/;
		next if /^$/;
		next if /=/;
		
		if (s/^#//) {

			$tag_class = $_;
			push (@{$$tags_href{$tag_class}}, @tags);
			undef @tags;
			pop @tags;
			next TAG_FILE;
		}
		my @tag_a = split (/\s+/, $_);
		push @tags, $tag_a[0];
 	}

	close TAGS;
}

# Form baseforms of analyzed compounds.
sub format_compound {
	my ($refbase, $refline, $refword) = @_;

	my $boundary_count = ($$refbase =~ tr/\#//);

	# Take only the analysis of the last part of the compound
	$$refbase =~ /^.*\#(.*?)$/;
	my $last_word = $1;
	if(! $last_word) { return; }

	my $second_last_word;
	my $third_last_word;
	if ($boundary_count > 1) {
		if ( $$refbase =~ /^.*\#(.*?)\#.*$/ ) {
			$second_last_word = $1;
		}
		if ( $$refbase =~ /^.*\#(.*?)\#.*\#.*$/ ) {
			$third_last_word = $1;
		}
	}
	my $i=4;
	my $substring = substr($last_word,0,$i);

	while (($$refword !~ m/\Q$substring\E/) && $i > 1 ) {
		$i--;
		$substring = substr($last_word,0,$i);
	}

	if ($$refword =~ m/\Q$substring\E/) {
		# If the compound boundary is found, 
		# replace the last word by its base form, and insert a # mark in front of
		# it, in order to mark the result as a compound.
		my $orig = $$refbase;
		$$refbase = $$refword;
		$$refbase =~ s/(^.*)\Q$substring\E.*$/$1\#$last_word/;
		if ($orig =~ m/^\p{isLower}/) {
			$$refbase = lcfirst($$refbase);
		}
	}

	if ($second_last_word) {
		my $i=4;
		my $substring = substr($second_last_word,0,$i);
		while ($$refbase !~ /.*\Q$substring\E.*\#/ && $i>1 ) {
			$i--;
			$substring = substr($second_last_word,0,$i);
		}
		# If the compound boundary is found, mark it with #
		if ($$refbase =~ /\Q$substring\E/) {
			$$refbase =~ s/(\Q$substring\E.*\#)/\#$1/;
		}
	}

	if ($third_last_word) {
		my $i=4;
		my $substring = substr($third_last_word,0,$i);
		while ($$refbase !~ /.*\Q$substring\E.*\#.*\#/ && $i>1 ) {
			$i--;
			$substring = substr($third_last_word,0,$i);
		}
		# If the compound boundary is found, mark it with #
		if ($$refbase =~ /\Q$substring\E/) {
			$$refbase =~ s/(\Q$substring\E.*\#.*\#)/\#$1/;
		}
	}
}


# Some character set conversion routines, rarely used nowadays.
# Convert windows charachters to Sami digraphs
sub win_digr {
	my $ctext  = shift(@_);

	$ctext =~ s/\212/S1/g ;
	$ctext =~ s/\232/s1/g ;
	$ctext =~ s/\216/Z1/g ;
	$ctext =~ s/\236/z1/g ;

	return $ctext;
}


sub digr_utf8 {
	my $ctext = shift(@_);

	$ctext =~ s/A1/Á/g ;
	$ctext =~ s/a1/á/g ;
	$ctext =~ s/C1/Č/g ;
	$ctext =~ s/c1/č/g ;
	$ctext =~ s/D1/Đ/g ;
	$ctext =~ s/d1/đ/g ;
	$ctext =~ s/N1/Ŋ/g ;
	$ctext =~ s/n1/ŋ/g ;
	$ctext =~ s/S1/Š/g ;
	$ctext =~ s/s1/š/g ;
	$ctext =~ s/T1/Ŧ/g ;
	$ctext =~ s/t1/ŧ/g ;
	$ctext =~ s/Z1/Ž/g ;
	$ctext =~ s/z1/ž/g ;
	
	return $ctext;
}


1;

__END__