#!/usr/bin/perl
use strict;

# lookup2cg
# Perl-script for converting lookup-output to CG-2 input.
# - Rates and removes compound analyses according to 
#   the number of word boundaries.
# - Reformats compound analyses and base forms, removes duplicates
#
# Input: 
# Dan     dat+Pron+Dem+Sg+Acc
# Dan     dat+Pron+Dem+Sg+Gen
#
# Output:
# "<Dan>"
#        "dat" Pron Dem Sg Acc
#        "dat" Pron Dem Sg Gen
#
# $Id$

my @Analyses;
my $word;
my $tmp;
my $compound = 0;

# Read while not eol
while(<>) {	
	# While line not empty, store word to a scalar and
	# analyses into an array.
	if (! /^\s*$/) {
		chomp;
		if (/#/) { 
			$compound = 1; 
		} 
		# store the word into a variable
		# and the analysis part into an array.
		$_ =~ /^(.*?)\t(.*)$/;
		$word = $1;
		push @Analyses, $2;
	}
	# if empty line, format the output and print
	else {
		# Rate compounds accoring to word boundaries
		if ($compound) { 
			&rate_compounds(\@Analyses); 
		}
		# Format each analysis
		for my $line (@Analyses) {			
			# Separate the base form from the analysis
			if ( $line =~ /^(.*?)\s*\+(.*)$/o ) {
				my $base = $1;
				$line = $2;
				$line =~ s/\+/ /go;				
				# If line contains a compound
				if ( $line =~ /#/ ) { 
					 &format_compound(\$base, \$line, \$word); 
				 }
				# Format output
				$line =  "\t" . " \"$base\" " . $line . "\n"; 
			}
		} 
		#Remove identical analyses
		&remove_identical(\@Analyses);

		# Mark derivational tags
		&mark_deriv_tags(\@Analyses);
		
		# Print output
		print "\"<$word>\"\n";
		for my $line (@Analyses)
		{ print $line; }
		
		#TODO: problem: this is one element in the array.
		@Analyses = 0;
		pop @Analyses;
		$compound = 0;
	}
} # end of while

sub remove_identical {
	my $aref = shift @_;

	# Remove identical analyses
	for my $i ( 0 .. $#{ $aref } ) {
		for my $j ( $i+1 .. $#{ $aref } ) {
			if ( $aref->[$i] eq $aref->[$j] )
			{ $aref->[$j] = undef; }
		}
	}
}

sub rate_compounds {
	my $aref = shift @_;
	
	# Rate compounds and remove extra readings.
	# First mark the number of words in the compound
	# to the beginning of the string
	my $word_count;
	my $best_word_count = 5;
	for my $line (@$aref) {
		$word_count = split ("#", $line);
		$line = $word_count . $line;
		if ( $word_count < $best_word_count )
		{ $best_word_count = $word_count; }
	}
	# Then sort the table
	# And remove the readings with too many #'s
	@$aref = sort {$a <=> $b} @$aref;
	for my $line (@$aref) {
		if ($line !~ /^$best_word_count/)
		{ $line = undef; }
	}
	# Remove the extra digits
	for my $line (@$aref) 
	{ $line =~ s/^\d(.*)$/$1/o; }
}

sub format_compound {
	my ($refbase, $refline, $refword) = @_;

	# Take only the analysis of the last part of the compound
	$$refline =~ s/^.*#(.*?)\s(.*$)/$2/o;
	my $last_word = $1;

	# Search for the 3 first letters of the last base form
	# from the original compound 
	my $substring = substr($last_word,0,3);
	
	# If some of the 3 letters is 2-letter garpheme
	# take 4 letters instead of 3
	if($substring =~ /c1|s1|z1|d1|t1/o) {
		$substring = substr($last_word,0,4);
	}
	# If the compound boundary is found, 
	# replace the last word by its base form, and insert a # mark in front of
    # it, in order to mark the result as a compound.
	if ($$refword =~ m/$substring/) {
		$$refbase = $$refword;
		$$refbase =~ s/(^.*)$substring.*$/$1#$last_word/;
	}
    # If the compound boundary is not found, 
	# use "last resort": 2 letters instead of 3 or 4.
	else {
		$substring = substr($last_word,0,2);
		if ($$refword =~ m/$substring/) {
			$$refbase = $$refword;
			$$refbase =~ s/(^.*)$substring.*$/$1#$last_word/;

		}		
	}
#	if ($$refline !~ /Prop/) {
#		print "JEE\n";
#		$$refbase =~ s/[A-Z鼴秶傒/[a-z槼欉鱕/;
#	}
}

sub mark_deriv_tags {
	my $aref = shift @_;

	foreach my $line ( @{$aref} ) {
		$line =~ s/(\s(?:V|N|Adv|A))(\s|\s.*?\s)((?:V|N|Adv|A)\s)/$1\*$2$3/;
		$line =~ s/(\s(?:V|N|Adv|A)\*(?:\s|\s.*?\s)(?:V|N|Adv|A))((?:\s|\s.*?\s)(?:V|N|Adv|A)\s)/$1\*$2/;
	}
}