#!/usr/bin/perl
use strict;
use Getopt::Long;
use Data::Dumper;
#use Switch;
use utf8;

# lookup2cg
# Perl-script for converting lookup-output to CG-2 input.
# - Rates and removes compound analyses according to 
#   the number of word boundaries.
# - Reformats compound analyses and base forms, removes duplicates
#
# Input: 
# Dan     dat+Pron+Dem+Sg+Acc
# Dan     dat+Pron+Dem+Sg+Gen
#
# Output:
# "<Dan>"
#        "dat" Pron Dem Sg Acc
#        "dat" Pron Dem Sg Gen
#
# $Id$

my $dict=0;
my $der=0;
my $help;

Getopt::Long::Configure ("bundling");
GetOptions ("der|r" => \$der,
			"dict|d" => \$dict,
			"help" => \$help);

if ($help) {
	&print_help;
	exit 1;
}


$/ = "";

# Read while not eol
while(<>) {
  if(/\t/) {s/ /_/g;} # Shall not be placed here, but in word form making
  my @Analyses;
  my %forms;
  my %rated;
  
  my $word;
  my $comp_rate = 0;
  my $max_comp_rate = 0;
  
  my @lines = split(/\n/, $_);
  for my $line (@lines) {
    next if ($line =~ /^\s*$/);
    # store word to a scalar and
    # analysis (=base form and tags) to an array
    my $analysis;
    ($word, $analysis) = split(/\t/, $line, 2);
    next if ! $analysis;
    $analysis =~ tr/\t//d;
    push @Analyses, $analysis;
    $comp_rate = ($line =~ tr/\#//);
    if ($comp_rate > $max_comp_rate) { 
      $max_comp_rate = $comp_rate; 
    }
  }
  # format the output and print
  if ($max_comp_rate > 0) { 
    while (@Analyses) {
      my $line = pop @Analyses;
      $rated{$line} = 1;
    }
    pop @Analyses;
    rate_compounds(\%rated); 
    # If dictionary use, try to prefer lexicalized compounds
    if ( $dict ) { select_lexicalized(\%rated); }
    for my $key (keys %rated) {
      push @Analyses, $key;
    }
  }
  
  # Format each analysis
  for my $line (@Analyses) {
    # Separate the base form from the analysis
    my $base;
    my $line2;
    my $plus;
    my $comp_rate = ($line =~ tr/\#//);
    if ( $comp_rate > 0) {
      $line =~ s/\#\+Der\d\+Der\//\#/g;
      $line =~ /^(.*\#.*?)\+(.*)$/;
      $base = $1;
      $line2 = $2;
    }
    else { 
      if ($line =~ s/^\+//) { $plus = "+"; }
      ($base, $line2) = split(/\+/, $line, 2);
    }
    $line2 =~ tr/+/ /;
    $base =~ tr/+//;
    $line2 =~ s/=/ = /g; # relevant to kal XXX
    #		$base =~ s/=/ = /g;  # relevant to kal 
    if ($plus) { $base = $plus . $base; }
    
    # If line contains a compound
    if ( $comp_rate > 0 && ! $der && ! $dict) {
      format_compound(\$base, \$line2, \$word); 
    }
    # Mark derivational tags
    # 1. basic tags
    1 while $line2 =~ s/\b(V|N|Adv|A)(\ |\ .*?\ )(V|N|Adv|A)\b/$1\*$2$3/g;
    
    # 2. star IV if last occurence is TV (causative derivation)
    1 while $line2 =~ s/\b(IV)(\ |\ .*?\ )(TV)\b/$1\*$2$3/g;
    
    #(Fixes to bug 954)
    # 3. star TV if last occurence is IV (passive derivation)
    1 while $line2 =~ s/\b(TV)(\ |\ .*?\ )(IV)\b/$1\*$2$3/g;

    # 4. star TV|IV if last occurence is N (nominal derivation)
    1 while $line2 =~ s/\b(TV|IV)(\ |\ .*?\ )(N)\b/$1\*$2$3/g;

    # 5. star IV if last occurence is IV (derivation without transitivity change, e.g. 'gildojuvvon')
    1 while $line2 =~ s/\b(IV)(\ |\ .*?\ )(IV)\b/$1\*$2$3/g;

    #Format output
    $line =  "\t" . " \"$base\" " . $line2 . "\n";
    
    # Store the analysis to a hash
    if ($base) { $forms{$line} = 1; }
  }
  
  # Print output
  if (@Analyses) {
    print "\"<$word>\"\n";
    for my $line (keys %forms) 
      { 
	$line =~ s/("[^"]+?)\s+([^"]+?")/$1_$2/g;
	print "$line"; }
  }
} # end of while


sub rate_compounds {
	my $href = shift @_;
	
	# Rate compounds and remove extra readings.
	my $min_boundary_count=5;
	my $boundary_count=0;
	my $der_count=0;
	for (keys %$href) {
		$boundary_count = tr/\#//;
		$der_count = () = $_ =~ /Der\//g;

		$boundary_count += $der_count;
		# The derivation is not preferred.
		#if (/Der\d\+Der/) { $boundary_count++; }
		if ( $boundary_count < $min_boundary_count )
		{ $min_boundary_count = $boundary_count; }
	}

	for (keys %$href) {
		$boundary_count = 0;
		$der_count = 0;
		if ($der) {
			next if /\b(V|N|Adv|A)(\ |\ .*?\ )(V|N|Adv|A)\b/;
		}
		$boundary_count = tr/\#//;
		$der_count = () = $_ =~ /Der\//g;
		#cip: commented out the next line,
		# which would not delete the Der-lines (see bug 1283)
		#$boundary_count += $der_count;
		if ( $boundary_count > $min_boundary_count )
		{ delete($$href{$_}); }
	}
}

sub select_lexicalized {
	my $href = shift @_;

	my %lexicalized;

	# Select the forms which do not have any compound tags
	for (keys %$href) { 
		next if (m/\+.*\#/);
		$lexicalized{$_} = 1;
	}
	# If there were lexicalized compounds, delete others from output
	if (%lexicalized) {
		for my $k (keys %$href) {
			if (! $lexicalized{$k}) { delete($$href{$k}); }
		}
	}
}

sub format_compound {
	my ($refbase, $refline, $refword) = @_;
	my $boundary_count = ($$refbase =~ tr/\#//);

	# Take only the analysis of the last part of the compound
	$$refbase =~ /^(.*)\#([^\#]+)$/;
	my $last_word = $2;
	if(! $last_word) { return; }

	my $second_last_word;
	my $third_last_word;
	my $fourth_last_word;
	if ($boundary_count > 1) {
		if ( $$refbase =~ /^.*\#(.*?)\#.*$/ ) {
			$second_last_word = $1;
		}
		if ( $$refbase =~ /^.*\#(.*?)\#.*\#.*$/ ) {
			$third_last_word = $1;
		}
		if ( $$refbase =~ /^.*\#(.*?)\#.*\#.*\#.*$/ ) {
			$fourth_last_word = $1;
		}
	}
	my $i=4;
	# this is the problem with Bug 1746: the substring
	my $substring = substr($last_word,0,$i);
	while ($i > 1) {
	    if ($$refword =~ m/.*(\Q$substring\E)/) {
		my $pos = rindex $$refword,$substring,;
		#exit the search only when the substring is not in the beginning of the word.
		if ($pos > 1) { last; }
	    }
	    $i--;
	    $substring = substr($last_word,0,$i);
	}

	#trigger Mázecabaret compounds
	my $substring_trigger;

	if ($$refword =~ m/.*\Q$substring\E/) {
	  # If the compound boundary is found, 
	  # replace the last word by its base form, and insert a # mark in front of
	  # it, in order to mark the result as a compound.
	  my $orig = $$refbase;
	  $$refbase = $$refword;
	  my $substring_counter = () = $last_word =~ /$substring/gi;
	  # bug fix for bug 949: input 'vuertemetjiehtjele'
	  # test: echo 'vuertemetjiehtjele' | usma VS echo 'vuertemetjiehtjele' | usma | lookup2cg
	  $$refbase =~ s/(^.*)(\Q$substring\E.*){$substring_counter}$/$1\#$last_word/;
	  if ($orig =~ m/^\p{isLower}/) {
	    $$refbase = lcfirst($$refbase);
	  }
	  $substring_trigger = 'head_initial';
	}

	if (!$substring_trigger)  {
	  #last_word don't match the refword: Mázecabaret vs. Máze#kabarea issue 
	  
	  (my $lemma_initial, my $lemma_rest) = $substring =~ /^(\w)(.*)$/;
	  my $wordform_initial;
	  
	  if ($lemma_initial eq 'k') {
	    $wordform_initial='c';
	  } elsif ($lemma_initial eq 'b') {
	    $wordform_initial='p';
	  } elsif ($lemma_initial eq 'd') {
	    $wordform_initial='t';
	  } elsif ($lemma_initial eq 'g') {
	    $wordform_initial='k';
	  }
	    # else {
	    #	    print STDERR "Nothing to map\n";
	    #	  }
	    
	  #commented out due to installation problems with Trond's configuration
	  #switch($lemma_initial){
	  #  case "k" { $wordform_initial='c'; }
	  #  case "b" { $wordform_initial='p'; }
	  #  case "d" { $wordform_initial='t'; }
	  #  case "g" { $wordform_initial='k'; }
	  #  #else { print "previous case not true" }
	  #}
	  
	  my $wf_substring = $wordform_initial+$lemma_rest;
	  
	  if ($refword =~ m/.*\Q$wf_substring\E/) {
	    my $orig = $$refbase;
	    $$refbase =~ s/(^[^\+]+).*$/$1\#$last_word/;
	    
	    # coping with geassi vs. geasse issue
	    my $first_l_part = lc($1);
	    my $first_wf_part = lc(substr($$refword, 0, length($first_l_part)));
	    
	    if ($first_l_part ne $first_wf_part) {
	      $$refbase = $first_wf_part.'#'.$last_word;
	    }

	    if ($orig =~ m/^\p{isLower}/) {
	      $$refbase = lcfirst($$refbase);
	    }
	  }
	}

	if ($second_last_word) {
		my $i=4;
		my $substring = substr($second_last_word,0,$i);
		while ($$refbase !~ /.*\Q$substring\E.*\#/ && $i>1 ) {
			$i--;
			$substring = substr($second_last_word,0,$i);
		}
		# If the compound boundary is found, mark it with #
		if ($$refbase =~ /.*\Q$substring\E/) {
		  # bug fix for bug 2011; test input string: guovloguovddášsuohkanat
		  # test: echo 'gieđagieđasuohkan' | usme | lookup2cg
		  # test: echo 'guovloguovddášsuohkanat' | usme | lookup2cg
			$$refbase =~ s/(.+)(\Q$substring\E.*\#)/$1\#$2/;
		}
	}
	if ($third_last_word) {
		my $i=4;
		my $substring = substr($third_last_word,0,$i);
		while ($$refbase !~ /.*\Q$substring\E.*\#.*\#/ && $i>1 ) {
			$i--;
			$substring = substr($third_last_word,0,$i);
		}
		# If the compound boundary is found, mark it with #
		if ($$refbase =~ /.*\Q$substring\E/) {
			$$refbase =~ s/(.*)(\Q$substring\E.*\#.*\#)/$1\#$2/;
		}
	}
	if ($fourth_last_word) {
		my $i=4;
		my $substring = substr($fourth_last_word,0,$i);
		print "$substring\n";
		while ($$refbase !~ /.*\Q$substring\E.*\#.*\#.*\#/ && $i>1 ) {
			$i--;
			$substring = substr($fourth_last_word,0,$i);
		}
		# If the compound boundary is found, mark it with #
		if ($$refbase =~ /.*\Q$substring\E/) {
			$$refbase =~ s/(\Q$substring\E.*\#.*\#.*\#)/\#$1/;
		}
	}
}

sub print_help {
    print "Usage: lookup2cg [OPTIONS] [FILES]\n";
    print "Options:\n";
    print "    --der=<DERIVATION>  Derivation\n";
    print "    --dict=<DICTIONARY> Dictionary\n";
    print "    --help              Print this message and exit.\n";
};