#!/usr/bin/perl use strict; use Getopt::Long; use Data::Dumper; #use Switch; use utf8; # lookup2cg # Perl-script for converting lookup-output to CG-2 input. # - Rates and removes compound analyses according to # the number of word boundaries. # - Reformats compound analyses and base forms, removes duplicates # # Input: # Dan dat+Pron+Dem+Sg+Acc # Dan dat+Pron+Dem+Sg+Gen # # Output: # "" # "dat" Pron Dem Sg Acc # "dat" Pron Dem Sg Gen # # $Id$ my $dict=0; my $der=0; my $help; Getopt::Long::Configure ("bundling"); GetOptions ("der|r" => \$der, "dict|d" => \$dict, "help" => \$help); if ($help) { &print_help; exit 1; } $/ = ""; # Read while not eol while(<>) { if(/\t/) {s/ /_/g;} # Shall not be placed here, but in word form making my @Analyses; my %forms; my %rated; my $word; my $comp_rate = 0; my $max_comp_rate = 0; my @lines = split(/\n/, $_); for my $line (@lines) { next if ($line =~ /^\s*$/); # store word to a scalar and # analysis (=base form and tags) to an array my $analysis; ($word, $analysis) = split(/\t/, $line, 2); next if ! $analysis; $analysis =~ tr/\t//d; push @Analyses, $analysis; $comp_rate = ($line =~ tr/\#//); if ($comp_rate > $max_comp_rate) { $max_comp_rate = $comp_rate; } } # format the output and print if ($max_comp_rate > 0) { while (@Analyses) { my $line = pop @Analyses; $rated{$line} = 1; } pop @Analyses; rate_compounds(\%rated); # If dictionary use, try to prefer lexicalized compounds if ( $dict ) { select_lexicalized(\%rated); } for my $key (keys %rated) { push @Analyses, $key; } } # Format each analysis for my $line (@Analyses) { # Separate the base form from the analysis my $base; my $line2; my $plus; my $comp_rate = ($line =~ tr/\#//); if ( $comp_rate > 0) { $line =~ s/\#\+Der\d\+Der\//\#/g; $line =~ /^(.*\#.*?)\+(.*)$/; $base = $1; $line2 = $2; } else { if ($line =~ s/^\+//) { $plus = "+"; } ($base, $line2) = split(/\+/, $line, 2); } $line2 =~ tr/+/ /; $base =~ tr/+//; $line2 =~ s/=/ = /g; # relevant to kal XXX # $base =~ s/=/ = /g; # relevant to kal if ($plus) { $base = $plus . $base; } # If line contains a compound if ( $comp_rate > 0 && ! $der && ! $dict) { format_compound(\$base, \$line2, \$word); } # Mark derivational tags # 1. basic tags 1 while $line2 =~ s/\b(V|N|Adv|A)(\ |\ .*?\ )(V|N|Adv|A)\b/$1\*$2$3/g; # 2. star IV if last occurence is TV (causative derivation) 1 while $line2 =~ s/\b(IV)(\ |\ .*?\ )(TV)\b/$1\*$2$3/g; #(Fixes to bug 954) # 3. star TV if last occurence is IV (passive derivation) 1 while $line2 =~ s/\b(TV)(\ |\ .*?\ )(IV)\b/$1\*$2$3/g; # 4. star TV|IV if last occurence is N (nominal derivation) 1 while $line2 =~ s/\b(TV|IV)(\ |\ .*?\ )(N)\b/$1\*$2$3/g; # 5. star IV if last occurence is IV (derivation without transitivity change, e.g. 'gildojuvvon') 1 while $line2 =~ s/\b(IV)(\ |\ .*?\ )(IV)\b/$1\*$2$3/g; #Format output $line = "\t" . " \"$base\" " . $line2 . "\n"; # Store the analysis to a hash if ($base) { $forms{$line} = 1; } } # Print output if (@Analyses) { print "\"<$word>\"\n"; for my $line (keys %forms) { $line =~ s/("[^"]+?)\s+([^"]+?")/$1_$2/g; print "$line"; } } } # end of while sub rate_compounds { my $href = shift @_; # Rate compounds and remove extra readings. my $min_boundary_count=5; my $boundary_count=0; my $der_count=0; for (keys %$href) { $boundary_count = tr/\#//; $der_count = () = $_ =~ /Der\//g; $boundary_count += $der_count; # The derivation is not preferred. #if (/Der\d\+Der/) { $boundary_count++; } if ( $boundary_count < $min_boundary_count ) { $min_boundary_count = $boundary_count; } } for (keys %$href) { $boundary_count = 0; $der_count = 0; if ($der) { next if /\b(V|N|Adv|A)(\ |\ .*?\ )(V|N|Adv|A)\b/; } $boundary_count = tr/\#//; $der_count = () = $_ =~ /Der\//g; #cip: commented out the next line, # which would not delete the Der-lines (see bug 1283) #$boundary_count += $der_count; if ( $boundary_count > $min_boundary_count ) { delete($$href{$_}); } } } sub select_lexicalized { my $href = shift @_; my %lexicalized; # Select the forms which do not have any compound tags for (keys %$href) { next if (m/\+.*\#/); $lexicalized{$_} = 1; } # If there were lexicalized compounds, delete others from output if (%lexicalized) { for my $k (keys %$href) { if (! $lexicalized{$k}) { delete($$href{$k}); } } } } sub format_compound { my ($refbase, $refline, $refword) = @_; my $boundary_count = ($$refbase =~ tr/\#//); # Take only the analysis of the last part of the compound $$refbase =~ /^(.*)\#([^\#]+)$/; my $last_word = $2; if(! $last_word) { return; } my $second_last_word; my $third_last_word; my $fourth_last_word; if ($boundary_count > 1) { if ( $$refbase =~ /^.*\#(.*?)\#.*$/ ) { $second_last_word = $1; } if ( $$refbase =~ /^.*\#(.*?)\#.*\#.*$/ ) { $third_last_word = $1; } if ( $$refbase =~ /^.*\#(.*?)\#.*\#.*\#.*$/ ) { $fourth_last_word = $1; } } my $i=4; # this is the problem with Bug 1746: the substring my $substring = substr($last_word,0,$i); while ($i > 1) { if ($$refword =~ m/.*(\Q$substring\E)/) { my $pos = rindex $$refword,$substring,; #exit the search only when the substring is not in the beginning of the word. if ($pos > 1) { last; } } $i--; $substring = substr($last_word,0,$i); } #trigger Mázecabaret compounds my $substring_trigger; if ($$refword =~ m/.*\Q$substring\E/) { # If the compound boundary is found, # replace the last word by its base form, and insert a # mark in front of # it, in order to mark the result as a compound. my $orig = $$refbase; $$refbase = $$refword; my $substring_counter = () = $last_word =~ /$substring/gi; # bug fix for bug 949: input 'vuertemetjiehtjele' # test: echo 'vuertemetjiehtjele' | usma VS echo 'vuertemetjiehtjele' | usma | lookup2cg $$refbase =~ s/(^.*)(\Q$substring\E.*){$substring_counter}$/$1\#$last_word/; if ($orig =~ m/^\p{isLower}/) { $$refbase = lcfirst($$refbase); } $substring_trigger = 'head_initial'; } if (!$substring_trigger) { #last_word don't match the refword: Mázecabaret vs. Máze#kabarea issue (my $lemma_initial, my $lemma_rest) = $substring =~ /^(\w)(.*)$/; my $wordform_initial; if ($lemma_initial eq 'k') { $wordform_initial='c'; } elsif ($lemma_initial eq 'b') { $wordform_initial='p'; } elsif ($lemma_initial eq 'd') { $wordform_initial='t'; } elsif ($lemma_initial eq 'g') { $wordform_initial='k'; } # else { # print STDERR "Nothing to map\n"; # } #commented out due to installation problems with Trond's configuration #switch($lemma_initial){ # case "k" { $wordform_initial='c'; } # case "b" { $wordform_initial='p'; } # case "d" { $wordform_initial='t'; } # case "g" { $wordform_initial='k'; } # #else { print "previous case not true" } #} my $wf_substring = $wordform_initial+$lemma_rest; if ($refword =~ m/.*\Q$wf_substring\E/) { my $orig = $$refbase; $$refbase =~ s/(^[^\+]+).*$/$1\#$last_word/; # coping with geassi vs. geasse issue my $first_l_part = lc($1); my $first_wf_part = lc(substr($$refword, 0, length($first_l_part))); if ($first_l_part ne $first_wf_part) { $$refbase = $first_wf_part.'#'.$last_word; } if ($orig =~ m/^\p{isLower}/) { $$refbase = lcfirst($$refbase); } } } if ($second_last_word) { my $i=4; my $substring = substr($second_last_word,0,$i); while ($$refbase !~ /.*\Q$substring\E.*\#/ && $i>1 ) { $i--; $substring = substr($second_last_word,0,$i); } # If the compound boundary is found, mark it with # if ($$refbase =~ /.*\Q$substring\E/) { # bug fix for bug 2011; test input string: guovloguovddášsuohkanat # test: echo 'gieđagieđasuohkan' | usme | lookup2cg # test: echo 'guovloguovddášsuohkanat' | usme | lookup2cg $$refbase =~ s/(.+)(\Q$substring\E.*\#)/$1\#$2/; } } if ($third_last_word) { my $i=4; my $substring = substr($third_last_word,0,$i); while ($$refbase !~ /.*\Q$substring\E.*\#.*\#/ && $i>1 ) { $i--; $substring = substr($third_last_word,0,$i); } # If the compound boundary is found, mark it with # if ($$refbase =~ /.*\Q$substring\E/) { $$refbase =~ s/(.*)(\Q$substring\E.*\#.*\#)/$1\#$2/; } } if ($fourth_last_word) { my $i=4; my $substring = substr($fourth_last_word,0,$i); print "$substring\n"; while ($$refbase !~ /.*\Q$substring\E.*\#.*\#.*\#/ && $i>1 ) { $i--; $substring = substr($fourth_last_word,0,$i); } # If the compound boundary is found, mark it with # if ($$refbase =~ /.*\Q$substring\E/) { $$refbase =~ s/(\Q$substring\E.*\#.*\#.*\#)/\#$1/; } } } sub print_help { print "Usage: lookup2cg [OPTIONS] [FILES]\n"; print "Options:\n"; print " --der= Derivation\n"; print " --dict= Dictionary\n"; print " --help Print this message and exit.\n"; };