#!/usr/bin/perl use strict; use Getopt::Long; use utf8; # lookup2cg # Perl-script for converting lookup-output to CG-2 input. # - Rates and removes compound analyses according to # the number of word boundaries. # - Reformats compound analyses and base forms, removes duplicates # # Input: # Dan dat+Pron+Dem+Sg+Acc # Dan dat+Pron+Dem+Sg+Gen # # Output: # "" # "dat" Pron Dem Sg Acc # "dat" Pron Dem Sg Gen # # $Id$ my @Analyses; my %forms; my %rated; my $word; my $tmp; my $compound = 0; my $comp_rate = 0; my $max_comp_rate = 0; my $der=0; my $dict=0; my $der=0; my $help; Getopt::Long::Configure ("bundling"); GetOptions ("der|r" => \$der, "dict|d" => \$dict, "help" => \$help); if ($help) { &print_help; exit 1; } $/ = ""; # Read while not eol while(<>) { if(/\t/) {s/ /_/g;} # Shall not be placed here, but in wordformmaking my @Analyses; my %forms; my %rated; my $word; my $tmp; my $compound = 0; my $comp_rate = 0; my $max_comp_rate = 0; my @lines = split(/\n/, $_); for my $line (@lines) { next if ($line =~ /^\s*$/); # store word to a scalar and # analysis (=base form and tags) to an array my $analysis; ($word, $analysis) = split(/\t/, $line, 2); next if ! $analysis; $analysis =~ tr/\t//d; push @Analyses, $analysis; $comp_rate = ($line =~ tr/\#//); if ($comp_rate > $max_comp_rate) { $max_comp_rate = $comp_rate; } } # format the output and print if ($max_comp_rate > 0) { while (@Analyses) { my $line = pop @Analyses; $rated{$line} = 1; } pop @Analyses; rate_compounds(\%rated); # If dictionary use, try to prefer lexicalized compounds if ( $dict ) { select_lexicalized(\%rated); } for my $key (keys %rated) { push @Analyses, $key; } } # Format each analysis for my $line (@Analyses) { # Separate the base form from the analysis my $base; my $line2; my $plus; my $comp_rate = ($line =~ tr/\#//); if ( $comp_rate > 0) { $line =~ s/\#\+Der\d\+Der\//\#/g; $line =~ /^(.*\#.*?)\+(.*)$/; $base = $1; $line2 = $2; } else { if ($line =~ s/^\+//) { $plus = "+"; } ($base, $line2) = split(/\+/, $line, 2); } $line2 =~ tr/+/ /; $base =~ tr/+//; $line2 =~ s/=/ = /g; # relevant to kal XXX # $base =~ s/=/ = /g; # relevant to kal if ($plus) { $base = $plus . $base; } # If line contains a compound if ( $comp_rate > 0 && ! $der && ! $dict) { format_compound(\$base, \$line2, \$word); } # Mark derivational tags 1 while $line2 =~ s/\b(V|N|Adv|A)(\ |\ .*?\ )(V|N|Adv|A)\b/$1\*$2$3/g; # Mark IV with star when there is later TV # (causative derivation) 1 while $line2 =~ s/\b(IV)(\ |\ .*?\ )(TV)\b/$1\*$2$3/g; #Format output $line = "\t" . " \"$base\" " . $line2 . "\n"; # Store the analysis to a hash if ($base) { $forms{$line} = 1; } } # Print output if (@Analyses) { print "\"<$word>\"\n"; for my $line (keys %forms) { $line =~ s/("[^"]+?)\s+([^"]+?")/$1_$2/g; print "$line"; } } } # end of while sub rate_compounds { my $href = shift @_; # Rate compounds and remove extra readings. my $min_boundary_count=5; my $boundary_count=0; my $der_count=0; for (keys %$href) { $boundary_count = tr/\#//; $der_count = () = $_ =~ /Der\//g; $boundary_count += $der_count; # The derivation is not preferred. #if (/Der\d\+Der/) { $boundary_count++; } if ( $boundary_count < $min_boundary_count ) { $min_boundary_count = $boundary_count; } } my $i=0; for (keys %$href) { $boundary_count = 0; $der_count = 0; if ($der) { next if /\b(V|N|Adv|A)(\ |\ .*?\ )(V|N|Adv|A)\b/; } $boundary_count = tr/\#//; $der_count = () = $_ =~ /Der\//g; $boundary_count += $der_count; if ( $boundary_count > $min_boundary_count ) { delete($$href{$_}); } } } sub select_lexicalized { my $href = shift @_; my %lexicalized; # Select the forms which do not have any compound tags for (keys %$href) { next if (m/\+.*\#/); $lexicalized{$_} = 1; } # If there were lexicalized compounds, delete others from output if (%lexicalized) { for my $k (keys %$href) { if (! $lexicalized{$k}) { delete($$href{$k}); } } } } sub format_compound { my ($refbase, $refline, $refword) = @_; my $boundary_count = ($$refbase =~ tr/\#//); # Take only the analysis of the last part of the compound $$refbase =~ /^(.*)\#([^\#]+)$/; my $last_word = $2; if(! $last_word) { return; } my $second_last_word; my $third_last_word; my $fourth_last_word; if ($boundary_count > 1) { if ( $$refbase =~ /^.*\#(.*?)\#.*$/ ) { $second_last_word = $1; } if ( $$refbase =~ /^.*\#(.*?)\#.*\#.*$/ ) { $third_last_word = $1; } if ( $$refbase =~ /^.*\#(.*?)\#.*\#.*\#.*$/ ) { $fourth_last_word = $1; } } my $i=4; my $substring = substr($last_word,0,$i); while ($i > 1) { if ($$refword =~ m/.*(\Q$substring\E)/) { my $pos = rindex $$refword,$substring,; #exit the search only when the substring is not in the beginning of the word. if ($pos > 1) { last; } } $i--; $substring = substr($last_word,0,$i); } if ($$refword =~ m/.*\Q$substring\E/) { # If the compound boundary is found, # replace the last word by its base form, and insert a # mark in front of # it, in order to mark the result as a compound. my $orig = $$refbase; $$refbase = $$refword; $$refbase =~ s/(^.*)\Q$substring\E.*$/$1\#$last_word/; if ($orig =~ m/^\p{isLower}/) { $$refbase = lcfirst($$refbase); } } if ($second_last_word) { my $i=4; my $substring = substr($second_last_word,0,$i); while ($$refbase !~ /.*\Q$substring\E.*\#/ && $i>1 ) { $i--; $substring = substr($second_last_word,0,$i); } # If the compound boundary is found, mark it with # if ($$refbase =~ /.*\Q$substring\E/) { $$refbase =~ s/(\Q$substring\E.*\#)/\#$1/; } } if ($third_last_word) { my $i=4; my $substring = substr($third_last_word,0,$i); while ($$refbase !~ /.*\Q$substring\E.*\#.*\#/ && $i>1 ) { $i--; $substring = substr($third_last_word,0,$i); } # If the compound boundary is found, mark it with # if ($$refbase =~ /.*\Q$substring\E/) { $$refbase =~ s/(.*)(\Q$substring\E.*\#.*\#)/$1\#$2/; } } if ($fourth_last_word) { my $i=4; my $substring = substr($fourth_last_word,0,$i); print "$substring\n"; while ($$refbase !~ /.*\Q$substring\E.*\#.*\#.*\#/ && $i>1 ) { $i--; $substring = substr($fourth_last_word,0,$i); } # If the compound boundary is found, mark it with # if ($$refbase =~ /.*\Q$substring\E/) { $$refbase =~ s/(\Q$substring\E.*\#.*\#.*\#)/\#$1/; } } }