#!/usr/bin/perl use strict; # lookup2cg # Perl-script for converting lookup-output to CG-2 input. # - Rates and removes compound analyses according to # the number of word boundaries. # - Reformats compound analyses and base forms, removes duplicates # # Input: # Dan dat+Pron+Dem+Sg+Acc # Dan dat+Pron+Dem+Sg+Gen # # Output: # "" # "dat" Pron Dem Sg Acc # "dat" Pron Dem Sg Gen # # $Id$ my @Analyses; my $word; my $tmp; my $compound = 0; # Read while not eol while(<>) { # While line not empty, store word to a scalar and # analyses into an array. if (! /^\s*$/) { chomp; if (/#/) { $compound = 1; } # store the word into a variable # and the analysis part into an array. $_ =~ /^(.*?)\t(.*)$/; $word = $1; push @Analyses, $2; } # if empty line, format the output and print else { # Rate compounds accoring to word boundaries if ($compound) { &rate_compounds(\@Analyses); } # Format each analysis for my $line (@Analyses) { # Separate the base form from the analysis if ( $line =~ /^(.*?)\s*\+(.*)$/o ) { my $base = $1; $line = $2; $line =~ s/\+/ /go; # If line contains a compound if ( $line =~ /#/ ) { &format_compound(\$base, \$line, \$word); } # Format output $line = "\t" . " \"$base\" " . $line . "\n"; } } #Remove identical analyses &remove_identical(\@Analyses); # Mark derivational tags &mark_deriv_tags(\@Analyses); # Print output print "\"<$word>\"\n"; for my $line (@Analyses) { print $line; } #TODO: problem: this is one element in the array. @Analyses = 0; pop @Analyses; $compound = 0; } } # end of while sub remove_identical { my $aref = shift @_; # Remove identical analyses for my $i ( 0 .. $#{ $aref } ) { for my $j ( $i+1 .. $#{ $aref } ) { if ( $aref->[$i] eq $aref->[$j] ) { $aref->[$j] = undef; } } } } sub rate_compounds { my $aref = shift @_; # Rate compounds and remove extra readings. # First mark the number of words in the compound # to the beginning of the string my $word_count; my $best_word_count = 5; for my $line (@$aref) { $word_count = split ("#", $line); $line = $word_count . $line; if ( $word_count < $best_word_count ) { $best_word_count = $word_count; } } # Then sort the table # And remove the readings with too many #'s @$aref = sort {$a <=> $b} @$aref; for my $line (@$aref) { if ($line !~ /^$best_word_count/) { $line = undef; } } # Remove the extra digits for my $line (@$aref) { $line =~ s/^\d(.*)$/$1/o; } } sub format_compound { my ($refbase, $refline, $refword) = @_; # Take only the analysis of the last part of the compound $$refline =~ s/^.*#(.*?)\s(.*$)/$2/o; my $last_word = $1; # Search for the 3 first letters of the last base form # from the original compound my $substring = substr($last_word,0,3); # If some of the 3 letters is 2-letter garpheme # take 4 letters instead of 3 if($substring =~ /c1|s1|z1|d1|t1/o) { $substring = substr($last_word,0,4); } # If the compound boundary is found, # replace the last word by its base form, and insert a # mark in front of # it, in order to mark the result as a compound. if ($$refword =~ m/$substring/) { $$refbase = $$refword; $$refbase =~ s/(^.*)$substring.*$/$1#$last_word/; } # If the compound boundary is not found, # use "last resort": 2 letters instead of 3 or 4. else { $substring = substr($last_word,0,2); if ($$refword =~ m/$substring/) { $$refbase = $$refword; $$refbase =~ s/(^.*)$substring.*$/$1#$last_word/; } } # if ($$refline !~ /Prop/) { # print "JEE\n"; # $$refbase =~ s/[A-ZÅÄÖÆØ]/[a-zåäöæø]/; # } } sub mark_deriv_tags { my $aref = shift @_; foreach my $line ( @{$aref} ) { $line =~ s/(\s(?:V|N|Adv|A))(\s|\s.*?\s)((?:V|N|Adv|A)\s)/$1\*$2$3/; $line =~ s/(\s(?:V|N|Adv|A)\*(?:\s|\s.*?\s)(?:V|N|Adv|A))((?:\s|\s.*?\s)(?:V|N|Adv|A)\s)/$1\*$2/; } }