# Util.pm # Utility functions for the using the different langtech tools. package langTools::Util; use utf8; use Encode; use warnings; use strict; use Carp qw(cluck); use Data::Dumper; # for DEBUGging %paradigms use Exporter; our ( $VERSION, @ISA, @EXPORT, @EXPORT_OK ); # $VERSION = sprintf "%d.%03d", q$Revision$ =~ /(\d+)/g; @ISA = qw(Exporter); @EXPORT = qw(&init_lookup &call_lookup &read_tags &generate_taglist &win_digr &digr_utf8 &format_compound); @EXPORT_OK = qw(); # Initialize expect object for analysis # Returns a pointer to the object. sub init_lookup { my ($command) = @_; if ( !$command ) { cluck "No command specified"; } my $exp = Expect->spawn($command) or cluck "Cannot spawn $command"; $exp->log_stdout(0); return $exp; } # Call expect object with a string. # Returns the analysis. sub call_lookup { my ( $exp_ref, $string, $decode ) = @_; if ( !$$exp_ref ) { cluck "The expect object missing"; } if ($decode) { $string = Encode::encode_utf8($string); } $$exp_ref->send("$string\n"); $$exp_ref->expect( undef, '-re', '\r?\n\r?\n' ); my $read_anl = $$exp_ref->before(); if ($decode) { $read_anl = Encode::decode_utf8($read_anl); } # Take away the original input. $read_anl =~ s/^.*?\n//; # Replace extra newlines. $read_anl =~ s/\r\n/\n/g; $read_anl =~ s/\r//g; return "$read_anl\n"; } # Read the grammar for paradigm tag list. # Call the recursive function that generates the tag list. sub generate_taglist { my ( $gramfile, $tagfile, $taglist_aref ) = @_; # gramfile = paradigm.sme.txt # tagfile = korpustags.sme.txt my @grammar; my %tags; if ($gramfile) { # Read from tag file and store to an array. open GRAM, "< $gramfile" or die "Cant open the file $gramfile: $!\n"; #my @tags; my $tag_class; GRAM_FILE: while () { chomp; next if /^\s*$/; next if /^%/; next if /^$/; next if /^#/; s/\s*$//; push( @grammar, $_ ); } } read_tags( $tagfile, \%tags ); # print Dumper(\%tags); my @taglists; # Read each grammar rule and generate the taglist. for my $gram (@grammar) { my @classes = split( /\+/, $gram ); my $pos = shift @classes; my $tag = $pos; my @taglist; generate_tag( $tag, \%tags, \@classes, \@taglist ); push( @{ $$taglist_aref{$pos} }, @taglist ); } # for my $pos ( keys %$taglist_aref ) { # print "\nJEE $pos OK @{$$taglist_aref{'Pron'}} "; # } } # Traverse recursively the taglists and generate # the tagsets for pardigm generation. # The taglist is stored to the array reference $taglist_aref. sub generate_tag { my ( $tag, $tags_href, $classes_aref, $taglist_aref ) = @_; if ( !@$classes_aref ) { push( @$taglist_aref, $tag ); return; } my $class = shift @$classes_aref; if ( $class =~ s/\?// ) { my $new_tag = $tag; my @new_class = @$classes_aref; generate_tag( $new_tag, $tags_href, \@new_class, $taglist_aref ); } if ( !$$tags_href{$class} ) { my $new_tag = $tag . "+" . $class; my @new_class = @$classes_aref; generate_tag( $new_tag, $tags_href, \@new_class, $taglist_aref ); return; } for my $t ( @{ $$tags_href{$class} } ) { my $new_tag = $tag . "+" . $t; my @new_class = @$classes_aref; generate_tag( $new_tag, $tags_href, \@new_class, $taglist_aref ); } } # Read the morphological tags from a file (korpustags.txt) sub read_tags { my ( $tagfile, $tags_href ) = @_; # Read from tag file and store to an array. open TAGS, "< $tagfile" or die "Cant open the file $tagfile: $!\n"; my @tags; my $tag_class; TAG_FILE: while () { chomp; next if /^\s*$/; next if /^%/; next if /^$/; next if /=/; if (s/^#//) { $tag_class = $_; push( @{ $$tags_href{$tag_class} }, @tags ); undef @tags; pop @tags; next TAG_FILE; } my @tag_a = split( /\s+/, $_ ); push @tags, $tag_a[0]; } close TAGS; } # Form baseforms of analyzed compounds. sub format_compound { my ( $refbase, $refline, $refword ) = @_; my $boundary_count = ( $$refbase =~ tr/\#// ); # Take only the analysis of the last part of the compound $$refbase =~ /^.*\#(.*?)$/; my $last_word = $1; if ( !$last_word ) { return; } my $second_last_word; my $third_last_word; if ( $boundary_count > 1 ) { if ( $$refbase =~ /^.*\#(.*?)\#.*$/ ) { $second_last_word = $1; } if ( $$refbase =~ /^.*\#(.*?)\#.*\#.*$/ ) { $third_last_word = $1; } } my $i = 4; my $substring = substr( $last_word, 0, $i ); while ( ( $$refword !~ m/\Q$substring\E/ ) && $i > 1 ) { $i--; $substring = substr( $last_word, 0, $i ); } if ( $$refword =~ m/\Q$substring\E/ ) { # If the compound boundary is found, # replace the last word by its base form, and insert a # mark in front of # it, in order to mark the result as a compound. my $orig = $$refbase; $$refbase = $$refword; $$refbase =~ s/(^.*)\Q$substring\E.*$/$1\#$last_word/; if ( $orig =~ m/^\p{isLower}/ ) { $$refbase = lcfirst($$refbase); } } if ($second_last_word) { my $i = 4; my $substring = substr( $second_last_word, 0, $i ); while ( $$refbase !~ /.*\Q$substring\E.*\#/ && $i > 1 ) { $i--; $substring = substr( $second_last_word, 0, $i ); } # If the compound boundary is found, mark it with # if ( $$refbase =~ /\Q$substring\E/ ) { $$refbase =~ s/(\Q$substring\E.*\#)/\#$1/; } } if ($third_last_word) { my $i = 4; my $substring = substr( $third_last_word, 0, $i ); while ( $$refbase !~ /.*\Q$substring\E.*\#.*\#/ && $i > 1 ) { $i--; $substring = substr( $third_last_word, 0, $i ); } # If the compound boundary is found, mark it with # if ( $$refbase =~ /\Q$substring\E/ ) { $$refbase =~ s/(\Q$substring\E.*\#.*\#)/\#$1/; } } } # Some character set conversion routines, rarely used nowadays. # Convert windows charachters to Sami digraphs sub win_digr { my $ctext = shift(@_); $ctext =~ s/\212/S1/g; $ctext =~ s/\232/s1/g; $ctext =~ s/\216/Z1/g; $ctext =~ s/\236/z1/g; return $ctext; } sub digr_utf8 { my $ctext = shift(@_); $ctext =~ s/A1/Á/g; $ctext =~ s/a1/á/g; $ctext =~ s/C1/Č/g; $ctext =~ s/c1/č/g; $ctext =~ s/D1/Đ/g; $ctext =~ s/d1/đ/g; $ctext =~ s/N1/Ŋ/g; $ctext =~ s/n1/ŋ/g; $ctext =~ s/S1/Š/g; $ctext =~ s/s1/š/g; $ctext =~ s/T1/Ŧ/g; $ctext =~ s/t1/ŧ/g; $ctext =~ s/Z1/Ž/g; $ctext =~ s/z1/ž/g; return $ctext; } 1; __END__