package langTools::XMLStruct; use utf8; use warnings; use strict; use XML::Twig; use Carp qw(cluck); use Exporter; our ($VERSION, @ISA, @EXPORT, @EXPORT_OK); $VERSION = sprintf "%d.%03d", q$Revision$ =~ /(\d+)/g; @ISA = qw(Exporter); @EXPORT = qw(&dis2html &hyph2html &gen2html &preprocess2xml &dis2xml &analyzer2xml &hyph2xml ¶digm2xml &gen2xml &xml2preprocess &xml2words &xml2dis $fst %dis_tools %action %prep_tools $language $xml_in $xml_out $args &process_paras &get_action &dis2corpus_xml); @EXPORT_OK = qw(&process_paras); our ($fst, %dis_tools, %action, %prep_tools, $prep, $language, $args, $xml_in, $xml_out); # Store the vislcg or lookup2cg output # to xml-structure. sub dis2xml { my ($text) = @_; my $w; my $output=XML::Twig::Elt->new('disamb'); if (! $text) { my $string = $output->sprint; $output->delete; return $string; } my @input = split(/\n/, $text); for my $out (@input) { # ignore empty lines next if $out =~ /^\s*$/; chomp $out; # Test the start of the cohort. if ($out =~ /^\"paste('last_child', $output); $w->DESTROY; undef $w; } # Read the word and go to next line. $out =~ s/^\"<(.*)?>\".*$/$1/; $out =~ s/\"/\'/g; $w = XML::Twig::Elt->new('w'); $w->set_att('form', $out); next; } $out =~ s/\s+$//; $out =~ s/^\s+//; $out =~ s/new('reading'); $lemma =~ s/\"//g; # Remove ^ and # from lemma for now. $lemma =~ s/[\^\#]//g; $reading->set_att('lemma', $lemma); if ($analysis) { $analysis =~ tr/ /+/; $reading->set_att('analysis', $analysis); } $reading->paste('last_child', $w); } if ($w) { $w->paste('last_child', $output); } my $string = $output->sprint; $output->delete; return $string; } # Store the vislcg or lookup2cg output # to xml-structure. sub dis2html { my ($text, $structure) = @_; my $output=XML::Twig::Elt->new('pre'); if (! $text) { if ($structure) { return $output; } my $string = $output->sprint; $output->delete; return $string; } $output->set_content($text); if ($structure) { return $output; } else{ my $string = $output->sprint; $output->delete; return $string; } } # Convert the xml-output of analyzator or lookup2cg to # vislcg input. sub xml2dis { my ($xml) = @_; my $string; my $twig = XML::Twig->new(keep_encoding => 1); if (! $twig->safe_parse ($xml)) { cluck("Couldn't parse xml"); return Carp::longmess("Could not parse xml"); } my $root=$twig->root; my @words=$root->children; for my $word (@words) { $string .= "\"<" . $word->{'att'}->{'form'} . ">\""; $string .= "\n"; my @readings = $word->children; for my $r (@readings) { my $analysis = $r->{'att'}->{'analysis'}; $analysis =~ s/\+/ /g; $string .= "\t" . "\"" . $r->{'att'}->{'lemma'} . "\""; $string .= " " . $analysis; $string .= "\n"; } } $twig->delete; return $string; } # Store analyzer output to xml-structure. sub analyzer2xml { my ($text) = shift @_; my $word; my $w; my $output=XML::Twig::Elt->new('analysis'); $output->set_pretty_print('record'); if (! $text) { my $string = $output->sprint; $output->delete; return $string; } my @input=split(/\n/, $text); for my $out (@input) { next if (! $out); if ($out =~ /^\s*$/) { if ($w) { $w->set_att('form', $word); $w->paste('last_child', $output); $w->DESTROY; undef $w; next; } } chomp $out; if (! $w) { $w=XML::Twig::Elt->new('w'); } my $line; ($word, $line) = split(/\t/, $out, 2); next if (! $line); my ($lemma, $analysis) = split(/\+/, $line, 2); $lemma =~ s/\s+$//; my $reading=XML::Twig::Elt->new('reading'); $reading->set_att('lemma', $lemma); if ($analysis) { $reading->set_att('analysis', $analysis); } $reading->paste('last_child', $w); } if ($w) { $w->set_att('form', $word); $w->paste('last_child', $output); } my $string = $output->sprint; return $string; } # Convert xml-input of word list # to analyzer or hyphenator, or possibly generator. sub xml2words { my ($xml) = @_; my $string; my $twig = XML::Twig->new(keep_encoding => 1); if (! $twig->safe_parse ($xml)) { cluck("Couldn't parse xml"); return Carp::longmess("Could not parse xml"); } my $root=$twig->root; my @words=$root->children; for my $word (@words) { if ($word->{'att'}->{'form'}) { $string .= $word->{'att'}->{'form'}; $string .= "\n"; } } $twig->dispose; return $string; } # Move hyphenator output to xml-structure. sub hyph2xml { my ($text) = @_; my $w; my $word; my $output=XML::Twig::Elt->new('hyphenation'); $output->set_pretty_print('record'); if (! $text) { my $string = $output->sprint; $output->delete; return $string; } my @input=split(/\n/, $text); for my $out (@input) { if ($out =~ /^\s*$/) { if ($w) { $w->set_att('form', $word); $w->paste('last_child', $output); $w->DESTROY; undef $w; next; } } chomp $out; my $hyph; ($word, $hyph) = split(/\t/, $out); if (! $w) { $w=XML::Twig::Elt->new('w'); } if ($hyph) { my $reading=XML::Twig::Elt->new('reading'); $reading->set_att('hyph', $hyph); $reading->paste('last_child', $w); } } if ($w) { $w->set_att('form', $word); $w->paste('last_child', $output); } my $string = $output->sprint; $output->delete; return $string; } # Move hyphenator output to xml-structure. sub hyph2html { my ($text,$structure) = @_; my @content; my $output=XML::Twig::Elt->new('p'); $output->set_pretty_print('record'); if (! $text) { if ($structure) { return $output; } my $string = $output->sprint; $output->delete; return $string; } my @input=split(/\n/, $text); for my $out (@input) { my ($word, $hyph) = split(/\t/, $out); if ($hyph) { push(@content,$hyph); push (@content," "); next; } } $output->set_content(@content); if ($structure) { return $output; } else { my $string = $output->sprint; $output->delete; return $string; } } sub paradigm2xml { my ($result, $answer, $candidates, $mode) = @_; my $lemma; my $analysis; my $w=XML::Twig::Elt->new('w'); $w->set_pretty_print('record'); for (my $j=0; $j<$result; $j++) { if ($$answer{$j}{form}) { $w->set_att('input', $$answer{$j}{form}); } my $paradigm=XML::Twig::Elt->new('paradigm'); if ($$answer{$j}{anl}) { my ($l, $anl) = split(/\+/, $$answer{$j}{anl}, 2); $paradigm->set_att('analysis', $anl); $paradigm->set_att('lemma', $l); } my @input=split(/\n/, $$answer{$j}{para}); for my $out (@input) { if ($out =~ /^\s*$/) { next; } chomp $out; my ($line, $form) = split(/\t/, $out, 2); next if (! $form); $form =~ s/^\s+//; ($lemma, $analysis) = split(/§/, $line, 2); if (! $analysis) { ($lemma, $analysis) = split(/\+/, $line, 2); } my $surface=XML::Twig::Elt->new('surface'); $surface->set_att('form', $form); $surface->set_att('analysis', $analysis); $surface->set_att('form', $form); $surface->paste('last_child', $paradigm); $surface->DESTROY; } $paradigm->paste('last_child', $w); $paradigm->DESTROY; # If minimal mode, show only first paradigm last if (! $mode || $mode eq "minimal"); } if (keys %$candidates) { my $cands=XML::Twig::Elt->new('other'); for my $c (keys %$candidates) { my $cand=XML::Twig::Elt->new('anl', $c); $cand->paste('last_child', $cands); } $cands->paste('last_child', $w); } my $string = $w->sprint; $w->delete; return $string; } # End of paradigm output # Move generator output to xml-structure. sub gen2xml { my ($text, $paradigm) = @_; my $w; my $lemma; my $analysis; my $output; if ($paradigm) { $output=XML::Twig::Elt->new('paradigm'); } else { $output=XML::Twig::Elt->new('generation'); } $output->set_pretty_print('record'); if (! $text) { my $string = $output->sprint; $output->delete; return $string; } my @input=split(/\n/, $text); for my $out (@input) { if ($out =~ /^\s*$/) { if ($w) { $w->set_att('lemma', $lemma); if (! $paradigm) { $w->set_att('analysis', $analysis); } $w->paste('last_child', $output); $w->DESTROY; undef $w; next; } } chomp $out; my ($line, $form) = split(/\t/, $out, 2); next if (! $form); $form =~ s/^\s+//; ($lemma, $analysis) = split(/§/, $line, 2); if (! $analysis) { ($lemma, $analysis) = split(/\+/, $line, 2); } if (! $w) { $w=XML::Twig::Elt->new('w'); } my $surface=XML::Twig::Elt->new('surface'); $surface->set_att('form', $form); if ($paradigm) { $surface->set_att('analysis', $analysis); } $surface->set_att('form', $form); $surface->paste('last_child', $w); } if ($w) { $w->set_att('lemma', $lemma); #$w->set_att('analysis', $analysis); $w->paste('last_child', $output); } my $string = $output->sprint; $output->delete; return $string; } # Move generator output to html-table. sub gen2html { my ($text, $paradigm,$structure,$fulllemma) = @_; my $tr; my $td; my $lemma; my $analysis; my $output; $output=XML::Twig::Elt->new('table'); $output->set_pretty_print('record'); if (! $text) { if ($structure) { return $output; } my $string = $output->sprint; $output->delete; return $string; } my $first_part; if ($fulllemma) { ($first_part = $fulllemma ) =~ s/\#[^\#]+$//; } my @input=split(/\n/, $text); my $prev_analysis=""; for my $out (@input) { chomp $out; my ($line, $form) = split(/\t/, $out, 2); next if (! $form); $form =~ s/^\s+//; if ($fulllemma) { $form = $first_part . $form; } $form =~ s/\#//g; ($lemma, $analysis) = split(/\+/, $line, 2); if ($fulllemma) { $lemma = $fulllemma; $lemma =~ s/\#//g; } # There may be more than one form for an analysis # to separate different paradigms, # try to group them. if($analysis && $prev_analysis eq $analysis) { $td=XML::Twig::Elt->new('td'); $td->set_text($form); if ($tr) { $td->paste('last_child', $tr); } next; } if ($tr) { $tr->paste('last_child', $output); undef $tr; } $tr=XML::Twig::Elt->new('tr'); $td=XML::Twig::Elt->new('td'); $td->set_text($lemma); $td->paste('last_child', $tr); if ($analysis) { $td=XML::Twig::Elt->new('td'); $td->set_text($analysis); $td->paste('last_child', $tr); } $td=XML::Twig::Elt->new('td'); $td->set_text($form); $td->paste('last_child', $tr); $prev_analysis = $analysis; } if ($tr) { $tr->paste('last_child', $output); } if ($structure) { return $output; } else { my $string = $output->sprint; $output->delete; return $string; } } # Move preprocessor output to xml-structure sub preprocess2xml { my ($text) = @_; my $output=XML::Twig::Elt->new('preprocess'); $output->set_pretty_print('record'); if (! $text) { my $string = $output->sprint; $output->delete; return $string; } my @input=split(/\n/, $text); for my $out (@input) { chomp $out; my $w=XML::Twig::Elt->new('w'); $w->set_att('form', $out); $w->paste('last_child', $output); } my $string = $output->sprint; $output->delete; return $string; } # preprocessor input read from xml-structure. sub xml2preprocess { my $xml = shift @_; my $twig = XML::Twig->new(keep_encoding => 1); if (! $twig->safe_parse ($xml)) { cluck("Couldn't parse xml."); return Carp::longmess("Could not parse xml"); } my $root=$twig->root; my $text=$root->text; $root->delete; $twig->dispose; return $text; } sub dis2corpus_xml { my ($text, $tags_href, $w_num_ref, $id) = @_; my $xml = dis2xml($text); #print $xml; my $string; my $twig = XML::Twig->new(); #my $twig = XML::Twig->new(keep_encoding => 1); if (! $twig->safe_parse ($xml)) { cluck("Couldn't parse xml"); return Carp::longmess("Could not parse xml"); } $twig->set_pretty_print('record'); my $root=$twig->root; my @words=$root->children; for my $word (@words) { my $id = $id . "_w" . $$w_num_ref++; $word->set_att('id', $id); for my $reading ($word->children) { my $reading_text = $reading->{'att'}->{'analysis'}; # Create a new XML element for each reading. my (@tag_list) = split(/\+/, $reading_text); # Process each tag and store them to XML attributes # for the reading. for my $tag (@tag_list) { for my $class (keys %$tags_href) { if ( exists $$tags_href{$class}{$tag} ) { # Store the tag to XML attribute of the reading $reading->set_att($class, $tag); } } } $reading->del_att('analysis'); } # end while readings } return $root; } sub get_action{ my $line = shift @_; print "$line\n"; my $document = XML::Twig->new; if (! $document->safe_parse ("$line") ) { cluck("Could not parse parameters."); return Carp::longmess("ERROR Could not parse $line"); } my $root = $document->root; my $action = $root->{'att'}->{'tool'}; return $action; } # Processing instructions are parsed # from XML-structure. sub process_paras { my $parameters = shift @_; my $document = XML::Twig->new(keep_encoding => 1); if (! $document->safe_parse ("$parameters") ) { cluck("Could not parse parameters."); return Carp::longmess("Could not parse parameters: $parameters"); } my $root = $document->root; my $lang = $root->first_child('lang'); if (! $lang || ! $lang->text) { cluck("No language specified."); return Carp::longmess("No language specified: $parameters"); } else { $language = $lang->text; } my %default_args = ( "dis" => "--quiet", "anl" => "-flags -mbTT -utf8", "gen" => "-flags -mbTT -utf8 -d", "para" => "-flags -mbTT -utf8 -d", "hyph" => "-flags -mbTT -utf8", "prep" => "", ); my %default_tools = ( "anl_fst" => "/opt/smi/$language/bin/$language.fst", "hyph_fst" => "/opt/smi/$language/bin/hyph-$language.fst", "hyph_filter" => "/opt/smi/common/bin/hyph-filter.pl", "gen_fst" => "/opt/smi/$language/bin/i$language-norm.fst", "para_fst" => "/opt/smi/$language/bin/i$language.fst", "para_grammar" => "/opt/smi/$language/bin/paradigm.$language.txt", "para_tags" => "/opt/smi/$language/bin/korpustags.$language.txt", "prep_fst" => "/opt/smi/$language/bin/$language.fst", "prep_abbr" => "/opt/smi/$language/bin/$language.fst", "prep_corr" => "/opt/smi/$language/bin/$language.fst", ); if (! -f $default_tools{'para_tags'} ) { $default_tools{'para_tags'} = "/opt/smi/common/bin/korpustags.txt"; } if (! -f $default_tools{'para_grammar'} ) { $default_tools{'para_grammar'} = "/opt/smi/common/bin/paradigm.txt"; } $xml_in = $root->first_child('xml_in'); $xml_out = $root->first_child('xml_out'); my @actions = $root->children('action'); my %tools; for my $act (@actions) { my $tool = $act->{'att'}->{'tool'}; my $tmp_fst = $act->{'att'}->{'fst'}; my $tmp_args = $act->{'att'}->{'args'}; my $rle = $act->{'att'}->{'rle'}; my $abbr = $act->{'att'}->{'abbr'}; my $corr = $act->{'att'}->{'corr'}; my $filter = $act->{'att'}->{'filter'}; my $filter_script = $act->{'att'}->{'filter_script'}; my $mode = $act->{'att'}->{'mode'}; my $grammar = $act->{'att'}->{'grammar'}; my $tags = $act->{'att'}->{'tags'}; if ($tool eq 'anl' || $tool eq 'hyph' || $tool eq 'gen' || $tool eq 'para') { if ($tmp_fst) { $action{$tool}{'fst'}=$tmp_fst; } else { $action{$tool}{'fst'} = $default_tools{$tool . "_fst"}; } if ($tmp_args) { $action{$tool}{'args'}=$tmp_args; } else { $action{$tool}{'args'} = $default_args{$tool}; } if ($filter) { if ($filter_script) { $action{$tool}{'filter'} = $filter_script; } else { $action{$tool}{'filter'} = $default_tools{'hyph_filter'}; } } if ($tool eq "para") { if ($grammar) { $action{$tool}{'grammar'} = $grammar; } else { $action{$tool}{'grammar'} = $default_tools{'para_grammar'}; } if ($tags) { $action{$tool}{'tags'} = $tags; } else { $action{$tool}{'tags'} = $default_tools{'para_tags'}; } if ($mode) { $action{$tool}{'mode'} = $mode; } } next; } if ($tool eq 'dis') { $action{'dis'}=1; if ($rle) { $dis_tools{'rle'}=$rle; } else { $dis_tools{'rle'}="/opt/smi/$language/bin/$language-dis.fst"; } # if ($bin) { $dis_tools{'bin'}=$bin; } # else { $dis_tools{'bin'}="/opt/smi/$language/bin/$language-dis.fst"; } if ($tmp_args) { $dis_tools{'args'}=$tmp_args; } else { $dis_tools{'args'}=$default_args{'dis'}; } next; } if ($tool eq 'prep') { $action{'prep'}=1; if ($tmp_fst) { $prep_tools{'fst'} = $tmp_fst; } else { $prep_tools{'fst'}=$default_tools{'prep_fst'}; } if ($abbr) { $prep_tools{'abbr'}=$abbr; } else { $prep_tools{'abbr'}=$default_tools{'prep_abbr'}; } if ($corr) { $prep_tools{'corr'}=$corr; } else { $prep_tools{'corr'}=$default_tools{'prep_corr'}; } if ($tmp_args) { $prep_tools{'args'}=$tmp_args; } else { $prep_tools{'args'}=$default_args{'prep'}; } next; } } if (! %action) { cluck("No action specified."); return Carp::longmess("No action specified $parameters"); } return 0; } 1; __END__