#!/usr/bin/perl -w use strict; # use Perl module XML::Twig for XML-handling # http://www.xmltwig.com/ use XML::Twig; # Create an XML tree for the lexicon my $twig = new XML::Twig; # The root element. Name?? my $dict = XML::Twig::Elt->new('dict'); while(<>) { # Check the regular expression!! if ($_ =~ /^([\w\*]+)\s+(\w+)\s+(\".*\")\s?\;$/) { my $lemma_text = $1; my $contlex_text = $2; # The article-part is processed as separate block later my $rest = $3; # Create a new entry and paste it to the XML-tree. my $entry = XML::Twig::Elt->new('entry'); $entry->paste('last_child', $dict); # Create entries to the other strings and # add them under entry. # The character conversion for $lemma_text #For the shell, backslash all the alphanumeric chars. $lemma_text = quotemeta $lemma_text; $lemma_text = `echo $lemma_text | perl lat2cyr.pl`; my $lemma = XML::Twig::Elt->new('lemma'); $lemma->set_text($lemma_text); $lemma->paste('last_child', $entry); # Create empty token for stem my $stem = XML::Twig::Elt->new('stem'); $stem->paste('last_child', $entry); my $contlex = XML::Twig::Elt->new('contlex'); $contlex->set_text($contlex_text); $contlex->paste('last_child', $entry); # Check the regular expression!! if ($rest =~ /^\"\s*(\w+?)\s+(.*)\"\s*$/) { my $POS_text = $1; my $article_text = $2; my $POS = XML::Twig::Elt->new('POS'); $POS->set_text($POS_text); $POS->paste('last_child', $entry); #Create article my $article = XML::Twig::Elt->new('article'); #Split $article_text to parts. #Take first EG-parts, if there are some if ($article_text =~ s/ EG \[(.*)\]\]//g) { #Create EG my $eg = XML::Twig::Elt->new('EG'); my $in_eg = $1; store_xml($in_eg, \$eg); # Paste EG to the tree. $eg->paste('last_child', $article); } store_xml($article_text, \$article); # Paste article to the tree. $article->paste('last_child', $entry); } } } # The xml specifications, name of dtd-file and root node. print qq||; print qq| |; # If pretty print not set, prints everything to its own line # there are other options too. $twig->set_pretty_print('record'); $dict->print( ); sub store_xml { my ($article_text, $xml_ref) = @_; #Split the article to blocks which contain one value. my @entries = split(/\]/, $article_text); for my $ent (@entries) { if ($ent) { my ($features, $value_text) = split(/\[/,$ent); my @domains = split(/ /, $features); my $dom_text = pop @domains; # Process the feature with value. if ($dom_text) { my $feat = XML::Twig::Elt->new($dom_text); if ($dom_text =~ /DER|KOMI|SYNO|SEE/) { #For the shell, backslash all the alphanumeric chars. $value_text = quotemeta $value_text; #Call lat2cyr.pl $value_text = `echo $value_text | perl lat2cyr.pl`; } if ($dom_text =~ /FIN/) { #For the shell, backslash all the alphanumeric chars. $value_text = quotemeta $value_text; #Call 7bit-utf8.pl $value_text = `echo $value_text | perl 7bit-utf8.pl`; } $feat->set_text($value_text); $feat->paste('last_child', $$xml_ref); } #Process rest features (those without values) for my $d (@domains) { if ($d) { my $dom = XML::Twig::Elt->new('DOM'); $dom->set_text($d); $dom->paste('last_child', $$xml_ref); } } } } }