#!/usr/bin/perl -w use strict; # convert2xml --tags= # Perl script for converting CG2 output to an XML-format # The script takes three command line arguments # --tags= # --input= # --output= # # --tags= specifies the name of the file where the names # of the tag classes and tag names are listed. # --input= The name of the CG2 output file. If not given, STDIN # is assumed. # --output= The name of the output file. If not given, # the output is STDOUT. # # $Id$ # Use the local character class. # It means that variable \w contains Latin-1 alphabet. use locale; # permit named arguments use Getopt::Long; use XML::Twig; # Create an XML tree for the text. my $twig = new XML::Twig; my $corpus = XML::Twig::Elt->new('corpus'); my $text = XML::Twig::Elt->new('text'); $text->paste($corpus); my $tags_file; my $input_file; my $output_file; GetOptions ("tags=s" => \$tags_file, "input=s" => \$input_file, "output=s" => \$output_file); my %tags; &read_tags(\%tags); my $ifh; if ($input_file) { open ($ifh, "< $input_file") or die "Can't open $output_file: $!"; } else { $ifh = 'STDIN'; } my $SENT_DELIM = qq|.!?|; my $CORRECT_TAG = ""; my $cohort_rec; my $sentence_end=0; my @tokens; COHORTS: while (<$ifh>) { chomp; # ignore empty lines next COHORTS if /^\s*$/; # Test the start of the cohort. if (/^\"{WORD} = $_; next COHORTS; } # If not at the start of the cohort, # read the analysis line else { # store the line with Correct-tag to an array. if(/$CORRECT_TAG/) { push ( @ { $cohort_rec->{READING} }, $_); next COHORTS; } } } # Process the last cohort: push @tokens, $cohort_rec; add_sentence(\@tokens, \%tags); # Close the input file handle close $ifh; # Print output my $ofh; if ($output_file) { open ($ofh, "< $output_file") or die "Can't open $output_file: $!"; &print_xml(\*$ofh, \%tags); close $ofh; } else { &print_xml(\*STDOUT, \%tags); close STDOUT; } # Subroutine to add sentences to the XML-tree. sub add_sentence { my ($tokens_aref, $tags_href) = @_; # create an XML-element for a new sentence. my $sentence = XML::Twig::Elt->new('sentence'); $sentence->paste('last_child', $text); $sentence_end = 0; while (my $token_rec = shift @$tokens_aref) { # Create a new XML-element for the token my $token = XML::Twig::Elt->new('token'); $token->paste('last_child', $sentence); $token_rec->{WORD} =~ s/^\"<(.*)?>\".*$/$1/; $token->set_att('form', $token_rec->{WORD}); while (my $correct = shift @ {$token_rec->{READING}} ) { # Create a new XML element for each reading. my $reading = XML::Twig::Elt->new('reading'); $correct =~ s/^\s+//; $correct =~ s/\s*$CORRECT_TAG\s*$//; my ($base, @tag_list) = split(/\s/, $correct); $base =~ s/\"//g; # Store the base form to XML attributes of the token. $reading->set_att('lemma', $base); # Process each tag and store them to XML attributes # for the reading. for my $tag (@tag_list) { for my $class (keys %$tags_href) { if ( exists $$tags_href{$class}{$tag} ) { # Store the tag to XML attribute of the reading $reading->set_att($class, $tag); } } } # Store the reading to child of the token in XML tree $reading->paste('last_child', $token); } } # end while readings } # Subroutine to print out the XML-tree. sub print_xml { my ($prtout, $tags_href) = @_; print $prtout qq||; print $prtout qq| |; $twig->set_pretty_print('record'); $corpus->print( $prtout ); } # Subroutine to read the morphological tags from a file sub read_tags { my ($tags_href) = shift @_; # Read from tag file and store to an array. open TAGS, "< $tags_file" or die "Cant open the file: $!\n"; my $tag_class; TAG_FILE: while () { chomp; s/\s+//g; next if /^%/; next if /^$/; if (s/#//) { $tag_class = $_; next TAG_FILE; } $$tags_href{$tag_class}{$_} = 1; } close TAGS; }