#!/usr/bin/perl -w use strict; # convert2cwb --tags= # Perl script for converting CG2 output to a format # which is required for creating an IMS Corpus Workbench binary corpus. # In the output, each word is in its own line followed by base form, # and the list of tags, separated by tab. # The script reads STDIN and takes one command line argument --tags # which is the name of the file where the names of the tag classes # and tag names are specified. # # $Id$ # Use the local character class. # It means that variable \w contains Latin-1 alphabet. use locale; # permit named arguments use Getopt::Long; my %tags; my @classes; my $tags_file; GetOptions ("tags=s" => \$tags_file); # Read from lex-file and write to abbr file. open TAGS, "< $tags_file" or die "Cant open the file: $!\n"; #read from the beginning of the file. my $current; my $tag_class; while () { chomp; s/\s+//g; if (/^%/) { next; } if (/^$/) { next; } if (s/#//) { $tag_class = $_; push @classes, $tag_class; next; } $tags{$tag_class}{$_} = 1; } close TAGS; #foreach my $key (keys %tags) { # print "JOO $key JES\n"; # for my $tag ( keys %{ $tags{$key} } ) { # print "$tag\n"; # } #} #print @classes; my $word; my $correct=""; my $base; my @tag_list; while (<>) { chomp; #If at the start of a cohort, read the word and go to next line. if (/^\"/ || $correct ) { next; } $correct = $_; $word =~ s/^\"<(.*)?>\".*$/$1/; $correct =~ s/^\s+//; $correct =~ s/\s*//; ($base, @tag_list) = split(/\s/, $correct); $base =~ s/\"//g; my %curr_tag; for my $tag (@tag_list) { foreach my $class (keys %tags) { if ( exists $tags{$class}{$tag} ) { $curr_tag{$class} = $tag; } } } my @list; for my $class (@classes) { if ( exists $curr_tag{$class} ) { push @list, $curr_tag{$class}; } else { push @list, ""; } } my $line = join "\t", $word, $base, @list, "\n"; print $line; }