#!/usr/bin/perl -w use strict; use utf8; # comments2lexctags.pl # Perl script for parsing lexicon files for tags. # Usage: comments2lexctags.pl lexicon_file > output_file # # Transform tags in comments to the entry: # abskis0sa:abskis'sa GOAHTI "absciss N" ; !+SgNomCmp +SgGenCmp +PlGenCmp # +SgNomCmp+SgGenCmp+PlGenCmpabskis0sa:abskis'sa GOAHTI "absciss N" ; !+SgNomCmp +SgGenCmp +PlGenCmp # # $Id$ my $file_name = $ARGV[$#ARGV]; my $inroot = 0; my $inacro = 0; if ($file_name =~ /\/sm[aej]-lex./) { $inroot = 1; } my $root_tags = ""; if ( $inroot ) { # Skip the definitions in the beginning of the file # Start processing the first lexicon while(<>) { print; last if (/LEXICON/); } } while (<>) { #Pitäisi hypätä tyhjien rivien yli if (/^\s*$/) { print; next; } #Pitäisi hypätä rivien yli, jotka alkaa '!' -merkillä # but allow whitespace chars in front of the ! if (/^\s*\!/) { print; next; } if (/LEXICON/ && $inroot) { if (! /\!.*\+/) { $root_tags = ""; print; next; } else { my ($entry, $comments) = split (/\!/, $_); my $tags = &process_comments($comments); $root_tags = $tags; print; next; } } if (/LEXICON/ && $inacro) { $inacro = 0; } if ((/LEXICON FIRSTLETTER/ || /LEXICON ARABIC\s/ || /LEXICON SCND/ || /LEXICON THRD/ || /LEXICON FRTH/) || $inacro) { $inacro = 1; print; next; } if (/LEXICON/ && ! $inroot) { print; next; } if ((! /\!.*/) && ($root_tags eq "")) { print; next; } chomp; my ($entry, $comments) = split (/\;/, $_); my $tags = &process_comments($comments); # my $new_tags = join ("",@tagset); $entry =~ s/^\s+//; # $entry = " " . $entry; if ($entry !~ /:/ && $tags =~ /\S+/) { if ($entry =~ /^\S+\s+$/) { $entry = ": " . $entry; } else { # print $entry; print "TOMI"; my ($lemma, $cont) = split (/(?