#!/usr/bin/perl use warnings; use strict; use utf8; # These definitions ensure that the script works # also in environments, where PERL_UNICODE is not set. binmode( STDIN, ':utf8' ); binmode( STDOUT, ':utf8' ); binmode( STDERR, ':utf8' ); use open 'utf8'; # preprocess # Perl-script for text preprocessing # - Cuts text into sentences and sentences into words (tokens). # - Reads plain text from STDIN and outputs # a list of words separated by newline. # # An abbreviation file may be given as input parameter --abbr, # the file has special syntax and the usage is connected # to other sámi tools. See documentation in # http://giellatekno.uit.no/doc/ling/preprocessor.html # # $Id$ # permit named arguments use Getopt::Long; my %abbrs; my %corrections; my %corr_short; my $TRAB="TRAB"; my $ITRAB="ITRAB"; my $TRNUMAB="TRNUMAB"; my $IDIOM="IDIOM"; my $NOAB="NOAB"; my $NUMNOAB="NUMNOAB"; # Max size of a multi-word expression. my $MULTIWORD_SIZE = 3; my $abbr_file; my $help=0; my $verbose=0; my $hyph=0; my $use_hyph_tag=0; my $corr; my $sentence_break="."; my $xml; my $no_xml_out; my $line; my $next_line; my $connect; my $space=0; my $ltag = "<"; my $rtag = ">"; my $sentences=0; my $no_utf8=0; my $preserve_newline=0; my $split_strings=1; GetOptions ("abbr=s" => \$abbr_file, "corr=s" => \$corr, "break=s" => \$sentence_break, "xml" => \$xml, "no-xml-out" => \$no_xml_out, "connect=s" => \$connect, "help" => \$help, "space" => \$space, "ltag" => \$ltag, "rtag" => \$rtag, "v" => \$verbose, "hyph" => \$hyph, "use-hyph-tag" => \$use_hyph_tag) ; if ($help) { &print_usage; exit; } my %connecting; if ($connect) { for my $w ( split(",", $connect)) { $connecting{$w} = 1; } } my $quotations = quotemeta("«»‹›“”„‘’‚´`'\""); my $other = quotemeta("…•¶½¾¼"); # dash to be added my $parentheses = quotemeta("|{}[]()<>"); if ($xml) { $parentheses = quotemeta("|{}[]()"); } my $general_punct = quotemeta("\$.*?!,;:.%"); # Punctuation marks that are always their own tokens, # whether word or numeral expression. # There are characters added to this list depending on if # the processed string is word or numeral. my $SINGLE_PUNCT = $quotations . $parentheses . quotemeta("?!,;/\\") . $other; # Regex for tokens that nevertheless contain punctuation that # is specified in variable $SINGLE_PUNCT. my $CONTAIN_PUNCT = 'ja\/dahje|http|:\/\/|km\/h|www|@|\.jpg|\.doc|\.pdf|\.html|\.com|\.txt|\.no|¹|²|³|™'; if ($xml) { $CONTAIN_PUNCT .= "|<.*>"; } # Punctuation that connects two or more numerals # into one numeral expressions. my $NUM_PUNCT=quotemeta("-+*=/≈·"); my %idioms; my %idioms_short; my %num; my %regex; # If the abbreviation file is given, # read the abbreviations from the file to a hash. if ($abbr_file) { &read_abbr (\%abbrs, \%idioms, \%idioms_short, \%num, \%regex); } # Read the corrections file (typos.txt) if ($corr) { &read_corr (\%corrections, \%corr_short); } # Read one line at a time but keep track of the next line. my @next_words; my @words; while (<>) { chomp; if (/^\s*$/ && !eof) { next; } s/^\s*//; # Process the tag. if (! $use_hyph_tag) { if ($hyph) { s/\/-/g; } else { s/\//g; } } # Process always the previous line, unless first line. if(!$line && ! @words) { $line = $_; if (!eof) { next; } else { $next_line = undef; } } elsif(eof && ! $_) { $next_line = undef; } else { $next_line = $_; } # If no XML-processing, split line to tokens. if(! $xml) { if (! @words) { while($line =~ s/^(\s*)([^\s]+)//) { my $word = $2; my $spacechar = $1; # Assume that a word containing a dot followed by a hyphen or parenthesis is # an abbreviation followed by another word. Split this word # into two words, inserting a space after each of them if ($word =~ /(.*\D)\.([-\)])(.*)/) { push_word($1 . '.', " "); push_word($2 .$3, " "); } else { push_word($word, $spacechar); } } $line = undef; } if ($next_line) { while($next_line =~ s/^(\s*)([^\s]+)//) { my $word = $2; my $spacechar = $1; push_word($word, $spacechar); } } $next_line = undef; } else { # If the xml-tags are included in the preprocessor output they are # left untouched and just added to the tokens array. # The xml-markup without space, like in word # is taken into account. if (!@words) { xml_tags($line, \@words); } xml_tags($next_line, \@next_words) } if ($preserve_newline) { $words[-1]{space} .= "\n"; } process_words(); @words = @next_words; } process_words(); sub push_word { my ($word, $spacechar) = @_; my %w = ('word' => $2, 'space' => $1); if ($space) { %w = ('word' => $word, 'space' => $spacechar); } else { %w = ('word' => $word, 'space' => " "); } push (@words, \%w); } sub xml_tags { my ($line, $aref) = @_; @$aref = undef; pop @$aref; chomp $line; while ($line) { verbose("processing xml-tag", $line , __LINE__); if ($line =~ /^\s*$/) { last; } if ($line =~ s/^(\s*)([^<\s]+?)(?=(?:\s|$))//) { verbose("read", $2 , __LINE__); my %w = ('word' => $2, 'space' => $1); push (@$aref, \%w); next; } if ($line =~ s/^(\s*)(<[^<]*?>)//) { if ($2) { verbose("read", $2 , __LINE__); my %w = ('xml' => $2, 'space' => $1 ); push (@$aref, \%w); } next; } while($line =~ s/^(\s*)([^<\s]+)//) { verbose("read", $2 , __LINE__); my %w = ('word' => $2, 'space' => $1); push (@$aref, \%w); } } } sub process_words { while (@words) { my $w_token = shift @words; # Leave xml-tags untouched. if ($xml && $w_token->{xml}) { if ($space && $w_token->{space}) { print $ltag, $w_token->{space}, $rtag, "\n"; } if (! $no_xml_out) { print $w_token->{xml}, "\n"; } next; } my $word = $w_token->{word}; if (! $word) { $word = ""; } verbose("NEW", $word , __LINE__); # An array for storing the tokens, each token in its own slot. my @tokens; # Move forward if the word is nothing special. # This is for making preprocessing faster. if ($word =~ /^[^\W\d\s\n]*$/) { if((! $corr || ! $corr_short{$word}) && ! $idioms_short{$word}) { verbose("no correction, no idiom", $word , __LINE__); add_token(\@tokens, $w_token, $word); if ($sentences) { print_sentence(\@tokens); } else { print_tokens(\@tokens); } next; } } # Process "word1- og word2" strings. if ($word =~ /\w+\p{Pd}$/) { if (@words) { my $next_w = $words[0]{word}; if ($connecting{$next_w}) { $word .= $words[0]{space} . $next_w; shift @words; verbose("connecting", $word , __LINE__); if ($words[0] && (! $xml || $words[0]{word} !~ /^ $second_word); if ($second_word =~ /^[\W]?[^\W\d]+/) { verbose("if word with dots", $second_word , __LINE__); process_word ($second_word, \%w, \@words, \@next_words, \@tokens); } else { verbose("if numeral with dots", $second_word , __LINE__); process_numeral ($second_word, \%w, \@words, \@next_words, \@tokens); } } last PROCESS_WORD; } # store punctuation from the front of the expression # to the tokens array. (todo: check variable $CONTAIN_PUNCT) if ($word =~ s/^([$SINGLE_PUNCT\.]+)//o) { verbose("if front punct ", $word , __LINE__); my @punct = split(//, $1); add_array(\@tokens, \@punct); last PROCESS_WORD if (!$word); } if ($word =~ /^[\W]?\pL+/) { # Examine the type of the string. If the expression contains # alphabetical characters optionally preceded by one punctuation # character, it is a word. verbose("if word", $word , __LINE__); process_word ($word, $w_token, \@words, \@next_words, \@tokens); } else { # Otherwise it is processed like numeral. process_numeral ($word, $w_token, \@words, \@next_words, \@tokens); } } # end of PROCESS_WORD if ($sentences) { print_sentence(\@tokens); } else { print_tokens(\@tokens); } } } sub process_word { my ($word, $w_token, $words_aref, $next_aref, $tokens_aref) = @_; verbose ("entering process_word", $word, __LINE__); # variable for storing all the punctuation at the end of the word. # except dot (for abbreviations) and some other punct. if needed my $end_punct = ""; # the variable storing dot and the possible some other punct. my $rest_punct; # Correct the word, if corrections is defined if ($corr ) { if ($corr_short{$word}) { test_corr(\$word, $words_aref, $tokens_aref); } } # Test for multiword expression. if ( $idioms_short{$word}) { test_idiom(\$word, $words_aref, $next_aref, $MULTIWORD_SIZE); } my $end_num; if ($split_strings) { if ($word =~ /\.?(\d+\.?)$/){ $word =~ s/(\d+\.?)$//; $end_num = $1; verbose ("process_word/end_num", $end_num, __LINE__); } } # Store the punctuation at the front of the string to tokens array. if ($word =~ s/^([*+=%:\p{Pd}$other]+)//o) { my @punct = split(//, $1); add_array($tokens_aref, \@punct); verbose ("process_word/front_punct", $1, __LINE__); } # Store the punctuation at the end of the string to a variable if ($word =~ s/([+=%:$SINGLE_PUNCT\§]+)$//o) { $end_punct = $1; if($corr) { if ($corr_short{$word}) { test_corr(\$word, $words_aref, $tokens_aref); } } verbose ("process_word/end_punct", $end_punct, __LINE__); verbose ("process_word/end_punct/word", $word, __LINE__); } TEST: { # If the punctuation at the end contains a sentence delimiter # ? or !, the word ends the sentence in any case, so the word can # be treated as a token. if ($end_punct =~ /[¶\?\!\.]/) { if($corr) { if ($corr_short{$word}) { test_corr(\$word, $words_aref, $tokens_aref); } } add_token($tokens_aref, $w_token, $word); last TEST; } # The word is checked for abbreviation and sentence boundary. my $abbr = $word; if ($abbr) { $abbr =~ s/\.$//; if ($abbrs{$abbr} || $abbrs{lc($abbr)}) { verbose ("process_word/go_to_test_abbr", $abbr, __LINE__); test_abbr($word, $w_token, $words_aref->[0]{word}, $tokens_aref); last TEST; } elsif ($abbr =~ /^[\p{Lu}]{2,3}$/o ) { # Check for abbreviation with 2-3 capital letters # If followed by a capital letter or number, # There is a sentence boundary. todo: Remove this as irrelevant!! verbose ("process_word/test_two_three_cap_letters", $abbr, __LINE__); if ($words_aref->[0]{word} && $words_aref->[0]{word} !~ /^[\p{Ll}]/o ) { if($corr) { if ($corrections{$word}) { test_corr(\$word, $words_aref, $tokens_aref); } } my @tmp = ($abbr, "."); add_array($tokens_aref, \@tmp); last TEST; } else { add_token($tokens_aref, $w_token, $word); last TEST; } } elsif ($abbr =~ /^\p{Ll}$/){ # If an abbreviation like b. add_token($tokens_aref, $w_token, $word); if ($words_aref->[0]{word} && $words_aref->[0]{word}!~ /^[\p{Ll}]/o ) { my @tmp = ("."); add_array($tokens_aref, \@tmp); } last TEST; } } # Cut the word into tokens if there is punctuation in the middle. # e.g. gielddat/guovllut # Check for expressions in $CONTAIN_PUNCT -variable. verbose ("process_word/not_abbr", "$word", __LINE__); if (($split_strings && $word =~ /^(\pL+\.)([\p{pD}\pL]+.*)$/) || $word =~ /([\pL\p{Pd}]+)([^\pL0-9\. \:\-\`]+[\p{pD}\pL]+.*)$/) { if ($word !~ /$CONTAIN_PUNCT/o) { $word = $1; my %w = ('word' => $2); unshift (@{$words_aref}, \%w); process_word($1, $w_token, $words_aref, $next_aref, $tokens_aref); verbose ("process_word/contain_punct", "$word", __LINE__); return 1; } } # If the word was not an abbreviation, the rest # of the punctuation is removed and stored as separate tokens. # This operation is for taking the dot out. if ($word =~ s/([$SINGLE_PUNCT\.:]+)$//o) { $rest_punct = $1; verbose ("process_word/rest_punct", "$word $rest_punct", __LINE__); if($corr) { if ($corrections{$word}) { test_corr(\$word, $words_aref, $tokens_aref); } } add_token($tokens_aref, $w_token, $word); my @rest = split ("", $rest_punct); add_array($tokens_aref, \@rest); last TEST; } add_token($tokens_aref, $w_token, $word); } # end of block TEST my @ep = split(//, $end_punct); unshift(@ep, $end_num); add_array($tokens_aref, \@ep); } sub test_abbr { my ($word, $w_token, $next_word, $tokens_aref) = @_; my $abbr = $word; verbose("test_abbr", $abbr, __LINE__); $abbr =~ s/\.$//; # Transitive abbreviations are never followed # by sentence boundary. if (($abbrs{$abbr} && $abbrs{$abbr} eq $TRAB) || ($abbrs{lc($abbr)} && $abbrs{lc($abbr)} eq $TRAB)) { add_token($tokens_aref, $w_token, $word); return 1; } elsif (($abbrs{$abbr} && $abbrs{$abbr} eq $TRNUMAB) || ($abbrs{lc($abbr)} && $abbrs{lc($abbr)} eq $TRNUMAB)) { # There is CLB after TRNUMAB only for the capital+small combinations # and all small-initial strings that consist of more than one letter. # For all other strings (one small letter, one or several capital letters, number) we # want no CLB after TRNUMAB. add_token ($tokens_aref, $w_token, $word); if (!$next_word or ($next_word && $next_word =~ /^\p{Lu}/ && $next_word !~ /^(\p{Lu}|[IVXCDLM]+)$/o)) { add_new_token ($tokens_aref, $sentence_break); } return 1; } elsif (($abbrs{$abbr} && $abbrs{$abbr} eq $ITRAB) || ($abbrs{lc($abbr)} && $abbrs{lc($abbr)} eq $ITRAB)) { # There is a sentence boundary if intransitive abbreviation # is NOT followed by a small alphabetic char or punctuation, # or if it is the last word add_token ($tokens_aref, $w_token, $word); if (!$next_word or ($next_word && $next_word !~ /^[\p{Ll}\pP]/o)) { verbose("ITRAB", $next_word, __LINE__); add_new_token ($tokens_aref, $sentence_break); } return 1; } elsif (($abbrs{$abbr} && $abbrs{$abbr} eq $NUMNOAB) || ($abbrs{lc($abbr)} && $abbrs{lc($abbr)} eq $NUMNOAB)) { # The expected behaviour of $NUMNOAB is shown in $GTHOME/tools/abbrtester/abbrtester.py if ($next_word) { if ($next_word =~ /^\p{Lu}/) { $word =~ s/\.$//; add_token ($tokens_aref, $w_token, $word); add_new_token ($tokens_aref, $sentence_break); } elsif ($next_word =~ /^\d/) { add_token ($tokens_aref, $w_token, $word); } elsif ($next_word =~ /^\p{Ll}/) { add_token ($tokens_aref, $w_token, $word); } } else { $word =~ s/\.//; add_token ($tokens_aref, $w_token, $word); add_new_token ($tokens_aref, $sentence_break); } return 1; } if (($abbrs{$abbr} && $abbrs{$abbr} eq $NOAB) || ($abbrs{lc($abbr)} && $abbrs{lc($abbr)} eq $NOAB)) { if ($next_word && $next_word !~ /^[\p{Ll}\pP]/o) { $word =~ s/\.$//; add_token($tokens_aref, $w_token, $word); add_new_token ($tokens_aref, $sentence_break); } else { add_token($tokens_aref, $w_token, $word); } return 1; } return 0; } # If a word is in the typos-list, it is replaced by the correct reading. sub test_corr { my ($word_ref, $words_aref, $next_aref, $tokens_aref) = @_; my $size=2; my $i=1; # If the word is single part. if (my $correct = $corrections{$$word_ref}) { my @parts = split (/ /o, $correct); if (scalar @parts > 1) { if ( $idioms{$correct}) { $$word_ref = $correct; verbose ("test_corr/split_words", "$correct", __LINE__); } else { $$word_ref = pop @parts; add_array($tokens_aref, \@parts); verbose ("test_corr/split_words", "$correct", __LINE__); } } else { $$word_ref = $correct; } return 0; } my $next; my $idiom; my $ucidiom; if( $words_aref->[0]{word} ) { $idiom = $$word_ref . $words_aref->[0]{space} . $words_aref->[0]{word}; $ucidiom = ucfirst($$word_ref) . $next_aref->[0]{space} . ucfirst($next_aref->[0]{word}); } elsif ($next_aref->[0]{word}) { $idiom = $$word_ref . $next_aref->[0]{space} . $next_aref->[0]{word}; $ucidiom = ucfirst($$word_ref) . $next_aref->[0]{space} . ucfirst($next_aref->[0]{word}); $next=1; } else { return 0; } while ($i <= $size) { # Remove the punctuation at the end of the expression. (my $idiom2 = $idiom) =~ s/([^\w]*)$//; (my $ucidiom2 = $ucidiom) =~ s/([^\w]*)$//; my $end_punct = $1; # If the expressions contains punctuation in the middle, return. return if ($idiom2 =~ /[^\w\s\d\.\-]/); # Test if the formed multiword expression exists in the # idiom list. Test also lower case version. verbose ("test_corr", $idiom, __LINE__); if ($corrections{$idiom2} || $corrections{lc($idiom2)} || $corrections{($ucidiom2)}) { my $correct = $corrections{$idiom}; verbose ("test_corr/correction", "$correct", __LINE__); CHECK_IDIOM : { if ( $idioms{$correct}) { $$word_ref = $correct; $$word_ref .= $end_punct; verbose ("test_corr/idiom", "$correct", __LINE__); last CHECK_IDIOM } my @parts = split (/ /o, $correct); if (scalar @parts > 1) { $$word_ref = pop @parts; $$word_ref .= $end_punct; add_array($tokens_aref, \@parts); verbose ("test_corr/split_words", "$correct", __LINE__); } } # end of block CHECK_IDIOM # Remove the parts of the multiword expression from # the word array. if($next) { splice (@{$next_aref}, 0, $i+1); } else { splice (@{$words_aref}, 0, $i+1); } return 1; } $i++; if($words_aref->[$i]) { $idiom = $idiom . $words_aref->[$i]{space} . $words_aref->[$i]{word}; } elsif(! $next && $next_aref->[0]) { $idiom = $idiom . $next_aref->[0]{space} . $next_aref->[0]{word}; $next=1; } else { return 0; } } } # If the word starts an multiword expression, it is replaced # with the expression. The other parts are # removed from the words array. sub test_idiom { my ($word_ref, $words_aref, $next_aref, $size) = @_; # Test for multiword expressions by growing the token # one word at a time my $next=0; my $i=0; my $last_part; my $idiom; my $ucidiom; if ($words_aref->[0]{word} or $next_aref->[0]{word}) { if( $words_aref->[0]{word} ) { if ($words_aref->[0]{space}) { $idiom = $$word_ref . $words_aref->[0]{space} . $words_aref->[0]{word}; $ucidiom = ucfirst($$word_ref) . $words_aref->[0]{space} . ucfirst($words_aref->[0]{word}); } else { $idiom = $$word_ref . $words_aref->[0]{word}; $ucidiom = ucfirst($$word_ref) . ucfirst($words_aref->[0]{word}); } } elsif ($next_aref->[0]{word}) { $idiom = $$word_ref . $next_aref->[0]{space} . $next_aref->[0]{word}; $ucidiom = ucfirst($$word_ref) . $next_aref->[0]{space} . ucfirst($next_aref->[0]{word}); $next=1; } while ($i <= $size) { # Remove the punctuation at the end of the expression. (my $idiom2 = $idiom) =~ s/[^\w]*$//; (my $ucidiom2 = $ucidiom) =~ s/[^\w]*$//; # If the expressions contains punctuation in the middle, return. return if ($idiom2 =~ /[^\w\s\d\.\-]/); # Test if the formed multiword expression exists in the # idiom list. Test also lower case version. verbose ("test_idiom", $idiom, __LINE__); if ($idioms{$idiom2} || $idioms{lc($idiom2)} || $idioms{($ucidiom2)}) { for (my $j=0; $j <= $i; $j++) { # Construct the new multiword processing unit. # Remove the parts of the multiword expression from # the word array. if($next) { $$word_ref = $$word_ref . $next_aref->[0]{space} . $next_aref->[0]{word}; shift @{$next_aref}; } else { if ($words_aref->[0]{space}) { $$word_ref = $$word_ref . $words_aref->[0]{space}; } if ($words_aref->[0]{word}) { $$word_ref = $$word_ref . $words_aref->[0]{word}; } shift @{$words_aref}; } } return 1; } $i++; if($words_aref->[$i]) { $idiom = $idiom . $words_aref->[$i]{space} . $words_aref->[$i]{word}; $ucidiom = $ucidiom . $words_aref->[$i]{space} . ucfirst($words_aref->[$i]{word}); } elsif(! $next && $next_aref->[0]{word} && $next_aref->[0]{space}) { $idiom = $idiom . $next_aref->[0]{space} . $next_aref->[0]{word}; $ucidiom = $ucidiom . $next_aref->[0]{space} . ucfirst($next_aref->[0]{word}); $next=1; } else { return 0; } } } else { return 0; } } sub process_numeral { my ($word, $w_token, $words_aref, $next_aref, $tokens_aref) = @_; verbose ("process_numeral", $word, __LINE__); if ($corr ) { if ($corr_short{$word}) { test_corr(\$word, $words_aref, $next_aref, $tokens_aref); } } # Search for numeral expressions with spaces. COMBINE: { # Consider date expressions first. # at the moment covers years: 1984-2000, 1984- 2000 etc. if ($word =~ /^\d{4}\p{Pd}?$/o) { while ($words_aref->[0] && $words_aref->[0]{word} =~ /^(?:\p{Pd}|\p{Pd}?(?:\d{4}|\d{2}))\.?$/o) { if ($word =~ /\-/ || $words_aref->[0]{word} =~ /\-/) { $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; verbose ("process_numeral/combine date", $word, __LINE__); shift @{$words_aref}; } else { last COMBINE; } } last COMBINE; } # Look for date ranges, e.g. 2. - 3. mars # First check that all variables that are accessed are initialized if ($word && $words_aref->[0]{word} && $words_aref->[1]{word}) { # Then do the real work if ( ($word =~ /^\d{1,2}\.$/ || # Test if $word is a number with at most two digits ending with a dot $word =~ /^\d{1,2}\.\d{1,2}\.$/ ) && # Or a number of the format dd.dd. $words_aref->[0]{word} =~ /-/ && # Test if the first word following $word is a hyphen ($words_aref->[1]{word} =~ /^\d{1,2}\.$/ || # Test if the second word following $word is a number with at most two digits ending with a dot $words_aref->[1]{word} =~ /^\d{1,2}\.\d{1,2}\.\d{1,2}/) # or has the dd.dd.dd ) { $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; shift @{$words_aref}; $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; shift @{$words_aref}; verbose ("process_numeral/combine", $word, __LINE__); last COMBINE; } } # Combine punctuation with numeral if followed by other numeral. # cases like 123- 456 and 123 -456 and 123 456 and 123 - 456 # 10 000,- and math expressions like 2 * 4 = 8 # many times connects long sequences of numbers into one token. while (($word =~ /^[\d$NUM_PUNCT\- ]+$/o) && ($words_aref->[0]{word} && $words_aref->[0]{word} =~ /^[\d$NUM_PUNCT\- ]+(\,\-)?\.?$/o)) { $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; verbose ("process_numeral/combine", $word, __LINE__); shift @{$words_aref}; } } # end of COMBINE # Combine percent sign to the numeral when separate. # covers cases like: 50 % if (($word =~ /\d$/) && ($words_aref->[0]{word} && $words_aref->[0]{word} =~ /^\%/)) { $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; verbose ("process_numeral/combine", $word, __LINE__); shift @{$words_aref}; } # Clean first the end of the token, where punctuation follows # the numeral or an ordinal and does not belong to the expression. # cases like: 123! and 123.), 123). 10,-. # problem: 123.? and 123 is an ordinal. my $end_punct; if ($word =~ s/([$SINGLE_PUNCT:]+\.?)$//o){ $end_punct = $1; } if ($word =~ /([$SINGLE_PUNCT:\p{Pd}\%]+\.?)$/o){ $word =~ s/(\.?)$//; $end_punct = $1; } verbose ("process_numeral", $word, __LINE__); my $end_word; if ($split_strings) { SPLIT_NUMERAL : { # Check if the word is an inflected numeral. 12s, etc. if ( $word =~ /^(\d+)([\:\'\-]?)([\pL]+)(\.)?$/o ) { verbose ("process_numeral/inflected", $word, __LINE__); if ( $num{$3} ) { last SPLIT_NUMERAL; } if (! $2 ) { verbose ("add_token", "$1", __LINE__); add_token($tokens_aref, $w_token, $1); my %w = ('word' => $3); process_word ($3, \%w, $words_aref, $next_aref, $tokens_aref); if ($end_punct) { if ($4) { $end_punct = $4 . $end_punct; } } else { $end_punct = $4; } if ($end_punct) { my @ep = split(//, $end_punct); verbose ("add_array", "@ep", __LINE__); add_array($tokens_aref, \@ep); } return; } last SPLIT_NUMERAL; } } # Check if there is an abbreviation or a word attached to # numeral. E.g. 6.b. ...ovdal. This introduces problems with some # mispellings with headings, like 1.6.Vuonain if ($word =~ /\.([^\W\d]+\.?)$/ && $word !~ /$CONTAIN_PUNCT/o) { $word =~ s/([^\W\d]+\.?)$//; verbose ("process_numeral/end_word", $1, __LINE__); process_numeral($word, $w_token, $words_aref, $next_aref, $tokens_aref); my $new_word = ""; if ($end_punct) { $new_word = $1 . $end_punct; } else { $new_word = $1; } my %w = ('word' => $new_word); unshift (@$words_aref, \%w); return; } } # end of SPLIT_NUMERAL verbose ("process_numeral", $word, __LINE__); my $rest_punct; NUM_TEST: { # Check for ending dot, it is a separate token if # the following word starts with capital letter. # If there is an abbreviation with numeral, don't do checking. # Otherwise the dot belongs to the expression. # Process first the abbreviation or word # that is attached to the numeral if ($end_word) { verbose ("process_numeral/test_abbr", $end_word, __LINE__); my $abbr = $end_word; if (( $abbr =~ s/\.$//) && ($abbrs{$abbr} || $abbrs{lc($abbr)})) { verbose ("process_numeral/test_abbr", $abbr, __LINE__); test_abbr($end_word, $w_token, $words_aref->[0], $tokens_aref); last NUM_TEST; } # Case with 3.b. #elsif ( $end_word =~ /^\p{Lc}\.$/) { # verbose ("process_numeral/test_abbr", $abbr, __LINE__); # push (@{$tokens_aref}, $end_word); # if ($words_aref->[0] =~ /^[\p{Lu}]/o) { # push (@{$tokens_aref}, "."); # } # last NUM_TEST; #} else { verbose ("process_numeral/test_abbr", $abbr, __LINE__); process_word ($end_word, $w_token, $words_aref, $next_aref, $tokens_aref); last NUM_TEST; } } DOT_TEST: { last DOT_TEST if ($word !~ /\.$/); (my $nopunct = $word) =~ s/\.$//; # Try to spot dates and other experssions 31.10. # preserve ending dot and and find sentence boundary #if ($word =~ /^(\d{1,2}\.){2,}$/) { # verbose ("process_numeral/test_date", $word, __LINE__); # if ($words_aref->[0] =~ /^[\p{Lu}]/o) { # $end_punct = "." . $end_punct; # } # last DOT_TEST; # } # If the number contains other than digits or is a year, # and the next word does not exist or does not start uppercase, then # the ending dot is removed. if($nopunct =~ /[§\d\pP\pL]/ && ((!($words_aref->[0]{word}) || $words_aref->[0]{word} =~ /^[\p{Lu}]/o))){ $word = $nopunct; } elsif ($word =~ /^\d{1,3}\.$/ ) { # Otherwise dot is considered to be an ordinal. if ($words_aref->[0]{word} !~ /^[\p{Lu}]/o) { last DOT_TEST; } } elsif ($nopunct =~ /[§\d\pP\pL]/ && $words_aref->[0]{word} =~ /^[$parentheses°]/o) { $word = $nopunct; } else { last DOT_TEST; } if ($end_punct) { $end_punct = "." . $end_punct; } else { $end_punct = "."; } verbose ("process_numeral/ending dot", $word, __LINE__); # Clean the rest of the token: # cases like 123). where dot is preceded by punctuation. if ($word =~ s/([$SINGLE_PUNCT:]+)$//o) { $rest_punct = $1; } } # end of DOT_TEST verbose ("process_numeral", $word, __LINE__); # Push everything to the tokens array. verbose ("add_token", "$word", __LINE__); add_token($tokens_aref, $w_token, $word); if ($end_word) { verbose ("add_token", "$end_word", __LINE__); add_token($tokens_aref, $w_token, $end_word); } } #end of NUM_TEST if ($rest_punct) { my @rp = split(//, $rest_punct); verbose ("add_array", "$word", __LINE__); add_array($tokens_aref, \@rp); } if ($end_punct) { my @ep = split(//, $end_punct); verbose ("add_array", "$word", __LINE__); add_array($tokens_aref, \@ep); } } sub verbose { my ($from, $word, $linenumber) = @_; if ($verbose) { print STDERR "[$from:$linenumber] $word\n"; } } # Add text to an existing token. sub add_token{ my ($tokens_aref, $w_token, $content) = @_; $$w_token{word} = $content; push @{$tokens_aref}, $w_token; } # Create a new token and add text to it sub add_new_token{ my ($tokens_aref, $content) = @_; my %w = ('word' => $content); push @{$tokens_aref}, \%w; } # Create several new tokens from an array sub add_array{ my ($tokens_aref, $content_aref) = @_; for my $p (@{$content_aref}) { my %w = ('word' => $p); push @{$tokens_aref}, \%w; } } sub print_token{ my ($w_token) = @_; if ($space && $w_token->{space}) { print $ltag, $w_token->{space}, $rtag, "\n"; } print $w_token->{word}, "\n"; } # Combine tokens to spaces and add newline after each sentence break. sub print_sentence { my $tokens_aref = shift @_; my $sentend=0; my $break = quotemeta($sentence_break); for my $t (@$tokens_aref) { if (! $sentend && $t->{space}) { print $t->{space}; } next if ($t->{word} =~ /^$/); if ($t->{word} =~ /^$break$/) { my $n_word = $t->{word}; if ($sentence_break ne ".") { $n_word =~ s/$sentence_break//g; } print "$n_word\n\n"; } else { print $t->{word}; } } } # Print the list of tokens. sub print_tokens{ my ($tokens_aref) = @_; for my $t (@$tokens_aref) { if ($space && $t->{space}) { print $ltag, $t->{space}, $rtag, "\n"; } if ($t->{word}) { if ($t->{word} !~ /^$/) { my $w = $t->{word}; $w =~ s/ $//; print "$w\n"; } } } } # Read the typos list sub read_corr { my ($corr_href, $corr_short_href) = @_; local $/="\n"; open CORR, "< $corr" or die "preprocess: Can't open the file $corr: $!\n"; while () { chomp; next if (/^!/); next if (/^\s*$/); next if (/^\#/); s/\s$//; my ($error, $correct) = split(/\t+/); if ($error && $correct) { $error =~ s/\s$//; $$corr_href{$error} = $correct; my ($first, $last) = split(/ /, $error, 2); $$corr_short_href{$first} = 1; } else { print STDERR "preprocess warning: Line not included to typos: $_\n"; } } # for my $key (keys %$corr_href) { # print "$key $$corr_href{$key}\n"; # } } sub read_abbr { my ($abbr_href, $idiom_href, $idiom_short_href, $num_href, $regex_href) = @_; open LEX, "< $abbr_file" or die "preprocess: Can't open the file: $!\n"; my $current; while () { chomp; last if (/^LEXICON IDIOM/); if (/^LEXICON\s+(.*?)\s*$/) { $current = $1; next; } $$abbr_href{$_} = $current; } while () { chomp; last if (/^LEXICON NUM/); my ($first, $last) = split(/ /, $_, 2); $$idiom_href{$_} = 1; if ($first) { $$idiom_short_href{$first} = 1; } } while () { next if /^\s*$/; chomp; $$num_href{$_} = 1; } close LEX; } sub print_usage { print < FILE Split text in FILE into sentences and words. Options: --hyph|h show the hyphenation points, i.e. change the tags to hyphens. The default is to just remove the tags. --use-hyph-tags leave the tags untouched --break= use instead of . as sentence delimiter. --connect= comma separated list of words which connect expressions like fisk- og vilthandelen --space|s Preserve space. --ltag= Left tag for space, default < --rtag= Right tag for space, default > --help Prints the help text and exit. --v Prints information of the execution of the script --corr= List of common typos and their corrections (e.g. typos.txt) --xml|x Accept xml-formatted input, print each tag on its own line. --no-xml-out|n Used together with --xml, does not print the xml-tags. END }