#!/usr/bin/perl use warnings; use strict; use utf8; use open 'utf8'; # permit named arguments use Getopt::Long; # These definitions ensure that the script works # also in environments, where PERL_UNICODE is not set. binmode( STDIN, ':utf8' ); binmode( STDOUT, ':utf8' ); binmode( STDERR, ':utf8' ); # preprocess # Perl-script for text preprocessing # - Cuts text into sentences and sentences into words (tokens). # - Reads plain text from STDIN and outputs # a list of words separated by newline. # # An abbreviation file may be given as input parameter --abbr, # the file has special syntax and the usage is connected # to other sámi tools. See documentation in # https://giellalt.uit.no/ling/preprocessor.html # # $Id$ my %abbrs; my %corrections; my %corr_short; my $TRAB="TRAB"; my $ITRAB="ITRAB"; my $TRNUMAB="TRNUMAB"; my $IDIOM="IDIOM"; my $NOAB="NOAB"; my $NUMNOAB="NUMNOAB"; # Max size of a multi-word expression. my $MULTIWORD_SIZE = 3; my $abbr_file; my $help=0; my $verbose=0; my $hyph=0; my $use_hyph_tag=0; my $corr; my $sentence_break="."; my $xml; my $no_xml_out; my $line; my $next_line; my $connect; my $space=0; my $ltag = "<"; my $rtag = ">"; my $no_utf8=0; my $preserve_newline=0; GetOptions ("abbr=s" => \$abbr_file, "corr=s" => \$corr, "break=s" => \$sentence_break, "xml" => \$xml, "no-xml-out" => \$no_xml_out, "connect=s" => \$connect, "help" => \$help, "space" => \$space, "ltag" => \$ltag, "rtag" => \$rtag, "v" => \$verbose, "hyph" => \$hyph, "use-hyph-tag" => \$use_hyph_tag) ; if ($help) { &print_usage; exit; } my %connecting; if ($connect) { for my $w ( split(",", $connect)) { $connecting{$w} = 1; } } # my $quotations = quotemeta("«»‹›“”„‘’‚´`'\""); # original my $quotations = quotemeta("«»‹›“”„‘’‚`'\""); # Skolt Sami fix without ACUTE ACCENT as delimiter my $other = quotemeta("…•¶½¾¼°◊√∆"); # dash to be added my $parentheses = quotemeta("|{}[]()<>"); if ($xml) { $parentheses = quotemeta("|{}[]()"); } my $general_punct = quotemeta("\$.*?!,;:.%"); # Punctuation marks that are always their own tokens, # whether word or numeral expression. # There are characters added to this list depending on if # the processed string is word or numeral. my $SINGLE_PUNCT = $quotations . $parentheses . quotemeta("?!,;/\\") . $other; # Regex for tokens that nevertheless contain punctuation that # is specified in variable $SINGLE_PUNCT. my $CONTAIN_PUNCT = 'ja\/dahje|http|:\/\/|km\/h|www|@|\.jpg|\.doc|\.pdf|\.html|\.com|\.txt|\.no|¹|²|³|⁴|⁵|⁶|⁷|⁸|⁹|™'; if ($xml) { $CONTAIN_PUNCT .= "|<.*>"; } # Punctuation that connects two or more numerals # into one numeral expressions. my $NUM_PUNCT=quotemeta("-+*=/≈·"); my %idioms; my %idioms_short; my %num; my %regex; my @next_words; my @words; preprocess_lines(); ### # Sub routines from here on ### sub preprocess_lines { read_abbr (\%abbrs, \%idioms, \%idioms_short, \%num, \%regex); read_corr (\%corrections, \%corr_short); loop_on_input(); } # Read one line at a time but keep track of the next line. sub loop_on_input { while (<>) { chomp; if (/^$/) { $_ = "¶"; } if (!(/^\s*$/ && !eof)) { s/^\s*//; # Process the tag. if (! $use_hyph_tag) { if ($hyph) { s/\/-/g; } else { s/\//g; } } # Process always the previous line, unless first line. if(!$line && ! @words && !eof) { $line = $_; } else { process_lines($_); process_words(); @words = @next_words; } } } process_words(); } sub process_lines { my ($this_line) = @_; verbose("process_lines", $this_line, __LINE__); if (!$line && ! @words && eof) { $line = $this_line; $next_line = undef; } elsif(eof && ! $this_line) { $next_line = undef; } else { $next_line = $this_line; } process_line(); if ($preserve_newline) { $words[-1]{space} .= "\n"; } } sub process_line { # If no XML-processing, split line to tokens. if(! $xml) { if (! @words) { line2words($line); $line = undef; } if ($next_line) { line2words($next_line); $next_line = undef; } } else { # If the xml-tags are included in the preprocessor output they are # left untouched and just added to the tokens array. # The xml-markup without space, like in word # is taken into account. if (!@words) { xml_tags($line, \@words); } xml_tags($next_line, \@next_words) } } sub line2words { my ($this_line) = @_; verbose("line2words", $this_line, __LINE__); while($this_line =~ s/^(\s*)([^\s]+)//) { my $word = $2; my $spacechar = $1; # Assume that a word containing a dot followed by a hyphen or parenthesis is # an abbreviation followed by another word. Split this word # into two words, inserting a space after each of them if ($word =~ /(.*\D)\.([-\)])(.*)/) { push_word($1 . '.', " "); push_word($2 .$3, " "); } else { push_word($word, $spacechar); } } } sub push_word { my ($word, $spacechar) = @_; my %w = ('word' => $2, 'space' => $1); if ($space) { %w = ('word' => $word, 'space' => $spacechar); } else { %w = ('word' => $word, 'space' => " "); } push (@words, \%w); } sub xml_tags { my ($line, $aref) = @_; @$aref = undef; pop @$aref; chomp $line; while ($line) { verbose("processing xml-tag", $line , __LINE__); if ($line =~ /^\s*$/) { last; } if ($line =~ s/^(\s*)([^<\s]+?)(?=(?:\s|$))//) { verbose("read", $2 , __LINE__); my %w = ('word' => $2, 'space' => $1); push (@$aref, \%w); next; } if ($line =~ s/^(\s*)(<[^<]*?>)//) { if ($2) { verbose("read", $2 , __LINE__); my %w = ('xml' => $2, 'space' => $1 ); push (@$aref, \%w); } next; } while($line =~ s/^(\s*)([^<\s]+)//) { verbose("read", $2 , __LINE__); my %w = ('word' => $2, 'space' => $1); push (@$aref, \%w); } } } sub process_words { while (@words) { my $w_token = shift @words; # Leave xml-tags untouched. if ($xml && $w_token->{xml}) { if ($space && $w_token->{space}) { print $ltag, $w_token->{space}, $rtag, "\n"; } if (! $no_xml_out) { print $w_token->{xml}, "\n"; } } else { my $word = $w_token->{word}; if (defined ($word) && $word ne "") { verbose("process_words", $word , __LINE__); # An array for storing the tokens, each token in its own slot. my @tokens; # Move forward if the word is nothing special. # This is for making preprocessing faster. if ($word =~ /^[^\W\d\s\n]*$/ && ((! $corr || ! $corr_short{$word}) && ! ($idioms_short{$word} || $idioms_short{lc($word)}))) { verbose("no correction, no idiom", $word , __LINE__); add_token(\@tokens, $w_token, $word); } else { $word = process_word1_hyphen_and_word2_string($word); @tokens = handle_leading_punctuation($word, $w_token); } print_tokens(\@tokens); } } } } # Process "word1- og word2" strings. sub process_word1_hyphen_and_word2_string { my ($word) = @_; verbose("process_word1_hyphen_and_word2_string", $word, __LINE__); if ($word =~ /\w+\p{Pd}$/) { if (@words && $words[0]{word}) { my $next_w = $words[0]{word}; if ($connecting{$next_w}) { $word .= $words[0]{space} . $next_w; shift @words; verbose("connecting", $word , __LINE__); if ($words[0] && (! $xml || $words[0]{word} !~ /^[0]{word}, $tokens_aref); # This really is an abbr $is_abbr = 1; } elsif ($abbr =~ /^[\p{Lu}]{2,3}$/o ) { # Check for abbreviation with 2-3 capital letters # If followed by a capital letter or number, # There is a sentence boundary. todo: Remove this as irrelevant!! verbose ("process_word/test_two_three_cap_letters", $abbr, __LINE__); if ($words_aref->[0]{word} && $words_aref->[0]{word} !~ /^\p{Ll}/o ) { if($corr) { if ($corrections{$word}) { test_corr(\$word, $words_aref, $tokens_aref); } } my @tmp = ($abbr, "."); add_array($tokens_aref, \@tmp); # This really is an abbr $is_abbr = 1; } else { add_token($tokens_aref, $w_token, $word); # This really is an abbr $is_abbr = 1; } } elsif ($abbr =~ /^\p{Ll}$/){ # If an abbreviation like b. if ( # Is our $abbr truely an abbr ($abbrs{$abbr} || $abbrs{lc($abbr)}) # Is there a word after our abbr && $words_aref->[0]{word} ## Does this word start with an uppercase letter && $words_aref->[0]{word} !~ /^[\p{Ll}]/o ) { add_token($tokens_aref, $w_token, $word); my @tmp = ("."); add_array($tokens_aref, \@tmp); } else { # This is not an abbr $is_abbr = 0; } } return $is_abbr; } sub test_punctuation { my ($word, $end_punct, $words_aref, $tokens_aref, $w_token, $next_aref) = @_; verbose ("test_punctuation", $word, __LINE__); # If the punctuation at the end contains a sentence delimiter # ? or !, the word ends the sentence in any case, so the word can # be treated as a token. if ($end_punct =~ /[¶\?\!\.]/) { verbose ("end_punct", $word, __LINE__); if($corr) { if ($corr_short{$word}) { test_corr(\$word, $words_aref, $tokens_aref); } } add_token($tokens_aref, $w_token, $word); } elsif ($word) { # The word is checked for abbreviation and sentence boundary. if (!check_if_word_is_abbr($word, $w_token, $words_aref, $tokens_aref)) { # Cut the word into tokens if there is punctuation in the middle. # e.g. gielddat/guovllut # Check for expressions in $CONTAIN_PUNCT -variable. verbose ("process_word/not_abbr", "$word", __LINE__); if (($word =~ /^(\pL+\.)([\p{pD}\pL]+.*)$/ || $word =~ /([\pL\p{Pd}]+)([^\pL0-9\. \:\-\`]+[\p{pD}\pL]+.*)$/) && $word !~ /$CONTAIN_PUNCT/o) { $word = $1; my %w = ('word' => $2 . $end_punct); $end_punct = ""; unshift (@{$words_aref}, \%w); verbose ("process_word/contain_punct", "$word", __LINE__); process_word($1, $w_token, $words_aref, $next_aref, $tokens_aref); } else { remove_dot($word, $corr, $words_aref, $tokens_aref, $w_token); } } } return $end_punct; } # If the word was not an abbreviation, the rest # of the punctuation is removed and stored as separate tokens. # This operation is for taking the dot out. sub remove_dot { my ($word, $corr, $words_aref, $tokens_aref, $w_token) = @_; if ($word =~ s/([$SINGLE_PUNCT\.:]+)$//o) { my $rest_punct = $1; verbose ("process_word/rest_punct", "$word $rest_punct", __LINE__); if($corr) { if ($corrections{$word}) { test_corr(\$word, $words_aref, $tokens_aref); } } add_token($tokens_aref, $w_token, $word); my @rest = split ("", $rest_punct); add_array($tokens_aref, \@rest); } else { add_token($tokens_aref, $w_token, $word); } } sub concatenate_three_or_more_leading_dots { my ($first_word, $middle_punct, $second_word, $word, $w_token, $words_aref, $next_words_aref, $tokens_aref) = @_; verbose("process_word, punctuation", $word , __LINE__); if ($first_word) { if ($first_word =~ /^[\W]?[^\W\d]+/) { verbose("if word with dots", $first_word , __LINE__); process_word ($first_word, $w_token, $words_aref, $next_words_aref, $tokens_aref); } else { verbose("if numeral with dots", $first_word , __LINE__); process_numeral ($first_word, $w_token, $words_aref, $next_words_aref, $tokens_aref); } } my $subpunct = substr($middle_punct, 0, 3); add_new_token ($tokens_aref, $subpunct); if ($second_word) { my %w = ('word' => $second_word); if ($second_word =~ /^[\W]?[^\W\d]+/) { verbose("if word with dots", $second_word , __LINE__); process_word ($second_word, \%w, $words_aref, $next_words_aref, $tokens_aref); } else { verbose("if numeral with dots", $second_word , __LINE__); process_numeral ($second_word, \%w, $words_aref, $next_words_aref, $tokens_aref); } } } sub handle_two_or_less_leading_dots { my ($word, $tokens_aref) = @_; # store punctuation from the front of the expression # to the tokens array. (todo: check variable $CONTAIN_PUNCT) if ($word =~ s/^([$SINGLE_PUNCT\.]+)//o) { verbose("if front punct ", $word , __LINE__); my @punct = split(//, $1); add_array($tokens_aref, \@punct); } return $word; } sub handle_leading_punctuation { my ($word, $w_token) = @_; my @tokens; verbose("handle_leading_punctuation", $word , __LINE__); # Fix some punctuation, like ...voxende and ----whatelse if ($word =~ /^(.*?)((:?\.){3,}|(:?\-){3,})(.*)$/) { concatenate_three_or_more_leading_dots ($1, $2, $5, $word, $w_token, \@words, \@next_words, \@tokens); } else { $word = handle_two_or_less_leading_dots($word, \@tokens); if (defined ($word) && $word ne "") { if ($word =~ /^[\W]?\pL+/) { # Examine the type of the string. If the expression contains # alphabetical characters optionally preceded by one punctuation # character, it is a word. verbose("if word", $word , __LINE__); process_word ($word, $w_token, \@words, \@next_words, \@tokens); } else { # Otherwise it is processed like numeral. verbose("else numerical", $word , __LINE__); process_numeral ($word, $w_token, \@words, \@next_words, \@tokens); } } } return @tokens; } sub is_abbreviation { my ($abbr, $class) = @_; return (($abbrs{$abbr} && $abbrs{$abbr} eq $class) || ($abbrs{lc($abbr)} && $abbrs{lc($abbr)} eq $class)); } sub add_TRAB { my ($tokens_aref, $w_token, $word) = @_; # Transitive abbreviations are never followed # by sentence boundary. verbose("TRAB", $word, __LINE__); add_token($tokens_aref, $w_token, $word); } sub add_TRNUMAB { my ($tokens_aref, $w_token, $word, $next_word) = @_; # There is CLB after TRNUMAB only for the capital+small combinations # and all small-initial strings that consist of more than one letter. # For all other strings (one small letter, one or several capital letters, number) we # want no CLB after TRNUMAB. verbose("TRNUMAB", $word, __LINE__); add_token ($tokens_aref, $w_token, $word); if (!$next_word or ($next_word && $next_word =~ /^\p{Lu}/ && $next_word !~ /^(\p{Lu}|[IVXCDLM]+)$/o)) { add_new_token ($tokens_aref, $sentence_break); } } sub add_ITRAB { my ($tokens_aref, $w_token, $word, $next_word) = @_; verbose("ITRAB", $word, __LINE__); # There is a sentence boundary if intransitive abbreviation # is NOT followed by a small alphabetic char or punctuation, # or if it is the last word add_token ($tokens_aref, $w_token, $word); if (!$next_word or ($next_word && $next_word !~ /^[\p{Ll}\pP]/o)) { verbose("ITRAB", $next_word, __LINE__); add_new_token ($tokens_aref, $sentence_break); } } sub add_NUMNOAB { my ($tokens_aref, $w_token, $word, $next_word) = @_; verbose("NUMNOAB", $word, __LINE__); # The expected behaviour of $NUMNOAB is shown in $GTHOME/tools/abbrtester/abbrtester.py if ($next_word) { if ($next_word =~ /^\p{Lu}/) { $word =~ s/\.$//; add_token ($tokens_aref, $w_token, $word); add_new_token ($tokens_aref, $sentence_break); } elsif ($next_word =~ /^\d/) { add_token ($tokens_aref, $w_token, $word); } elsif ($next_word =~ /^\p{Ll}/) { add_token ($tokens_aref, $w_token, $word); } } else { $word =~ s/\.//; add_token ($tokens_aref, $w_token, $word); add_new_token ($tokens_aref, $sentence_break); } } sub add_NOAB { my ($tokens_aref, $w_token, $word, $next_word) = @_; # if next_word isn't defined, then we are at the end of a sentence # -> this is not an abbr # if next word is defined, and not uppercase # -> this is not an abbr verbose("NOAB", $word, __LINE__); if (!$next_word || ($next_word && $next_word !~ /^[\p{Ll}\pP]/o)) { $word =~ s/\.$//; add_token($tokens_aref, $w_token, $word); add_new_token ($tokens_aref, $sentence_break); } else { add_token($tokens_aref, $w_token, $word); } } sub is_word_abbreviation { my ($word, $w_token, $next_word, $tokens_aref) = @_; my $abbr = $word; $abbr =~ s/\.$//; verbose("is_word_abbreviation", $abbr, __LINE__); if (is_abbreviation($abbr, $TRAB)) { add_TRAB($tokens_aref, $w_token, $word); return 1; } elsif (is_abbreviation($abbr, $TRNUMAB)) { add_TRNUMAB($tokens_aref, $w_token, $word, $next_word); return 1; } elsif (is_abbreviation($abbr, $ITRAB)) { add_ITRAB($tokens_aref, $w_token, $word, $next_word); return 1; } elsif (is_abbreviation($abbr, $NUMNOAB)) { add_NUMNOAB($tokens_aref, $w_token, $word, $next_word); return 1; } elsif (is_abbreviation($abbr, $NOAB)) { add_NOAB($tokens_aref, $w_token, $word, $next_word); return 1; } return 0; } # If a word is in the typos-list, it is replaced by the correct reading. sub test_corr { my ($word, $words_aref, $next_aref, $tokens_aref) = @_; # Correct the word, if corrections is defined if ($corr ) { if ($corr_short{$word}) { verbose("test_corr", $word, __LINE__); # If the word is single part. if (my $correct = $corrections{$word}) { $word = single_part($correct, $word, $tokens_aref); } else { $word = define_word($word, $tokens_aref, $words_aref, $next_aref); } } } return $word; } sub single_part { my ($correct, $word, $tokens_aref) = @_; my @parts = split (/ /o, $correct); if (scalar @parts > 1) { if ( $idioms{$correct}) { $word = $correct; verbose ("test_corr/split_words", "$correct", __LINE__); } else { $word = pop @parts; add_array($tokens_aref, \@parts); verbose ("test_corr/split_words", "$correct", __LINE__); } } else { $word = $correct; } return $word; } sub define_word { my ($word, $tokens_aref, $words_aref, $next_aref) = @_; verbose("define_word", $word, __LINE__); my $next; my $idiom; my $ucidiom; if( $words_aref->[0]{word} ) { $idiom = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; $ucidiom = ucfirst($word) . $next_aref->[0]{space} . ucfirst($next_aref->[0]{word}); } elsif ($next_aref->[0]{word}) { $idiom = $word . $next_aref->[0]{space} . $next_aref->[0]{word}; $ucidiom = ucfirst($word) . $next_aref->[0]{space} . ucfirst($next_aref->[0]{word}); $next=1; } return loop_correction($word, $idiom, $ucidiom, $tokens_aref, $next, $words_aref, $next_aref); } sub loop_correction { my ($word, $idiom, $ucidiom, $tokens_aref, $next, $words_aref, $next_aref) = @_; verbose("loop_correction", $word, __LINE__); my $size=2; my $i=1; while ($i <= $size) { # Remove the punctuation at the end of the expression. (my $idiom2 = $idiom) =~ s/([^\w]*)$//; (my $ucidiom2 = $ucidiom) =~ s/([^\w]*)$//; my $end_punct = $1; # If the expressions contains punctuation in the middle, return. if ($idiom2 =~ /[^\w\s\d\.\-]/) { last; } elsif ($corrections{$idiom2} || $corrections{lc($idiom2)} || $corrections{($ucidiom2)}) { # Test if the formed multiword expression exists in the # idiom list. Test also lower case version. my $correct = $corrections{$idiom}; verbose ("test_corr/correction", "$correct", __LINE__); $word = check_idiom($correct, $word, $end_punct, $tokens_aref); # Remove the parts of the multiword expression from # the word array. if($next) { splice (@{$next_aref}, 0, $i+1); } else { splice (@{$words_aref}, 0, $i+1); } last; } else { $i++; if($words_aref->[$i]) { $idiom = $idiom . $words_aref->[$i]{space} . $words_aref->[$i]{word}; } elsif(! $next && $next_aref->[0]) { $idiom = $idiom . $next_aref->[0]{space} . $next_aref->[0]{word}; $next=1; } else { last; } } } return $word; } sub check_idiom { my ($correct, $word, $end_punct, $tokens_aref) = @_; if ( $idioms{$correct}) { $word = $correct; $word .= $end_punct; verbose ("test_corr/idiom", "$correct", __LINE__); } else { my @parts = split (/ /o, $correct); if (scalar @parts > 1) { $word = pop @parts; $word .= $end_punct; add_array($tokens_aref, \@parts); verbose ("test_corr/split_words", "$correct", __LINE__); } } return $word; } # If the word starts an multiword expression, it is replaced # with the expression. The other parts are # removed from the words array. sub test_idiom { my ($word_ref, $words_aref, $next_aref, $size, $end_punct) = @_; # Test for multiword expressions by growing the token # one word at a time my $next=0; my $i=0; my $last_part; my $idiom; my $ucidiom; verbose("test_idiom", "", __LINE__); if ($words_aref->[0]{word} or $next_aref->[0]{word}) { if( $words_aref->[0]{word} ) { if ($words_aref->[0]{space}) { $idiom = $$word_ref . $words_aref->[0]{space} . $words_aref->[0]{word}; $ucidiom = ucfirst($$word_ref) . $words_aref->[0]{space} . ucfirst($words_aref->[0]{word}); } else { $idiom = $$word_ref . $words_aref->[0]{word}; $ucidiom = ucfirst($$word_ref) . ucfirst($words_aref->[0]{word}); } } elsif ($next_aref->[0]{word}) { $idiom = $$word_ref . $next_aref->[0]{space} . $next_aref->[0]{word}; $ucidiom = ucfirst($$word_ref) . $next_aref->[0]{space} . ucfirst($next_aref->[0]{word}); $next=1; } # Only test for multi word expression if end_punct is emtpy if ($end_punct eq "") { while ($i <= $size) { # Remove the punctuation at the end of the expression. (my $idiom2 = $idiom) =~ s/[^\w]*$//; (my $ucidiom2 = $ucidiom) =~ s/[^\w]*$//; # If the expressions contains punctuation in the middle, return. return if ($idiom2 =~ /[^\w\s\d\.\-]/); # Test if the formed multiword expression exists in the # idiom list. Test also lower case version. verbose ("test_idiom", $idiom, __LINE__); if ($idioms{$idiom2} || $idioms{lc($idiom2)}) { for (my $j=0; $j <= $i; $j++) { # Construct the new multiword processing unit. # Remove the parts of the multiword expression from # the word array. if($next) { $$word_ref = $$word_ref . $next_aref->[0]{space} . $next_aref->[0]{word}; shift @{$next_aref}; } else { if ($words_aref->[0]{space}) { $$word_ref = $$word_ref . $words_aref->[0]{space}; } if ($words_aref->[0]{word}) { $$word_ref = $$word_ref . $words_aref->[0]{word}; } shift @{$words_aref}; } } return 1; } $i++; if($words_aref->[$i]) { if ($words_aref->[$i]{space}) { $idiom = $idiom . $words_aref->[$i]{space}; $ucidiom = $ucidiom . $words_aref->[$i]{space}; } if ($words_aref->[$i]{word}) { $idiom = $idiom . $words_aref->[$i]{word}; $ucidiom = $ucidiom . ucfirst($words_aref->[$i]{word}); } } elsif(! $next && $next_aref->[0]{word} && $next_aref->[0]{space}) { $idiom = $idiom . $next_aref->[0]{space} . $next_aref->[0]{word}; $ucidiom = $ucidiom . $next_aref->[0]{space} . ucfirst($next_aref->[0]{word}); $next=1; } else { return 0; } } } } else { return 0; } } sub process_numeral { my ($word, $w_token, $words_aref, $next_aref, $tokens_aref) = @_; verbose ("process_numeral", $word, __LINE__); if ($corr ) { if ($corr_short{$word}) { test_corr(\$word, $words_aref, $next_aref, $tokens_aref); } } ($word, $words_aref) = fix_numeral_expressions_containing_spaces($word, $words_aref); my $end_punct; ($end_punct, $word) = find_endpunct($end_punct, $word); verbose ("process_numeral", $word, __LINE__); my $end_word; if ( $word =~ /^(\d+)([\:\'\-]?)([\pL]+)(\.)?$/o && ! $2 && ! $num{$3}) { word_as_inflected_numeral($tokens_aref, $word, $w_token, $words_aref, $next_aref, $end_punct); } elsif ($word =~ /\.([^\W\d]+\.?)$/ && $word !~ /$CONTAIN_PUNCT/o) { word_or_abbr_attached_to_numeral($word, $w_token, $words_aref, $next_aref, $tokens_aref, $end_punct); } else { my $rest_punct; compute_end_and_restpunct($end_punct, $rest_punct, $word, $end_word, $w_token, $words_aref, $next_aref, $tokens_aref); } } # Check if there is an abbreviation or a word attached to # numeral. E.g. 6.b. ...ovdal. This introduces problems with some # mispellings with headings, like 1.6.Vuonain sub word_or_abbr_attached_to_numeral { my ($word, $w_token, $words_aref, $next_aref, $tokens_aref, $end_punct) = @_; verbose("word_or_abbr_attached_to_numeral", $word, __LINE__); $word =~ s/([^\W\d]+\.?)$//; process_numeral($word, $w_token, $words_aref, $next_aref, $tokens_aref); my $new_word = ""; if ($end_punct) { $new_word = $1 . $end_punct; } else { $new_word = $1; } my %w = ('word' => $new_word); unshift (@$words_aref, \%w); } # Check if the word is an inflected numeral. 12s, etc. sub word_as_inflected_numeral { my ($tokens_aref, $word, $w_token, $words_aref, $next_aref, $end_punct) = @_; verbose("word_as_inflected_numeral", $word, __LINE__); add_token($tokens_aref, $w_token, $1); my %w = ('word' => $3); process_word ($3, \%w, $words_aref, $next_aref, $tokens_aref); if ($end_punct) { if ($4) { $end_punct = $4 . $end_punct; } } else { $end_punct = $4; } if ($end_punct) { my @ep = split(//, $end_punct); add_array($tokens_aref, \@ep); } } sub compute_end_and_restpunct { my ($end_punct, $rest_punct, $word, $end_word, $w_token, $words_aref, $next_aref, $tokens_aref) = @_; # Check for ending dot, it is a separate token if # the following word starts with capital letter. # If there is an abbreviation with numeral, don't do checking. # Otherwise the dot belongs to the expression. # Process first the abbreviation or word # that is attached to the numeral if (defined($end_word) && $end_word ne "") { verbose ("process_numeral/is_word_abbreviation", $end_word, __LINE__); my $abbr = $end_word; if (( $abbr =~ s/\.$//) && ($abbrs{$abbr} || $abbrs{lc($abbr)})) { verbose ("process_numeral/is_word_abbreviation", $abbr, __LINE__); is_word_abbreviation($end_word, $w_token, $words_aref->[0], $tokens_aref); } else { verbose ("process_numeral/is_word_abbreviation", $abbr, __LINE__); process_word ($end_word, $w_token, $words_aref, $next_aref, $tokens_aref); } } else { ($word, $end_punct, $rest_punct) = dot_test($word, $end_punct, $rest_punct, $words_aref); verbose ("process_numeral", $word, __LINE__); # Push everything to the tokens array. verbose ("add_token", "$word", __LINE__); add_token($tokens_aref, $w_token, $word); if (defined($end_word) && $end_word ne "") { verbose ("add_token", "$end_word", __LINE__); add_token($tokens_aref, $w_token, $end_word); } } if (defined($rest_punct) && $rest_punct ne "") { my @rp = split(//, $rest_punct); verbose ("add_array", "$word", __LINE__); add_array($tokens_aref, \@rp); } if (defined($end_punct) && $end_punct ne "") { my @ep = split(//, $end_punct); verbose ("add_array", "$word", __LINE__); add_array($tokens_aref, \@ep); } } sub dot_test { my ($word, $end_punct, $rest_punct, $words_aref) = @_; verbose("dot_test", $word, __LINE__); unless ($word !~ /\.$/) { (my $nopunct = $word) =~ s/\.$//; # If the number contains other than digits or is a year, # and the next word does not exist or does not start uppercase, then # the ending dot is removed. if($nopunct =~ /[§\d\pP\pL]/ && ((!($words_aref->[0]{word}) || $words_aref->[0]{word} =~ /^[\p{Lu}]/o))){ $word = $nopunct; $end_punct = compute_endpunct($end_punct); $rest_punct = compute_restpunct($word, $rest_punct); verbose("dot_test", $words_aref->[0]{word}, __LINE__); } elsif($nopunct =~ /[§\d\pP\pL]/ && ((!($words_aref->[0]{word}) || $words_aref->[0]{word} =~ /¶/))){ $word = $nopunct; $end_punct = compute_endpunct($end_punct); $rest_punct = compute_restpunct($word, $rest_punct); verbose("dot_test", $words_aref->[0]{word}, __LINE__); } elsif ($word =~ /^\d{1,3}\.$/ ) { if ($words_aref->[0]{word} =~ /^[\p{Lu}]/o) { $end_punct = compute_endpunct($end_punct); $rest_punct = compute_restpunct($word, $rest_punct); verbose("dot_test", $words_aref->[0]{word}, __LINE__); } } elsif ($nopunct =~ /[§\d\pP\pL]/ && $words_aref->[0]{word} =~ /^[$parentheses°]/o) { $word = $nopunct; $end_punct = compute_endpunct($end_punct); $rest_punct = compute_restpunct($word, $rest_punct); verbose("dot_test", $words_aref->[0]{word}, __LINE__); } } return ($word, $end_punct, $rest_punct); } sub compute_endpunct { my ($end_punct) = @_; if ($end_punct) { $end_punct = "." . $end_punct; } else { $end_punct = "."; } return $end_punct } sub compute_restpunct { my ($word, $rest_punct) = @_; # Clean the rest of the token: # cases like 123). where dot is preceded by punctuation. if ($word =~ s/([$SINGLE_PUNCT:]+)$//o) { $rest_punct = $1; } return $rest_punct; } sub find_endpunct { my ($end_punct, $word) = @_; # Clean first the end of the token, where punctuation follows # the numeral or an ordinal and does not belong to the expression. # cases like: 123! and 123.), 123). 10,-. # problem: 123.? and 123 is an ordinal. if ($word =~ s/([$SINGLE_PUNCT:]+\.?)$//o){ $end_punct = $1; } if ($word =~ /([$SINGLE_PUNCT:\p{Pd}\%]+\.?)$/o){ $word =~ s/(\.?)$//; $end_punct = $1; } return ($end_punct, $word); } # Combine percent sign to the numeral when separate. # covers cases like: 50 % sub combine_numeral_and_percent { my ($word, $words_aref) = @_; if (($word =~ /\d$/) && ($words_aref->[0]{word} && $words_aref->[0]{word} =~ /^\%/)) { $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; verbose ("process_numeral/combine", $word, __LINE__); shift @{$words_aref}; } return ($word, $words_aref); } sub combine_date_expression { my ($word, $words_aref) = @_; verbose("combine_date_expression", $word, __LINE__); # Consider date expressions first. # at the moment covers years: 1984-2000, 1984- 2000 etc. if ($word =~ /^\d{4}\p{Pd}?$/o) { while ($words_aref->[0] && $words_aref->[0]{word} && $words_aref->[0]{word} =~ /^(?:\p{Pd}|\p{Pd}?(?:\d{4}|\d{2}))\.?$/o) { if ($word =~ /\-/ || $words_aref->[0]{word} =~ /\-/) { $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; verbose ("process_numeral/combine date", $word, __LINE__); shift @{$words_aref}; } else { return ($word, $words_aref); } } return ($word, $words_aref); } return ($word, $words_aref); } sub combine_date_ranges { my ($word, $words_aref) = @_; verbose("combine_date_ranges", $word, __LINE__); # Look for date ranges, e.g. 2. - 3. mars # First check that all variables that are accessed are initialized if ($word && $words_aref->[0]{word} && $words_aref->[1]{word}) { # Then do the real work if ( ($word =~ /^\d{1,2}\.$/ || # Test if $word is a number with at most two digits ending with a dot $word =~ /^\d{1,2}\.\d{1,2}\.$/ ) && # Or a number of the format dd.dd. $words_aref->[0]{word} =~ /-/ && # Test if the first word following $word is a hyphen ($words_aref->[1]{word} =~ /^\d{1,2}\.$/ || # Test if the second word following $word is a number with at most two digits ending with a dot $words_aref->[1]{word} =~ /^\d{1,2}\.\d{1,2}\.\d{1,2}/) # or has the dd.dd.dd ) { $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; shift @{$words_aref}; $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; shift @{$words_aref}; verbose ("process_numeral/combine", $word, __LINE__); return ($word, $words_aref); } } return ($word, $words_aref); } sub combine_punctuation_with_numeral { my ($word, $words_aref) = @_; verbose("combine_punctuation_with_numeral", $word, __LINE__); # Combine punctuation with numeral if followed by other numeral. # cases like 123- 456 and 123 -456 and 123 456 and 123 - 456 # 10 000,- and math expressions like 2 * 4 = 8 # many times connects long sequences of numbers into one token. while (($word =~ /^[\d$NUM_PUNCT\- ]+$/o) && ($words_aref->[0]{word} && $words_aref->[0]{word} =~ /^[\d$NUM_PUNCT\- ]+(\,\-)?\.?$/o)) { $word = $word . $words_aref->[0]{space} . $words_aref->[0]{word}; verbose ("process_numeral/combine", $word, __LINE__); shift @{$words_aref}; } return ($word, $words_aref); } # Search for numeral expressions with spaces. sub fix_numeral_expressions_containing_spaces { my ($word, $words_aref) = @_; ($word, $words_aref) = combine_date_expression($word, $words_aref); ($word, $words_aref) = combine_date_ranges($word, $words_aref); ($word, $words_aref) = combine_punctuation_with_numeral($word, $words_aref); ($word, $words_aref) = combine_numeral_and_percent($word, $words_aref); return ($word, $words_aref); } sub verbose { my ($from, $word, $linenumber) = @_; if ($verbose) { print STDERR "[$from:$linenumber] $word\n"; } } # Add text to an existing token. sub add_token{ my ($tokens_aref, $w_token, $content) = @_; $$w_token{word} = $content; push @{$tokens_aref}, $w_token; } # Create a new token and add text to it sub add_new_token{ my ($tokens_aref, $content) = @_; my %w = ('word' => $content); push @{$tokens_aref}, \%w; } # Create several new tokens from an array sub add_array{ my ($tokens_aref, $content_aref) = @_; for my $p (@{$content_aref}) { my %w = ('word' => $p); push @{$tokens_aref}, \%w; } } sub print_token{ my ($w_token) = @_; if ($space && $w_token->{space}) { print $ltag, $w_token->{space}, $rtag, "\n"; } print $w_token->{word}, "\n"; } # Print the list of tokens. sub print_tokens{ my ($tokens_aref) = @_; verbose("print_tokens", "", __LINE__); for my $t (@$tokens_aref) { if ($space && $t->{space}) { print $ltag, $t->{space}, $rtag, "\n"; } if (defined($t->{word}) && $t->{word} ne "") { if ($t->{word} !~ /^$/) { my $w = $t->{word}; $w =~ s/ $//; print "$w\n"; } } } } # Read the typos list # Read the corrections file (typos.txt) sub read_corr { my ($corr_href, $corr_short_href) = @_; if ($corr) { local $/="\n"; open CORR, "< $corr" or die "preprocess: Can't open the corr file $corr: $!\n"; while () { chomp; next if (/^!/); next if (/^\s*$/); next if (/^\#/); s/\s$//; my ($error, $correct) = split(/\t+/); if ($error && $correct) { $error =~ s/\s$//; $$corr_href{$error} = $correct; my ($first, $last) = split(/ /, $error, 2); $$corr_short_href{$first} = 1; } else { print STDERR "preprocess warning: Line not included to typos: $_\n"; } } } } # If the abbreviation file is given, # read the abbreviations from the file to a hash. sub read_abbr { my ($abbr_href, $idiom_href, $idiom_short_href, $num_href, $regex_href) = @_; if ($abbr_file) { open LEX, "< $abbr_file" or die "preprocess: Can't open the abbr file: $!\n"; my $current; while () { chomp; last if (/^LEXICON IDIOM/); if (/^LEXICON\s+(.*?)\s*$/) { $current = $1; next; } $$abbr_href{$_} = $current; } while () { chomp; last if (/^LEXICON NUM/); my ($first, $last) = split(/ /, $_, 2); $$idiom_href{$_} = 1; if ($first) { $$idiom_short_href{$first} = 1; } } while () { next if /^\s*$/; chomp; $$num_href{$_} = 1; } close LEX; } } sub print_usage { print < FILE Split text in FILE into sentences and words. Options: --hyph|h show the hyphenation points, i.e. change the tags to hyphens. The default is to just remove the tags. --use-hyph-tags leave the tags untouched --break= use instead of . as sentence delimiter. --connect= comma separated list of words which connect expressions like fisk- og vilthandelen --space|s Preserve space. --ltag= Left tag for space, default < --rtag= Right tag for space, default > --help Prints the help text and exit. --v Prints information of the execution of the script --corr= List of common typos and their corrections (e.g. typos.txt) --xml|x Accept xml-formatted input, print each tag on its own line. --no-xml-out|n Used together with --xml, does not print the xml-tags. END }