#!/usr/bin/perl -w # © Gertjan van Noord, 1997. # mailto:vannoord@let.rug.nl use strict; use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_q $opt_s $opt_t $opt_v $opt_u $opt_a $opt_x $opt_w); use Getopt::Std; use XML::Twig; use utf8; my $MINCHAR = 0; my %lmodel; my %wmodel; my %pairs = ( "\"" => "\"", "«" => "»", "“" => "”", ); # OPTIONS getopts('a:d:f:i:nqst:u:xhvwl'); # defaults: set $opt_X unless already defined (Perl Cookbook p. 6): $opt_a ||= 10; $opt_d ||= './LM'; $opt_f ||= 0; $opt_w ||= 0; $opt_t ||= 400; $opt_u ||= 1.1; $opt_q ||= 0; sub help { print <) { chomp; classify($_); } } else { classify(input()); } # prepare the xml-file for language detection sub read_xml { my $file = shift @_; my @languages; my $test=0; # Find out the languages specified for the docu. # Take only the header of the document. my $twig= XML::Twig->new( twig_roots => { header => 1 }); if (! $twig->safe_parsefile ("$file") ) { print STDERR "Encoding: $file: ERROR parsing the XML-file failed. $!\n"; return; } my $docu=$twig->root; my $mainlang = $docu->{'att'}->{'xml:lang'}; my $mainlang_ok=0; my $header = $docu->first_child('header'); my $multi = $header->first_child('multilingual'); if(! $multi ) { if($test) { print STDERR "text_cat: Document is monolingual.\n"; } return; } # open directory to find which languages are supported opendir DIR, "$opt_d" or die "directory $opt_d: $!\n"; my @lm = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR)); closedir DIR; if (! @lm) { die "text_cat: no language models found in $opt_d"; } LANG_TEST: { my @langs = $multi->children; if (! @langs ) { if ($test) { print STDERR "text_cat: Document is categorized in languages: all.\n" } @languages = @lm; last LANG_TEST; } for my $lang (@langs) { push @languages, $lang->{'att'}->{'xml:lang'}; if ($lang eq $mainlang) { $mainlang_ok=1; } } if (! $mainlang_ok ) { push @languages, $mainlang; } if($test) { if (@languages) { print STDERR "Document is categorized in languages: @languages\n"; } else { print STDERR "text_cat: Document is categorized in languages: all.\n" ; } } } # Test if the main language is included in the language models. my %languages_h; for my $lang (@lm) { $languages_h{$lang}=1;} if(! $languages_h{$mainlang}) { print STDERR "text_cat: Document language \"$mainlang\" was not recognized.\n"; return; } # Load langauge and word models for the tested languages. foreach my $lang (@languages) { $lmodel{$lang}=load_lm($lang); } if(!$opt_w) { foreach my $lang (@languages) { $wmodel{$lang}=load_wm($lang); } } # Process each paragraph at the time. # Function langdetect handles the classification of the paragraph. my $document = XML::Twig->new(twig_handlers => { p => sub { langdetect(@_, $mainlang); } }); if (! $document->safe_parsefile ("$file")) { print STDERR "Langdetect: ERROR parsing the XML-file failed: $@\n"; last LANGDETECT; } open (FH, ">$file") or die "ERROR cannot open file $!"; $document->set_pretty_print('indented'); $document->print( \*FH); $document->purge; close(FH); } # Process each xml-element (paragraph) sub langdetect { my ( $twig, $para, $mainlang) = @_; my $text = $para->text; # The language of paragraphs shorter than MINCHAR characters is not guessed. my $count = length($text); if ($count < $MINCHAR) { $para->del_att( 'xml:lang'); return; } # If text contains only non-word characters, it is classified # as the default language. if ($text !~ /[\pL]/o) { $para->del_att( 'xml:lang'); return;} # Classification returns the choices for the text. my @choices = classify($text); # If the language was not recognized, the document language is selected. if (! @choices || ! $choices[0]) { # print "Language was not recognized.\n"; $para->del_att( 'xml:lang'); return; } # The best language is selected. my $bestlang = $choices[0]; # Test if there are quoted strings in other than main lang. if ($opt_q && $text =~ /[\«\"“]/) { my @text_array; my $return_val = mark_span($text, \@text_array, $mainlang); if ($return_val ) { $para->set_content(@text_array); } } if ($bestlang ne $mainlang ) { $para->set_att( "xml:lang" => $bestlang ); } else { $para->del_att( 'xml:lang'); } } # Function to pick quotations to element "span" # The language of the quotation is checked and marked # if it differs from the main language. sub mark_span { my ($text, $text_aref, $mainlang) = @_; my $first; if ($text =~ /[\"\«“]/) { if ($text =~ /([\«\"\“]).*?[\»\"”]/s) { $first = $1; } else { return 0; } } else { return 0; } my $second = $pairs{$first}; if ($text =~ /^(.*?)($first.*?$second)(.*)$/s ) { my $quote = $2; my $rest = $3; # print "1: $1\n 2: $quote\n 3: $rest\n"; # Classification returns the choices for the text. my @choices = classify($quote); # If the language was not recognized, the mainlang is selected. my $bestlang; if (! @choices || ! $choices[0]) { $bestlang = $mainlang; } else { $bestlang = $choices[0]; } # A new span element is created for the quotation. push (@$text_aref, $1); my $span = XML::Twig::Elt->new('span'); $span->set_att("type" => 'quote'); if ($bestlang ne $mainlang) { $span->set_att("xml:lang" => $bestlang); } $span->set_text($quote); push (@$text_aref, $span); my $return_val; # The rest of the paragraph is processed further. if ($rest) { $return_val = mark_span($rest, $text_aref, $mainlang); } if (! $return_val) {push (@$text_aref, $rest)}; return 1; } } # CLASSIFICATION sub classify { my $input = shift @_; # create ngrams for input. Note that hash %unknown is not used; # it contains the actual counts which are only used under -n: creating # new language model (and even then they are not really required). my @unknown=create_lm($input); # If text_cat was invoked command-line, the language models have to # be loaded. if(! %lmodel && $opt_l) { # open directory to find which languages are supported opendir DIR, "$opt_d" or die "directory $opt_d: $!\n"; my @lm = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR)); closedir DIR; if (! @lm) { die "text_cat: no language models found in $opt_d"; } for my $lang (@lm) { $lmodel{$lang}=load_lm($lang); } if(!$opt_w) { for my $lang (@lm) { $wmodel{$lang}=load_wm($lang); } } } my %results=(); my $maxp = $opt_t; if (! %lmodel) { print "Loading the language models failed, exiting..\n"; return; } foreach my $language (keys %lmodel) { # compares the language model with input ngrams list my ($i,$p)=(0,0); while ($i < @unknown) { my $unk=$unknown[$i]; if ($lmodel{$language}{$unk}) { $p=$p+abs($lmodel{$language}{$unk}-$i); } else { $p=$p+$maxp; } ++$i; } $results{$language} = $p; } my @results = sort { $results{$a} <=> $results{$b} } keys %results; if ($opt_v) { print "results\n"; for my $r (@results) { print "$r\t$results{$r}\n"; } } my $ans = $results{$results[0]}; my $ans2 = $results{$results[1]}; # Drop the candidates which are too weak. my @answers=(shift(@results)); while (@results && $results{$results[0]} < ($opt_u*$ans)) { @answers=(@answers,shift(@results)); } # print "Best is $ans, Answers are: @answers\n"; # If there was no language that could fit the input, # return undef. if ($ans==0 || $ans2 == $ans ) { if($opt_l) { print "Could not decide the language\n"; exit; } else { return undef; } } splice(@answers,3) if (@answers > 3); # SHORT WORD MODEL # Test the rest of the languages with word lists. my @wanswers; my %wresults; TEST_SHORT: { if(@answers == 1 || $opt_w || ! %wmodel) { @wanswers = @answers; last TEST_SHORT; } # Create short word model. my %wunknown=create_wm($input); foreach my $language (@answers) { # compares the language model with input wgrams list my ($i,$p)=(0,0); for my $key (keys %wunknown) { if ($wmodel{$language}{$key}) { $b = sprintf("%.0f", ($wmodel{$language}{$key}**2*$wunknown{$key}*100/$results{$language})); $p=$p-$b; # if($opt_l) { print "$language vah. $b, p: $p $wmodel{$language}{$key} {$key $wunknown{$key}}\n"; } } ++$i; } #print STDERR "short word: $language: $p\n" if $opt_v; # The result of the word test is added to the results of # the trigram test. $wresults{$language} = $results{$language} + $p; } my @wresults_a = sort { $wresults{$a} <=> $wresults{$b} } keys %wresults; if ($opt_v) { print "word results\n"; for my $r (@wresults_a) { print "$r\t$wresults{$r}\n"; } } my $ans = $wresults{$wresults_a[0]}; @wanswers=(shift(@wresults_a)); while (@wresults_a && $results{$wresults_a[0]} < ($opt_u *$ans)) { @wanswers=(@wanswers,shift(@wresults_a)); } # If there was no language that could fit the input, # return undef. if ($ans==0) { if($opt_l) { print "Could not decide the language\n"; exit; } else { return undef; } } # print "Best is $ans, WAnswers are: @wanswers\n"; } #TEST_SHORT if (@wanswers > $opt_a) { if($opt_l) { print "I don't know; " . "Perhaps this is a language I haven't seen before?\n"; } else { return undef; } } else { if($opt_l) { print join(" or ", @wanswers), "\n"; } else { return @wanswers; } } } # first and only argument is reference to hash. # this hash is filled, and a sorted list (opt_n elements) # is returned. sub input { my $read=""; if ($opt_i) { while(<>) { if ($. == $opt_i) { return $read . $_; } $read = $read . $_; } return $read; } else { local $/; # so it doesn't affect $/ elsewhere undef $/; $read = <>; # swallow input. $read || die "determining the language of an empty file is hard...\n"; return $read; } } sub create_wm { my %wgram; my $text = shift @_; $text =~ s/[^\pL_\- ]//og; #print "$text\n"; foreach my $word (split(/\s+/, $text)) { $wgram{lc($word)}++; } return %wgram; } sub create_lm { my ($string, $ngram) = @_; #$ngram contains reference to the hash we build # then add the ngrams found in each word in the hash my @array; # if ($infile) { # open(FH, "<:utf8", "$infile"); # while() { chomp; push(@array, $_); } # } if ($string) { @array = split("\n", $string); } foreach(@array) { my $line = $_; # remove non-word characters from input --sh $line =~ s/[^\pL_\- ]//; foreach my $word (split(/\s+/, $line)) { # lower case $word = lc($word); $word = "_" . $word . "_"; my $len = length($word); my $flen=$len; my $i; for ($i=0;$i<$flen;$i++) { $$ngram{substr($word,$i,5)}++ if $len > 4; $$ngram{substr($word,$i,4)}++ if $len > 3; $$ngram{substr($word,$i,3)}++ if $len > 2; $$ngram{substr($word,$i,2)}++ if $len > 1; $$ngram{substr($word,$i,1)}++; $len--; } } } ### print STDERR "@{[%$ngram]}"; # as suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up # sorting by removing singletons map { my $key=$_; if ($$ngram{$key} <= $opt_f) { delete $$ngram{$key}; }; } keys %$ngram; #however I have very bad results for short inputs, this way # sort the ngrams, and spit out the $opt_t frequent ones. # adding `or $a cmp $b' in the sort block makes sorting five # times slower..., although it would be somewhat nicer (unique result) my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram; splice(@sorted,$opt_t) if (@sorted > $opt_t); return @sorted; } sub load_lm { my $language = shift @_; # loads the language model into hash my %ngram=(); my $rang=1; open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n"; print STDERR "creating language model $language\n" if $opt_v; while () { chomp; # only use lines starting with appropriate character. Others are # ignored. # remove non-word characters from ngram --sh s/[^\pL]//og; s/\s+//g; next if(/^_+$/); next if(/^\s+$/); if (/^[\pL]+/o) { # The less frequent the string, the more weight it gets. $ngram{$_} = $rang++; } } close(LM); print STDERR "loaded language model $language\n" if $opt_v; return \%ngram; } sub load_wm { my $language = shift @_; # loads the language model into hash %$language. my %wgram=(); # There are 1000 words in some word lists. # not used right now. my $maxw=1000; if (! open(LM,"$opt_d/$language.wm")) { # print STDERR "cannot open $opt_d/$language.wm: $!\n"; return; } print STDERR "creating short word model for $language\n" if $opt_v; # Normalize the length of the list my $maxw_lang=`wc -l \"$opt_d/$language.wm\"`; $maxw_lang =~ s/^(\d+)\s+.*$/$1/; my $wrang=0; my $increment = $maxw/$maxw_lang; while () { chomp; s/\s+//og; s/[^\pL]//og; if (/^[\pL]+/o) { $wrang += $increment; $wgram{$_} = $maxw - $wrang; } } close(LM); print STDERR "loaded short word model $language\n" if $opt_v; return \%wgram; }