#!/usr/bin/perl -w
# © Gertjan van Noord, 1997.
# mailto:vannoord@let.rug.nl

use strict;
use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_q $opt_s $opt_t $opt_v $opt_u $opt_a $opt_x $opt_w);
use Getopt::Std;
use XML::Twig;

use utf8;

my $MINCHAR = 0;
my %lmodel;
my %wmodel;

my %pairs = (
			 "\"" =>  "\"",
			 "«" => "»",
			 "“" => "”",
			 );

# OPTIONS
getopts('a:d:f:i:nqst:u:xhvwl');

# defaults: set $opt_X unless already defined (Perl Cookbook p. 6):
$opt_a ||= 10;
$opt_d ||= './LM';
$opt_f ||= 0;
$opt_w ||= 0;
$opt_t ||= 400;
$opt_u ||= 1.1;
$opt_q ||= 0;

sub help {
    print <<HELP
Text Categorization. Typically used to determine the language of a
given document. 

Usage
-----

* print help message:

$0 -h

* for guessing: 

$0 [-a Int] [-d Dir] [-f Int] [-h] [-i N] [-l] [q] [-t Int] [-u Int] [-v] [-x File] [-w]

    -a    the program returns the best-scoring language together
          with all languages which are $opt_u times worse (cf option -u). 
          If the number of languages to be printed is larger than the value 
          of this option (default: $opt_a) then no language is returned, but
          instead a message that the input is of an unknown language is
          printed. Default: $opt_a.
    -d    indicates in which directory the language models are 
          located (files ending in .lm). Currently only a single 
          directory is supported. Default: $opt_d.
    -f    Before sorting is performed the Ngrams which occur this number 
          of times or less are removed. This can be used to speed up
          the program for longer inputs. For short inputs you should use
          -f 0.
          Default: $opt_f.
    -h    Print this help text and exit.
    -i N  only read first N lines
    -l    indicates that input is given as an argument on the command line,
          e.g. text_cat -l "this is english text"
          Cannot be used in combination with -n.
    -q    Add span-element for quotations and check the language.
    -s    Determine language of each line of input.
    -x F  The input is given as xml (corpus.dtd) and output is written to 
	      the same file.
    -t    indicates the topmost number of ngrams that should be used. 
          If used in combination with -n this determines the size of the 
          output. If used with categorization this determines
          the number of ngrams that are compared with each of the language
          models (but each of those models is used completely). 
    -u    determines how much worse result must be in order not to be 
          mentioned as an alternative. Typical value: 1.05 or 1.1. 
          Default: $opt_u.
    -w    Do not use word lists.
    -v    verbose. Continuation messages are written to standard error.


* for creating new language model, based on text read from standard input:

$0 -n [-v]

    -v    verbose. Continuation messages are written to standard error.


HELP
}

if ($opt_h) { help(); exit 0; };

if ($opt_n) { 
    my %ngram=();
    my %wgram=();
    my @result = create_lm(input(),\%ngram);
    print join("\n",map { "$_\t $ngram{$_}" ; } @result),"\n";
} elsif ($opt_l) {
    classify(Encode::decode_utf8($ARGV[0]));
} elsif ($opt_x) {
    read_xml(Encode::decode_utf8($ARGV[0]));
} elsif ($opt_s) {
    while (<>) {
	chomp;
	classify($_);
    }
} else { 
    classify(input()); 
}

# prepare the xml-file for language detection
sub read_xml {
	my $file = shift @_;

	my @languages;
	my $test=0;

	# Find out the languages specified for the docu.
	# Take only the header of the document.
	my $twig= XML::Twig->new( twig_roots => { header => 1 });
	if (! $twig->safe_parsefile ("$file") ) {
		print STDERR "Encoding: $file: ERROR parsing the XML-file failed. $!\n";
		return;
	}

	my $docu=$twig->root;
	my $mainlang = $docu->{'att'}->{'xml:lang'};

	my $mainlang_ok=0;	
	my $header = $docu->first_child('header');
	my $multi = $header->first_child('multilingual');

	if(! $multi ) {
		if($test) { print STDERR "text_cat: Document is monolingual.\n"; }
		return;
	}
	# open directory to find which languages are supported
	opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
	my @lm = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
	closedir DIR;
	if (! @lm) { die "text_cat: no language models found in $opt_d"; }
	
  LANG_TEST: {
	  my @langs = $multi->children;
	  if (! @langs ) {
		  if ($test) { print STDERR "text_cat: Document is categorized in languages: all.\n" }
		  @languages = @lm;
		  last LANG_TEST;
	  }
	  for my $lang (@langs) {
		  push @languages, $lang->{'att'}->{'xml:lang'};
		  if ($lang eq $mainlang) { $mainlang_ok=1; }
	  }
	  if (! $mainlang_ok ) { push @languages, $mainlang; }
	  if($test) {
		  if (@languages) { print STDERR "Document is categorized in languages: @languages\n"; }
		  else { 
			  print STDERR "text_cat: Document is categorized in languages: all.\n" ;
			  }
	  }
  }
	# Test if the main language is included in the language models.
	my %languages_h;
	for my $lang (@lm) { $languages_h{$lang}=1;}
	if(! $languages_h{$mainlang}) { 
		print STDERR "text_cat: Document language \"$mainlang\" was not recognized.\n"; 
		return; 
	}

    # Load langauge and word models for the tested languages.
	foreach my $lang (@languages) { $lmodel{$lang}=load_lm($lang); }
	if(!$opt_w) {
		foreach my $lang (@languages) { $wmodel{$lang}=load_wm($lang); }
	}

	# Process each paragraph at the time.
	# Function langdetect handles the classification of the paragraph.
	my $document = XML::Twig->new(twig_handlers => { p => sub { langdetect(@_, $mainlang); } });
	if (! $document->safe_parsefile ("$file")) {
		print STDERR "Langdetect: ERROR parsing the XML-file failed: $@\n";		  
		last LANGDETECT;
	}
	open (FH, ">$file") or die "ERROR cannot open file $!";
	$document->set_pretty_print('indented');
	$document->print( \*FH);
	$document->purge;
	close(FH);
}

# Process each xml-element (paragraph)
sub langdetect {
	my ( $twig, $para, $mainlang) = @_;

	my $text = $para->text;

	# The language of paragraphs shorter than MINCHAR characters is not guessed.
	my $count = length($text);
	if ($count < $MINCHAR) {  $para->del_att( 'xml:lang'); return; }

	# If text contains only non-word characters, it is classified
	# as the default language.
	if ($text !~ /[\pL]/o) { $para->del_att( 'xml:lang'); return;}

	# Classification returns the choices for the text.
	my @choices = classify($text);
	
	# If the language was not recognized, the document language is selected.
	if (! @choices || ! $choices[0]) {
#		print "Language was not recognized.\n";
		$para->del_att( 'xml:lang');
		return;
	}

	# The best language is selected.
	my $bestlang = $choices[0];

	# Test if there are quoted strings in other than main lang.
	if ($opt_q && $text =~ /[\«\"“]/) {
		my @text_array;
		my $return_val = mark_span($text, \@text_array, $mainlang);
		if ($return_val ) { $para->set_content(@text_array); }
	}
	
	if ($bestlang ne $mainlang  ) { $para->set_att( "xml:lang" => $bestlang ); }
	else { $para->del_att( 'xml:lang'); }
}

# Function to pick quotations to element "span"
# The language of the quotation is checked and marked
# if it differs from the main language.
sub mark_span {
	my ($text, $text_aref, $mainlang) = @_;

	my $first;
	if ($text =~ /[\"\«“]/) { 
		if ($text =~ /([\«\"\“]).*?[\»\"”]/s) { $first = $1; }
		else { return 0; }
	}
	else { return 0; }
	
	my $second = $pairs{$first};
	if ($text =~ /^(.*?)($first.*?$second)(.*)$/s ) {
		my $quote = $2;
		my $rest = $3;
#		print "1: $1\n 2: $quote\n 3: $rest\n"; 

		# Classification returns the choices for the text.
		my @choices = classify($quote);
		
		# If the language was not recognized, the mainlang is selected.
		my $bestlang;
		if (! @choices || ! $choices[0]) { $bestlang = $mainlang; }
		else { $bestlang = $choices[0]; }

		# A new span element is created for the quotation.
		push (@$text_aref, $1);
		my $span = XML::Twig::Elt->new('span');
		$span->set_att("type" => 'quote');
		if ($bestlang ne $mainlang) { $span->set_att("xml:lang" =>  $bestlang); }
		$span->set_text($quote);
		push (@$text_aref, $span);
		my $return_val;

		# The rest of the paragraph is processed further.
		if ($rest) { $return_val = mark_span($rest, $text_aref, $mainlang); }
		if (! $return_val) {push (@$text_aref, $rest)};
		return 1;
	}
}

# CLASSIFICATION
sub classify {
  my $input = shift @_;

  # create ngrams for input. Note that hash %unknown is not used;
  # it contains the actual counts which are only used under -n: creating
  # new language model (and even then they are not really required).
  my @unknown=create_lm($input);

  # If text_cat was invoked command-line, the language models have to 
  # be loaded.
  if(! %lmodel && $opt_l) {
	  # open directory to find which languages are supported
	  opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
	  my @lm = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
	  closedir DIR;
	  if (! @lm) { die "text_cat: no language models found in $opt_d"; }

	  for my $lang (@lm) {  $lmodel{$lang}=load_lm($lang); }
	  if(!$opt_w) {
		  for my $lang (@lm) {  $wmodel{$lang}=load_wm($lang); }
	  }
  }
  
  my %results=();
  my $maxp = $opt_t;

  if (! %lmodel) { 
	  print "Loading the language models failed, exiting..\n";
	  return;
  }

  foreach my $language (keys %lmodel) {

	  # compares the language model with input ngrams list
	  my ($i,$p)=(0,0);
	  while ($i < @unknown) {
		  my $unk=$unknown[$i];
		  if ($lmodel{$language}{$unk}) {
			  $p=$p+abs($lmodel{$language}{$unk}-$i);
		  } else { 
			  $p=$p+$maxp;
		  }
		  ++$i;
	  }
	  
	  $results{$language} = $p;
  }
  my @results = sort { $results{$a} <=> $results{$b} } keys %results;
  
  if ($opt_v) {
	  print "results\n";
	  for my $r (@results) {
		  print "$r\t$results{$r}\n";
	  }
  }

  my $ans = $results{$results[0]}; 
  my $ans2 = $results{$results[1]}; 

  # Drop the candidates which are too weak.
  my @answers=(shift(@results));
  while (@results && $results{$results[0]} < ($opt_u*$ans)) {
	  @answers=(@answers,shift(@results));
  }
 
  #  print "Best is $ans, Answers are: @answers\n";
  
  # If there was no language that could fit the input,
  # return undef.
  if ($ans==0 || $ans2 == $ans ) {
	  if($opt_l) { print "Could not decide the language\n"; exit; }
	  else { return undef; }
  }

  splice(@answers,3) if (@answers > 3); 

  # SHORT WORD MODEL
  # Test the rest of the languages with word lists.
  my @wanswers;
  my %wresults;
 TEST_SHORT: {
	 if(@answers == 1 || $opt_w || ! %wmodel) {
		 @wanswers = @answers; 
		 last TEST_SHORT; 
	 }

     # Create short word model.
	 my %wunknown=create_wm($input);
	 foreach my $language (@answers) {

	# compares the language model with input wgrams list
	my ($i,$p)=(0,0);
	for my $key (keys %wunknown) {
		if ($wmodel{$language}{$key}) {
			$b = sprintf("%.0f", ($wmodel{$language}{$key}**2*$wunknown{$key}*100/$results{$language}));
			$p=$p-$b;
#			if($opt_l) { print "$language vah. $b, p: $p $wmodel{$language}{$key} {$key $wunknown{$key}}\n"; }				 
		} 
			 ++$i;
		 }
		 #print STDERR "short word: $language: $p\n" if $opt_v;
		 
		 # The result of the word test is added to the results of
		 # the trigram test.
		 $wresults{$language} = $results{$language} + $p;
	 }
		
	 my @wresults_a = sort { $wresults{$a} <=> $wresults{$b} } keys %wresults;

	 if ($opt_v) {
		 print "word results\n";
		 for my $r (@wresults_a) {
			 print "$r\t$wresults{$r}\n";
		 }
	 }

	 my $ans = $wresults{$wresults_a[0]};
	 @wanswers=(shift(@wresults_a));
	 while (@wresults_a && $results{$wresults_a[0]} < ($opt_u *$ans)) {
		 @wanswers=(@wanswers,shift(@wresults_a));
	 }

	 # If there was no language that could fit the input,
	 # return undef.
	 if ($ans==0) {
		 if($opt_l) { print "Could not decide the language\n"; exit; }
		 else { return undef; }
	 }

#	 print "Best is $ans, WAnswers are: @wanswers\n";

 } #TEST_SHORT

  if (@wanswers > $opt_a) {
	  if($opt_l) {
		  print "I don't know; " .
			  "Perhaps this is a language I haven't seen before?\n";
	  }
	  else { return undef; }
  } else {
	  if($opt_l) { print join(" or ", @wanswers), "\n"; }
	  else { return @wanswers; }
  }
}

# first and only argument is reference to hash.
# this hash is filled, and a sorted list (opt_n elements)
# is returned.
sub input {
    my $read="";
    if ($opt_i) {
	while(<>) {
	    if ($. == $opt_i) {
		return $read . $_;
	    }
	    $read = $read . $_;
	}
	return $read;
    } else {
	local $/;     # so it doesn't affect $/ elsewhere
	undef $/;
	$read = <>;      # swallow input.
	$read || die "determining the language of an empty file is hard...\n";
	return $read;
    }
}


sub create_wm {
	my %wgram;
	my $text = shift @_;

	$text =~ s/[^\pL_\- ]//og;
	#print "$text\n";

	foreach my $word (split(/\s+/, $text)) {
		$wgram{lc($word)}++;
	}
	return %wgram;
}

sub create_lm {
  my ($string, $ngram) = @_;  #$ngram contains reference to the hash we build
  # then add the ngrams found in each word in the hash

  my @array;
#  if ($infile) {
#	  open(FH, "<:utf8", "$infile");
#	  while(<FH>) { chomp; push(@array, $_); }
#  }
  if ($string) {
	  @array = split("\n", $string);
  }
  foreach(@array) {

	  my $line = $_;

	  # remove non-word characters from input --sh
	  $line =~ s/[^\pL_\- ]//;

	  foreach my $word (split(/\s+/, $line)) {
		  # lower case

		  $word = lc($word);
		  $word = "_" . $word . "_";
		  
		  my $len = length($word);
		  my $flen=$len;
		  my $i;
		  for ($i=0;$i<$flen;$i++) {
			  $$ngram{substr($word,$i,5)}++ if $len > 4;
			  $$ngram{substr($word,$i,4)}++ if $len > 3;
			  $$ngram{substr($word,$i,3)}++ if $len > 2;
			  $$ngram{substr($word,$i,2)}++ if $len > 1;
			  $$ngram{substr($word,$i,1)}++;
			  $len--;
		  }
	  }
  }

  ### print STDERR "@{[%$ngram]}";

  # as suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up
  # sorting by removing singletons
  map { my $key=$_; if ($$ngram{$key} <= $opt_f) 
             { delete $$ngram{$key}; }; } keys %$ngram;
  #however I have very bad results for short inputs, this way

  # sort the ngrams, and spit out the $opt_t frequent ones.
  # adding  `or $a cmp $b' in the sort block makes sorting five
  # times slower..., although it would be somewhat nicer (unique result)
  my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;

  splice(@sorted,$opt_t) if (@sorted > $opt_t); 

  return @sorted;
}


sub load_lm {
	my $language = shift @_;
	
	# loads the language model into hash
	my %ngram=();
	my $rang=1;

	open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n";
	print STDERR "creating language model $language\n" if $opt_v;
	while (<LM>) {
		chomp;
		# only use lines starting with appropriate character. Others are
		# ignored.
		# remove non-word characters from ngram --sh
		s/[^\pL]//og;
		s/\s+//g;
		next if(/^_+$/); 
		next if(/^\s+$/); 
		if (/^[\pL]+/o) {
			# The less frequent the string, the more weight it gets.
			$ngram{$_} = $rang++;
		} 
	}
	close(LM);
	print STDERR "loaded language model $language\n" if $opt_v;
	return \%ngram;

}

sub load_wm {
	my $language = shift @_;
	
	# loads the language model into hash %$language.
	my %wgram=();
	# There are 1000 words in some word lists.
	# not used right now.
	my $maxw=1000;
	if (! open(LM,"$opt_d/$language.wm")) { 
#			 print STDERR "cannot open $opt_d/$language.wm: $!\n"; 
		return; 
	} 
	print STDERR "creating short word model for $language\n" if $opt_v;
	
	# Normalize the length of the list
	my $maxw_lang=`wc -l \"$opt_d/$language.wm\"`;
	$maxw_lang =~ s/^(\d+)\s+.*$/$1/;
	my $wrang=0;
	my $increment = $maxw/$maxw_lang;
	while (<LM>) {
		chomp;
		s/\s+//og;
		s/[^\pL]//og;
		if (/^[\pL]+/o) {
			$wrang += $increment;
			$wgram{$_} = $maxw - $wrang;
		} 
	}
	close(LM);
	
	print STDERR "loaded short word model $language\n" if $opt_v;
	return \%wgram;
}