Moses Output

#!/usr/bin/perl -w use warnings; use strict; $|++; # # Connection script between the Autshumato ITE and Autshumato MT # #-- IMPORTS -- use CGI; use CGI::Carp qw/fatalsToBrowser/; use Encode; use lib 'lib'; use RemoteProcess; use Subprocess; # Moses Server address(es) my @MOSES_ADDRESSES = ("localhost:1025"); # The tokenizer tries to adapt its rules depending on the language it's dealing # with, so we indicate that here. my $INPUT_LANG = 'en'; my $OUTPUT_LANG = 'en'; # In order to tokenize and detokenize strings in a way that stays consistent # with how it is done in the rest of the Moses system, we use the scripts that # come with Moses as external processes. These are the commands we must run to # start them. my @TOKENIZER_CMD = ('./bin/tokenizer.perl', '-l', $INPUT_LANG); my @DETOKENIZER_CMD = ('./bin/detokenizer.perl', '-l', $OUTPUT_LANG); # clear the path $ENV{PATH} = ''; # get the String to be translated my $sentence = decode('UTF-8', CGI->new->param ('url')); # change the variable in html # run the tokenisers my $tokenizer = new Subprocess (@TOKENIZER_CMD); my $detokenizer = new Subprocess (@DETOKENIZER_CMD); $tokenizer->start; $detokenizer->start; # connect to the moses server my ($host, $port) = split /:/, $MOSES_ADDRESSES[0]; my $moses = new RemoteProcess ($host, $port), or die "Can't connect to '$host:$port'"; $moses->start; #translate the string my $output = &translate_text($sentence, $moses, $tokenizer, $detokenizer); # return the string print "Content-Type: text/html; charset=UTF-8\n\n"; print encode ('UTF-8', " Moses Output

$output

"); sub translate_text { my($input_text, $moses, $tokenizer, $detokenizer) = @_; $input_text = $tokenizer->do_line($input_text); # Sentence splitting within a paragraph or block of text is done after # tokenizing. Tokens matched by this regex will be considered to end a # sentence, and hence be used in splitting the text into sentences. my @lines = split(/(?<=\s[\.\?!:;])(\s)/, $input_text); my $output_text = ''; foreach my $line (@lines) { if ($line =~ /^\s$/) { $output_text .= $line; next; } # Join together tokens into a plain text string. This is now ready to # be shipped to Moses: all tags and placeholders have been removed, # and it's a single sentence. We also lowercase as needed, and make # a note of whether we did. my $was_ucfirst = ($line =~ s/^(\p{IsUpper})(?=\p{IsLower})/lc $1;/e); my $was_allcaps = ($line =~ s/^([\p{IsUpper}\P{IsAlpha}]+)$/lc $1;/e); my $t_output_text = &_translate_text_moses($line, $moses); # Early post-translation formatting fixes $t_output_text = ucfirst $t_output_text if $was_ucfirst; $t_output_text = uc $t_output_text if $was_allcaps; $output_text .= $t_output_text; } # Try to remove spaces inserted by the tokenizer $output_text = $detokenizer->do_line($output_text); return $output_text; } # This one, given a handle to a Moses subprocess, will use that to translate # the text. Not much to see here actually. sub _translate_text_moses { my($text, $moses) = @_; my $traced_text = $moses->do_line($text); # Remove the trace outputs my $outText = ''; while ($traced_text =~ s/^(.+?)\s*\|(\d+)-+(\d+)\|\s*//) { $outText .= $1; $outText .= ' '; } return $outText; }