#!/usr/bin/perl -w # Debugging #use CGI::Debug; #use lib '/home/trond/gt/main/gt/script'; use strict; use utf8; use HTML::Entities; #use Unicode::String qw(utf8 latin1); use XML::Twig; use CGI::Minimal; use CGI; #use CGI qw/:standard :html3 *table *dl/; #$CGI::DISABLE_UPLOADS = 0; # limit posts to 1 meg max #$CGI::POST_MAX = 1_024 * 1_024; use CGI::Alert ('trond.trosterud@uit.no', 'http_die'); #use Encode qw( encode_utf8 ); #use JSON::MaybeXS (); #use JSON::MaybeXS qw(encode_json decode_json); # To read variable "hostname" use Sys::Hostname; # Use project's utility functions. use langTools::Util; use langTools::XMLStruct; # Configuration: variable definitions etc. require "/var/www/cgi-bin/smi/conf.pl"; # on server # require "/usr/lib/cgi-bin/smi/conf.pl"; # local testing ######################################################################## # #smi-cg.cgi # # resides: Web Folder:cgi-bin:smi:smi.cgi # # called from HTML; output HTML # # Original written by Ken Beesley, Xerox, for Aymara. # reviewed and modified 12 april 2002, Trond Trosterud # reviewed and modified 2006,2007 Saara Huhmarniemi # modified 2012 Heli Uibo, Sjur Moshagen # modified 2013, 2014 Ciprian Gerstenberger # # $Id$ ######################################################################## # this CGI script is called whenever a user submits an analysis request # from the FORM on the different Sami HTML pages # The script uses Perl module CGI.pm to retrieve and handle # information from HTML form and generating new HTML pages. # Variables retrieved from the query. our ($text,$pos,$lang,$plang,$xml_in,$xml_out,$action,$mode,$tr_lang,$json); # Variable definitions, included in smi.cgi our ($wordlimit,$utilitydir,$bindir,$paradigmfile,%paradigmfiles,$tmpfile,$tagfile,$langfile,$logfile,$div_file); our ($preprocess,$analyze,$disamb,$dependency,$gen_lookup,$gen_norm_lookup,$generate,$generate_norm,$hyphenate,$transcribe,$convert,$lat2syll,$syll2lat,%avail_pos, %lang_actions, $translate,$placenames,$remove_weight); our ($uit_href,$giellatekno_href,$projectlogo,$unilogo); ##### DEVELOPMENT OR PRODUCTION ##### # Set $devel to 1 when testing locally, 0 when on production server my $devel = 0 ; ##### GET THE INPUT ##### $text=""; #The text to be analysed my $query = CGI::Minimal->new; my $query2 = new CGI; $text = $query->param('text'); if (! $text ) { $text = '';} $pos = $query->param('pos'); if (! $pos ) { $pos = '';} # Remove the unsecure characters from the input. $pos =~ s/[\/;<>\*\|`&\$!\#\(\)\[\]\{\}'"]/ /g; $lang = $query->param('lang'); if (! $lang ) { $lang = '';} # Remove the unsecure characters from the input. $lang =~ s/[\/;<>\*\|`&\$!\#\(\)\[\]\{\}'"]/ /g; $plang = $query->param('plang'); if (! $plang ) { $plang = '';} # Remove the unsecure characters from the input. $plang =~ s/[\/;<>\*\|`&\$!\#\(\)\[\]\{\}'"]/ /g; # Action is either "generate" or "analyze" or "paradigm" or "placenames" $action = $query->param('action'); if (! $action) { $action= '';} # Remove the unsecure characters from the input. $action =~ s/[\/;<>\*\|`&\$!\#\(\)\[\]\{\}'"]/ /g; # Paradigm mode: minimal, standard, full, full with dialectal variation $mode = $query->param('mode'); if (! $mode) { $mode= '';} # Remove the unsecure characters from the input. $mode =~ s/[\/;<>\*\|`&\$!\#\(\)\[\]\{\}'"]/ /g; # The language for lemma translation in disambiguation. $tr_lang = $query->param('translate'); if (! $tr_lang) { $tr_lang = "none"; } # Remove the unsecure characters from the input. $tr_lang =~ s/[\/;<>\*\|`&\$!\#\(\)\[\]\{\}'"]/ /g; # Input and output can be xml. $xml_in = $query->param('xml_in'); if (! $xml_in) { $xml_in = '';} $xml_out = $query->param('xml_out'); if (! $xml_out) { $xml_out = '';} $json = $query->param('json'); if (! $json) { $json= '';} # Remove the unsecure characters from the input. $json =~ s/[\/;<>\*\|`&\$!\#\(\)\[\]\{\}'"]/ /g; if ($pos =~ /\s/) { http_die '--no-alert','400 Bad Request',"Invalid pos.\n" }; if ($lang =~ /\s/) { http_die '--no-alert','400 Bad Request',"Invalid lang.\n" }; if ($plang =~ /\s/) { http_die '--no-alert','400 Bad Request',"Invalid plang.\n" }; if ($action =~ /\s/) { http_die '--no-alert','400 Bad Request',"Invalid actoin.\n" }; if ($mode =~ /\s/) { http_die '--no-alert','400 Bad Request',"Invalid mode.\n" }; if ($tr_lang =~ /\s/) { http_die '--no-alert','400 Bad Request',"Invalid tr_lang.\n" }; if ($json =~ /\s/) { http_die '--no-alert','400 Bad Request',"Invalid json.\n" }; if (! $lang && $action ne "placenames" ) { http_die '--no-alert','400 Bad Request',"lang parameter missing.\n" }; if (! $text) { http_die '--no-alert','400 Bad Request',"No text given.\n" }; if (! $action) { http_die '--no-alert','400 Bad Request',"No action given.\n" }; ##### INITIALIZE #### &init_variables; # temporary files #open (FH, ">$tmpfile"); #open (LFH, ">>$logfile"); my @candidates; my $document; my $page; my $form_action; my $server_name = $ENV{SERVER_NAME}; my $script_name = $ENV{SCRIPT_NAME}; # If developing, use local url. On production server, use gtweb url. if ($devel && $server_name && $script_name) { $form_action = "http://" . $server_name . $script_name ; } else { $form_action="https://gtweb.uit.no/cgi-bin/smi/smi.cgi"; } my $body; my $giellatekno_logo; # Initialize HTML-page if (! $xml_out) { if ($json eq 'true') { $document = XML::Twig->new(keep_encoding => 1); if (! $document->safe_parsefile ("$langfile")) { print "parsing the XML-file failed: $@\n"; exit; } $page = $document->root; $body = XML::Twig::Elt->new("body"); $body->set_pretty_print('record'); $body->set_empty_tag_style ('expand'); } else { # Parse language file. $document = XML::Twig->new(keep_encoding => 1); if (! $document->safe_parsefile ("$langfile")) { print "parsing the XML-file failed: $@\n"; exit; } $page = $document->root; $body = XML::Twig::Elt->new("body"); $body->set_pretty_print('record'); $body->set_empty_tag_style ('expand'); my $a = XML::Twig::Elt->new(a=>{href=>$uit_href},'The University of Tromsø >'); $a->paste('last_child',$body); $a = XML::Twig::Elt->new(a=>{href=>$giellatekno_href},'Giellatekno >'); $a->paste('last_child',$body); my $br = XML::Twig::Elt->new('br'); $br->paste('last_child', $body); $giellatekno_logo = XML::Twig::Elt->new(a=>{href=>$giellatekno_href}); my $img= XML::Twig::Elt->new(img=>{src=>$projectlogo, style=>'border: none;', title=>'Giellatekno'}); $img->paste('last_child', $giellatekno_logo); &printinitialhtmlcodes($action, $page, $body); } } # Process input XML if ($xml_in) { if ($action eq "analyze" || $action eq "disamb" || $action eq "dependency" || $action eq "hyphenate" || $action eq "transcribe" || $action eq "convert" || $action eq "lat2syll" || $action eq "syll2lat") { $text = xml2preprocess($text); } if ($action eq "generate" || $action eq "paradigm") { $text = xml2words($text); } } # no charset radio buttons #if ($charset eq "latin1") { # $text = Unicode::String::latin1( $text); #} # Convert html-entity to unicode decode_entities( $text ); #print LFH "PARAM $action, $lang, $plang"; #if ($action eq "paradigm") { print LFH "$pos"; } #print LFH "\n$text\n"; # Special characters in the text (e.g. literal ampersands, plus signs # and equal signs # typed by the user) must be encoded for transmission, to prevent confusion with # the delimiters used by CGI); here is the magic formula to undo the CGI encodings $text =~ s/%(..)/pack("c",hex($1))/ge ; # Convert digraphs to utf-8 $text = digr_utf8($text); # Remove the unsecure characters from the input. $text =~ s/[;<>\*\|`&\$!\#\(\)\[\]\{\}'"]/ /g; # ` This stupid dummy line is here just to restore emacs syntax colouring. # Change linebreaks to space and check the word limit my @words = split(/[\s]+/, $text); $text = join(' ', splice(@words,0,$wordlimit)); if (@words && ! $xml_out) { http_die '--no-alert','400 Bad Request',"Too many words in input. The limit is $wordlimit.\n" }; # And here is where the actual lookup gets done: # ############################################### # 1. echo the input string to preprocessor, # 2. pipe the now tokenized text (one word per line) to the lookup application # (which has some flags set, and which accesses sme.fst) # 3. The output of lookup is assigned as the value of $result my $result; my %answer; my %candits; my $coloring = "./color_d.pl sme"; my $coloring_a = "./color_a.pl"; if ($action eq "generate") { $result = `echo $text | $generate_norm`; } elsif ($action eq "paradigm") { $result = generate_paradigm($text, $pos, \%answer, \%candits); } elsif ($action eq "disamb") { if ($translate) { $result = `echo $text | $disamb | $translate | $coloring`; } else { $result = `echo "$text" | $disamb | $coloring`; } } elsif ($action eq "dependency") { if ($translate) { $result = `echo $text | $dependency | $translate | $coloring`; } else { $result = `echo "$text" | $dependency | $coloring`; } } elsif ($action eq "analyze") { $result = `echo $text | $analyze | $remove_weight | $coloring_a`; } elsif ($action eq "hyphenate") { $result = `echo $text | $hyphenate`; } elsif ($action eq "transcribe") { $result = `echo $text | $transcribe`; } elsif ($action eq "convert") { $result = `echo $text | $convert`; } elsif ($action eq "lat2syll") { $result = `echo $text | $lat2syll`; } elsif ($action eq "syll2lat") { $result = `echo $text | $syll2lat`; } elsif ($action eq "placenames") { $result = `echo $text | $placenames | $remove_weight`; } else { if (!$xml_out) { print "
No action given
"; } else { print "