#!/usr/bin/perl -w # Debugging #use CGI::Debug; #use lib '/home/saara/gt/script'; use strict; use utf8; use HTML::Entities; use Unicode::String qw(utf8 latin1); use XML::Twig; use CGI::Minimal; #use CGI qw/:standard :html3 *table *dl/; #$CGI::DISABLE_UPLOADS = 0; # limit posts to 1 meg max #$CGI::POST_MAX = 1_024 * 1_024; use CGI::Alert ('saara', 'http_die'); # Use project's utility functions. use langTools::Util; use langTools::XMLStruct; # Configuration: variable definitions etc. require "conf.pl"; ######################################################################## # #smi-cg.cgi # # resides: Web Folder:cgi-bin:smi:smi.cgi # # called from HTML; output HTML # # Original written by Ken Beesley, Xerox, for Aymara. # reviewed and modified 12 april 2002, Trond Trosterud # reviewed and modified 2006,2007 Saara Huhmarniemi # # $Id$ ######################################################################## # this CGI script is called whenever a user submits an analysis request # from the FORM on the different Sami HTML pages # The script uses Perl module CGI.pm to retrieve and handle # information from HTML form and generating new HTML pages. # Variables retrieved from the query. our ($text,$pos,$charset,$lang,$plang,$xml_in,$xml_out,$action,$mode,$tr_lang); # Variable definitions, included in smi.cgi our ($wordlimit,$utilitydir,$bindir,$paradigmfile,%paradigmfiles,$tmpfile,$tagfile,$langfile,$logfile,$div_file); our ($preprocess,$analyze,$disamb,$gen_lookup,$gen_norm_lookup,$generate,$generate_norm,$hyphenate,$transcribe,$convert,%avail_pos, %lang_actions, $translate); our ($uit_href,$giellatekno_href,$projectlogo,$unilogo); ##### GET THE INPUT ##### $text=""; #The text to be analysed my $query = CGI::Minimal->new; $text = $query->param('text'); $pos = $query->param('pos'); $charset = $query->param('charset'); $lang = $query->param('lang'); $plang = $query->param('plang'); # Action is either "generate" or "analyze" or "paradigm" $action = $query->param('action'); # Paradigm mode: minimal, standard, full, full with dialectal variation $mode = $query->param('mode'); # The language for lemma translation in disambiguation. $tr_lang = $query->param('translate'); if (! $tr_lang) { $tr_lang = "none"; } # Input and output can be xml. $xml_in = $query->param('xml_in'); $xml_out = $query->param('xml_out'); if (! $lang) { http_die '--no-alert','400 Bad Request',"lang parameter missing.\n" }; if (! $text) { http_die '--no-alert','400 Bad Request',"No text given.\n" }; if (! $action) { http_die '--no-alert','400 Bad Request',"No action given.\n" }; ##### INITIALIZE #### &init_variables; # temporary files #open (FH, ">$tmpfile"); #open (LFH, ">>$logfile"); my @candidates; my $document; my $page; my $form_action="http://sami-cgi-bin.uit.no/cgi-bin/smi/smi.cgi"; my $body; my $giellatekno_logo; # Initialize HTML-page if(! $xml_out) { # Parse language file. $document = XML::Twig->new(keep_encoding => 1); if (! $document->safe_parsefile ("$langfile")) { print "parsing the XML-file failed: $@\n"; exit; } $page = $document->root; $body = XML::Twig::Elt->new("body"); $body->set_pretty_print('record'); $body->set_empty_tag_style ('expand'); my $a = XML::Twig::Elt->new(a=>{href=>$uit_href},'The University of Tromsø >'); $a->paste('last_child',$body); $a = XML::Twig::Elt->new(a=>{href=>$giellatekno_href},'Giellatekno >'); $a->paste('last_child',$body); my $br = XML::Twig::Elt->new('br'); $br->paste('last_child', $body); $giellatekno_logo = XML::Twig::Elt->new(a=>{href=>$giellatekno_href}); my $img= XML::Twig::Elt->new(img=>{src=>$projectlogo, style=>'border: none;', title=>'Giellatekno'}); $img->paste('last_child',$giellatekno_logo); &printinitialhtmlcodes($action, $page,$body); } # Process input XML if ($xml_in) { if ($action eq "analyze" || $action eq "disamb" || $action eq "hyphenate" || $action eq "transcribe" || $action eq "convert" ) { $text = xml2preprocess($text); } if ($action eq "generate" || $action eq "paradigm") { $text = xml2words($text); } } if($charset eq "latin1") { $text = Unicode::String::latin1( $text); } # Convert html-entity to unicode decode_entities( $text ); #print LFH "PARAM $action, $lang, $plang"; #if ($action eq "paradigm") { print LFH "$pos"; } #print LFH "\n$text\n"; # Special characters in the text (e.g. literal ampersands, plus signs # and equal signs # typed by the user) must be encoded for transmission, to prevent confusion with # the delimiters used by CGI); here is the magic formula to undo the CGI encodings $text =~ s/%(..)/pack("c",hex($1))/ge ; # Convert digraphs to utf-8 $text = digr_utf8($text); # Remove the unsecure characters from the input. $text =~ s/[;<>\*\|`&\$!\#\(\)\[\]\{\}:'"]/ /g; # Change linebreaks to space and check the word limit my @words = split(/[\s]+/, $text); $text = join(' ', splice(@words,0,$wordlimit)); if (@words && ! $xml_out) { &printwordlimit; } # And here is where the actual lookup gets done: # ############################################### # 1. echo the input string to preprocessor, # 2. pipe the now tokenized text (one word per line) to the lookup application # (which has some flags set, and which accesses sme.fst) # 3. The output of lookup is assigned as the value of $result my $result; my %answer; my %candits; if ($action eq "generate") { $result = `echo $text | $generate_norm`; } elsif ($action eq "paradigm") { $result = generate_paradigm($text, $pos, \%answer, \%candits); } elsif ($action eq "disamb") { if ($translate) { $result = `echo $text | $disamb | $translate`; } else { $result = `echo $text | $disamb`; } } elsif ($action eq "analyze") { $result = `echo $text | $analyze`; } elsif ($action eq "hyphenate") { $result = `echo $text | $hyphenate`; } elsif ($action eq "transcribe") { $result = `echo $text | $transcribe`; } elsif ($action eq "convert") { $result = `echo $text | $convert`; } else { if (!$xml_out) { print "
No action given
"; } else { print "