#!/usr/bin/perl
########################################################################
#
#sme-lookup.pl
#
# resides: Web Folder:cgi-bin:smi:sme-lookup.cgi
#
# called from FORM on skjema.html page; output HTML
#
# Original written by Ken Beesley, Xerox, for Aymara.
# reviewed and modified 12 april 2002, Trond Trosterud
#
########################################################################
# this CGI script is called whenever a user submits an analysis request
# from the FORM on the Northern Sami HTML page
# For manual testing, see comments at the bottom of the file
# the output of this script (output using the normal Perl 'print' commands)
# is an HTML file, sent back to the user's browser for display. (How it
# gets back to the user's browser is a mystery to me--the web server must
# take care of this.)
# System-Specific directories
# The directory where utilities like 'lookup' are stored
$utilitydir = "/opt/xerox/bin" ;
# The directory where iclock-sme.fst is stored
$smefstdir = "/opt/smi/sme/bin" ;
&printinitialhtmlcodes ; # see the subroutine below
# prints out the usual HTML header info
$wordlimit = 50 ; # adjust as appropriate; prevent large-scale (ab)use
# GET THE INPUT
# The data arrives by GET method (from the HTML FORM
# in the HTML input page being viewed by the user)
# The format of the data sent to the CGI script in GET mode is a string
# separated into "fields". Each field represents the data from one GUI
# widget in the HTML FORM. (Compare the FORM to the fields listed
# below.) Fields are separated with an ampersand.
# In the present example, the FORM contain one GUI widgets, transmitted
# to the CGI script in a field:
# A string of characters, typed into an HTML TEXTAREA widget. The FORM
# specifies that these characters are to be labeled as "text". So
# if the user types "utanaka utamankapxarakiwa" in the TEXTAREA, it
# will be transmitted to the CGI script as the field
# text=utanaka+utamankapxarakiwa
# Note that the space typed by the user will be replaced by a plus sign
# for transmission.
# Compare these descriptions against the FORM in the HTML page to see
# where these fields (and their labels and values) are coming from.
# the data is encoded for transmission
# 1. any spaces in the original user input are replaced with plus signs
# 2. other special characters are encoded (see below for decoding steps)
# with the GET method, the input is available in the environment variable
# QUERY_STRING, with fields separated by ampersands, e.g.
# text=word+word+word
@query = $ENV{'QUERY_STRING'} ;
# the input field holds the text itself (word or words)
# in the format
# text=word1+word2+word3+word4...
# (literal spaces typed by the user were replaced with plus signs for
# transmission)
($name, $text) = split(/\=/, shift(@query)) ; # try to get only one field...
#($name, $text) = split(/\=/, shift(@queryfield)) ; # original...
if ($name ne "text") {
print "Error: Expected text in QUERY_STRING\n" ;
}
# special characters in the text (e.g. literal ampersands, plus signs and equal signs
# typed by the user) must be encoded for transmission, to prevent confusion with
# the delimiters used by CGI); here is the magic formula to undo the CGI encodings
$text =~ s/%(..)/pack("c",hex($1))/ge ;
# change the plus signs back to the original spaces typed by the user
$text =~ s/\+/ /g ;
#Removing the unsecure characters from the input.
$text =~ s/[;<>\*\|`&\$!#\(\)\[\]\{\}'"]/ /g;
# make space before punctuation
$text =~ s/\?/ \?/g ;
$text =~ s/\./ \./g ;
$text =~ s/\,/ \,/g ;
$text =~ s/^\s+// ; # chop any whitespace off the front
$text =~ s/\s+$// ; # chop any whitespace off the back
$text =~ s/\s+/\ /g ; # squeeze any multiple whitespaces into one
# split the text into words crudely on spaces
@words = split(/\s+/, $text) ;
# Limit the input to a certain number of words (specified in variable $wordlimit
# set above)
if (@words > $wordlimit) {
$upperindex = $wordlimit - 1 ;
@words = @words[0..$upperindex] ;
}
# make a check to see if there are any words at all
if (@words == 0) {
print "\n
\nNo words received.\n" ;
&printfinalhtmlcodes ;
return "No Words Received" ;
}
# if we reach here, then the user did indeed one or more words;
# join the words back into a single string
# each remaining word now separated by spaces
$allwords = join(" ", @words) ;
# The morphological analysis will be done using the 'lookup' utility,
# which takes a tokenized "file" as input (i.e. one word to a line)
# In Perl, backquoted expressions are sent to be performed by the native
# operating system, here UNIX, and the text result is returned, e.g.
# $date = `date` ;
# would call the Unix utility 'data' and assign the answer, e.g. a string like
# Thu Mar 21 16:37:10 MET 2002
# as the value of the Perl variable $data
# the same backquoting trick will be used to lookup the input words in
# using the 'lookup' utility, which will access the aymara.fst transducer
# we will take the string of space-separated input words in the Perl variable
# $allwords (computed above), pipe them to a very simple tokenizer that puts
# one word on each line (i.e. inserts a newline character between words), and
# then pipe that tokenized "file" to the 'lookup' utility
# And here is where the actual lookup gets done:
# ###############################################
# 1. echo the string $allwords via a pipe to tr, which replaces spaces with newlines
# 2. pipe the now tokenized text (one word per line) to the lookup application
# (which has some flags set, and which accesses iclock-sme.fst)
# 3. The output of lookup is assigned as the value of $result
$result = `echo $allwords | tr " " "\n" | \
$utilitydir/lookup -flags mbL\" => \"LTT -d -utf8 $smefstdir/iclock-sme.fst` ;
# $utilitydir/lookup -flags mbL" => "LTT -d $smefstdir/iclock-sme.fst` ;
# testing line two here, lauri's advice.
#back with line one
# ***** Now we need to parse the $result string to output the information as HTML ***
# This information will be directed automatically back to the user's browser for display
# first split the $result into solutiongroups (one solutiongroup for each input word)
# given the way that 'lookup' formats its results, solutiongroups are separated by
# two newline characters
@solutiongroups = split(/\n\n/, $result) ;
# the following is basically a loop over the original input words, now
# associated with their solutions
foreach $solutiongroup (@solutiongroups) {
print "\n
\n\n" ; # Include some Copyright notices print "