#!/usr/bin/perl -w use strict; # Importing CGI libraries use CGI qw/:standard/; #use CGI::Upload; $CGI::DISABLE_UPLOADS = 0; $CGI::POST_MAX = 1_024 * 1_024; # limit posts to 1 meg max # Forwarding warnings and fatal errors to browser window use CGI::Alert 'saara'; # Show custom text to remote viewer CGI::Alert::custom_browser_text << '-END-';

Error in uploading the file.

[MSG]

Our maintainers have been informed.

Send feedback and questions to corpus@giellatekno.uit.no

Upload more files

Divvun main page

-END- # File copying and xml-processing use XML::Twig; # Allowed mime-headers. # http://www.iana.org/assignments/media-types/ my %mime_types = ( "text/html" => "html", "application/msword" => "doc", "application/pdf" => "pdf", "text/plain" => "txt" ); # The first thing is to print some kind of html-code print "Content-TYPE: text/html; charset=utf-8\n\n" ; my $convert = "/usr/local/share/corp/bin/convert2xml.pl"; my $tmpdir = "/usr/local/share/corp/upload" ; # Define upload directory and mkdir it, if necessary my $upload_dir = "/usr/local/share/corp/upload"; mkdir ($upload_dir, 0755) unless -d $upload_dir; # Some securing operations. -sh $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin'; delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; # print new page for multiple file upload my $file_count = param("file_count"); if (! $file_count) { $file_count = 1; } if(param("print_multiple") && $file_count) { print_multiple_upload($file_count); exit; } # Getting the filename and contet type check. my @filehandle = upload('document'); #array of file handles if (! @filehandle ) { die "No file was selected for upload.\n" } for my $file (@filehandle) { my $filetype = uploadInfo($file)->{'Content-Type'}; if (! $mime_types{$filetype}) { die "Upload only msword, pdf and html-files.\n" } else { if ($file !~ m/\.(doc|pdf|html|txt|ptx)$/) { $file = $file . "." . $mime_types{$file}; } } } my $real_count = @filehandle; if ($real_count > $file_count || $real_count > 9) { die "Upload only up to 9 files at the time.\n" } # Message sent to the maintainer after corpus upload. my $message = "Files uploaded to $upload_dir.\n"; my $author1_ln=""; my $publisher=""; my $year=""; my $isbn=""; my $issn=""; my $title=""; my @fnames; my @mainlangs; my @license_types; # Calculate md5sums of files in orig/ dir my @md5sums = (`find /usr/local/share/corp/orig -type f -print0 | xargs -0 -n1 md5sum | sort --key=1,32 -u | cut -c 1-32`); my $i; for($i=0;$i<$file_count;$i++) { # Parsing the path away my $filename = $filehandle[$i]; # Parsing the path away $filename =~ s/.*[\/\\](.*)/$1/; # Replace space with underscore - c = complement the search list my $fname = $filename; $fname =~ tr/\.A-Za-z0-9ÁČĐŊŠŦŽÅÆØÄÖáčđŋšŧžåæøäö/_/c; # Generate a new file name # if there exists a file with the same name. $fname =~ s/.*[\/\\](.*)/$1/; my $j = 1; while (-e "$upload_dir/$fname") { my $tmp = $fname; $tmp =~ s/(\.(.*))$//; $fname = $tmp . "_" . $j . $1; $j++; } # Open file on the server and print uploaded file into it. open UPLOADFILE, ">$upload_dir/$fname" or die "Can't open file $upload_dir/$fname!"; while (<$filename>) { print UPLOADFILE; } close UPLOADFILE; my $md5 = (`md5sum $upload_dir/$fname | cut -c 1-32`); # Check that file doesn't already exist for my $j (@md5sums) { if ($j eq $md5) { # rm $upload_dir/$fname; # TODO: remove file, how to do it in Perl? if ($j>0) { mailit($message); } die "$filename: File already exists in our corpus base!"; } } my $mlang = "mainlang_" . $i; my $ltype = "license_type_" . $i; my $mainlang = param($mlang); my $license_type = param($ltype); push @mainlangs, $mainlang; push @license_types, $license_type; # Calling convert2xml -script with hardcoded execution path # The 'or die' part doesn't work, it dies everytime... my @args = ("$convert", "--lang=$mainlang", "--tmpdir=$tmpdir", "--noxsl", "--upload", "$upload_dir/$fname"); system (@args); # or die "$filename: Error in conversion"; # Only the first document is parsed and checked for xml-structure # to speed up processing. if($i==0) { my $doc = XML::Twig->new(twig_handlers => {'header/author/person' => sub { $author1_ln = $_->{'att'}->{'lastname'} }, 'header/publChannel/publisher' => sub { $publisher = $_->text }, 'header/publChannel/isbn' => sub { $isbn = $_->text }, 'header/publChannel/issn' => sub { $issn = $_->text }, 'header/year' => sub { $year = $_->text }, 'header/title' => sub { $title = $_->text }, }); if (! $doc->safe_parsefile ("$upload_dir/$fname.xml")) { print STDERR "$fname: ERROR parsing the XML-file failed.\n"; } } push @fnames, $fname; $message.= "Name: $fname\nlanguage: $mainlang\nlicense: $license_type\n" } # Print metainformation form. &print_metaform; ########## Subroutines from here on ###### # Print metainfomation form sub print_metaform { print < END_HEADER &write_js; if($real_count > 1) { print <File uploaded

File uploaded

Thank you for uploading the file @filehandle .

Please fill in the following form with the available information of the document. The fields submitter name and email are mandatory.

END_FIRST_PART } else { print <Files uploadeded

File uploaded

Thank you for uploading the files @filehandle .

Please fill in the following form with the available information of the documents. Notice that all the documents will recieve the same metainformation. The fields submitter name and email are mandatory.

END_FIRST_PART } print <The document information
Document title:
Author(s):
1. Author:
Firstname:
Lastname:
Gender: Male Female
Born:
Nationality:
3. Author:
Firstname:
Lastname:
Gender: Male Female
Born:
Nationality:
2. Author:
Firstname:
Lastname:
Gender: Male Female
Born:
Nationality:
4. Author:
Firstname:
Lastname:
Gender: Male Female
Born:
Nationality:
Publishing year:
Publishing place:
Publisher:
ISBN:
ISSN:
Collection:
Genre:
If the document is multilingual,
select the other languages:
North Sámi Julev Sámi South Sámi
Nynorsk Bokmål
Finnish Swedish English German other
Translated from:
Translator: Firstname:
Lastname:
Submitter: Name: star
Email: star
END_HTML } # send an email notification. sub mailit { my $message = shift @_; my $recipient="corpus\@giellatekno.uit.no"; my $sender="upload.cgi"; my $subject="File uploaded"; $message .= "\n---\nThis is automatic notification email from web upload script upload.cgi."; return; open(MAIL, "|/usr/lib/sendmail -t"); print MAIL "To: $recipient\n"; print MAIL "From: $sender\n"; print MAIL "Subject: $subject\n\n"; print MAIL "$message"; close (MAIL); } sub print_multiple_upload { my $file_c = shift @_; print < File uploaded

Fill in the filenames and languages

END_H my $i; for($i=0;$i<$file_c;$i++) { my $j=$i+1; print < File $j: Main language: License: END_FIELDS } print < END } sub write_js { print < // check form function checkWholeForm(theForm) { var why = ""; why += checkEmail(theForm.sub_email.value); why += isEmpty(theForm.sub_name.value); if (why != "") { alert(why); return false; } return true; } // email function checkEmail (strng) { var error=""; if (strng == "") { error = "You didn't enter an email address.\\n"; } var emailFilter=/^.+@.+\\..{2,3}\$\/; if (!(emailFilter.test(strng))) { error = "Please enter a valid email address.\\n"; } else { //test email for illegal characters var illegalChars= /[\\(\\)\\<\\>\\,\\;\\:\\"\\[\\]]/ if (strng.match(illegalChars)) { error = "The email address contains illegal characters.\\n"; } } return error; } // non-empty submitter function isEmpty(strng) { var error = ""; if (strng.length == 0) { error = "Please fill in the submitter name field.\\n" } return error; } END_OF_JS }