#!/usr/bin/perl -w use strict; # Importing CGI libraries use CGI qw/:standard/; #use CGI::Upload; $CGI::DISABLE_UPLOADS = 0; $CGI::POST_MAX = 1_024 * 1_024; # limit posts to 1 meg max # Forwarding warnings and fatal errors to browser window use CGI::Alert 'saara'; # Show custom text to remote viewer CGI::Alert::custom_browser_text << '-END-';
[MSG]
Our maintainers have been informed.
Send feedback and questions to corpus@giellatekno.uit.no
-END- # File copying and xml-processing use XML::Twig; # Allowed mime-headers. # http://www.iana.org/assignments/media-types/ my %mime_types = ( "text/html" => "html", "application/msword" => "doc", "application/pdf" => "pdf", "text/plain" => "txt" ); # The first thing is to print some kind of html-code print "Content-TYPE: text/html; charset=utf-8\n\n" ; my $convert = "/usr/local/share/corp/bin/convert2xml.pl"; my $tmpdir = "/usr/local/share/corp/upload" ; # Define upload directory and mkdir it, if necessary my $upload_dir = "/usr/local/share/corp/upload"; mkdir ($upload_dir, 0755) unless -d $upload_dir; # Some securing operations. -sh $ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin'; delete @ENV{'IFS', 'CDPATH', 'ENV', 'BASH_ENV'}; # print new page for multiple file upload my $file_count = param("file_count"); if (! $file_count) { $file_count = 1; } if(param("print_multiple") && $file_count) { print_multiple_upload($file_count); exit; } # Getting the filename and contet type check. my @filehandle = upload('document'); #array of file handles if (! @filehandle ) { die "No file was selected for upload.\n" } for my $file (@filehandle) { my $filetype = uploadInfo($file)->{'Content-Type'}; if (! $mime_types{$filetype}) { die "Upload only msword, pdf and html-files.\n" } else { if ($file !~ m/\.(doc|pdf|html|txt|ptx)$/) { $file = $file . "." . $mime_types{$file}; } } } my $real_count = @filehandle; if ($real_count > $file_count || $real_count > 9) { die "Upload only up to 9 files at the time.\n" } # Message sent to the maintainer after corpus upload. my $message = "Files uploaded to $upload_dir.\n"; my $author1_ln=""; my $publisher=""; my $year=""; my $isbn=""; my $issn=""; my $title=""; my @fnames; my @mainlangs; my @license_types; # Calculate md5sums of files in orig/ dir my @md5sums = (`find /usr/local/share/corp/orig -type f -print0 | xargs -0 -n1 md5sum | sort --key=1,32 -u | cut -c 1-32`); my $i; for($i=0;$i<$file_count;$i++) { # Parsing the path away my $filename = $filehandle[$i]; # Parsing the path away $filename =~ s/.*[\/\\](.*)/$1/; # Replace space with underscore - c = complement the search list my $fname = $filename; $fname =~ tr/\.A-Za-z0-9ÁČĐŊŠŦŽÅÆØÄÖáčđŋšŧžåæøäö/_/c; # Generate a new file name # if there exists a file with the same name. $fname =~ s/.*[\/\\](.*)/$1/; my $j = 1; while (-e "$upload_dir/$fname") { my $tmp = $fname; $tmp =~ s/(\.(.*))$//; $fname = $tmp . "_" . $j . $1; $j++; } # Open file on the server and print uploaded file into it. open UPLOADFILE, ">$upload_dir/$fname" or die "Can't open file $upload_dir/$fname!"; while (<$filename>) { print UPLOADFILE; } close UPLOADFILE; my $md5 = (`md5sum $upload_dir/$fname | cut -c 1-32`); # Check that file doesn't already exist for my $j (@md5sums) { if ($j eq $md5) { # rm $upload_dir/$fname; # TODO: remove file, how to do it in Perl? if ($j>0) { mailit($message); } die "$filename: File already exists in our corpus base!"; } } my $mlang = "mainlang_" . $i; my $ltype = "license_type_" . $i; my $mainlang = param($mlang); my $license_type = param($ltype); push @mainlangs, $mainlang; push @license_types, $license_type; # Calling convert2xml -script with hardcoded execution path # The 'or die' part doesn't work, it dies everytime... my @args = ("$convert", "--lang=$mainlang", "--tmpdir=$tmpdir", "--noxsl", "--upload", "$upload_dir/$fname"); system (@args); # or die "$filename: Error in conversion"; # Only the first document is parsed and checked for xml-structure # to speed up processing. if($i==0) { my $doc = XML::Twig->new(twig_handlers => {'header/author/person' => sub { $author1_ln = $_->{'att'}->{'lastname'} }, 'header/publChannel/publisher' => sub { $publisher = $_->text }, 'header/publChannel/isbn' => sub { $isbn = $_->text }, 'header/publChannel/issn' => sub { $issn = $_->text }, 'header/year' => sub { $year = $_->text }, 'header/title' => sub { $title = $_->text }, }); if (! $doc->safe_parsefile ("$upload_dir/$fname.xml")) { print STDERR "$fname: ERROR parsing the XML-file failed.\n"; } } push @fnames, $fname; $message.= "Name: $fname\nlanguage: $mainlang\nlicense: $license_type\n" } # Print metainformation form. &print_metaform; ########## Subroutines from here on ###### # Print metainfomation form sub print_metaform { print <Thank you for uploading the file @filehandle .
Please fill in the following form with the available information of the document. The fields submitter name and email are mandatory.
END_FIRST_PART } else { print <Thank you for uploading the files @filehandle .
Please fill in the following form with the available information of the documents. Notice that all the documents will recieve the same metainformation. The fields submitter name and email are mandatory.
END_FIRST_PART } print <