package langTools::Converter; use langTools::AvvirXMLConverter; use langTools::BibleXMLConverter; use langTools::HTMLConverter; use langTools::ParatextConverter; use langTools::RTFConverter; use langTools::DOCConverter; use langTools::PlaintextConverter; use langTools::PDFConverter; use langTools::SVGConverter; use langTools::CorrectXMLConverter; use langTools::CantHandle; use langTools::Decode; use strict; use utf8; use warnings; use Carp qw(cluck carp); use XML::Twig; sub new { my ( $class, $filename, $test ) = @_; my $self = {}; $self->{_test} = $test; $self->{_bindir} = "$ENV{'GTHOME'}/gt/script"; $self->{_corpus_script} = $self->{_bindir} . "/corpus"; $self->{_common_xsl} = $self->{_corpus_script} . "/common.xsl"; $self->{_preproc_xsl} = $self->{_corpus_script} . "/preprocxsl.xsl"; $self->{_xsltemplate} = $self->{_corpus_script} . "/XSL-template.xsl"; bless $self, $class; unless ( $self->makePreconverter( $filename, $test ) ) { $self->getPreconverter(); $self->{_tmpfile_base} = $self->getPreconverter()->getTmpFilebase(); $self->{_orig_file} = $self->getPreconverter()->getOrig(); $self->{_tmpdir} = $self->getPreconverter()->getTmpDir(); $self->{_intermediate_xml} = $self->getPreconverter()->gettmp1(); } return $self; } sub makePreconverter { my ( $self, $filename, $test ) = @_; my $error = 0; my $abs_path = Cwd::abs_path($filename); $self->{_preconverter} = undef; if ( $abs_path =~ /Avvir/ && $abs_path =~ /\.xml$/ ) { $self->{_preconverter} = langTools::AvvirXMLConverter->new( $filename, $test ); } elsif ( $abs_path =~ /\.bible\.xml$/ ) { $self->{_preconverter} = langTools::BibleXMLConverter->new( $filename, $test ); } elsif ( $abs_path =~ /(\.html_id=\d*|\.html$|\.htm$|\.php\?id=\d*|\.php$|\.aspx$)/ ) { $self->{_preconverter} = langTools::HTMLConverter->new( $filename, $test ); } elsif ( $abs_path =~ /\.ptx$/ ) { $self->{_preconverter} = langTools::ParatextConverter->new( $filename, $test ); } elsif ( $abs_path =~ /\.rtf$/ ) { $self->{_preconverter} = langTools::RTFConverter->new( $filename, $test ); } elsif ( $abs_path =~ /\.doc$/i ) { $self->{_preconverter} = langTools::DOCConverter->new( $filename, $test ); } elsif ( $abs_path =~ /\.txt$/ ) { $self->{_preconverter} = langTools::PlaintextConverter->new( $filename, $test ); } elsif ( $abs_path =~ /(\.pdf$|\.ai$)/ ) { $self->{_preconverter} = langTools::PDFConverter->new( $filename, $test ); } elsif ( $abs_path =~ /\.svg$/ ) { $self->{_preconverter} = langTools::SVGConverter->new( $filename, $test ); } elsif ( $abs_path =~ /\.correct.xml$/ ) { $self->{_preconverter} = langTools::CorrectXMLConverter->new( $filename, $test ); } else { $self->{_preconverter} = langTools::CantHandle->new( $filename, $test ); } return $error; } sub getOrig { my ($self) = @_; return $self->{_orig_file}; } sub getInt { my ($self) = @_; # my $int = $self->getTmpDir() . "/" . $self->getTmpFilebase(); return $int . ".xml"; } sub getTmpFilebase { my ($self) = @_; return $self->{_tmpfile_base}; } sub getCommonXsl { my ($self) = @_; return $self->{_common_xsl}; } sub getPreprocXsl { my ($self) = @_; return $self->{_preproc_xsl}; } sub getTmpDir { my ($self) = @_; return $self->{_tmpdir}; } sub getMetadataXsl { my ($self) = @_; return $self->{_metadata_xsl}; } sub getIntermediateXml { my ($self) = @_; return $self->{_intermediate_xml}; } sub getXslTemplate { my ($self) = @_; return $self->{_xsltemplate}; } sub getPreconverter { my ($self) = @_; return $self->{_preconverter}; } sub getFinalName { my ($self) = @_; ( my $int = $self->getOrig() ) =~ s/\/orig\//\/converted\//; return $int . ".xml"; } sub get_debug { my ($self) = @_; return $self->{_test}; } sub get_logfile { my ($self) = @_; return $self->{_logfile}; } sub makeXslFile { my ($self) = @_; $self->{_metadata_xsl} = $self->getTmpDir() . "/" . $self->getTmpFilebase() . ".xsl"; my $command = undef; if ( !-f $self->getOrig() . ".xsl" ) { File::Copy::copy( $self->getXslTemplate(), $self->getOrig() . ".xsl" ); } my $protected = $self->getOrig(); $protected =~ s/\(/\\(/g; $protected =~ s/\)/\\)/g; $protected =~ s/\&/\\&/g; $command = "xsltproc --novalid --stringparam commonxsl " . $self->getCommonXsl() . " " . $self->getPreprocXsl() . " " . $protected . ".xsl" . " > " . $self->getMetadataXsl(); return $self->exec_com($command); } sub makeFinalDir { my ($self) = @_; if ( !-e File::Basename::dirname( $self->getFinalName() ) ) { File::Path::mkpath( File::Basename::dirname( $self->getFinalName() ) ); } return; } sub exec_com { my ( $self, $com ) = @_; if ( $self->get_debug() ) { print STDERR $self->getOrig() . " exec_com: $com\n"; } my $result = system($com); if ( $result != 0 ) { print STDERR "Errnr: $result. Errmsg: $com: $!\n"; } return $result; } sub convert2intermediatexml { my ($self) = @_; return $self->getPreconverter()->convert2intermediate(); } sub convert2xml { my ($self) = @_; my $command = "xsltproc --novalid \"" . $self->getMetadataXsl() . "\" \"" . $self->getIntermediateXml() . "\" > \"" . $self->getInt() . "\""; return $self->exec_com($command); } sub checkxml { my ($self) = @_; my $command = "xmllint --noout --dtdvalid file:///$ENV{'GTHOME'}/giella-core/schemas/corpus.dtd --postvalid --encode UTF-8 " . $self->getInt(); return $self->exec_com($command); } sub checklang { my ($self) = @_; # Check the main language, add whether it is missing. my $tmp = $self->getInt(); my $document = XML::Twig->new; if ( !$document->safe_parsefile("$tmp") ) { carp "ERROR parsing the XML-file «$tmp» failed "; return "1"; } my $root = $document->root; my $mainlang = $root->{'att'}->{'xml:lang'}; my $id = $root->{'att'}->{'id'}; if ( !$mainlang || $mainlang eq "unknown" ) { # Setting language by using the directory path is a better 'guess' for # documents lacking this piece of information $root->set_att( 'xml:lang', $self->getPreconverter()->getDoclang() ); } my $fh; open( $fh, ">:encoding(utf8)", $tmp ) or die "Cannot open $tmp $!"; $document->set_pretty_print('indented'); $document->print($fh); close $fh; return 0; } sub getOCRFromXsl { my ($self) = @_; my $retval = "ERROR"; my $document = XML::Twig->new; if ( $document->safe_parsefile( $self->getMetadataXsl() ) ) { my $root = $document->root; my $ocr_elt = $root->first_child('xsl:variable[@name="ocr"]'); if ($ocr_elt) { $retval = $ocr_elt->{'att'}{'select'}; } else { $retval = "''"; } } return $retval; } sub getEncodingFromXsl { my ($self) = @_; my $document = XML::Twig->new; if ( $document->safe_parsefile( $self->getMetadataXsl() ) ) { my $root = $document->root; my $coding_elt = $root->first_child('xsl:variable[@name="text_encoding"]'); if ($coding_elt) { return $coding_elt->{'att'}{'select'}; } else { return ""; } } else { return "ERROR"; } } sub get_doc_text { my ($self) = @_; my $int = $self->getInt(); my $twig = XML::Twig->new(); if ( $twig->safe_parsefile($int) ) { my $root = $twig->root; my $body = $root->first_child('body'); my $text = $body->text(); return $text; } else { return 0; } } sub get_person_lastname { my ($self) = @_; my $int = $self->getInt(); my $twig = XML::Twig->new(); if ( $twig->safe_parsefile($int) ) { my $root = $twig->root; my $person = $root->first_child('header')->first_child('author')->first_child('person'); my $text = $person->{'att'}{'lastname'}; return $text; } else { return 0; } } sub get_title_text { my ($self) = @_; my $int = $self->getInt(); my $twig = XML::Twig->new(); if ( $twig->safe_parsefile($int) ) { my $root = $twig->root; my $title = $root->first_child('header')->first_child('title'); my $text = $title->text; return $text; } else { return 0; } } sub character_encoding { my ($self) = @_; #my ($file, $int, $no_decode_this_time) = @_; my $file = $self->getOrig(); my $int = $self->getInt(); my $test = $self->getPreconverter()->getTest(); my $language = $self->getPreconverter()->getDoclang(); my $error = 0; # Check if the file contains characters that are wrongly # utf-8 encoded and decode them. my $text = $self->get_doc_text(); my $encoding = $self->getEncodingFromXsl(); $encoding =~ s/'//g; my $fh; if ( $encoding eq "ERROR" ) { $error = 1; } else { if ( !$encoding ) { $encoding = &guess_body_encoding( $text ); } if ($test) { print "(character_encoding) Encoding is: $encoding\n"; } my $d; if ( $encoding ne $langTools::Decode::NO_ENCODING ) { $d = XML::Twig->new( twig_handlers => { 'p' => sub { call_decode_para( $self, @_, $encoding ); }, 'title' => sub { call_decode_title( $self, @_ ); }, 'person' => sub { call_decode_person( $self, @_ ); }, 'header' => sub { set_word_count($self, @_); }, } ); } else { $d = XML::Twig->new( twig_handlers => { 'title' => sub { call_decode_title( $self, @_ ); }, 'person' => sub { call_decode_person( $self, @_ ); }, 'header' => sub { set_word_count($self, @_); }, } ); } if ( !$d->safe_parsefile("$int") ) { carp "ERROR parsing the XML-file failed.\n"; $error = 1; } elsif ( !open( $fh, ">:encoding(utf8)", $int ) ) { carp "ERROR cannot open file"; $error = 2; } else { $d->set_pretty_print('indented'); $d->print($fh); close($fh); } } return $error; } # Decode false utf8-encoding for text paragraph. sub call_decode_para { my ( $self, $twig, $para, $coding ) = @_; # print "(call_decode_para) encoding $coding\n"; my $language = $self->getPreconverter()->getDoclang(); my $error = 0; $para = recursively_decode_text( $para, $language, $coding ); return $error; } sub recursively_decode_text { my ( $element, $language, $coding ) = @_; my @new_content; for my $child ( $element->children ) { if ( $child->tag eq "#PCDATA" ) { my $text = $child->text; &decode_para( \$text, $coding ); push( @new_content, $text ); } else { push( @new_content, recursively_decode_text( $child, $language, $coding ) ); } } $element->set_content(@new_content); return $element; } sub call_decode_person { my ( $self, $twig, $person, $coding ) = @_; my $text = $person->{'att'}{'lastname'}; $coding = &guess_person_encoding($text); &decode_para( \$text, $coding ); $person->atts->{'lastname'} = $text; } sub call_decode_title { my ( $self, $twig, $title, $coding ) = @_; my $text = $title->text; $coding = &guess_person_encoding($text); &decode_para( \$text, $coding ); my @new_content; push( @new_content, $text ); $title->set_content( @new_content ); } sub set_word_count { my ($self, $twig, $header) = @_; my $words = $self->get_total_wordcount(); my $wordcount = $header->first_child('wordcount'); if (!$wordcount) { $wordcount = XML::Twig::Elt->new('wordcount'); $wordcount->set_text($words); my @tags = ('collection', 'publChannel', 'place', 'year', 'translated_from', 'translator', 'author'); foreach my $tag (@tags) { my $found = $header->first_child($tag); if ($found) { $wordcount->paste('after', $found); last; } } } else { $wordcount->set_text($words); } } sub error_markup { my ($self) = @_; my $error = 0; my $int = $self->getInt(); my $document = XML::Twig->new( twig_handlers => { 'p' => sub { langTools::Corpus::add_error_markup(@_); } } ); if ( !$document->safe_parsefile("$int") ) { carp "ERROR parsing the XML-file failed. STOP\n"; return 1; } my $fh; if ( !open( $fh, ">:encoding(utf8)", $int ) ) { carp "ERROR cannot open file STOP"; return 1; } $document->set_pretty_print('indented'); $document->print($fh); close($fh); return $error; } sub move_int_to_converted { my ($self) = @_; $self->makeFinalDir(); if ( -f $self->getInt() ) { File::Copy::copy( $self->getInt(), $self->getFinalName() ); return $self->getFinalName(); } else { return; } } sub search_for_faulty_characters { my ( $self, $filename ) = @_; my $error = 0; my $lineno = 0; # The theory is that only the sami languages can be erroneously encoded ... if ( $self->getPreconverter->getDoclang() =~ /(sma|sme|smj)/ ) { my $fh; if ( !open( $fh, "<:encoding(utf8)", $filename ) ) { print "(Search for faulty) Cannot open: " . $filename . "\n"; $error = 1; } else { while (<$fh>) { $lineno++; if ( $_ =~ /(¤|Ä\?|Ď|ª|Ω|π|∏|Ã|Œ|α|ρ|λ|ν|χ|υ|τ|Δ|Λ|þ|±|¢|¿|˜)/ ) { print STDERR "In file " . $filename . " (base is " . $self->getOrig() . " )\n"; print STDERR "Faulty character at line: $lineno with line\n$_\n"; $error = 1; } } } close $fh; } return $error; } sub remove_temp_files { my ($self) = @_; unlink( $self->getInt() ); unlink( $self->getIntermediateXml() ); unlink( $self->getPreconverter()->gettmp2() ); unlink( $self->getMetadataXsl() ); return; } # Redirect STDERR to log files. sub redirect_stderr_to_log { my ($self) = @_; if ( !$self->get_debug() ) { $self->{_log_file} = $self->getPreconverter()->getTmpDir() . "/" . File::Basename::basename( $self->getOrig() ) . ".log"; open STDERR, '>', $self->{_log_file} or die "Can't redirect STDERR: $!"; } return; } sub text_categorization { my ($self) = @_; my $command = "$ENV{'GTHOME'}/tools/lang-guesser/text_cat.pl -q -x -d $ENV{'GTHOME'}/tools/lang-guesser/LM \"" . $self->getInt() . "\""; return $self->exec_com($command); } sub analyze_content { my ($self) = @_; if ( $self->getPreconverter()->getDoclang() =~ m/sm[aej]/ ) { my $wordcount = $self->get_mainlang_wordcount(); my $unknown_wordcount = $self->find_unknown_words(); # Avoid division by zero ... if ( $wordcount > 0 ) { # if wordcount is less than 100 words, improve the ratio ... if ( $wordcount < 100 ) { $wordcount *= 2; } my $ratio = $unknown_wordcount / $wordcount * 100; return $unknown_wordcount / $wordcount * 100; } else { return 0; } } else { return 0; } } sub get_mainlang_ratio { my ($self) = @_; my $total_wordcount = $self->get_total_wordcount(); my $mainlang_wordcount = $self->get_mainlang_wordcount(); if ( $total_wordcount > 0 ) { my $ratio = $mainlang_wordcount / $total_wordcount * 100; return $ratio; } else { return 0; } } sub find_unknown_words { my ($self) = @_; my $mainlang = $self->getPreconverter()->getDoclang(); my $preprocess; my $gthome = $ENV{'GTHOME'}; my $int = $self->getInt(); # Macs need ssed for the -r flag below my $sed; if ( `uname` eq "Darwin\n" ) { $sed = "ssed"; } else { $sed = "sed"; } if ( $mainlang eq "sme" || $mainlang eq "smj" ) { $preprocess = "preprocess --corr=$gthome/gt/$mainlang/bin/corr.txt --abbr=$gthome/gt/$mainlang/bin/abbr.txt"; } else { $preprocess = "preprocess --corr=$gthome/gt/$mainlang/bin/corr.txt"; } my $ukw = `ccat -l $mainlang -a $int | $preprocess | $sed -r -e 's_http:.*__' -e 's_\\w+\\.\\w+__' | sort -u | lookup -flags mbTT $gthome/gt/$mainlang/bin/$mainlang.fst | fgrep '+?' | wc -l`; if ( $self->{_test} ) { print "this is unknown words: " . $ukw . "\n"; print `ccat -l $mainlang -a $int | $preprocess | $sed -r -e 's_http:.*__' -e 's_\\w+\\.\\w+__' | sort -u | lookup -flags mbTT $gthome/gt/$mainlang/bin/$mainlang.fst | fgrep '+?'`; } return $ukw; } sub get_mainlang_wordcount { my ($self) = @_; my $mainlang = $self->getPreconverter()->getDoclang(); my $int = $self->getInt(); my $wc = `ccat -l $mainlang -a -S $int | wc -l`; if ( $self->{_test} ) { print "this is mainlang word count: " . $wc . "\n"; } return $wc; } sub get_total_wordcount { my ($self) = @_; my $int = $self->getInt(); my $wc = trim(`ccat -a -S $int | wc -l`); if ( $self->{_test} ) { print "this is total word count: " . $wc . "\n"; } return $wc; } sub do_parallels_exist { my ($self) = @_; my $document = XML::Twig->new; if ( $document->safe_parsefile( $self->getMetadataXsl() ) ) { my $root = $document->root; my $parallel_texts_elt = $root->first_child('xsl:variable[@name="parallel_texts"]'); if ( $parallel_texts_elt && $parallel_texts_elt->{'att'}{'select'} eq "'1'" ) { return $self->which_parallels_do_exist($root); } } return 0; } sub which_parallels_do_exist { my ( $self, $root ) = @_; my @langs = ( "dan", "eng", "fin", "fkv", "ger", "isl", "kal", "nno", "nob", "sma", "sme", "smj", "swe" ); for my $lang (@langs) { my $key = 'xsl:variable[@name="para_' . $lang . '"]'; my $para_elt = $root->first_child($key); my $para_doc = $para_elt->{'att'}{'select'}; $para_doc =~ s/'//g; if ( $para_doc ne "" ) { my $para = $self->getOrig(); my $mainlang = $self->getPreconverter()->getDoclang(); $para =~ s/\/$mainlang\//\/$lang\//; my $x = rindex( $para, "/" ); my $para_file = substr( $para, 0, rindex( $para, "/" ) + 1 ) . $para_doc; unless ( -e $para_file ) { print STDERR "Parallel file $para_file doesn't exist\n"; return 1; } } } return 0; } # remove whitespace from the start and end of the string sub trim($) { my $string = shift; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; } 1;