package langTools::HTMLConverter; use langTools::Preconverter; @ISA = ("langTools::Preconverter"); sub new { my ( $class, $filename, $test ) = @_; my $self = $class->SUPER::new( $filename, $test ); $self->{_converter_xsl} = $self->{_corpus_script} . "/xhtml2corpus.xsl"; bless $self, $class; return $self; } sub getXsl { my ($self) = @_; return $self->{_converter_xsl}; } # Clean out unwanted tags using sed, then run it through tidy sub tidyHTML { my ($self) = @_; $command = "tidy.py " . $self->getOrig() . " " . $self->gettmp1(); $self->exec_com($command); $command = "tidy -config " . $self->{_bindir} . "/tidy-config.txt -utf8 -asxml -quiet " . $self->gettmp1() . " > " . $self->gettmp2(); return $self->exec_com($command); } # Clean the html # Convert the html to xml using xhtml2corpus.xsl # Clean the result a little, as well sub convert2intermediate { my ($self) = @_; my $error = 0; if ( $self->tidyHTML() == 512 ) { $error = 1; } else { my $command = "xsltproc --novalid \"" . $self->getXsl() . "\" \"" . $self->gettmp2() . "\" > \"" . $self->gettmp1() . "\""; if ( $self->exec_com($command) ) { $error = 1; } else { $self->clean_doc(); } } return $error; } sub clean_doc { my ($self) = @_; my %replacements = ( "„" => "«", "“" => "»" ); open( FH, "<:encoding(utf8)", $self->gettmp1() ) or die "Cannot open " . $self->gettmp1() . "$!"; my @file = ; close(FH); open( FH, ">:encoding(utf8)", $self->gettmp1() ) or die "Cannot open " . $self->gettmp1() . "$!"; foreach my $string (@file) { foreach my $a ( keys %replacements ) { my $ii = Encode::decode_utf8($a); my $i = Encode::decode_utf8( $replacements{$a} ); $string =~ s/$ii/$i/g; } print FH $string; } close(FH); } 1;