package langTools::DOCConverter; use langTools::Preconverter; @ISA = ("langTools::Preconverter"); sub new { my ( $class, $filename, $test ) = @_; my $self = $class->SUPER::new( $filename, $test ); $self->{_converter_xsl} = $self->{_corpus_script} . "/docbook2corpus2.xsl"; bless $self, $class; return $self; } sub getXsl { my ($self) = @_; return $self->{_converter_xsl}; } sub convert2intermediate { my ($self) = @_; my $error = 0; my $command = "antiword -x db \"" . $self->getOrig() . "\" > \"" . $self->gettmp2() . "\""; if ( $self->exec_com($command) ) { $error = 1; } else { $command = "xsltproc \"" . $self->getXsl() . "\" \"" . $self->gettmp2() . "\" > \"" . $self->gettmp1() . "\""; if ( $self->exec_com($command) ) { $error = 1; } else { $self->clean_doc(); } } return $error; } sub clean_doc { my ($self) = @_; my %replacements = ( "ö" => "ö", # this has to be here because ¶ otherwise is converted to <\/p>
"ð" => "đ", "¶" => "<\/p>
",
"fi" => "fi",
"fl" => "fl",
"ff" => "ff",
"ffi" => "ffi",
"ffl" => "ffl",
"ſt" => "ft",
);
open( FH, "<:encoding(utf8)", $self->gettmp1() )
or die "Cannot open " . $self->gettmp1() . "$!";
my @file =