package langTools::Parallelize; use XML::Twig; use utf8; use strict; use warnings; sub new { my ( $class ) = @_; my $self = { _lang1 => undef; _lang2 => undef; _lang1_filename => undef; _lang2_filename => undef; _anchor_filename => undef; } bless( $self, $class ); return $self; } # The file and it's parallel counterpart are split to sentences, # aligned and analyzed. sub parallelize_text { my ( $self ) = @_; my $file = $_; $file = shift(@_) if ( !$file ); my $document = XML::Twig->new; if ( !$document->safe_parsefile("$file") ) { cluck "parsing the XML-file failed.\n"; return; } # Find the parallel files for the document. my $location = find_parallel_location($document); if ( !$location ) { print "No parallel texts found for language $lang2.\n"; } else { $file = File::Spec->rel2abs($file); my @full_paths = find_parallel_paths($file, $lang1, $lang2, $location); # Prepare files for further processing by # adding tags and sentence ids. # The output goes to tmp. # Take only the file name without path. if ( $#full_paths > 0 ) { die "Cannot process more than one parallel file\n"; } else { my $pfile = $full_paths[0]; my $lang1_infile = divide_p_into_sentences($file); my $lang2_infile = divide_p_into_sentences($pfile); parallelize_files( $anchor_file, $lang1_infile, $lang2_infile ); make_tmx( $file, $pfile, $lang1, $lang2 ); } } return; } sub find_parallel_location { my ( $self, $twig ) = @_; my $location; my $root = $twig->root; my $header = $root->first_child('header'); my @parallel_texts = $header->children('parallel_text'); for my $p (@parallel_texts) { my $plang = $p->{'att'}->{'xml:lang'}; next if ( $plang ne $lang2 ); $location = $p->{'att'}->{'location'}; last; } return $location; } sub find_parallel_paths { my ( $self, $file, $lang1, $lang2, $location ) = @_; my @full_paths; # The path to the original. # And path to parallel files. ( my $path = $file ) =~ s/(.*)[\/\\].*/$1/; ( my $para_path = $path ) =~ s/$lang1/$lang2/o; my @para_files = split( ",", $location ); for my $p (@para_files) { $p = $para_path . "/" . $p; if ( $p !~ /\.xml/ ) { $p = $p . ".xml"; } push( @full_paths, $p ); } return @full_paths; } sub divide_p_into_sentences { my ( $self, $file, $lang ) = @_; my $outfile = $outdir . "/" . calculate_base($file) . $lang . ".sent.xml"; my $command = "$corpus_analyze --all --output=\"$outfile\" --only_add_sentences --lang=$lang \"$file\""; print STDERR "$0: $command\n"; if ( system($command) != 0 ) { die "errors in $command: $!\n"; } else { return $outfile; } } sub parallelize_files { my ( $self, $anchor_file, $infile1, $infile2 ) = @_; my $command = "tca2.sh $anchor_file $infile1 $infile2"; print STDERR "$0: $command\n"; if ( system($command) != 0 ) { die "errors in $command: $!\n"; } else { return; } } sub make_tmx { my ( $self, $file, $pfile, $lang1, $lang2 ) = @_; my @f1_data = read_tca2_output( calculate_base($file), $lang1 ); my @f2_data = read_tca2_output( calculate_base($pfile), $lang2 ); my $body = XML::Twig::Elt->new("body"); $body->set_pretty_print('indented'); my $f1_length = @f1_data; for ( my $i = 0 ; $i < $f1_length ; $i++ ) { my $tu_elt = XML::Twig::Elt->new("tu"); make_tuv( $f1_data[$i], $lang1 )->paste( 'last_child', $tu_elt ); make_tuv( $f2_data[$i], $lang2 )->paste( 'last_child', $tu_elt ); $tu_elt->paste( 'last_child', $body ); } print_tmx_file( $body, calculate_base($file), $lang1, $lang2 ); return; } sub calculate_base { my ( $$self, file ) = @_; ( my $base = $file ) =~ s/.*[\/\\](.*).xml/$1/; return $base; } sub read_tca2_output { my ( $self, $base, $lang ) = @_; my $fh1; open( $fh1, "<:encoding(utf8)", $outdir . "/" . $base . $lang . ".sent_new.txt" ) || die("Could not open file!"); my @data = <$fh1>; close($fh1); return @data; } sub make_tuv { my ( $self, $sentence, $lang ) = @_; my $tuv_elt = XML::Twig::Elt->new("tuv"); $tuv_elt->set_att( 'xml:lang', $lang ); $sentence =~ s///g; $sentence =~ s/<\/s>//g; my $seg_elt = XML::Twig::Elt->new( "seg", $sentence ); $seg_elt->paste( 'last_child', $tuv_elt ); return $tuv_elt; } sub print_tmx_file { my ( $self, $body, $base, $lang1, $lang2 ) = @_; my $FH1; if ( !-e $ENV{'GTFREE'} . "/prestable/tmx/" . $lang1 . $lang2 ) { File::Path::mkpath( $ENV{'GTFREE'} . "/prestable/tmx/" . $lang1 . $lang2 ); } open( $FH1, " >:encoding(utf8)", $ENV{'GTFREE'} . "/prestable/tmx/" . $lang1 . $lang2 . "/" . $base . ".tmx" ); print_tmx_header( $FH1, $lang1 ); $body->print($FH1); print $FH1 qq||, "\n"; close($FH1); return; } sub print_tmx_header { my ( $self, $FH1, $lang ) = @_; print $FH1 qq||, "\n"; print $FH1 qq||, "\n"; print $FH1 qq||, "\n"; print $FH1 qq||, "\n"; return; } 1;