package langTools::Parallelize;

use XML::Twig;
use utf8;
use strict;
use warnings;

sub new {
    my ( $class ) = @_;
    
    my $self = {
        _lang1 => undef;
        _lang2 => undef;
        _lang1_filename => undef;
        _lang2_filename => undef;
        _anchor_filename => undef;
    }
    bless( $self, $class );
    
    return $self;
}


# The file and it's parallel counterpart are split to sentences,
# aligned and analyzed.
sub parallelize_text {
    my ( $self ) = @_;
    
    my $file = $_;
    $file = shift(@_) if ( !$file );

    my $document = XML::Twig->new;
    if ( !$document->safe_parsefile("$file") ) {
        cluck "parsing the XML-file failed.\n";
        return;
    }

    # Find the parallel files for the document.
    my $location = find_parallel_location($document);
    if ( !$location ) {
        print "No parallel texts found for language $lang2.\n";
    } else {
        $file = File::Spec->rel2abs($file);
        my @full_paths = find_parallel_paths($file, $lang1, $lang2, $location);
        

        # Prepare files for further processing by
        # adding <s> tags and sentence ids.
        # The output goes to tmp.

        # Take only the file name without path.
        if ( $#full_paths > 0 ) {
            die "Cannot process more than one parallel file\n";
        }
        else {
            my $pfile = $full_paths[0];

            my $lang1_infile = divide_p_into_sentences($file);
            my $lang2_infile = divide_p_into_sentences($pfile);

            parallelize_files( $anchor_file, $lang1_infile, $lang2_infile );
            make_tmx( $file, $pfile, $lang1, $lang2 );
        }
    }
    return;
}

sub find_parallel_location {
    my ( $self, $twig ) = @_;
    
    my $location;
    my $root           = $twig->root;
    my $header         = $root->first_child('header');
    my @parallel_texts = $header->children('parallel_text');
    for my $p (@parallel_texts) {
        my $plang = $p->{'att'}->{'xml:lang'};
        next if ( $plang ne $lang2 );
        $location = $p->{'att'}->{'location'};
        last;
    }

    
    return $location;
}

sub find_parallel_paths {
    my ( $self, $file, $lang1, $lang2, $location ) = @_;
    
    my @full_paths;
    # The path to the original.
    # And path to parallel files.
    ( my $path      = $file ) =~ s/(.*)[\/\\].*/$1/;
    ( my $para_path = $path ) =~ s/$lang1/$lang2/o;

    my @para_files = split( ",", $location );
    for my $p (@para_files) {
        $p = $para_path . "/" . $p;
        if ( $p !~ /\.xml/ ) {
            $p = $p . ".xml";
        }
        push( @full_paths, $p );
    }
    
    return @full_paths;
}

sub divide_p_into_sentences {
    my ( $self, $file, $lang ) = @_;

    my $outfile = $outdir . "/" . calculate_base($file) . $lang . ".sent.xml";

    my $command =
"$corpus_analyze --all --output=\"$outfile\" --only_add_sentences --lang=$lang \"$file\"";
    print STDERR "$0: $command\n";
    if ( system($command) != 0 ) {
        die "errors in $command: $!\n";
    } else {
        return $outfile;
    }
}

sub parallelize_files {
    my ( $self, $anchor_file, $infile1, $infile2 ) = @_;

    my $command = "tca2.sh $anchor_file $infile1 $infile2";

    print STDERR "$0: $command\n";
    if ( system($command) != 0 ) {
        die "errors in $command: $!\n";
    } else {
        return;
    }
}

sub make_tmx {
    my ( $self, $file, $pfile, $lang1, $lang2 ) = @_;

    my @f1_data = read_tca2_output( calculate_base($file),  $lang1 );
    my @f2_data = read_tca2_output( calculate_base($pfile), $lang2 );

    my $body = XML::Twig::Elt->new("body");
    $body->set_pretty_print('indented');

    my $f1_length = @f1_data;
    for ( my $i = 0 ; $i < $f1_length ; $i++ ) {
        my $tu_elt = XML::Twig::Elt->new("tu");

        make_tuv( $f1_data[$i], $lang1 )->paste( 'last_child', $tu_elt );
        make_tuv( $f2_data[$i], $lang2 )->paste( 'last_child', $tu_elt );

        $tu_elt->paste( 'last_child', $body );
    }

    print_tmx_file( $body, calculate_base($file), $lang1, $lang2 );

    return;
}

sub calculate_base {
    my ( $$self, file ) = @_;

    ( my $base = $file ) =~ s/.*[\/\\](.*).xml/$1/;
    return $base;
}

sub read_tca2_output {
    my ( $self, $base, $lang ) = @_;

    my $fh1;

    open( $fh1, "<:encoding(utf8)",
        $outdir . "/" . $base . $lang . ".sent_new.txt" )
      || die("Could not open file!");
    my @data = <$fh1>;
    close($fh1);

    return @data;
}

sub make_tuv {
    my ( $self, $sentence, $lang ) = @_;

    my $tuv_elt = XML::Twig::Elt->new("tuv");
    $tuv_elt->set_att( 'xml:lang', $lang );
    $sentence =~ s/<s id="[^ ]*">//g;
    $sentence =~ s/<\/s>//g;
    my $seg_elt = XML::Twig::Elt->new( "seg", $sentence );
    $seg_elt->paste( 'last_child', $tuv_elt );

    return $tuv_elt;
}

sub print_tmx_file {
    my ( $self, $body, $base, $lang1, $lang2 ) = @_;

    my $FH1;
    if ( !-e $ENV{'GTFREE'} . "/prestable/tmx/" . $lang1 . $lang2 ) {
        File::Path::mkpath(
            $ENV{'GTFREE'} . "/prestable/tmx/" . $lang1 . $lang2 );
    }
    open(
        $FH1,
        " >:encoding(utf8)",
        $ENV{'GTFREE'}
          . "/prestable/tmx/"
          . $lang1
          . $lang2 . "/"
          . $base . ".tmx"
    );
    print_tmx_header( $FH1, $lang1 );
    $body->print($FH1);
    print $FH1 qq|</tmx>|, "\n";
    close($FH1);

    return;
}

sub print_tmx_header {
    my ( $self, $FH1, $lang ) = @_;

    print $FH1 qq|<?xml version='1.0'  encoding="UTF-8"?>|, "\n";
    print $FH1 qq|<tmx>|,                                   "\n";
    print $FH1 qq|<header|,                                 "\n";
    print $FH1 qq|    segtype="sentence"|,                  "\n";
    print $FH1 qq|    o-tmf="OmegaT TMX"|,                  "\n";
    print $FH1 qq|    adminlang="EN-US"|,                   "\n";
    print $FH1 qq|    srclang="$lang-NO"|,                  "\n";
    print $FH1 qq|    datatype="plaintext"|,                "\n";
    print $FH1 qq|    >|,                                   "\n";
    print $FH1 qq|</header>|,                               "\n";

    return;
}

1;