#!/usr/bin/perl

# catxml
# Perl-script for processing corpus xml-files as text.
#  - Reads xml-file(s) or directory containing xml-file(s)
#	 as input and outputs plain text.
#
# Getting help, give command:
# catxml --help 
#
# $Id: catxml,v Exp $

use strict;

# Use utf-8 encoding
use encoding 'utf-8';
use open ':utf8';

# using XML
use XML::Twig;

use File::Find;
use File::Spec;
use Getopt::Long;



############################
# Variables
#
############################

my $gt="/Users/tomi/Documents/eclipse/workspace/gt";
my $ANALYZE = "preprocess --abbr=$gt/sme/bin/abbr.txt | lookup -d -flags mbTT -utf8 $gt/sme/bin/sme.fst | lookup2cg | vislcg3 --grammar=$gt/sme/bin/sme-dis.bin 2>/dev/null\n";

# variable for processing all elements
my $all;

# one variable per element
my $para = 'true';
my $title;
my $list;
my $table;

# choosing what language we want
my $path = File::Spec->rel2abs($_);
my $lang;
$lang = "sme" if $path =~ "sme";
$lang = "sma" if $path =~ "sma";
$lang = "smj" if $path =~ "smj";
$lang = "sjd" if $path =~ "sjd";
$lang = "smn" if $path =~ "smn";
$lang = "sms" if $path =~ "sms";

# If we want to analyze the xml document
my $analyze;

# for help
my $help;

# input file(s) in an array
my @input;

# language of the document
my $doclang;


#############################
# Get options
#
#############################
GetOptions ('para!' => \$para,
	    'all' => \$all,
	    'title' => \$title,
	    'list' => \$list,
	    'table' => \$table,
	    'lang=s' => \$lang,
	    'analyze' => \$analyze,
	    'help' => \$help,
	    'input=s' => \@input);

if ($all) {
    $title = 'true';
    $list = 'true';
    $table = 'true';
}

if ($help) {
    &print_usage;
    exit;
}


foreach my $in (@input) {
	# if input is directory
    find ( \&process_file, $in) if -d $in;
    
    # if input is file
    process_file ($in) if -f $in;
}


sub process_file {
    my $file = $_;
    $file = shift (@_) if (!$file);
    
    	# Check that 
    	# a)file is not hidden file    
    return if $file =~ /^\./;

    # b)file ends with .xml
    return unless $file =~ /\.xml$/;


	# Find out the language of the file
    my $document = XML::Twig->new(twig_handlers =>
				  {'document' => sub { $doclang = $_->{'att'}->{'xml:lang'}}} );

	if ($document->safe_parsefile ($file) == 0) {
		print STDERR "Couldn't parse file $file: $@";
	}
	
	else {
		# Process all <p> elements
	    $document = XML::Twig->new(twig_handlers =>
					  {p => \&process_xml} );
	
	    if ($document->safe_parsefile ($file) == 0) {
			print STDERR "Couldn't parse file $file: $@";
		}
    }
}

sub process_xml {
    my ($twig, $elt) = @_;


	# We want to process only paragraphs within document that has the 
	# wanted language OR paragraph is in the wanted language
    if ((!$elt->att("xml:lang") && $doclang =~ $lang) || 
		$elt->att("xml:lang" =~ $lang))
    {
    		# print out paragraphs with text content
		if ($para) {
			if (!$elt->att("type") || $elt->att("type") =~ "text") {
				if ($analyze) {
#					print "<" . $elt->tag . ">\n";
					print "<" . $elt->tag . ">¶\n";   # testing, TT.
					analyze ($elt);
#					print "<\\" . $elt->tag . ">\n";
					print "<\\" . $elt->tag . ">¶\n"; # testing, TT.
				}
				else {
			    		print $elt->text;
			    		print "¶\n";             # Testing, TT.
			    	}
			}
	    }

    		# print out paragraphs with title content
		if ($title) {
			if ($elt->att("type") && $elt->{'att'}->{'type'} =~ "title") {
				if ($analyze) {
					print "<" . $elt->tag . " type=" . $elt->att("type") . ">\n";
					analyze ($elt);
					print "<\\" . $elt->tag . ">\n";
				}
				else {
				    print $elt->text;
			    		print "¶\n";             # Specical sign for titles
			    	}
			}
	    }

    		# print out paragraphs with list content
		if ($list) {
		    if ($elt->att("type") && $elt->att("type") =~ "listitem") {
			    print $elt->text;
		    		print "¶";               # Using the title sign also here
	    		}
		}

    		# print out paragraphs with table content
		if ($table) {
			if ($elt->att("type") && $elt->att("type") =~ "tablecell") {
			    print $elt->text;
		    		print "¶";               # Using the title sign also here
			}
		}
    }

    $twig->purge;
}

sub analyze {
	my $elt = $_;
	
	my $text = $elt->text;
#	print $text;
	system "echo \"$text\" | $ANALYZE";
}

sub print_usage {
    print "\nUsage: catxml <options> [-i | --input] [FileName | Directory]\n";
    print "\nwhere possible options include:\n";
    print "[-al | --all]\t\t\tPrint all text elements\n";
    print "[-p | --para]\t\t\tPrint paragraphs with text content (default)\n";
    print "[-nop | --nopara]\t\tDon't print text paragraphs\n";
    print "[-ti | --title]\t\t\tPrint paragraphs with title type\n";
    print "[-li | --list]\t\t\tPrint paragraphs with list type\n";
    print "[-ta | --table]\t\t\tPrint paragraphs with table type\n";
    print "[-la='lang' | --lang='lang']\tPrint only paragraphs in language 'lang'\n";
    print "\t\t\t\t(default language is sme)\n";
	print "[-an | --analyze]\t\t\tAnalyze the text (currently only plain text and titles)\n";
    print "\n";
}
