# -*- coding:utf-8 -*-
import re, os, errno, cgi, json, xml
import sys, codecs, locale, getopt
import xml.etree.ElementTree as ET
from subprocess import Popen, PIPE
from operator import itemgetter
from xml.dom.minidom import parse, parseString
from importlib import reload
from collections import defaultdict

def main():
    #The script expects 2 (and 1 optional) parameters:
    # language to analyse, input_directory (and genre)
    lang = sys.argv[1]
    in_dir = sys.argv[2]
    if len(sys.argv) == 4:
        genre_str = sys.argv[3]
    else:
        genre_str = ''
    out_dir = 'out_' + lang + '_' + in_dir
    done_dir = 'done_' + genre
    cwd = os.getcwd()
    out_dir_path = os.path.join(cwd, out_dir)
    done_dir_path = os.path.join(cwd, done_dir)

    if not os.path.exists(out_dir_path):
        os.makedirs(out_dir_path)
    if not os.path.exists(done_dir_path):
        os.mkdir(done_dir_path)

    debug_fst = False

    namespaces = {'xml': 'http://www.w3.org/1999/xml'}

    plup = Popen('which lookup', shell=True, stdout=PIPE, stderr=PIPE)
    olup, elup = plup.communicate()
    #print("___ lookup is ",olup.decode())
    if not olup.decode():
        #print('No lookup found, please install it!')
        sys.exit()

    lookup = olup.decode().strip()
    langs_dir = '$GTHOME/langs/'
    abbr_file = langs_dir + lang + '/tools/tokenisers/abbr.txt'
    rel_xfst_file = '/src/analyser-disamb-gt-desc.xfst'
    abs_xfst_file = langs_dir + lang + rel_xfst_file
    disamb_file = langs_dir + lang + '/src/syntax/disambiguator.cg3'

    for root, dirs, files in os.walk(in_dir): # Walk directory tree
        # print("Input dir {0} with {1} files ...".format(root, len(files)))

        for f in files:
            if f.endswith('tmx'):
                print('... processing ', str(f))
                tree = ET.parse(os.path.join(root,f))
                f_root = tree.getroot()

                header = f_root.find('.//header')

                genre = ET.Element('genre')
                if genre_str:
                    genre.text = genre_str
                    header.insert(1, genre)
                tuvs = f_root.findall('.//tuv[@lang="'+lang+'"]')
                for tuv in tuvs:
                    seg = tuv.findall('seg')
                    seg_txt = seg[0].text
                    print('... seg ', str(seg_txt))
                    cmd = '| preprocess --abbr ' + abbr_file + ' | lookup -q -flags mbTT ' + abs_xfst_file + ' | lookup2cg | vislcg3 -g ' + disamb_file


                    #print('... cmd ', cmd)
                    p = Popen("echo '" + seg_txt + "'" + cmd, shell=True, stdout=PIPE, stderr=PIPE)
                    out, err = p.communicate()

                    c_analysis = ''
                    #print("|", out.decode().split('\n', 1 ),"|")
                    current_analysis = filter(None, out.decode().split('\n"<'))
                    for current_cohort in current_analysis:
                        cc_list = current_cohort.split('\n\t')

                        wform = cc_list[0]
                        wform = wform.strip()
                        if wform.startswith('"<'):
                            wform = wform[2:]
                        if wform.endswith('>"'):
                            wform = wform[:-2]
                        wform = wform.replace(' ', '_')

                        cc_list.pop(0)
                        sccl = sorted(cc_list)
                        l_a = sccl[0]


                        lemma = l_a.partition('" ')[0]
                        lemma = lemma.strip()
                        lemma = lemma.replace('#','')
                        lemma = lemma.replace(' ', '_')
                        if lemma.startswith('"'):
                            lemma = lemma[1:]

                        analysis = l_a.partition('" ')[2]
                        p_analysis = l_a.partition('" ')[2]
                        analysis = analysis.partition('@')[0]
                        analysis = analysis.replace('Err/Orth','')
                        analysis = analysis.replace(' <'+lang+'>','')
                        analysis = analysis.replace(' <vdic>','')
                        analysis = analysis.replace(' Sem/Date','')
                        analysis = analysis.replace(' Sem/Org','')
                        analysis = analysis.replace(' Sem/Sur','')
                        analysis = analysis.replace(' Sem/Fem','')
                        analysis = analysis.replace(' Sem/Mal','')
                        analysis = analysis.replace(' Sem/Plc','')
                        analysis = analysis.replace(' Sem/Obj','')
                        analysis = analysis.replace(' Sem/Adr','')
                        analysis = analysis.replace('Sem/Adr ','')
                        analysis = analysis.replace(' Sem/Year','')
                        analysis = analysis.replace(' IV','')
                        analysis = analysis.replace(' TV','')
                        analysis = analysis.replace('v1 ','')
                        analysis = analysis.replace('v2 ','')
                        analysis = analysis.replace('Hom1 ','')
                        analysis = analysis.replace('Hom2 ','')
                        analysis = analysis.replace('/','_')
                        if analysis.startswith('Arab Num'):
                            analysis = analysis.replace('Arab Num','Num Arab')
                        analysis = analysis.strip()
                        if '?' in analysis:
                            analysis = '___'
                        analysis = analysis.strip()
                        analysis = analysis.replace('  ',' ')
                        analysis = analysis.replace(' ','.')
                        pos = analysis.partition('.')[0]


                        formated_line = wform+'\t'+lemma+'\t'+pos+'\t'+analysis
                        c_analysis = c_analysis+'\n'+formated_line

                    analysis = ET.Element('analysis')
                    analysis.text =  c_analysis+'\n'
                    tuv.insert(1, analysis)

                tree.write(os.path.join(out_dir_path,str(f)),
                            xml_declaration=True,encoding='utf-8',
                            method="xml")
                print('DONE ', f, '\n\n')

                mv_cmd = "mv " + os.path.join(root, f) + " " + done_dir_path + "/"
                print('MOVED file ', f, ' in done folder \n\n')
                p = Popen(mv_cmd, shell=True, stdout=PIPE, stderr=PIPE)
                mv_out, mv_err = p.communicate()


if __name__ == "__main__":
    reload(sys)
    main()