"""
Parse input xml file and look up each lemma in ilmentyma.xml 
looking for lemmas which has entries with two (or more) different "lekseemi_id"s,
thus having different etymologies.

An example is sme sávdnji which can both mean seam and sauna.

Print the found lemmas to stdout. These should be checked manually 
after having run add_algu_ids.py to make sure each lemma points to the
correct etymology.
"""
import os, sys

import lxml.etree as etree

iso_to_id = {
    "mns": "14",
    "hun": "16",
    "kpv": "20",
    "udm": "21",
    "fin": "29",
    "krl": "31",
    "vep": "33",
    "vot": "34",
    "est": "35",
    "liv": "36",
    "sma": "41",
    "sju": "42",
    "sje": "43",
    "smj": "44",
    "sme": "45",
    "smn": "46",
    "sms": "47",
    "sjd": "49",
    "myv": "60",
    "mhr": "62",
    "mrj": "63"
}

# Initial checks before running.
if (len(sys.argv) != 3):
    print("Usage: python3 {} <XML_FILE_PATH> <SANUE_LANGUAGE_ID>/<ISO 639-3>".format(sys.argv[0]))
    sys.exit()

kieli_id = sys.argv[2]
try:
    int(kieli_id)
except ValueError:
    try:
        kieli_id = iso_to_id[kieli_id]
    except KeyError:
        print("Provided language ID not a number, and string is not mapped. Aborting")
        sys.exit()

# Set file paths
xml_file_name = sys.argv[1]

try:
    gthome = os.environ["GTHOME"]
    print("Found GTHOME: {}".format(gthome))
except KeyError:
    try:
        gthome = os.environ["GIELLA_HOME"]
        print("Found GIELLA_HOME: {}".format(gthome))
    except KeyError:
        gthome = "../.."
        print("Did not find GTHOME or GIELLA_HOME, using relative path")
ilmentyma_path = gthome + "/words/dicts/algu/ilmentyma.xml"
sanue_path = gthome + "/words/dicts/algu/sanue.xml"

# Read ilmentyma.xml
try:
    ilmentyma_tree = etree.parse(ilmentyma_path)
    print("Loaded ilmentyma.xml")
except OSError:
    print("Failed to load {}.".format(ilmentyma_path))
    sys.exit()

# Read sanue.xml
try:
    sanue_tree = etree.parse(sanue_path)
    print("Loaded sanue.xml")
except OSError:
    print("Failed to load {}.".format(sanue_path))
    sys.exit()

# Read input file
try:
    # Removing blank text in parser to make sure pretty printing works as intended
    dict_tree = etree.parse(xml_file_name, parser=etree.XMLParser(remove_blank_text=True))
    print("Loaded {}".format(xml_file_name))
except OSError:
    print("Failed to load {}. Is the file name correct?".format(xml_file_name))
    sys.exit()

i=0

# Look up words from dict xml in ilmentyma.xml
for elem in dict_tree.getiterator():
    # Go though all lemmas
    if elem.tag == "l":
        lemma = elem.text
        xpath = '//field[@name="ilmentyma_p"][.="{}"]'.format(lemma)
        # Search for lemma in ilmentyma.xml
        ilmentyma_field_list = ilmentyma_tree.findall(xpath)
        if len(ilmentyma_field_list) > 1:
            lekseemi_ids = {}
            # Go through all fields, find the lekseemi_id and add it to the dict
            # If the dict contains more than one lekseemi_id after this, then print info
            for field in ilmentyma_field_list:
                row = field.getparent()
                lekseemi_id = row.find('field[@name="lekseemi_id"]').text
                # Check if the lekseemi id is in the chosen language
                xpath = '//field[@name="eduslekseemi"][.="{}"]'.format(lekseemi_id)
                sanue_field = sanue_tree.find(xpath)
                if sanue_field is not None:
                    row = sanue_field.getparent()
                    if kieli_id == row.find('field[@name="kieli_id"]').text:
                        sanue_id = row.find('field[@name="id"]').text
                        lekseemi_ids[sanue_id] = lemma
            if len(lekseemi_ids) > 1:
                print("Lemma {} corresponds to 'sanue_id's {}".format(lemma, [x for x in lekseemi_ids.keys()]))
        i+=1
        print("Checked {} lemmas".format(i), end="\r")


