""" This script should take the words in the xml dictionary file given, and first look them up in main/words/dicts/algu/ilmentyma.xml in the field ilmentyma_p. Retrieve the corresponding lekseemi_id. Then it must look up this value (lekseemi_id) in main/words/dicts/algu/sanue.xml (where it is called eduslekseemi) and retrieve the corresponding "id", which corresponds to sanue_id in Álgu. This script is made for sme, and may need to be adjusted for other langs. """ import os, sys import lxml.etree as etree iso_to_id = { "mns": "14", "hun": "16", "kpv": "20", "udm": "21", "fin": "29", "krl": "31", "vep": "33", "vot": "34", "est": "35", "liv": "36", "sma": "41", "sju": "42", "sje": "43", "smj": "44", "sme": "45", "smn": "46", "sms": "47", "sjd": "49", "myv": "60", "mhr": "62", "mrj": "63" } # Initial checks before running. if (len(sys.argv) != 3): print("Usage: python3 {} /".format(sys.argv[0])) sys.exit() language = sys.argv[2] try: int(language) except ValueError: try: language = iso_to_id[language] except KeyError: print("Provided language ID not a number, and string is not mapped. Aborting") sys.exit() # Set file paths xml_file_name = sys.argv[1] try: gthome = os.environ["GTHOME"] print("Found GTHOME: {}".format(gthome)) except KeyError: try: gthome = os.environ["GIELLA_HOME"] print("Found GIELLA_HOME: {}".format(gthome)) except KeyError: gthome = "../.." print("Did not find GTHOME or GIELLA_HOME, using relative path") ilmentyma_path = gthome + "/words/dicts/algu/ilmentyma.xml" sanue_path = gthome + "/words/dicts/algu/sanue.xml" # Read ilmentyma.xml try: ilmentyma_tree = etree.parse(ilmentyma_path) print("Loaded ilmentyma.xml") except OSError: print("Failed to load {}.".format(ilmentyma_path)) sys.exit() # Read sanue.xml try: sanue_tree = etree.parse(sanue_path) print("Loaded sanue.xml") except OSError: print("Failed to load {}.".format(sanue_path)) sys.exit() # Read input file try: # Removing blank text in parser to make sure pretty printing works as intended dict_tree = etree.parse(xml_file_name, parser=etree.XMLParser(remove_blank_text=True)) print("Loaded {}".format(xml_file_name)) except OSError: print("Failed to load {}. Is the file name correct?".format(xml_file_name)) sys.exit() i=j=0 print("Parsing file and adding algu nodes...") print("Note that homonymes are not distinguished, so you need to run find_homonymous_lemmas.py afterwards and manually revise the file") # Look up words from dict xml in ilmentyma.xml for elem in dict_tree.getiterator(): # Go though all lemmas if elem.tag == "l": lemma = elem.text xpath = '//field[@name="ilmentyma_p"][.="{}"]'.format(lemma) # Search for lemma in ilmentyma.xml ilmentyma_field = ilmentyma_tree.find(xpath) if ilmentyma_field is not None: # Found lemma! Retrieve lekseemi_id row = ilmentyma_field.getparent() lekseemi_id = row.find('field[@name="lekseemi_id"]').text # Retrieve sanue_id which is "id" in sanue.xml xpath = '//field[@name="eduslekseemi"][.="{}"]'.format(lekseemi_id) eduslekseemi_field = sanue_tree.find(xpath) if eduslekseemi_field is not None: row = eduslekseemi_field.getparent() sanue_id = row.find('field[@name="id"]').text # Add IDs as algu node in lemma group algu = etree.SubElement(elem.getparent(), "algu") algu.set("lekseemi_id", lekseemi_id) algu.set("sanue_id", sanue_id) i += 1 j += 1 print("Added algu node to {}/{} lemma(s)".format(i, j), end="\r") if (i > 1): print("\nFinished adding algu information. Writing file... ", end="") dict_tree.write(xml_file_name, encoding="utf-8", pretty_print=True) print("Done") else: print("No changes made. Quitting without saving file.")