""" Parse input xml file and look up each lemma in ilmentyma.xml looking for lemmas which has entries with two (or more) different "lekseemi_id"s, thus having different etymologies. An example is sme sávdnji which can both mean seam and sauna. Print the found lemmas to stdout. These should be checked manually after having run add_algu_ids.py to make sure each lemma points to the correct etymology. """ import os, sys import lxml.etree as etree iso_to_id = { "mns": "14", "hun": "16", "kpv": "20", "udm": "21", "fin": "29", "krl": "31", "vep": "33", "vot": "34", "est": "35", "liv": "36", "sma": "41", "sju": "42", "sje": "43", "smj": "44", "sme": "45", "smn": "46", "sms": "47", "sjd": "49", "myv": "60", "mhr": "62", "mrj": "63" } # Initial checks before running. if (len(sys.argv) != 3): print("Usage: python3 {} /".format(sys.argv[0])) sys.exit() kieli_id = sys.argv[2] try: int(kieli_id) except ValueError: try: kieli_id = iso_to_id[kieli_id] except KeyError: print("Provided language ID not a number, and string is not mapped. Aborting") sys.exit() # Set file paths xml_file_name = sys.argv[1] try: gthome = os.environ["GTHOME"] print("Found GTHOME: {}".format(gthome)) except KeyError: try: gthome = os.environ["GIELLA_HOME"] print("Found GIELLA_HOME: {}".format(gthome)) except KeyError: gthome = "../.." print("Did not find GTHOME or GIELLA_HOME, using relative path") ilmentyma_path = gthome + "/words/dicts/algu/ilmentyma.xml" sanue_path = gthome + "/words/dicts/algu/sanue.xml" # Read ilmentyma.xml try: ilmentyma_tree = etree.parse(ilmentyma_path) print("Loaded ilmentyma.xml") except OSError: print("Failed to load {}.".format(ilmentyma_path)) sys.exit() # Read sanue.xml try: sanue_tree = etree.parse(sanue_path) print("Loaded sanue.xml") except OSError: print("Failed to load {}.".format(sanue_path)) sys.exit() # Read input file try: # Removing blank text in parser to make sure pretty printing works as intended dict_tree = etree.parse(xml_file_name, parser=etree.XMLParser(remove_blank_text=True)) print("Loaded {}".format(xml_file_name)) except OSError: print("Failed to load {}. Is the file name correct?".format(xml_file_name)) sys.exit() i=0 # Look up words from dict xml in ilmentyma.xml for elem in dict_tree.getiterator(): # Go though all lemmas if elem.tag == "l": lemma = elem.text xpath = '//field[@name="ilmentyma_p"][.="{}"]'.format(lemma) # Search for lemma in ilmentyma.xml ilmentyma_field_list = ilmentyma_tree.findall(xpath) if len(ilmentyma_field_list) > 1: lekseemi_ids = {} # Go through all fields, find the lekseemi_id and add it to the dict # If the dict contains more than one lekseemi_id after this, then print info for field in ilmentyma_field_list: row = field.getparent() lekseemi_id = row.find('field[@name="lekseemi_id"]').text # Check if the lekseemi id is in the chosen language xpath = '//field[@name="eduslekseemi"][.="{}"]'.format(lekseemi_id) sanue_field = sanue_tree.find(xpath) if sanue_field is not None: row = sanue_field.getparent() if kieli_id == row.find('field[@name="kieli_id"]').text: sanue_id = row.find('field[@name="id"]').text lekseemi_ids[sanue_id] = lemma if len(lekseemi_ids) > 1: print("Lemma {} corresponds to 'sanue_id's {}".format(lemma, [x for x in lekseemi_ids.keys()])) i+=1 print("Checked {} lemmas".format(i), end="\r")