# -*- coding:utf-8 -*- import re, os, errno, cgi, json, xml import sys, codecs, locale, getopt import xml.etree.ElementTree as ET from subprocess import Popen, PIPE from operator import itemgetter from xml.dom.minidom import parse, parseString from imp import reload from collections import defaultdict def main(): # to be adjusted as needed pos='Prop' in_dir = '3_uxml' out_dir = '4_axml' cwd = os.getcwd() out_dir_path = os.path.join(cwd,out_dir) if not os.path.exists(out_dir_path): os.mkdir(out_dir_path) debug_fst = False namespaces = {'xml': 'http://www.w3.org/1999/xml'} # parameters to be adjusted as needed plup = Popen('which lookup', shell=True, stdout=PIPE, stderr=PIPE) olup, elup = plup.communicate() print("___ lookup is ",olup.decode()) if not olup.decode(): print('No lookup found, please install it!') sys.exit() lookup = olup.decode().strip() langs_dir = '$GTHOME/langs/' xfst_file = '/src/analyser-gt-' for root, dirs, files in os.walk(in_dir): for f in files: if f.endswith('xml'): print('... processing ', str(f)) tree = ET.parse(os.path.join(in_dir,f)) f_root = tree.getroot() lgs = f_root.findall('.//lang') for lg in lgs: lang_code=lg.get('code') names = lg.findall('./name') for name in names: # analyser-gt-norm.xfst # analyser-gt-desc.xfst for fst_type in ['norm', 'desc']: c_fst = langs_dir + lang_code + xfst_file + fst_type + '.xfst' name = checkAnalysis(fst_type, c_fst, name, lang_code) tree.write(os.path.join(out_dir_path,str(f)), xml_declaration=True,encoding='utf-8', method="xml") print('DONE ', f, '\n\n') def checkAnalysis(fst_type, fst, name, lang_code): _fst_type = fst_type _fst = fst _name = name _lang_code = lang_code spelling=_name.find('spelling').text print('... lemma ', str(spelling)) cmd = " | lookup -q -flags mbTT " + _fst p = Popen('echo "'+spelling+'"'+cmd, shell=True, stdout=PIPE, stderr=PIPE) out, err = p.communicate() c_analysis = '' filtered_analysis = '' print("|", out.decode().split('\n', 1 ),"|") current_analysis = filter(None,out.decode().split('\n\n')) for current_cohort in current_analysis: cc_list = current_cohort.split('\n') # set default analysis value to 'no' _name.find('spelling').set(_fst_type+'_fst', 'no') for analysis in cc_list: analysis = analysis.partition('\t')[2] if '+Prop+' in analysis: # due to tags in nob output if _lang_code is 'nob': _name.find('spelling').set(_fst_type+'_fst', 'yes') break # due to tags in non-nob output if analysis.endswith('+Nom') and not _lang_code is 'nob': _name.find('spelling').set(_fst_type+'_fst', 'yes') break # refine analysis: check with NÃ¥ebrie, Storfjellet and kommunenavn Lierne # this is easier done via xsl: both files are xml-files return _name if __name__ == "__main__": reload(sys) main()