# -*- coding:utf-8 -*- import re, os, errno, cgi, json, xml import sys, codecs, locale, getopt import xml.etree.ElementTree as ET from subprocess import Popen, PIPE from operator import itemgetter from xml.dom.minidom import parse, parseString from importlib import reload from collections import defaultdict def main(): #The script expects 2 (and 1 optional) parameters: # language to analyse, input_directory (and genre) lang = sys.argv[1] in_dir = sys.argv[2] if len(sys.argv) == 4: genre_str = sys.argv[3] else: genre_str = '' out_dir = 'out_' + lang + '_' + in_dir done_dir = 'done_' + genre cwd = os.getcwd() out_dir_path = os.path.join(cwd, out_dir) done_dir_path = os.path.join(cwd, done_dir) if not os.path.exists(out_dir_path): os.makedirs(out_dir_path) if not os.path.exists(done_dir_path): os.mkdir(done_dir_path) debug_fst = False namespaces = {'xml': 'http://www.w3.org/1999/xml'} plup = Popen('which lookup', shell=True, stdout=PIPE, stderr=PIPE) olup, elup = plup.communicate() #print("___ lookup is ",olup.decode()) if not olup.decode(): #print('No lookup found, please install it!') sys.exit() lookup = olup.decode().strip() langs_dir = '$GTHOME/langs/' abbr_file = langs_dir + lang + '/tools/tokenisers/abbr.txt' rel_xfst_file = '/src/analyser-disamb-gt-desc.xfst' abs_xfst_file = langs_dir + lang + rel_xfst_file disamb_file = langs_dir + lang + '/src/syntax/disambiguator.cg3' for root, dirs, files in os.walk(in_dir): # Walk directory tree # print("Input dir {0} with {1} files ...".format(root, len(files))) for f in files: if f.endswith('tmx'): print('... processing ', str(f)) tree = ET.parse(os.path.join(root,f)) f_root = tree.getroot() header = f_root.find('.//header') genre = ET.Element('genre') if genre_str: genre.text = genre_str header.insert(1, genre) tuvs = f_root.findall('.//tuv[@lang="'+lang+'"]') for tuv in tuvs: seg = tuv.findall('seg') seg_txt = seg[0].text print('... seg ', str(seg_txt)) cmd = '| preprocess --abbr ' + abbr_file + ' | lookup -q -flags mbTT ' + abs_xfst_file + ' | lookup2cg | vislcg3 -g ' + disamb_file #print('... cmd ', cmd) p = Popen("echo '" + seg_txt + "'" + cmd, shell=True, stdout=PIPE, stderr=PIPE) out, err = p.communicate() c_analysis = '' #print("|", out.decode().split('\n', 1 ),"|") current_analysis = filter(None, out.decode().split('\n"<')) for current_cohort in current_analysis: cc_list = current_cohort.split('\n\t') wform = cc_list[0] wform = wform.strip() if wform.startswith('"<'): wform = wform[2:] if wform.endswith('>"'): wform = wform[:-2] wform = wform.replace(' ', '_') cc_list.pop(0) sccl = sorted(cc_list) l_a = sccl[0] lemma = l_a.partition('" ')[0] lemma = lemma.strip() lemma = lemma.replace('#','') lemma = lemma.replace(' ', '_') if lemma.startswith('"'): lemma = lemma[1:] analysis = l_a.partition('" ')[2] p_analysis = l_a.partition('" ')[2] analysis = analysis.partition('@')[0] analysis = analysis.replace('Err/Orth','') analysis = analysis.replace(' <'+lang+'>','') analysis = analysis.replace(' ','') analysis = analysis.replace(' Sem/Date','') analysis = analysis.replace(' Sem/Org','') analysis = analysis.replace(' Sem/Sur','') analysis = analysis.replace(' Sem/Fem','') analysis = analysis.replace(' Sem/Mal','') analysis = analysis.replace(' Sem/Plc','') analysis = analysis.replace(' Sem/Obj','') analysis = analysis.replace(' Sem/Adr','') analysis = analysis.replace('Sem/Adr ','') analysis = analysis.replace(' Sem/Year','') analysis = analysis.replace(' IV','') analysis = analysis.replace(' TV','') analysis = analysis.replace('v1 ','') analysis = analysis.replace('v2 ','') analysis = analysis.replace('Hom1 ','') analysis = analysis.replace('Hom2 ','') analysis = analysis.replace('/','_') if analysis.startswith('Arab Num'): analysis = analysis.replace('Arab Num','Num Arab') analysis = analysis.strip() if '?' in analysis: analysis = '___' analysis = analysis.strip() analysis = analysis.replace(' ',' ') analysis = analysis.replace(' ','.') pos = analysis.partition('.')[0] formated_line = wform+'\t'+lemma+'\t'+pos+'\t'+analysis c_analysis = c_analysis+'\n'+formated_line analysis = ET.Element('analysis') analysis.text = c_analysis+'\n' tuv.insert(1, analysis) tree.write(os.path.join(out_dir_path,str(f)), xml_declaration=True,encoding='utf-8', method="xml") print('DONE ', f, '\n\n') mv_cmd = "mv " + os.path.join(root, f) + " " + done_dir_path + "/" print('MOVED file ', f, ' in done folder \n\n') p = Popen(mv_cmd, shell=True, stdout=PIPE, stderr=PIPE) mv_out, mv_err = p.communicate() if __name__ == "__main__": reload(sys) main()