#!/usr/bin/env python3 # -*- coding:utf-8 -*- import re, os, errno, cgi, json, xml, logging, numpy, copy #print('numpy version: '+numpy.version.version) import inspect import sys, codecs, locale, getopt import xml.etree.ElementTree as ET from subprocess import Popen, PIPE from operator import itemgetter from xml.dom.minidom import parse, parseString from imp import reload from collections import defaultdict def indent(elem, level=0): i = "\n" + level*" " if len(elem): if not elem.text or not elem.text.strip(): elem.text = i + " " if not elem.tail or not elem.tail.strip(): elem.tail = i for elem in elem: indent(elem, level+1) if not elem.tail or not elem.tail.strip(): elem.tail = i else: if level and (not elem.tail or not elem.tail.strip()): elem.tail = i def vrt_format(elem, level=0): i = "\n" if len(elem): if not elem.text or not elem.text.strip(): elem.text = i if not elem.tail or not elem.tail.strip(): elem.tail = i for elem in elem: vrt_format(elem, level+1) if not elem.tail or not elem.tail.strip(): elem.tail = i else: if not elem.tail or not elem.tail.strip(): elem.tail = i def main(): # for debugging purposes #Try printing inspect.stack() you can see current stack and pick whatever you want #file_name = __FILE__ #current_line_no = inspect.stack()[0][2] #current_function_name = inspect.stack()[0][3] # to be adjusted as needed if len(sys.argv) != 2: print('wrong number of arguments') sys.exit('Error') # check if the arg is a dir or a file and adjust the script according to this info in_dir = sys.argv[1] debug_index = '' out_dir = '_od_'+in_dir+'_'+debug_index logging.basicConfig(filename='proc_'+in_dir+'_'+debug_index+'.log',level=logging.DEBUG) cwd = os.getcwd() out_dir_path = os.path.join(cwd,out_dir) if not os.path.exists(out_dir_path): print('_od_ ::: ' + out_dir_path) os.mkdir(out_dir_path) # parameters to be adjusted as needed lang = 'sme' fst_type = 'hfstol' debug_fst = False rel_fst_file = '/src/analyser-disamb-gt-desc.'+fst_type langs_dir = '$GTHOME/langs/' lookup = '' lookup2cg = '' vislcg3 = '' if fst_type == 'xfst': plup = Popen('which lookup', shell=True, stdout=PIPE, stderr=PIPE) olup, elup = plup.communicate() ###print("___ lookup is ",olup.decode()) if fst_type == 'hfstol': plup = Popen('which hfst-optimised-lookup', shell=True, stdout=PIPE, stderr=PIPE) olup, elup = plup.communicate() if not olup.decode(): print('No lookup found, please install it!') sys.exit('Error') lookup = olup.decode().strip() plup2cg = Popen('which lookup2cg', shell=True, stdout=PIPE, stderr=PIPE) olup2cg, elup2cg = plup2cg.communicate() if not olup2cg.decode(): print('No lookup2cg found, please install it!') sys.exit('Error') lookup2cg = olup2cg.decode().strip() pvislcg3 = Popen('which vislcg3', shell=True, stdout=PIPE, stderr=PIPE) ovislcg3, evislcg3 = pvislcg3.communicate() if not ovislcg3.decode(): print('No vislcg3 found, please install it!') sys.exit('Error') vislcg3 = ovislcg3.decode().strip() for root, dirs, files in os.walk(in_dir): # Walk directory tree print("Input dir {0} with {1} files ...".format(root, len(files))) for current_file in files: if len(files) == 0 : continue if current_file.endswith('.xml'): print('... processing ', str(root)) print('... processing ', str(current_file)) logging.warning(str(os.path.join(root,current_file))+'\n') current_out_dir_path = os.path.join(out_dir_path,root) print('... processing ', str(current_out_dir_path)) if not os.path.exists(current_out_dir_path): os.makedirs(current_out_dir_path, exist_ok=True) ### print('___ processed ', str(current_out_dir_path)) xml_tree = ET.parse(os.path.join(root,current_file), ET.XMLParser(encoding='utf-8')) f_root = xml_tree.getroot() content_el = f_root.find('.//body/dependency') content = content_el.text #attributes the a text element # title="Sámi_oskkuoahpahusplána" #lang="sme" #orig_lang="___" #gt_domain="bible" #first_name="___" #last_name="___" #nationality="___" #date="2011-01-01" #datefrom="20110101" #dateto="20110101" #timefrom="000000" #timeto="235959" f_title = '' f_genre = '' f_lang = '' f_orig_lang = '' f_first_name_author = '' f_last_name_author = '' f_nationality = '' year_value = '' f_date = '0000-00-00' f_datefrom = '00000000' f_dateto = '00000000' f_timefrom = '000000' f_timeto = '235959' f_title = f_root.find('.//header/title').text.strip() if f_root.find('.//header/title').text else '' f_genre = f_root.find('.//header/genre').attrib.get('code') if f_root.find('.//header/genre').attrib.get('code') else '' if f_root.find('.//header/author/person') is not None: f_first_name_author = f_root.find('.//header/author/person').attrib.get('firstname') f_last_name_author = f_root.find('.//header/author/person').attrib.get('lastname') f_nationality = f_root.find('.//header/author/person').attrib.get('nationality') f_lang = f_root.get('{http://www.w3.org/XML/1998/namespace}lang') if f_root.find('.//header/translated_from') is not None: f_orig_lang = f_root.find('.//header/translated_from').attrib.get('{http://www.w3.org/XML/1998/namespace}lang') # no element year in the header if f_root.find('.//header/year') is None: f_date = '0000-00-00' f_datefrom='00000000' f_dateto='00000000' else: year_value = str(f_root.find('.//header/year').text) #unknown if year_value == 'unknown': f_date = '0000-00-00' f_datefrom='00000000' f_dateto='00000000' #2018 elif re.match(r'^[0-9]{4,4}$', year_value): f_date = year_value + '-01-01' f_datefrom = year_value + '0101' f_dateto = year_value + '0101' #2011-2012 elif re.match(r'^([0-9]{4,4})\-([0-9]{4,4})$', year_value): first,last = re.split('\-', year_value) f_date = first + '-01-01' f_datefrom = first + '0101' f_dateto = last + '0101' #05.10.2004 elif re.match(r'^[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4,4}$', year_value): day,month,year = re.split('\.', year_value) f_date = year + '-' + month + '-' + day f_datefrom = year + month + day f_dateto = year + month + day else: f_date = '0000-00-00' f_datefrom='00000000' f_dateto='00000000' # logging.info('... title|' + f_title +'|_') # logging.info('... genre|' + f_genre + '|and domain|' + get_domain_string(f_genre) +'|_') # logging.info('... lang|' + f_lang +'|_') # logging.info('... orig_lang|' + f_orig_lang +'|_') # logging.info('... first name|' + f_first_name_author +'|_') # logging.info('... last name|' + f_last_name_author +'|_') # logging.info('... nationality|' + f_nationality +'|_') # logging.info('... date|' + f_date +'|_') # logging.info('... datefrom|' + f_datefrom +'|_') # logging.info('... dateto|' + f_dateto +'|_') # logging.info('... timefrom|' + f_timefrom +'|_') # logging.info('... timeto|' + f_timeto +'|_') f_root.clear() f_root.tag = 'text' f_root.set('title', f_title) f_root.set('lang', f_lang) f_root.set('orig_lang', f_orig_lang) f_root.set('first_name', f_first_name_author) f_root.set('last_name', f_last_name_author) f_root.set('nationality', f_nationality) f_root.set('gt_domain', get_domain_string(f_genre)) f_root.set('date', f_date) f_root.set('datefrom', f_datefrom) f_root.set('dateto', f_dateto) f_root.set('timefrom', f_timefrom) f_root.set('timeto', f_timeto) sentences = split_cohort(content, lang) # converting the analysis output into a suitable xml format for vrt transformation (vrt is the cwb input format) #for s_id, sentence in reversed(list(enumerate(sentences))): for s_id, sentence in enumerate(sentences): current_sentence = ET.SubElement(f_root, 'sentence') current_sentence.set('id', str(s_id+1)) positional_attributes = '\n' output_type = 'vrt' for token in sentence: ### logging.info('_current_token_|'+str(token)+'|_') if output_type == 'xml': current_word = ET.SubElement(current_sentence, 'word') # NB: the names of the xml attributes of the word element are sorted alphabetically, e.g., 'dcs' comes first! for i, positional_feature in enumerate(token): if i == 0: current_word.set('form', positional_feature) elif i == 1: current_word.set('lemma', positional_feature) elif i == 2: current_word.set('pos', positional_feature) elif i == 3: current_word.set('msd', positional_feature) elif i == 4: current_word.set('sID', positional_feature) elif i == 5: current_word.set('depRel', positional_feature) elif i == 6: current_word.set('pID', positional_feature) elif i == 7: current_word.set('dcs', positional_feature) if output_type == 'vrt': for i, positional_feature in enumerate(token): if i == 0: positional_attributes += positional_feature else: if i == 5: ###print('_posfit_|' + positional_feature + '|_posfit_') positional_feature = get_deprel_string(positional_feature) positional_attributes += '\t' + positional_feature positional_attributes += '\n' current_sentence.text = positional_attributes # delete the original dependency node dep_nodes = f_root.findall('.//body/dependency') while len(dep_nodes): parent = f_root.findall('.//body/dependency'+'/..')[0] parent.remove(dep_nodes[0]) dep_nodes = f_root.findall('.//body/dependency') vrt_format(f_root) xml_tree.write(os.path.join(current_out_dir_path,str(current_file)), xml_declaration=False,encoding='utf-8', method="xml") print('DONE ', current_file, '\n\n') def split_cohort(analysis, current_lang): _current_lang = current_lang debug_output = False #generate_der_comp_lemma = False _analysis = analysis # ambiguity hack: mask '<' as lemma, i.e., in the context of '\n\t\"<' _analysis = re.sub('\n\t\"<','\n\t\"\\<',_analysis) _analysis = re.sub('\n\t\">','\n\t\"\\>',_analysis) _analysis = re.sub(""":\s* \s* \s*""",':\n',_analysis) # another hack while waiting for the fix: delete all initial line of a file starting with a colon if _analysis.startswith(':'): _analysis = re.sub('^:[^\n]*\n','',_analysis) # - waiting for specifications on how these pieces of information will be deployed in the corpus # and presented in Korp: as substrings of the msd-string or as extra attribute-value pairs choosable # via the Korp interface? # - for now they ar just filtered away for extra_tag in ['','','','','','','']: _analysis = re.sub(' '+extra_tag,'',_analysis) wordform_filter = get_wordform_filter() for wordform in wordform_filter: _analysis = re.sub(' '+wordform,'',_analysis) ###logging.info('ANALYSIS_sentence|'+ _analysis + '|_') _sentences = [] for current_sentence in [x for x in re.split('\n\n', _analysis) if x != '']: sentence = [] ###print('...1_sentence|'+ current_sentence + '|_') # split the tokens+analyses based on ('"<' for current_cohort in [x for x in re.split('"<', current_sentence) if x != '']: # discard all lines starting with ':' (= giella format of hfst) cohort = re.split('\n:', current_cohort)[0] ### print('...2_cohort|'+ cohort + '|_') # split word form from analysis word_form = re.split('>"\n', cohort)[0] rest_cohort = re.split('>"\n', cohort) ### print('...3_wf|'+ word_form + '|_') ### print('...3_rc|'+ str(rest_cohort) + '|_') # further split non-disambiguated analyses based on '\n\t"' cohort_lines = re.split('\n\t"', rest_cohort[1]) ### print('...4_cohort_lines|'+ str(cohort_lines) + '|_') split_analysis = [] # explicit marking of boundaries between: # lemma, derivation strings, analysis of parts of compounds for line in cohort_lines: # delete '\t"' at the beginning of the analysis ###print('...5_ln_x1x|'+ line + '|_') line = line.lstrip('\t') ###print('...6_ln_x2x|'+ line + '|_') if line.startswith('"'): line = line[1:] ###print('...7_ln_x3x|'+ line + '|_') # delete '\n' at the end of the analysis line = line.rstrip('\n') # delimiter between lemma and msd (morpho-syntactic description) line = re.sub('\"\s','_∞_',line) # delimiter between the compound parts line = re.sub('\n\t','_™_',line) # keep track of the embedding of the different parts for compounds split into more than two parts line = re.sub('\t\"','_™_',line) line = re.sub('\t','_™_',line) split_analysis.append(line) ###print('_unsorted_cohort_|'+str(split_analysis)+'|__') # sort cohort sorted_analysis_lines = [] # if there are mixed analyses with and without Error tags # filter away all instances containing Error tags # however, if there are only analyses containing Error tags # sort the cohort and choose the first version filtered_analysis = [i for i in split_analysis if not ('Err/' in i)] if len(filtered_analysis) > 0: ### logging.info('_filtered_cohort_|'+str(filtered_analysis)+'|__') sorted_analysis_lines = sorted(filtered_analysis, key=lambda name:name.lower()) ### logging.info('_filtered_sorted_cohort_|'+str(sorted_analysis_lines)+'|__') else: ### logging.info('_unfiltered_unsorted_cohort_|'+str(split_analysis)+'|__') sorted_analysis_lines = sorted(split_analysis, key=lambda name:name.lower()) ### logging.info('_unfiltered_sorted_cohort_|'+str(sorted_analysis_lines)+'|__') ### if len(split_analysis) > 1: ### logging.info('_unsorted_cohort_|'+str(split_analysis)+'|__') ### if len(split_analysis) > 1: ### logging.info('_sorted_cohort_|'+str(sorted_analysis_lines)+'|__') # take the first analysis in case there are more than one non-disambiguated analyses used_analysis = sorted_analysis_lines[0] # filter all Err- and Sem-tags from the string used_analysis = re.sub('Err/[^\s]+\s','',used_analysis) used_analysis = re.sub('Sem/[^\s]+\s','',used_analysis) used_analysis = re.sub('Use/[^\s]+\s','',used_analysis) used_analysis = re.sub('Gram/[^\s]+\s','',used_analysis) used_analysis = re.sub('OLang/[^\s]+\s','',used_analysis) used_analysis = re.sub('Dial/[^\s]+\s','',used_analysis) used_analysis = re.sub('CmpN/[^\s]+\s','',used_analysis) used_analysis = re.sub('CmpNP/[^\s]+\s','',used_analysis) used_analysis = re.sub('G3+\s','',used_analysis) used_analysis = re.sub('v9+\s','',used_analysis) if debug_output: print('8_used_analysis_|'+str(used_analysis)+'|_') logging.info('8_used_analysis_|'+str(used_analysis)+'|_') # keep this strig for lemma generation original_analysis = used_analysis ex_index = used_analysis.find('Ex/') tm_index = used_analysis.find('_™_') current_line_no = inspect.stack()[0][2] ### print('_ex-tm_|'+str(ex_index)+'|'+str(tm_index)+'|__|'+str(current_line_no)+'|__') ### print('_|'+ word_form + '|_|' + str(used_analysis) + '|_') if 'Ex/' in used_analysis and not '_™_' in used_analysis: lemma = used_analysis.split('_∞_', 1)[0] msd = used_analysis.split('_∞_', 1)[1] swapped_msd = get_correct_pos(msd) used_analysis = lemma+'_∞_'+swapped_msd ###print('_LMSU__|'+ lemma + '|_|' + msd + '|_|' + swapped_msd+ '|_|' + used_analysis+ '|__LMSU_') # extra handling of the combination of derivation of the head # and compounding if 'Ex/' in used_analysis and '_™_' in used_analysis and ex_index < tm_index: #logging.info('_XXX_|'+used_analysis+'|_') lemma = used_analysis.split('_∞_', 1)[0] msd = used_analysis.split('_∞_', 1)[1] derivation = msd.split('_™_', 1)[0] rest = msd.split('_™_', 1)[1] swapped_msd = get_correct_pos(derivation) used_analysis = lemma+'_∞_'+swapped_msd+'_™_'+rest #logging.info('_YYY_|'+used_analysis+'|_') # put a clear delimiter between the (first) pos value and the rest of msd # in order to disambiguate from the rest of whitespaces parts = re.compile("(_∞_\w+\s?|_∞_\?\s?)").split(used_analysis, 1) ###logging.info('_parts_|'+str(parts)+'|_') parts[1] = parts[1].replace('_∞_', '').strip() lemma = parts[0] pos = parts[1] rest = parts[2] #logging.info('_LEN_the-parts_|'+str(len(parts))+'|_') #logging.info('_1_the-parts_|'+str(parts)+'|_') ex_in_r = rest.find('_©_') tm_in_r = rest.find('_™_') #current_line_no = inspect.stack()[0][2] #logging.info('_exr-tmr_|'+str(ex_in_r)+'|'+str(tm_in_r)+'|_|'+str(current_line_no)+'|_') #derivation-composition string dcs = '' #morpho-syntactic description msd = '' # split derivation/composition string from the rest of MSD # and put it in and extra tuple at the end of the tuple list, # otherwise add a default tuple '___' # no derivation, no composition if ex_in_r == -1 and tm_in_r == -1: msd = rest dcs = '___' ###logging.info('_msd_cds_1_|'+str(msd)+'|_|'+str(dcs)+'|_') # no derivation, but composition elif ((ex_in_r == -1 and not tm_in_r == -1) or (not ex_in_r == -1 and not tm_in_r == -1 and tm_in_r < ex_in_r)): msd, dcs = re.compile('_™_').split(rest, 1) dcs = '_™_'+dcs ###logging.info('_msd_cds_2_|'+str(msd)+'|_|'+str(dcs)+'|_') # derivation, but no composition elif ((not ex_in_r == -1 and tm_in_r == -1) or (not ex_in_r == -1 and not tm_in_r == -1 and ex_in_r < tm_in_r)): msd, dcs = re.compile('_©_').split(rest, 1) dcs = '_©_'+dcs ###logging.info('_msd_cds_3_|'+str(msd)+'|_|'+str(dcs)+'|_') # covered all relevant combinations? else: logging.info('_msd_cds_4_|'+str(rest)+'|_') # processing msd: splitting function label, selfID and parentID from the msd string msd_drel = re.compile(' #').split(msd) head = '' tail = '' ###print('_XXX_|'+str(msd_drel)+'|_') ###print('_YYY_|'+str(len(msd_drel))+'|_') if len(msd_drel) == 1: head = '___' ###print('IF ... head ', head) tail = msd_drel[0].lstrip('#') ###print('IF ... tail ', tail) else: ### here to debug head = msd_drel[0] ###print('ELSE ... head ', head) tail = msd_drel[1] ###print('ELSE ... tail ', tail) current_msd = '' fct_label = '' ### here to debug ###print('_the-tail_|'+str(tail)+'|_') self_id, parent_id = re.compile('->').split(tail) ###print('_ID_|'+str(self_id)+'|_|'+str(parent_id)+'|_') # splitting the fuction label if not head == '___': if not '@' in head: current_msd = head fct_label = 'X' #logging.info('_head_|'+str(head)+'|_') else: msd_fct = re.compile(' @').split(head) if len(msd_fct) == 1: current_msd = '___' fct_label = msd_fct[0].lstrip('@') #logging.info('_msd_fct_1_|'+str(msd_fct)+'|_') else: current_msd = msd_fct[0] fct_label = msd_fct[1] #logging.info('_msd_fct_2_|'+str(msd_fct)+'|_') else: current_msd = '___' fct_label = 'X' # TODO: update the description below # MDS can be complex and partitioned with specific separators: # 1. _™_ for each TAB for parts of the compounds # 2. _∞_ as separator between lemma and POS+MSD in a part of a compound # Ex.: ('juovlavuonasildi', 'sildi', # 'N', 'Sem/Ani Sg Nom @HNOUN_™__™_vuotna_∞_N Sem/Plc Cmp/SgGen Cmp_™__™__™_juovllat_∞_N Sem/Time Cmp/SgNom Cmp') # 3. _©_ as separator between the MSD and the derivation tags # Ex.: ('stellejuvvot', 'stellet', 'V', 'IV Inf @-FMAINV_©_Ex/V Ex/TV Der/PassL') # TODO: split Sem-tags and put them into as a separate position attribute for an updated corpus format for Korp # so that semantic attributes can be searchable via Korp interface if pos == '?': pos = '___' # analysed data as an 8-tuple: (WORD_FORM, LEMMA, POS, MSD, SELF_ID, FUNCTION_LABEL, PARENT_ID, DERIVATION-COMPOUNDING-STRING) # ambiguity hack: unmask '<' and '>' as lemma lemma = lemma.replace('\\','') ### DONE ### replace here lemma with the generated lemma; ### delete derivation/composition tags ### CAVEAT: the tuple will not be a 8-tuple but a 7-tuple #lemma generation string lemma_generation_string = '' generated_lemma = '' #if generate_der_comp_lemma: if 'Ex/' in original_analysis or '_™_' in original_analysis: lemma_generation_string = get_generation_string(original_analysis, pos, current_msd, _current_lang) if lemma_generation_string: ### logging.info('xxx_lem-gen-str_|'+lemma_generation_string+'|_') generated_lemma = generate_lemma(lemma_generation_string, _current_lang) # msd clean up ### logging.info('_1_msd_|' + current_msd + '|_') current_msd = re.sub('IV\s','',current_msd) current_msd = re.sub('TV\s','',current_msd) current_msd = re.sub('Relc','Rel',current_msd) current_msd = re.sub('Dyn','',current_msd) current_msd = re.sub('Known','',current_msd) current_msd = current_msd.strip() current_msd = re.sub('/','_',current_msd) current_msd = re.sub('\s','.',current_msd) # add the pos as fist element of the msd string if current_msd == '___': current_msd = pos else: current_msd = pos + '.' + current_msd ### logging.info('_2_msd_|' + current_msd + '|_') analysis_tuple = () ### logging.info('_generated_lemma_|' + generated_lemma + '|_') if generated_lemma == '': analysis_tuple = (word_form, lemma, pos, current_msd, self_id, fct_label, parent_id) else: analysis_tuple = (word_form, generated_lemma, pos, current_msd, self_id, fct_label, parent_id) ### logging.info("_current_tuple_|"+str(analysis_tuple)+"|_") #filter away '¶', which is used only to have some clause boundary if there is no one there # TODO: as well as other strings that are most likely noise if not word_form == '¶': sentence.append(analysis_tuple) # filter empty "sentences" due to filtering of '¶' if sentence: ###logging.info("_analysed_token_tuples_|"+str(sentence)+"|_") _sentences.append(sentence) return _sentences; def get_correct_pos(input_string): _input_string = input_string ###print('_instr_|' + _input_string + '|_') der_pos_msd = re.split('( V | N | A | Adv | Po )', input_string) ###print('_der_pos_msd_|' + str(der_pos_msd) + '|_') swapped_string = der_pos_msd[1].strip()+' '+der_pos_msd[2].strip()+'_©_'+der_pos_msd[0] return swapped_string; def get_domain_string(domain): domain_mapping = { "admin": "administration", "bible": "bible", "facta": "facts", "ficti": "fiction", "literature": "fiction", "law": "law", "laws": "law", "news": "news", "science": "science", "blogs": "blog", "": "" } _domain = domain_mapping[domain] return _domain; def get_deprel_string(deprel): ### print('___deprep___'+deprel) deprel_mapping = { ">A": "→A", ">ADVL": "→ADVL", ">CC": "→CC", ">N": "→N", ">Num": "→Num", ">P": "→P", ">Pron": "→Pron", "": "-FADVL→", "-FMAINV": "-FMAINV", "-FOBJ>": "-FOBJ→", "-FSUBJ>": "-FSUBJ→", "A<": "A←", "ADVL": "ADVL", "ADVL>": "ADVL→", "ADVL>CS": "ADVL→CS", "ADVL<": "ADVL←", "APP-ADVL<": "APP-ADVL←", "APP-N<": "APP-N←", "APP-Pron<": "APP-Pron←", "CNP": "CNP", "COMP-CS<": "COMP-CS←", "CVP": "CVP", "FAUX": "FAUX", "-FAUX": "-FAUX", "-FAUXV": "-FAUXV", "FMV": "FMV", "FMVdic": "FMVdic", "FS-": "FS-ADVL→", "FS-IAUX": "FS-IAUX", "FS-IMV": "FS-IMV", "FS-N<": "FS-N←", "FS-N": "OBJ→", "OPRED>": "OPRED→", "P<": "P←", "PCLE": "PCLE", "Pron<": "Pron←", "S<": "S←", "SPRED>": "SPRED→", "SPRED": "SUBJ→", "VOC": "VOC", "SPRED": "SPRED", "SUBJ": "SUBJ", "X": "X" } _deprel = deprel_mapping[deprel] return _deprel; def get_wordform_filter(): _wordform_filter = [ '"< suohkanbargi>"', '"< suohkanbargiide>"', '"< suohkanbargiin>"', '"< suohkanbargit>"', '"< suohkanbáhpa>"', '"< suohkanbáhpain>"', '"< suohkanbáhpas>"', '"< suohkanbáhppa>"', '"< suohkanbáhppan>"', '"< suohkanbáhppavirgi>"', '"< suohkanbáhppavirgái>"', '"< suohkandoaktáris>"', '"< suohkandoavtterbálvalusas>"', '"< suohkandoavttir>"', '"< suohkanekonomiija>"', '"< suohkangirji>"', '"< suohkangirjji>"', '"< suohkanhálddahusas>"', '"< suohkanluottain>"', '"< suohkanmearka>"', '"< suohkanpolitihkar>"', '"< suohkanpolitihkarin>"', '"< suohkanpolitihkka>"', '"< suohkanpolitihkkarat>"', '"< suohkanpolitihkkariid>"', '"< suohkanpolitihkkariiguin>"', '"< suohkanpolitihkkariin>"', '"< suohkanpolitihkkár>"', '"< suohkanpolitihkkárat>"', '"< suohkanpolitihkkáriid>"', '"< suohkanpsykologa>"', '"< suohkanrádjái>"', '"< suohkanráji>"', '"< suohkanrájiid>"', '"< suohkanrájit>"', '"< suohkanstivra>"', '"< suohkanstivracoahkin>"', '"< suohkanstivralahttu>"', '"< suohkanstivralahtut>"', '"< suohkanstivraáirasat>"', '"< suohkanstivraáirasiid>"', '"< suohkanstivraáirras>"', '"< suohkanstivraášši>"', '"< suohkanstivračoahkkimis>"', '"< suohkanstivračoahkkin>"', '"< suohkanstivrii>"', '"< suohkanstivrra>"', '"< suohkanstivrraid>"', '"< suohkanstivrraláhttu>"', '"< suohkanstivrras>"', '"< suohkanstivrrat>"', '"< suohkanstivrraválga>"', '"< suohkanstivrraáirras>"', '"< suohkanstivrračoahkkima>"', '"< suohkanviesu>"', '"< suohkanviesus>"', '"< suohkanvissui>"', '"< suohkanvisteseaidnái>"', '"< suohkanvistti>"', '"< suohkanváldodoavttir>"', '"< suohkanválga>"', '"< suohkanválggaid>"', '"< suohkanválggaide>"', '"< suohkanválggain>"', '"< suohkanválggas>"', '"< suohkanválggat>"', '"< suohkanválgii>"', '"< suohkanássit>"', '"< suohkanšibitdoavttir>"', '"<.>"', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '""', '"<Čakčam>"', '"<čakčam>"', ] return _wordform_filter; def get_generation_string(in_analysis, in_pos, in_msd, in_lang): _used_analysis = in_analysis _pos = in_pos _msd = in_msd _lang = in_lang _string2generate = '' _lemma = _used_analysis.split('_∞_', 1)[0] _tail = _used_analysis.split('_∞_', 1)[1] # ignore function and dependence relation here _tail = re.sub('\s@[^\s]+','',_tail) _tail = re.sub('\s#\d+->\d+','',_tail) ex_index = _tail.find('Ex/') tm_index = _tail.find('_™_') current_line_no = inspect.stack()[0][2] ### print('_ex-tm_|'+str(ex_index)+'|'+str(tm_index)+'|__|'+str(current_line_no)+'|__') if 'Ex/' in _tail: if (not '_™_' in _tail) or ('_™_' in _tail and ex_index < tm_index): _string2generate = _lemma+'_∞1EX∞_'+_tail if '_™_' in _tail: if (not 'Ex/' in _tail) or ('Ex/' in _tail and tm_index < ex_index): _string2generate = _lemma+'_∞1CO∞_'+_tail ### replace all delimiter by '+' and '_™_' by '#' _string2generate = re.sub('\s+','+',_string2generate) _string2generate = re.sub('_∞1EX∞_','+',_string2generate) _string2generate = re.sub('Ex/','',_string2generate) _string2generate = re.sub('_∞1CO∞_','+',_string2generate) _string2generate = re.sub('_∞_','+',_string2generate) _string2generate = re.sub('(_™_)+','_™_',_string2generate) ### construct the correct order of generation for compund parts parts = _string2generate.split('_™_') swapped_string = '' if len(parts) > 1: ###print('_the_parts_|'+str(parts)+'|_') for i, p in reversed(list(enumerate(parts))): swapped_string += p if i > 0: swapped_string += '#' _string2generate = swapped_string ### logging.info('_bfr_str2gen_|'+_string2generate+'|_') # replace inflection tags of the analysed string with the corresponding baseform tags str_first = _string2generate.rpartition('+'+_pos+'+')[0] str_last = _string2generate.rpartition('+'+_pos+'+')[2] ### logging.info('_mid_str2gen_|'+str(_string2generate.rpartition('+'+_pos+'+'))+'|_') if _pos == 'V': _string2generate = str_first + '+' + _pos + '+' + 'Inf' if _pos == 'N': _string2generate = str_first + '+' + _pos + '+' + 'Sg+Nom' if _pos == 'A': if _lang == 'sma': if 'Comp' in str_last: _string2generate = str_first + '+' + _pos + '+' + 'Comp+Attr' elif 'Superl' in str_last: _string2generate = str_first + '+' + _pos + '+' + 'Superl+Attr' else: _string2generate = str_first + '+' + _pos + '+' + 'Attr' else: _string2generate = str_first + '+' + _pos + '+' + 'Sg+Nom' ### logging.info('_afr_str2gen_|'+_string2generate+'|_') return _string2generate; def generate_lemma(in_string, c_lang): _in_string = in_string _current_lang = c_lang _analysis_lemma = re.split('\+', _in_string, 1)[0] _generated_lemma = 'TODO_'+_in_string generation_cmd = ' | hfst-lookup -q $GTHOME/langs/'+ _current_lang + '/src/generator-gt-norm.hfstol' pFST = Popen('echo \''+_in_string+'\''+generation_cmd, shell=True, stdout=PIPE, stderr=PIPE) outFST, errFST = pFST.communicate() outFST = outFST.decode() outFST = re.split('\n', outFST, 1)[0] _generated_lemma = re.split('\t', outFST)[1] if _generated_lemma.endswith('+?'): _generated_lemma = _analysis_lemma ### logging.info('___gen-out___ ' + outFST + '______') return _generated_lemma; if __name__ == "__main__": reload(sys) main()