#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import re, os, errno, cgi, json, xml, logging, numpy, copy
#print('numpy version: '+numpy.version.version)
import inspect
import sys, codecs, locale, getopt
import xml.etree.ElementTree as ET
from subprocess import Popen, PIPE
from operator import itemgetter
from xml.dom.minidom import parse, parseString
from imp import reload
from collections import defaultdict
def indent(elem, level=0):
i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
def vrt_format(elem, level=0):
i = "\n"
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
vrt_format(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if not elem.tail or not elem.tail.strip():
elem.tail = i
def main():
# for debugging purposes
#Try printing inspect.stack() you can see current stack and pick whatever you want
#file_name = __FILE__
#current_line_no = inspect.stack()[0][2]
#current_function_name = inspect.stack()[0][3]
# to be adjusted as needed
if len(sys.argv) != 2:
print('wrong number of arguments')
sys.exit('Error')
# check if the arg is a dir or a file and adjust the script according to this info
in_dir = sys.argv[1]
debug_index = ''
out_dir = '_od_'+in_dir+'_'+debug_index
logging.basicConfig(filename='proc_'+in_dir+'_'+debug_index+'.log',level=logging.DEBUG)
cwd = os.getcwd()
out_dir_path = os.path.join(cwd,out_dir)
if not os.path.exists(out_dir_path):
print('_od_ ::: ' + out_dir_path)
os.mkdir(out_dir_path)
# parameters to be adjusted as needed
lang = 'sme'
fst_type = 'hfstol'
debug_fst = False
rel_fst_file = '/src/analyser-disamb-gt-desc.'+fst_type
langs_dir = '$GTHOME/langs/'
lookup = ''
lookup2cg = ''
vislcg3 = ''
if fst_type == 'xfst':
plup = Popen('which lookup', shell=True, stdout=PIPE, stderr=PIPE)
olup, elup = plup.communicate()
###print("___ lookup is ",olup.decode())
if fst_type == 'hfstol':
plup = Popen('which hfst-optimised-lookup', shell=True, stdout=PIPE, stderr=PIPE)
olup, elup = plup.communicate()
if not olup.decode():
print('No lookup found, please install it!')
sys.exit('Error')
lookup = olup.decode().strip()
plup2cg = Popen('which lookup2cg', shell=True, stdout=PIPE, stderr=PIPE)
olup2cg, elup2cg = plup2cg.communicate()
if not olup2cg.decode():
print('No lookup2cg found, please install it!')
sys.exit('Error')
lookup2cg = olup2cg.decode().strip()
pvislcg3 = Popen('which vislcg3', shell=True, stdout=PIPE, stderr=PIPE)
ovislcg3, evislcg3 = pvislcg3.communicate()
if not ovislcg3.decode():
print('No vislcg3 found, please install it!')
sys.exit('Error')
vislcg3 = ovislcg3.decode().strip()
for root, dirs, files in os.walk(in_dir): # Walk directory tree
print("Input dir {0} with {1} files ...".format(root, len(files)))
for current_file in files:
if len(files) == 0 :
continue
if current_file.endswith('.xml'):
print('... processing ', str(root))
print('... processing ', str(current_file))
logging.warning(str(os.path.join(root,current_file))+'\n')
current_out_dir_path = os.path.join(out_dir_path,root)
print('... processing ', str(current_out_dir_path))
if not os.path.exists(current_out_dir_path):
os.makedirs(current_out_dir_path, exist_ok=True)
### print('___ processed ', str(current_out_dir_path))
xml_tree = ET.parse(os.path.join(root,current_file), ET.XMLParser(encoding='utf-8'))
f_root = xml_tree.getroot()
content_el = f_root.find('.//body/dependency')
content = content_el.text
#attributes the a text element
# title="Sámi_oskkuoahpahusplána"
#lang="sme"
#orig_lang="___"
#gt_domain="bible"
#first_name="___"
#last_name="___"
#nationality="___"
#date="2011-01-01"
#datefrom="20110101"
#dateto="20110101"
#timefrom="000000"
#timeto="235959"
f_title = ''
f_genre = ''
f_lang = ''
f_orig_lang = ''
f_first_name_author = ''
f_last_name_author = ''
f_nationality = ''
year_value = ''
f_date = '0000-00-00'
f_datefrom = '00000000'
f_dateto = '00000000'
f_timefrom = '000000'
f_timeto = '235959'
f_title = f_root.find('.//header/title').text.strip() if f_root.find('.//header/title').text else ''
f_genre = f_root.find('.//header/genre').attrib.get('code') if f_root.find('.//header/genre').attrib.get('code') else ''
if f_root.find('.//header/author/person') is not None:
f_first_name_author = f_root.find('.//header/author/person').attrib.get('firstname')
f_last_name_author = f_root.find('.//header/author/person').attrib.get('lastname')
f_nationality = f_root.find('.//header/author/person').attrib.get('nationality')
f_lang = f_root.get('{http://www.w3.org/XML/1998/namespace}lang')
if f_root.find('.//header/translated_from') is not None:
f_orig_lang = f_root.find('.//header/translated_from').attrib.get('{http://www.w3.org/XML/1998/namespace}lang')
# no element year in the header
if f_root.find('.//header/year') is None:
f_date = '0000-00-00'
f_datefrom='00000000'
f_dateto='00000000'
else:
year_value = str(f_root.find('.//header/year').text)
#unknown
if year_value == 'unknown':
f_date = '0000-00-00'
f_datefrom='00000000'
f_dateto='00000000'
#2018
elif re.match(r'^[0-9]{4,4}$', year_value):
f_date = year_value + '-01-01'
f_datefrom = year_value + '0101'
f_dateto = year_value + '0101'
#2011-2012
elif re.match(r'^([0-9]{4,4})\-([0-9]{4,4})$', year_value):
first,last = re.split('\-', year_value)
f_date = first + '-01-01'
f_datefrom = first + '0101'
f_dateto = last + '0101'
#05.10.2004
elif re.match(r'^[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{4,4}$', year_value):
day,month,year = re.split('\.', year_value)
f_date = year + '-' + month + '-' + day
f_datefrom = year + month + day
f_dateto = year + month + day
else:
f_date = '0000-00-00'
f_datefrom='00000000'
f_dateto='00000000'
# logging.info('... title|' + f_title +'|_')
# logging.info('... genre|' + f_genre + '|and domain|' + get_domain_string(f_genre) +'|_')
# logging.info('... lang|' + f_lang +'|_')
# logging.info('... orig_lang|' + f_orig_lang +'|_')
# logging.info('... first name|' + f_first_name_author +'|_')
# logging.info('... last name|' + f_last_name_author +'|_')
# logging.info('... nationality|' + f_nationality +'|_')
# logging.info('... date|' + f_date +'|_')
# logging.info('... datefrom|' + f_datefrom +'|_')
# logging.info('... dateto|' + f_dateto +'|_')
# logging.info('... timefrom|' + f_timefrom +'|_')
# logging.info('... timeto|' + f_timeto +'|_')
f_root.clear()
f_root.tag = 'text'
f_root.set('title', f_title)
f_root.set('lang', f_lang)
f_root.set('orig_lang', f_orig_lang)
f_root.set('first_name', f_first_name_author)
f_root.set('last_name', f_last_name_author)
f_root.set('nationality', f_nationality)
f_root.set('gt_domain', get_domain_string(f_genre))
f_root.set('date', f_date)
f_root.set('datefrom', f_datefrom)
f_root.set('dateto', f_dateto)
f_root.set('timefrom', f_timefrom)
f_root.set('timeto', f_timeto)
sentences = split_cohort(content, lang)
# converting the analysis output into a suitable xml format for vrt transformation (vrt is the cwb input format)
#for s_id, sentence in reversed(list(enumerate(sentences))):
for s_id, sentence in enumerate(sentences):
current_sentence = ET.SubElement(f_root, 'sentence')
current_sentence.set('id', str(s_id+1))
positional_attributes = '\n'
output_type = 'vrt'
for token in sentence:
### logging.info('_current_token_|'+str(token)+'|_')
if output_type == 'xml':
current_word = ET.SubElement(current_sentence, 'word')
# NB: the names of the xml attributes of the word element are sorted alphabetically, e.g., 'dcs' comes first!
for i, positional_feature in enumerate(token):
if i == 0:
current_word.set('form', positional_feature)
elif i == 1:
current_word.set('lemma', positional_feature)
elif i == 2:
current_word.set('pos', positional_feature)
elif i == 3:
current_word.set('msd', positional_feature)
elif i == 4:
current_word.set('sID', positional_feature)
elif i == 5:
current_word.set('depRel', positional_feature)
elif i == 6:
current_word.set('pID', positional_feature)
elif i == 7:
current_word.set('dcs', positional_feature)
if output_type == 'vrt':
for i, positional_feature in enumerate(token):
if i == 0:
positional_attributes += positional_feature
else:
if i == 5:
###print('_posfit_|' + positional_feature + '|_posfit_')
positional_feature = get_deprel_string(positional_feature)
positional_attributes += '\t' + positional_feature
positional_attributes += '\n'
current_sentence.text = positional_attributes
# delete the original dependency node
dep_nodes = f_root.findall('.//body/dependency')
while len(dep_nodes):
parent = f_root.findall('.//body/dependency'+'/..')[0]
parent.remove(dep_nodes[0])
dep_nodes = f_root.findall('.//body/dependency')
vrt_format(f_root)
xml_tree.write(os.path.join(current_out_dir_path,str(current_file)),
xml_declaration=False,encoding='utf-8',
method="xml")
print('DONE ', current_file, '\n\n')
def split_cohort(analysis, current_lang):
_current_lang = current_lang
debug_output = False
#generate_der_comp_lemma = False
_analysis = analysis
# ambiguity hack: mask '<' as lemma, i.e., in the context of '\n\t\"<'
_analysis = re.sub('\n\t\"<','\n\t\"\\<',_analysis)
_analysis = re.sub('\n\t\">','\n\t\"\\>',_analysis)
_analysis = re.sub(""":\s*
\s*
\s*""",':\n',_analysis)
# another hack while waiting for the fix: delete all initial line of a file starting with a colon
if _analysis.startswith(':'):
_analysis = re.sub('^:[^\n]*\n','',_analysis)
# - waiting for specifications on how these pieces of information will be deployed in the corpus
# and presented in Korp: as substrings of the msd-string or as extra attribute-value pairs choosable
# via the Korp interface?
# - for now they ar just filtered away
for extra_tag in ['','','','','','','']:
_analysis = re.sub(' '+extra_tag,'',_analysis)
wordform_filter = get_wordform_filter()
for wordform in wordform_filter:
_analysis = re.sub(' '+wordform,'',_analysis)
###logging.info('ANALYSIS_sentence|'+ _analysis + '|_')
_sentences = []
for current_sentence in [x for x in re.split('\n\n', _analysis) if x != '']:
sentence = []
###print('...1_sentence|'+ current_sentence + '|_')
# split the tokens+analyses based on ('"<'
for current_cohort in [x for x in re.split('"<', current_sentence) if x != '']:
# discard all lines starting with ':' (= giella format of hfst)
cohort = re.split('\n:', current_cohort)[0]
### print('...2_cohort|'+ cohort + '|_')
# split word form from analysis
word_form = re.split('>"\n', cohort)[0]
rest_cohort = re.split('>"\n', cohort)
### print('...3_wf|'+ word_form + '|_')
### print('...3_rc|'+ str(rest_cohort) + '|_')
# further split non-disambiguated analyses based on '\n\t"'
cohort_lines = re.split('\n\t"', rest_cohort[1])
### print('...4_cohort_lines|'+ str(cohort_lines) + '|_')
split_analysis = []
# explicit marking of boundaries between:
# lemma, derivation strings, analysis of parts of compounds
for line in cohort_lines:
# delete '\t"' at the beginning of the analysis
###print('...5_ln_x1x|'+ line + '|_')
line = line.lstrip('\t')
###print('...6_ln_x2x|'+ line + '|_')
if line.startswith('"'):
line = line[1:]
###print('...7_ln_x3x|'+ line + '|_')
# delete '\n' at the end of the analysis
line = line.rstrip('\n')
# delimiter between lemma and msd (morpho-syntactic description)
line = re.sub('\"\s','_∞_',line)
# delimiter between the compound parts
line = re.sub('\n\t','_™_',line)
# keep track of the embedding of the different parts for compounds split into more than two parts
line = re.sub('\t\"','_™_',line)
line = re.sub('\t','_™_',line)
split_analysis.append(line)
###print('_unsorted_cohort_|'+str(split_analysis)+'|__')
# sort cohort
sorted_analysis_lines = []
# if there are mixed analyses with and without Error tags
# filter away all instances containing Error tags
# however, if there are only analyses containing Error tags
# sort the cohort and choose the first version
filtered_analysis = [i for i in split_analysis if not ('Err/' in i)]
if len(filtered_analysis) > 0:
### logging.info('_filtered_cohort_|'+str(filtered_analysis)+'|__')
sorted_analysis_lines = sorted(filtered_analysis, key=lambda name:name.lower())
### logging.info('_filtered_sorted_cohort_|'+str(sorted_analysis_lines)+'|__')
else:
### logging.info('_unfiltered_unsorted_cohort_|'+str(split_analysis)+'|__')
sorted_analysis_lines = sorted(split_analysis, key=lambda name:name.lower())
### logging.info('_unfiltered_sorted_cohort_|'+str(sorted_analysis_lines)+'|__')
### if len(split_analysis) > 1:
### logging.info('_unsorted_cohort_|'+str(split_analysis)+'|__')
### if len(split_analysis) > 1:
### logging.info('_sorted_cohort_|'+str(sorted_analysis_lines)+'|__')
# take the first analysis in case there are more than one non-disambiguated analyses
used_analysis = sorted_analysis_lines[0]
# filter all Err- and Sem-tags from the string
used_analysis = re.sub('Err/[^\s]+\s','',used_analysis)
used_analysis = re.sub('Sem/[^\s]+\s','',used_analysis)
used_analysis = re.sub('Use/[^\s]+\s','',used_analysis)
used_analysis = re.sub('Gram/[^\s]+\s','',used_analysis)
used_analysis = re.sub('OLang/[^\s]+\s','',used_analysis)
used_analysis = re.sub('Dial/[^\s]+\s','',used_analysis)
used_analysis = re.sub('CmpN/[^\s]+\s','',used_analysis)
used_analysis = re.sub('CmpNP/[^\s]+\s','',used_analysis)
used_analysis = re.sub('G3+\s','',used_analysis)
used_analysis = re.sub('v9+\s','',used_analysis)
if debug_output:
print('8_used_analysis_|'+str(used_analysis)+'|_')
logging.info('8_used_analysis_|'+str(used_analysis)+'|_')
# keep this strig for lemma generation
original_analysis = used_analysis
ex_index = used_analysis.find('Ex/')
tm_index = used_analysis.find('_™_')
current_line_no = inspect.stack()[0][2]
### print('_ex-tm_|'+str(ex_index)+'|'+str(tm_index)+'|__|'+str(current_line_no)+'|__')
### print('_|'+ word_form + '|_|' + str(used_analysis) + '|_')
if 'Ex/' in used_analysis and not '_™_' in used_analysis:
lemma = used_analysis.split('_∞_', 1)[0]
msd = used_analysis.split('_∞_', 1)[1]
swapped_msd = get_correct_pos(msd)
used_analysis = lemma+'_∞_'+swapped_msd
###print('_LMSU__|'+ lemma + '|_|' + msd + '|_|' + swapped_msd+ '|_|' + used_analysis+ '|__LMSU_')
# extra handling of the combination of derivation of the head
# and compounding
if 'Ex/' in used_analysis and '_™_' in used_analysis and ex_index < tm_index:
#logging.info('_XXX_|'+used_analysis+'|_')
lemma = used_analysis.split('_∞_', 1)[0]
msd = used_analysis.split('_∞_', 1)[1]
derivation = msd.split('_™_', 1)[0]
rest = msd.split('_™_', 1)[1]
swapped_msd = get_correct_pos(derivation)
used_analysis = lemma+'_∞_'+swapped_msd+'_™_'+rest
#logging.info('_YYY_|'+used_analysis+'|_')
# put a clear delimiter between the (first) pos value and the rest of msd
# in order to disambiguate from the rest of whitespaces
parts = re.compile("(_∞_\w+\s?|_∞_\?\s?)").split(used_analysis, 1)
###logging.info('_parts_|'+str(parts)+'|_')
parts[1] = parts[1].replace('_∞_', '').strip()
lemma = parts[0]
pos = parts[1]
rest = parts[2]
#logging.info('_LEN_the-parts_|'+str(len(parts))+'|_')
#logging.info('_1_the-parts_|'+str(parts)+'|_')
ex_in_r = rest.find('_©_')
tm_in_r = rest.find('_™_')
#current_line_no = inspect.stack()[0][2]
#logging.info('_exr-tmr_|'+str(ex_in_r)+'|'+str(tm_in_r)+'|_|'+str(current_line_no)+'|_')
#derivation-composition string
dcs = ''
#morpho-syntactic description
msd = ''
# split derivation/composition string from the rest of MSD
# and put it in and extra tuple at the end of the tuple list,
# otherwise add a default tuple '___'
# no derivation, no composition
if ex_in_r == -1 and tm_in_r == -1:
msd = rest
dcs = '___'
###logging.info('_msd_cds_1_|'+str(msd)+'|_|'+str(dcs)+'|_')
# no derivation, but composition
elif ((ex_in_r == -1 and not tm_in_r == -1) or
(not ex_in_r == -1 and not tm_in_r == -1 and tm_in_r < ex_in_r)):
msd, dcs = re.compile('_™_').split(rest, 1)
dcs = '_™_'+dcs
###logging.info('_msd_cds_2_|'+str(msd)+'|_|'+str(dcs)+'|_')
# derivation, but no composition
elif ((not ex_in_r == -1 and tm_in_r == -1) or
(not ex_in_r == -1 and not tm_in_r == -1 and ex_in_r < tm_in_r)):
msd, dcs = re.compile('_©_').split(rest, 1)
dcs = '_©_'+dcs
###logging.info('_msd_cds_3_|'+str(msd)+'|_|'+str(dcs)+'|_')
# covered all relevant combinations?
else:
logging.info('_msd_cds_4_|'+str(rest)+'|_')
# processing msd: splitting function label, selfID and parentID from the msd string
msd_drel = re.compile(' #').split(msd)
head = ''
tail = ''
###print('_XXX_|'+str(msd_drel)+'|_')
###print('_YYY_|'+str(len(msd_drel))+'|_')
if len(msd_drel) == 1:
head = '___'
###print('IF ... head ', head)
tail = msd_drel[0].lstrip('#')
###print('IF ... tail ', tail)
else:
### here to debug
head = msd_drel[0]
###print('ELSE ... head ', head)
tail = msd_drel[1]
###print('ELSE ... tail ', tail)
current_msd = ''
fct_label = ''
### here to debug
###print('_the-tail_|'+str(tail)+'|_')
self_id, parent_id = re.compile('->').split(tail)
###print('_ID_|'+str(self_id)+'|_|'+str(parent_id)+'|_')
# splitting the fuction label
if not head == '___':
if not '@' in head:
current_msd = head
fct_label = 'X'
#logging.info('_head_|'+str(head)+'|_')
else:
msd_fct = re.compile(' @').split(head)
if len(msd_fct) == 1:
current_msd = '___'
fct_label = msd_fct[0].lstrip('@')
#logging.info('_msd_fct_1_|'+str(msd_fct)+'|_')
else:
current_msd = msd_fct[0]
fct_label = msd_fct[1]
#logging.info('_msd_fct_2_|'+str(msd_fct)+'|_')
else:
current_msd = '___'
fct_label = 'X'
# TODO: update the description below
# MDS can be complex and partitioned with specific separators:
# 1. _™_ for each TAB for parts of the compounds
# 2. _∞_ as separator between lemma and POS+MSD in a part of a compound
# Ex.: ('juovlavuonasildi', 'sildi',
# 'N', 'Sem/Ani Sg Nom @HNOUN_™__™_vuotna_∞_N Sem/Plc Cmp/SgGen Cmp_™__™__™_juovllat_∞_N Sem/Time Cmp/SgNom Cmp')
# 3. _©_ as separator between the MSD and the derivation tags
# Ex.: ('stellejuvvot', 'stellet', 'V', 'IV Inf @-FMAINV_©_Ex/V Ex/TV Der/PassL')
# TODO: split Sem-tags and put them into as a separate position attribute for an updated corpus format for Korp
# so that semantic attributes can be searchable via Korp interface
if pos == '?':
pos = '___'
# analysed data as an 8-tuple: (WORD_FORM, LEMMA, POS, MSD, SELF_ID, FUNCTION_LABEL, PARENT_ID, DERIVATION-COMPOUNDING-STRING)
# ambiguity hack: unmask '<' and '>' as lemma
lemma = lemma.replace('\\','')
### DONE
### replace here lemma with the generated lemma;
### delete derivation/composition tags
### CAVEAT: the tuple will not be a 8-tuple but a 7-tuple
#lemma generation string
lemma_generation_string = ''
generated_lemma = ''
#if generate_der_comp_lemma:
if 'Ex/' in original_analysis or '_™_' in original_analysis:
lemma_generation_string = get_generation_string(original_analysis, pos, current_msd, _current_lang)
if lemma_generation_string:
### logging.info('xxx_lem-gen-str_|'+lemma_generation_string+'|_')
generated_lemma = generate_lemma(lemma_generation_string, _current_lang)
# msd clean up
### logging.info('_1_msd_|' + current_msd + '|_')
current_msd = re.sub('IV\s','',current_msd)
current_msd = re.sub('TV\s','',current_msd)
current_msd = re.sub('Relc','Rel',current_msd)
current_msd = re.sub('Dyn','',current_msd)
current_msd = re.sub('Known','',current_msd)
current_msd = current_msd.strip()
current_msd = re.sub('/','_',current_msd)
current_msd = re.sub('\s','.',current_msd)
# add the pos as fist element of the msd string
if current_msd == '___':
current_msd = pos
else:
current_msd = pos + '.' + current_msd
### logging.info('_2_msd_|' + current_msd + '|_')
analysis_tuple = ()
### logging.info('_generated_lemma_|' + generated_lemma + '|_')
if generated_lemma == '':
analysis_tuple = (word_form, lemma, pos, current_msd, self_id, fct_label, parent_id)
else:
analysis_tuple = (word_form, generated_lemma, pos, current_msd, self_id, fct_label, parent_id)
### logging.info("_current_tuple_|"+str(analysis_tuple)+"|_")
#filter away '¶', which is used only to have some clause boundary if there is no one there
# TODO: as well as other strings that are most likely noise
if not word_form == '¶':
sentence.append(analysis_tuple)
# filter empty "sentences" due to filtering of '¶'
if sentence:
###logging.info("_analysed_token_tuples_|"+str(sentence)+"|_")
_sentences.append(sentence)
return _sentences;
def get_correct_pos(input_string):
_input_string = input_string
###print('_instr_|' + _input_string + '|_')
der_pos_msd = re.split('( V | N | A | Adv | Po )', input_string)
###print('_der_pos_msd_|' + str(der_pos_msd) + '|_')
swapped_string = der_pos_msd[1].strip()+' '+der_pos_msd[2].strip()+'_©_'+der_pos_msd[0]
return swapped_string;
def get_domain_string(domain):
domain_mapping = {
"admin": "administration",
"bible": "bible",
"facta": "facts",
"ficti": "fiction",
"literature": "fiction",
"law": "law",
"laws": "law",
"news": "news",
"science": "science",
"blogs": "blog",
"": ""
}
_domain = domain_mapping[domain]
return _domain;
def get_deprel_string(deprel):
### print('___deprep___'+deprel)
deprel_mapping = {
">A": "→A",
">ADVL": "→ADVL",
">CC": "→CC",
">N": "→N",
">Num": "→Num",
">P": "→P",
">Pron": "→Pron",
"": "-FADVL→",
"-FMAINV": "-FMAINV",
"-FOBJ>": "-FOBJ→",
"-FSUBJ>": "-FSUBJ→",
"A<": "A←",
"ADVL": "ADVL",
"ADVL>": "ADVL→",
"ADVL>CS": "ADVL→CS",
"ADVL<": "ADVL←",
"APP-ADVL<": "APP-ADVL←",
"APP-N<": "APP-N←",
"APP-Pron<": "APP-Pron←",
"CNP": "CNP",
"COMP-CS<": "COMP-CS←",
"CVP": "CVP",
"FAUX": "FAUX",
"-FAUX": "-FAUX",
"-FAUXV": "-FAUXV",
"FMV": "FMV",
"FMVdic": "FMVdic",
"FS-": "FS-ADVL→",
"FS-IAUX": "FS-IAUX",
"FS-IMV": "FS-IMV",
"FS-N<": "FS-N←",
"FS-N": "OBJ→",
"OPRED>": "OPRED→",
"P<": "P←",
"PCLE": "PCLE",
"Pron<": "Pron←",
"S<": "S←",
"SPRED>": "SPRED→",
"SPRED": "SUBJ→",
"VOC": "VOC",
"SPRED": "SPRED",
"SUBJ": "SUBJ",
"X": "X"
}
_deprel = deprel_mapping[deprel]
return _deprel;
def get_wordform_filter():
_wordform_filter = [
'"< suohkanbargi>"',
'"< suohkanbargiide>"',
'"< suohkanbargiin>"',
'"< suohkanbargit>"',
'"< suohkanbáhpa>"',
'"< suohkanbáhpain>"',
'"< suohkanbáhpas>"',
'"< suohkanbáhppa>"',
'"< suohkanbáhppan>"',
'"< suohkanbáhppavirgi>"',
'"< suohkanbáhppavirgái>"',
'"< suohkandoaktáris>"',
'"< suohkandoavtterbálvalusas>"',
'"< suohkandoavttir>"',
'"< suohkanekonomiija>"',
'"< suohkangirji>"',
'"< suohkangirjji>"',
'"< suohkanhálddahusas>"',
'"< suohkanluottain>"',
'"< suohkanmearka>"',
'"< suohkanpolitihkar>"',
'"< suohkanpolitihkarin>"',
'"< suohkanpolitihkka>"',
'"< suohkanpolitihkkarat>"',
'"< suohkanpolitihkkariid>"',
'"< suohkanpolitihkkariiguin>"',
'"< suohkanpolitihkkariin>"',
'"< suohkanpolitihkkár>"',
'"< suohkanpolitihkkárat>"',
'"< suohkanpolitihkkáriid>"',
'"< suohkanpsykologa>"',
'"< suohkanrádjái>"',
'"< suohkanráji>"',
'"< suohkanrájiid>"',
'"< suohkanrájit>"',
'"< suohkanstivra>"',
'"< suohkanstivracoahkin>"',
'"< suohkanstivralahttu>"',
'"< suohkanstivralahtut>"',
'"< suohkanstivraáirasat>"',
'"< suohkanstivraáirasiid>"',
'"< suohkanstivraáirras>"',
'"< suohkanstivraášši>"',
'"< suohkanstivračoahkkimis>"',
'"< suohkanstivračoahkkin>"',
'"< suohkanstivrii>"',
'"< suohkanstivrra>"',
'"< suohkanstivrraid>"',
'"< suohkanstivrraláhttu>"',
'"< suohkanstivrras>"',
'"< suohkanstivrrat>"',
'"< suohkanstivrraválga>"',
'"< suohkanstivrraáirras>"',
'"< suohkanstivrračoahkkima>"',
'"< suohkanviesu>"',
'"< suohkanviesus>"',
'"< suohkanvissui>"',
'"< suohkanvisteseaidnái>"',
'"< suohkanvistti>"',
'"< suohkanváldodoavttir>"',
'"< suohkanválga>"',
'"< suohkanválggaid>"',
'"< suohkanválggaide>"',
'"< suohkanválggain>"',
'"< suohkanválggas>"',
'"< suohkanválggat>"',
'"< suohkanválgii>"',
'"< suohkanássit>"',
'"< suohkanšibitdoavttir>"',
'"<.>"',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'""',
'"<Čakčam>"',
'"<čakčam>"',
]
return _wordform_filter;
def get_generation_string(in_analysis, in_pos, in_msd, in_lang):
_used_analysis = in_analysis
_pos = in_pos
_msd = in_msd
_lang = in_lang
_string2generate = ''
_lemma = _used_analysis.split('_∞_', 1)[0]
_tail = _used_analysis.split('_∞_', 1)[1]
# ignore function and dependence relation here
_tail = re.sub('\s@[^\s]+','',_tail)
_tail = re.sub('\s#\d+->\d+','',_tail)
ex_index = _tail.find('Ex/')
tm_index = _tail.find('_™_')
current_line_no = inspect.stack()[0][2]
### print('_ex-tm_|'+str(ex_index)+'|'+str(tm_index)+'|__|'+str(current_line_no)+'|__')
if 'Ex/' in _tail:
if (not '_™_' in _tail) or ('_™_' in _tail and ex_index < tm_index):
_string2generate = _lemma+'_∞1EX∞_'+_tail
if '_™_' in _tail:
if (not 'Ex/' in _tail) or ('Ex/' in _tail and tm_index < ex_index):
_string2generate = _lemma+'_∞1CO∞_'+_tail
### replace all delimiter by '+' and '_™_' by '#'
_string2generate = re.sub('\s+','+',_string2generate)
_string2generate = re.sub('_∞1EX∞_','+',_string2generate)
_string2generate = re.sub('Ex/','',_string2generate)
_string2generate = re.sub('_∞1CO∞_','+',_string2generate)
_string2generate = re.sub('_∞_','+',_string2generate)
_string2generate = re.sub('(_™_)+','_™_',_string2generate)
### construct the correct order of generation for compund parts
parts = _string2generate.split('_™_')
swapped_string = ''
if len(parts) > 1:
###print('_the_parts_|'+str(parts)+'|_')
for i, p in reversed(list(enumerate(parts))):
swapped_string += p
if i > 0:
swapped_string += '#'
_string2generate = swapped_string
### logging.info('_bfr_str2gen_|'+_string2generate+'|_')
# replace inflection tags of the analysed string with the corresponding baseform tags
str_first = _string2generate.rpartition('+'+_pos+'+')[0]
str_last = _string2generate.rpartition('+'+_pos+'+')[2]
### logging.info('_mid_str2gen_|'+str(_string2generate.rpartition('+'+_pos+'+'))+'|_')
if _pos == 'V':
_string2generate = str_first + '+' + _pos + '+' + 'Inf'
if _pos == 'N':
_string2generate = str_first + '+' + _pos + '+' + 'Sg+Nom'
if _pos == 'A':
if _lang == 'sma':
if 'Comp' in str_last:
_string2generate = str_first + '+' + _pos + '+' + 'Comp+Attr'
elif 'Superl' in str_last:
_string2generate = str_first + '+' + _pos + '+' + 'Superl+Attr'
else:
_string2generate = str_first + '+' + _pos + '+' + 'Attr'
else:
_string2generate = str_first + '+' + _pos + '+' + 'Sg+Nom'
### logging.info('_afr_str2gen_|'+_string2generate+'|_')
return _string2generate;
def generate_lemma(in_string, c_lang):
_in_string = in_string
_current_lang = c_lang
_analysis_lemma = re.split('\+', _in_string, 1)[0]
_generated_lemma = 'TODO_'+_in_string
generation_cmd = ' | hfst-lookup -q $GTHOME/langs/'+ _current_lang + '/src/generator-gt-norm.hfstol'
pFST = Popen('echo \''+_in_string+'\''+generation_cmd, shell=True, stdout=PIPE, stderr=PIPE)
outFST, errFST = pFST.communicate()
outFST = outFST.decode()
outFST = re.split('\n', outFST, 1)[0]
_generated_lemma = re.split('\t', outFST)[1]
if _generated_lemma.endswith('+?'):
_generated_lemma = _analysis_lemma
### logging.info('___gen-out___ ' + outFST + '______')
return _generated_lemma;
if __name__ == "__main__":
reload(sys)
main()