from lexicon import lexicon_overrides from morphology.utils import tagfilter from utils.data import flatten from flask import current_app from .lexicon import hash_node class FormattingError(Exception): pass class EntryNodeIterator(object): """ A class for iterating through the result of an LXML XPath query, while cleaning the nodes into a more usable format. .clean() is where most of the magic happens, so if new formats are needed, just override this. """ def l_node(self, entry): l = entry.find('lg/l') try: lemma = l.text except: lemma = '' pos = l.get('pos') context = l.get('context') type = l.get('type') hid = l.get('hid') if context == None: context = False if type == None: type = False if hid == None: hid = False return lemma, pos, context, type, hid def tg_nodes(self, entry): """ Select nodes. If an entry has nodes marked with xml:lang attributes, then return only the entries matching the target_lang, otherwise, if there are no xml:lang attributes on any of the nodes, then return all entries unfiltered. """ # how to detect multi-format? problem is that behavior between # pair format may be disrupted by disallowing fallback? target_lang = self.query_kwargs.get('target_lang', False) if not target_lang: multi = False else: multi = len(entry.xpath("mg/tg/@xml:lang")) > 0 and True or False if multi: ts = entry.xpath("mg/tg[@xml:lang='%s']/t" % target_lang) tgs = entry.xpath("mg/tg[@xml:lang='%s']" % target_lang) else: ts = entry.findall('mg/tg/t') tgs = entry.findall('mg/tg') return tgs, ts def examples(self, tg): _ex = [] possible_errors = False for xg in tg.findall('xg'): _x = xg.find('x') _xt = xg.find('xt') if _x is not None and hasattr(_x, 'text'): _x_tx = _x.text else: _x_tx = '' possible_errors = True if _xt is not None and hasattr(_xt, 'text'): _xt_tx = _xt.text else: _xt_tx = '' possible_errors = True _ex.append((_x_tx, _xt_tx)) if possible_errors: from lxml import etree error_xml = etree.tostring(tg, pretty_print=True, encoding="utf-8") current_app.logger.error( "Potential XML formatting problem on node\n\n%s" % error_xml.strip() ) if len(_ex) == 0: return False else: return _ex def find_translation_text(self, tg): """ This parses a node and returns text, annotations, xml:lang. Annotations means here: tg/re, tg/te, or tg/tf. If there is no node, we try to fall back to using one of these, otherwise, pick the node and use the annotations as definition. """ tCtn = tg.find('tCtn') if tCtn is not None: return self.find_translation_text(tCtn) def orFalse(l): if len(l) > 0: return l[0] else: return False text = False re = tg.find('re') te = tg.find('te') tf = tg.find('tf') te_text = '' re_text = '' tf_text = '' if te is not None: te_text = te.text if re is not None: re_text = re.text if tf is not None: tf_text = tf.text tx = tg.findall('t') link = True if not tx: if te_text: text, te_text = [te_text], '' elif re_text: text, re_text = [re_text], '' elif tf_text: text, tf_text = [tf_text], '' else: text = [_tx.text for _tx in tx if _tx.text is not None] lang = tg.xpath('@xml:lang') annotations = [] for a in [te_text, re_text, tf_text]: if a is not None: if a.strip(): annotations.append(a) return text, annotations, lang def __init__(self, nodes, *query_args, **query_kwargs): if not nodes or len(nodes) == 0: self.nodes = [] else: self.nodes = [a for a in nodes if a is not None] self.query_args = query_args self.query_kwargs = query_kwargs self.additional_template_kwargs = {} if 'additional_template_kwargs' in query_kwargs: self.additional_template_kwargs = query_kwargs.get('additional_template_kwargs') query_kwargs.pop('additional_template_kwargs') def __iter__(self): from lxml import etree for node in self.nodes: try: yield self.clean(node) except Exception, e: import traceback import sys exc_type, exc_value, exc_traceback = sys.exc_info() tb_str = traceback.format_exception(exc_type, exc_value, exc_traceback) if node is not None: error_xml = etree.tostring(node, pretty_print=True, encoding="utf-8") else: error_xml = 'No entry for lookup' msg_args = (error_xml.strip(), ''.join(tb_str), repr(self.query_args), repr(self.query_kwargs)) current_app.logger.error( "Potential XML formatting problem somewhere in... \n\n%s\n\n%s\n\n%s\n\n%s" % msg_args ) continue class SimpleJSON(EntryNodeIterator): """ A simple JSON-ready format for /lookups/ """ def sorted_by_pos(self): _from = self.query_kwargs.get('source_lang') _to = self.query_kwargs.get('target_lang') def filterPOS(r): def fixTag(t): t_pos = t.get('pos', False) if not t_pos: return t t['pos'] = tagfilter(t_pos, _from, _to) return t return fixTag(r) return map(filterPOS, list(self)) def clean(self, e): lemma, lemma_pos, lemma_context, _, lemma_hid = self.l_node(e) tgs, ts = self.tg_nodes(e) # TODO: format source in JSON view target_formatteds = [] right_langs = [] translations = [] for tg in tgs: default_text, default_annotations, default_lang = self.find_translation_text(tg) tf_text = lexicon_overrides.format_target( self.query_kwargs.get('source_lang'), self.query_kwargs.get('target_lang'), self.query_kwargs.get('ui_lang'), e, tg, False ) if tf_text: default_lang.append(default_lang) target_formatteds.append(tf_text) else: translations.append((default_text, default_annotations, default_lang)) if len(target_formatteds) > 0: right_text = target_formatteds else: right_text = flatten([a for a, b, c in translations]) right_langs = flatten([c for a, b, c in translations]) return { 'left': lemma , 'context': lemma_context , 'pos': lemma_pos , 'right': right_text , 'lang': right_langs , 'hid': lemma_hid , 'input': self.query_kwargs.get('user_input', '') } class FrontPageFormat(EntryNodeIterator): def clean_tg_node(self, e, tg): from functools import partial ui_lang = self.query_kwargs.get('ui_lang') # TODO: detect if there are texts vs. annotations only, # still need to run those through texts, annotations, lang = self.find_translation_text(tg) link = True if texts: if not isinstance(texts, list): texts = [texts] elif annotations: if not isinstance(annotations, list): annotations = [annotations] else: from lxml import etree error_xml = etree.tostring(e, pretty_print=True, encoding="utf-8") current_app.logger.error( "Potential XML formatting problem while processing nodes.\n\n" + \ repr(self.query_kwargs) + "\n\n" + \ repr(self.query_args) + "\n\n" + \ error_xml.strip() ) texts = [] # e node, tg node, default text for when formatter doesn't # exist for current iso # Apply to each translation text separately target_formatter = partial( lexicon_overrides.format_target , self.query_kwargs.get('source_lang') , self.query_kwargs.get('target_lang') , ui_lang , e , tg ) def add_link(_p): if '' in _p: return _p src_lang = self.query_kwargs.get('source_lang') _from_l = self.query_kwargs.get('target_lang') _to_l = src_lang # Does the reversed pair exist as a variant? If so we need # to get the original pair and re-reverse it if (_to_l, _from_l) in current_app.config.variant_dictionaries: _var = current_app.config.variant_dictionaries.get((_to_l, _from_l)) (_to_l, _from_l) = _var.get('orig_pair') if (_from_l, _to_l) not in current_app.config.dictionaries and \ (_from_l, _to_l) in current_app.config.variant_dictionaries: var = current_app.config.variant_dictionaries.get((_from_l, _to_l)) (_from_l, _to_l) = var.get('orig_pair') pair = ( _from_l , _to_l ) if pair not in current_app.config.dictionaries: return _p _url = [ 'detail' , self.query_kwargs.get('target_lang') , src_lang , '%s.html?no_compounds=true&lemma_match=true' % _p ] _url = '/' + '/'.join(_url) link = "%s" % (_url, _p) return link # problem: no nodes available here for til_/fra_ref words target_formatted = [] if len(texts) > 0: # TODO: does this not actually pass texts ? target_formatted = map(target_formatter, texts) elif len(annotations) > 0: # target_formatter expects some default text to be passed in # the event that no formatting is able to be made target_formatted = map(target_formatter, annotations) # If there were changes, then we want to give absolute control # on this string to the formatter. target_reformatted = [] if set(target_formatted) != set(texts): target_reformatted = True target_formatted_unlinked = target_formatted target_formatted = map(add_link, target_formatted) right_node = { 'tx': ', '.join(texts) , 're': annotations , 'target_reformatted': target_reformatted , 'target_formatted_unlinked': target_formatted_unlinked , 'examples': self.examples(tg) , 'target_formatted': ', '.join(target_formatted) } return right_node, lang def clean(self, e): lemma, lemma_pos, lemma_context, lemma_type, lemma_hid = self.l_node(e) tgs, ts = self.tg_nodes(e) ui_lang = self.query_kwargs.get('ui_lang') _right = map( lambda tg: self.clean_tg_node(e, tg) , tgs ) right_langs = [lang for _, lang in _right] right_nodes = [fmt_node for fmt_node, _ in _right] # # Make our own hash, 'cause lxml won't # entry_hash = [ unicode(lemma) # , unicode(lemma_context) # , unicode(lemma_pos) # , ','.join(sorted([t['tx'] for t in right_nodes])) # ] # entry_hash = str('-'.join(entry_hash).__hash__()) entry_hash = hash_node(e) # node, and default format for if a formatter doesn't exist for # iso source_lang = self.query_kwargs.get('source_lang') target_lang = self.query_kwargs.get('target_lang') lemma_attrs = self.query_kwargs.get('lemma_attrs', False) if lemma and lemma_pos: default_format = "%s (%s)" % ( lemma , tagfilter( lemma_pos , source_lang , target_lang ) ) elif lemma and not lemma_pos: default_format = lemma elif lemma_attrs: default_format = '' def add_link(_p): """ If there's a link already, then don't add one, otherwise... """ if '' in _p: return _p # TODO: will need a more lasting solution... src_lang = self.query_kwargs.get('source_lang') if src_lang == 'SoMe': src_lang = 'sme' _url = [ 'detail' , src_lang , self.query_kwargs.get('target_lang') , '%s.html?e_node=%s' % (lemma, entry_hash) ] _url = '/' + '/'.join(_url) link = "%s" % (_url, _p) return link source_formatted_unlinked = lexicon_overrides.format_source( source_lang, ui_lang, e, target_lang, default_format ) source_formatted = add_link(source_formatted_unlinked) formatted_dict = { 'left': lemma , 'source_formatted': source_formatted , 'source_unlinked': source_formatted_unlinked , 'context': lemma_context , 'pos': lemma_pos , 'right': right_nodes , 'lang': right_langs , 'hid': lemma_hid , 'entry_hash': entry_hash } formatted_dict.update(self.additional_template_kwargs) return formatted_dict # TODO: adding hverandre functionality requires some additional # attributes to be available, but this formatter class is annoying, # and a good argument for how this should all just be handled by xslt or # some template thing instead. class DetailedFormat(FrontPageFormat): def clean(self, e): lemma, lemma_pos, lemma_context, lemma_type, lemma_hid = self.l_node(e) tgs, ts = self.tg_nodes(e) ui_lang = self.query_kwargs.get('ui_lang') _right = map( lambda tg: self.clean_tg_node(e, tg) , tgs ) right_langs = [lang for _, lang in _right] right_nodes = [fmt_node for fmt_node, _ in _right] entry_hash = hash_node(e) # node, and default format for if a formatter doesn't exist for # iso source_lang = self.query_kwargs.get('source_lang') target_lang = self.query_kwargs.get('target_lang') lemma_attrs = self.query_kwargs.get('lemma_attrs', False) if lemma and lemma_pos: default_format = "%s (%s)" % ( lemma , tagfilter( lemma_pos , source_lang , target_lang ) ) elif lemma and not lemma_pos: default_format = lemma elif lemma_attrs: default_format = '' def add_link(_p): """ If there's a link already, then don't add one, otherwise... """ return _p source_formatted_unlinked = lexicon_overrides.format_source( source_lang, ui_lang, e, target_lang, default_format ) source_formatted = add_link(source_formatted_unlinked) formatted_dict = { 'left': lemma , 'source_formatted': source_formatted , 'source_unlinked': source_formatted_unlinked , 'context': lemma_context , 'pos': lemma_pos , 'right': right_nodes , 'lang': right_langs , 'hid': lemma_hid , 'entry_hash': entry_hash , 'input': (lemma, lemma_pos, '', lemma_type) , 'node': e } formatted_dict.update(self.additional_template_kwargs) return formatted_dict