# -*- encoding: utf-8 -*- """ Various rules for displaying ``sme`` entries properly, and connecting FST to Lexicon. """ # NOTE: if copying this for a new language, remember to make sure that # it's being imported in __init__.py # * paradigm documentation here: # http://giellatekno.uit.no/doc/dicts/dictionarywork.html from logging import getLogger from morphology import generation_overrides as morphology from lexicon import lexicon_overrides as lexicon from lexicon import autocomplete_filters as autocomplete_filters from morpholex import morpholex_overrides as morpholex LEX_TO_FST = { 'a': 'A', 'adj': 'A', 'adp': 'Adp', 'adv': 'Adv', 'aktor': 'NomAg', 'egenn': 'Prop', 'interj': 'Interj', 'konj': 'CC', 'n': 'N', 'npl': 'N', 'num': 'Num', 'part': 'Pcle', 'postp': 'Po', 'prep': 'Pr', 'pron': 'Pron', 'prop': 'Prop', 'subj': 'CS', 'subst': 'N', 'v': 'V', 'verb': 'V', '': '', } morph_log = getLogger('morphology') # This is called before any lookup is done, regardless of whether it # came from analysis or not. # TODO: til_ref / fra_ref # * need to allow (blank) # * need to render with fra_ref, with links which generate a # query to til_ref # * maybe include these in get parameters or something. # NOTE: some mwe will mess things up here a bit, in that pos is # passed in with part of the mwe. Thus, if there is no POS, do # nothing. @lexicon.pre_lookup_tag_rewrite_for_iso(*['sme', 'SoMe']) def pos_to_fst(*args, **kwargs): """ For synchronizing PoS between lexicon and FST. Should be less necessary now. TODO: generalize to a setting in .yaml or somewhere. """ if 'lemma' in kwargs and 'pos' in kwargs: _k = kwargs.get('pos', '') if _k is not None: _k = _k.replace('.', '').replace('+', '') new_pos = LEX_TO_FST.get(_k, False) else: _k = False new_pos = False if new_pos: kwargs['pos'] = new_pos else: if _k: morph_log.error("sme.py: Missing LEX_TO_FST pair for %s" % _k.encode('utf-8')) morph_log.error("sme.py: in morphology.morphological_definitions.sme") return args, kwargs @autocomplete_filters.autocomplete_filter_for_lang(('nob', 'sme')) def remove_orig_entry(entries): _entries = [e for e in entries if 'orig_entry' not in e.attrib] return _entries @morphology.pregenerated_form_selector(*['sme', 'SoMe']) def pregenerate_sme(form, tags, node, **kwargs): """ **pregenerated form selector**: mini_paradigm / lemma_ref If mini_paradigm and lemma_ref exist for this node, then grab analyses and tags from the node, instead of from the FST. """ _has_mini_paradigm = node.xpath('.//mini_paradigm[1]') _has_lemma_ref = node.xpath('.//lemma_ref') if len(_has_lemma_ref) > 0: return form, [], node, [] if len(_has_mini_paradigm) == 0: return form, tags, node else: mp = _has_mini_paradigm[0] def analysis_node(node): """ Node -> ("lemma", ["Pron", "Sg", "Tag"], ["wordform", "wordform"]) """ tag = node.xpath('.//@ms') if len(tag) > 0: tag = tag[0].split('_') else: tag = [] wfs = node.xpath('.//wordform/text()') return (form, tag, wfs) analyses = map(analysis_node, mp.xpath('.//analysis')) return form, tags, node, analyses @morphology.tag_filter_for_iso(*['sme', 'SoMe']) def lexicon_pos_to_fst(form, tags, node=None, **kwargs): """ **tag filter**: Lexicon -> FST changes. Change POS to be compatible with FST for when they are not. """ new_tags = [] for t in tags: _t = [] for p in t: _t.append(LEX_TO_FST.get(p, p)) new_tags.append(_t) return form, new_tags, node _str_norm = 'string(normalize-space(%s))' # commented out; Bug 1719 ### NB: if commenting back in, argument structure for decorator function is changed. ### @morpholex.post_morpho_lexicon_override(*['sme', 'SoMe']) ### def remove_analyses_for_analyzed_forms_with_lemma_ref(xml, fst): ### """ **Post morpho-lexicon override** ### ### If there is an entry that is an analysis and the set of XML entries ### resulting from the lookup contains another entry with its matching ### lemma, then discard the analyses. ### """ ### ### if xml is None or fst is None: ### return None ### ### from collections import defaultdict ### nodes_by_lemma = defaultdict(list) ### ### for e in xml: ### lemma = e.xpath(_str_norm % 'lg/l/text()') ### lemma_ref_lemma = e.xpath(_str_norm % 'lg/lemma_ref/text()') ### ### if lemma_ref_lemma: ### nodes_by_lemma[lemma_ref_lemma].append( ### (e, True) ### ) ### elif lemma: ### nodes_by_lemma[lemma].append( ### (e, False) ### ) ### ### def lg_l_matches_str(n, s): ### return n.xpath(_str_norm % 'lg/l/text()') == s ### ### for lemma, nodes in nodes_by_lemma.iteritems(): ### # get the lemma_ref node ### lemma_ref_node = filter( lambda (n, is_lemma_ref): is_lemma_ref ### , nodes ### ) ### ### if len(lemma_ref_node) > 0: ### _l_node, _is_l_ref = lemma_ref_node[0] ### lemma_ref_lemma = _l_node.xpath( ### _str_norm % 'lg/lemma_ref/text()' ### ) ### ### # Match nodes by lg_l vs. lemma_ref_string ### _match = lambda (m_n, _): \ ### lg_l_matches_str(m_n, lemma_ref_lemma) ### lemmas_matching = filter( _match, nodes ) ### # If there is a lemma for the lemma_ref string ... ### if len(lemmas_matching) > 0: ### def analysis_lemma_is_not(analysis): ### return lemma_ref_lemma != analysis.lemma ### ### # wipe out analyses in fst for a lemma if there is a lemma_ref ### fst = filter( analysis_lemma_is_not ### , fst ### ) ### ### return xml, fst # commented out; bug 1719 ### NB: if commenting back in, argument structure for decorator function is changed. ### @morpholex.post_morpho_lexicon_override(*['sme', 'SoMe']) ### def remove_analyses_for_specific_closed_classes(xml, fst): ### """ **Post morpho-lexicon override** ### ### Remove analyses from list when the XML entry contains a specific PoS ### type. ### ### This has to be done in two steps: ### * check for xml entries containing the types ### * filter out the matching lemma from those entries, *or*, remove ### analyses that have a member of the hideanalysis tagset ### ### NB: this must be registered after ``remove_analyses_for_analyzed_forms_with_lemma_ref``, ### because that function depends on analyses still existing to some of ### these types. ### """ ### ### if xml is None or fst is None: ### return None ### ### restrict_xml_type = [ 'Pers' ### , 'Dem' ### , 'Rel' ### , 'Refl' ### , 'Recipr' ### , 'Neg' ### ] ### ### restrict_lemmas = [ 'leat' ### ] ### ### for e in xml: ### _pos_type = e.xpath(_str_norm % 'lg/l/@type') ### _lemma = e.xpath(_str_norm % 'lg/l/text()') ### ### if _pos_type in restrict_xml_type: ### restrict_lemmas.append(_lemma) ### ### def lemma_not_in_list(lemma): ### _lemma = lemma.lemma not in restrict_lemmas ### return _lemma ### ### def hideanalysis_tagset(lemma): ### _hide = lemma.tag['hideanalysis'] ### _hide_analysis = True ### if _hide: ### if len(_hide) > 0: ### _hide_analysis = False ### ### return _hide_analysis ### ### fst = filter( hideanalysis_tagset ### , filter( lemma_not_in_list ### , fst ### ) ### ) ### ### return xml, fst SME_NOB_DICTS = [ ('sme', 'nob'), ('SoMe', 'nob'), ] NOB_SME = [ ('nob', 'sme'), ] # NB: this filter is what would be needed, but now we're going to trust # the lexicon is pre-prepared instead. # @lexicon.postlookup_filters_for_lexicon(*SME_NOB_DICTS) # def usage_vd_only_for_entry(lexicon, nodelist, lookup_kwargs): # def filter_node(n): # usages = n.get('usage', '').split(',') # return 'vd' in usages or 'nds' in usages # if nodelist: # return filter(filter_node, nodelist) # else: # return nodelist # @lexicon.postlookup_filters_for_lexicon(*NOB_SME) # def clean_tgs_with_no_usage_vd(lexicon, nodelist, lookup_kwargs): # """ A little node manipulation to remove nodes without # usage=vd. # # Basically: go through all /, iterate , and if it is # not usage=vd, remove it; then if this results in the having # no entries, clear. # """ # def clean_tgs(n): # for tg in n.xpath('./mg/tg'): # _ts = tg.xpath('./t') # for t in _ts: # if not t.get('usage', '') == 'vd': # tg.remove(t) # _ts = tg.xpath('./t') # if len(_ts) == 0: # tg.clear() # return n # if nodelist: # return map(clean_tgs, nodelist) # return nodelist @lexicon.entry_source_formatter(*['sme', 'SoMe']) def format_source_sme(ui_lang, e, target_lang): """ **Entry source formatter** Format the source for a variety of parameters. Here: * Include @pos and @class attributes * if there is a lemma_ref, then we provide the link to that entry too (e.g., munnje) # TODO: new-style templates """ from morphology.utils import tagfilter_conf from flask import current_app paren_args = [] _str_norm = 'string(normalize-space(%s))' _lemma = e.xpath(_str_norm % 'lg/l/text()') _class = e.xpath(_str_norm % 'lg/l/@class') _pos = e.xpath(_str_norm % 'lg/l/@pos') _lemma_ref = e.xpath(_str_norm % 'lg/lemma_ref/text()') _til_ref = e.xpath(_str_norm % 'lg/l/@til_ref') if _lemma_ref: _link_targ = u'/detail/%s/%s/%s.html' % ('sme', target_lang, _lemma_ref) _lemma_ref_link = u'%s' % (_link_targ, _lemma_ref) _lemma_ref_link = u' → ' + _lemma_ref_link _lemma_ref_link += u'' else: _lemma_ref_link = '' if _pos: filters = current_app.config.tag_filters.get(('sme', ui_lang)) if filters: paren_args.append(tagfilter_conf(filters, _pos)) else: paren_args.append(_pos) if _class: paren_args.append(_class.lower()) if len(paren_args) > 0: thing = '%s (%s)' % (_lemma, ', '.join(paren_args)) return thing + _lemma_ref_link else: return _lemma return None @lexicon.entry_source_formatter('nob') def format_source_nob(ui_lang, e, target_lang): """ **Entry source formatter** Format the source for a variety of parameters. Here: * Include @pos and @class attributes * if there is a lemma_ref, then we provide the link to that entry too (e.g., munnje) # TODO: new-style templates """ from morphology.utils import tagfilter_conf from flask import current_app paren_args = [] _str_norm = 'string(normalize-space(%s))' _lemma = e.xpath(_str_norm % 'lg/l/text()') _class = e.xpath(_str_norm % 'lg/l/@class') _pos = e.xpath(_str_norm % 'lg/l/@pos') _lemma_ref = e.xpath(_str_norm % 'lg/lemma_ref/text()') _til_ref = e.xpath(_str_norm % 'lg/l/@til_ref') _orig_entry = e.xpath(_str_norm % 'lg/l/@orig_entry') tag_filter = current_app.config.tag_filters.get(('sme', 'nob')) if _til_ref and _orig_entry: _link_return = "/nob/sme/ref/?l_til_ref=%s" % _orig_entry _link = "%s" % (_link_return, _orig_entry) _lemma_ref_link = u' → ' + _link _lemma_ref_link += u'' _transl_pos = "(%s)" % tag_filter.get(_pos) _new_str = [ _lemma , _transl_pos , _lemma_ref_link ] _new_str = ' '.join(_new_str) return _new_str return None @lexicon.entry_target_formatter(('sme', 'nob'), ('SoMe', 'nob')) def format_target_sme(ui_lang, e, tg): """**Entry target translation formatter** Display @reg (region) attribute in translations, but only for ``N Prop``. # TODO: new-style templates """ _str_norm = 'string(normalize-space(%s))' _type = e.xpath(_str_norm % 'lg/l/@type') _pos = e.xpath(_str_norm % 'lg/l/@pos') if _pos == 'N' and _type == 'Prop': _t_lemma = tg.xpath(_str_norm % 't/text()') _reg = tg.xpath(_str_norm % 't/@reg') if _reg: return "%s (%s)" % (_t_lemma, _reg) return None @lexicon.entry_target_formatter(('nob', 'sme')) def format_fra_ref_links(ui_lang, e, tg): """**Entry target translation formatter** Display @reg (region) attribute in translations, but only for ``N Prop``. """ # print 'format_fra_ref_links' _str_norm = 'string(normalize-space(%s))' _fra_ref = tg.xpath(_str_norm % 're/@fra_ref') _fra_text = tg.xpath(_str_norm % 're/text()') # print '' # print _fra_text # print _fra_ref if _fra_ref is not None: if len(_fra_ref) > 0: return "%s →" % (_fra_ref, _fra_text) _type = e.xpath(_str_norm % 'lg/l/@type') _pos = e.xpath(_str_norm % 'lg/l/@pos') if _pos == 'N' and _type == 'Prop': _t_lemma = tg.xpath(_str_norm % 't/text()') _reg = tg.xpath(_str_norm % 't/@reg') if _reg: return "%s (%s)" % (_t_lemma, _reg) return None from common import remove_blank # Remove blank analyses morphology.postgeneration_filter_for_iso( 'sme', 'SoMe' )(remove_blank)