# -*- encoding: utf-8 -*- """ sma-specific overrides, and pregenerated paradigm selection. A set of lexicon-related language specific rules, provided by `lexicon.LexiconOverrides`. There is probably a better location for this documentation, but for now ... Example source formatting function: @lexicon.entry_source_formatter('sme') def format_source_sme(ui_lang, entry_node): # do some processing on the entry node ... if successful: return some_formatted_string return None Example target string formatting function: @lexicon.entry_target_formatter('sme', 'nob') def format_target_sme(ui_lang, entry_node, tg_node): # do some processing on the entry and tg node ... if successful: return some_formatted_string return None """ # NOTE: if copying this for a new language, remember to make sure that # it's being imported in __init__.py from morphology import generation_overrides as morphology from lexicon import lexicon_overrides as lexicon from flask import current_app from morpholex import morpholex_overrides as morpholex @lexicon.entry_source_formatter('sma') def format_source_sma(ui_lang, e, target_lang): from morphology.utils import tagfilter_conf paren_args = [] _str_norm = 'string(normalize-space(%s))' _lemma = e.xpath(_str_norm % 'lg/l/text()') _class = e.xpath(_str_norm % 'lg/l/@class') _pos = e.xpath(_str_norm % 'lg/l/@pos') _lemma_ref = e.xpath(_str_norm % 'lg/lemma_ref/text()') if _lemma_ref: _link_targ = u'/detail/%s/%s/%s.html' % ('sma', target_lang, _lemma_ref) _lemma_ref_link = u'%s' % (_link_targ, _lemma_ref) _lemma_ref_link = u' → ' + _lemma_ref_link _lemma_ref_link += u'' else: _lemma_ref_link = '' if _pos: filters = current_app.config.tag_filters.get(('sma', 'nob')) paren_args.append(tagfilter_conf(filters, _pos)) if _class: paren_args.append(_class) if len(paren_args) > 0: entry_string = '%s (%s)' % (_lemma, ', '.join(paren_args)) return entry_string + _lemma_ref_link else: return _lemma return None # LEX_TO_FST = { # 'a': 'A', # 'adv': 'Adv', # 'n': 'N', # 'npl': 'N', # 'num': 'Num', # 'prop': 'Prop', # 'v': 'V', # } @morphology.pregenerated_form_selector('sma') def pregenerate_sma(form, tags, node, **kwargs): _has_mini_paradigm = node.xpath('.//mini_paradigm[1]') _has_lemma_ref = node.xpath('.//lemma_ref') if len(_has_lemma_ref) > 0: return form, [], node, [] if len(_has_mini_paradigm) == 0: return form, tags, node else: mp = _has_mini_paradigm[0] def analysis_node(node): """ Node -> ("lemma", ["Pron", "Sg", "Tag"], ["wordform", "wordform"]) """ tag = node.xpath('.//@ms') if len(tag) > 0: tag = tag[0].split('_') else: tag = [] wfs = node.xpath('.//wordform/text()') return (form, tag, wfs) analyses = map(analysis_node, mp.xpath('.//analysis')) return form, tags, node, analyses # from common import remove_blank, match_homonymy_entries, external_korp_url from common import remove_blank, match_homonymy_entries morphology.postgeneration_filter_for_iso( 'sma', )(remove_blank) morpholex.post_morpho_lexicon_override( 'sma' )(match_homonymy_entries) # KORP_SEARCHES = [ # ('korp_wordform', 'sma', 'nob'), # # ('korp_wordform', 'nob', 'sma'), # ] # # lexicon.external_search(*KORP_SEARCHES)( # external_korp_url # )