from morphology import generation_overrides as morphology from morpholex import morpholex_overrides as morpholex from lexicon import lexicon_overrides from lexicon import search_types, CustomLookupType from lxml import etree from views.custom_rendering import template_rendering_overrides from flask import current_app, g @template_rendering_overrides.register_custom_sort(('crk', 'eng'), ('crkMacr', 'eng'), ('crkS', 'eng')) def sort_by_analyses(search_result_obj, unsorted_entries_and_tags_and_paradigms): """ This is where we sort analyses first, and then everything else. Copying the original sorting to modify that. Original sort is: * entries where lemma matches (==) the user input first * otherwise alphabetical sort by lemma. TODO: * show analyses first, perhaps with absolute match first of these; * then show sorted non-morphological matches below """ def sort_key((lex, morph, p, l)): _str_norm = 'string(normalize-space(%s))' lemma = lex.xpath(_str_norm % './lg/l/text()') return (lemma, morph) def sort_with_user_input_first((a_lemma, a_morph), (b_lemma, b_morph)): a_has_morph = len(a_morph) > 0 b_has_morph = len(b_morph) > 0 a_lemma_matches_input = a_lemma == search_result_obj.user_input b_lemma_matches_input = b_lemma == search_result_obj.user_input move_up = -1 move_down = 1 no_diff = 0 def sort_lemma_alpha(): if a_lemma < b_lemma: return move_up elif a_lemma > b_lemma: return move_down else: return no_diff # sort alphabetically within main groups split by presence of # lemmas if (a_has_morph and b_has_morph) or (not a_has_morph and not b_has_morph): return sort_lemma_alpha() # otherwise sort by presence of morphology if a_has_morph and not b_has_morph: return move_up if not a_has_morph and b_has_morph: return move_down return no_diff return sorted( unsorted_entries_and_tags_and_paradigms , key=sort_key , cmp=sort_with_user_input_first ) # @lexicon_overrides.postlookup_filters_for_lexicon(('eng', 'crk')) # def sort_by_rank(lex, nodelist, *args, **kwargs): # # _str_norm = 'string(normalize-space(%s))' # # def get_rank(n): # try: # rank = int( n.xpath(_str_norm % './/rank/@rank') ) # except: # rank = False # if rank: # return rank # else: # return n.xpath(_str_norm % './/l/text()') # # return sorted(nodelist, key=get_rank) # NB: general search type, so crk->eng, and everything else that isn't # eng->crk substring type class CustomCrkSearch(CustomLookupType): """ This is the custom lookup type class from which all custom lookup types should be subclassed. """ lemma_match_query = './/e[re:test(lg/l/text(), $lemma_fuzz, \'i\')]' # we will use this to match things less than 3 chars. lemma_strict_match = './/e[lg/l/text() = $lemma]' def lookupLemma(self, lemma): if len(lemma) <= 3 or g._from == 'eng': match_fx = self.prepare_xpath(self.lemma_strict_match) else: match_fx = self.lemma # Can only have one character on the left side, because # iterating through input string one character by one character # to make sure replacements don't overlap. # TODO: can we use a generalized spell relax function for this? fuzzings = { u'a': u'[aâā]', u'â': u'[aâā]', u'i': u'[iîī]', u'î': u'[iîī]', u'e': u'[eêē]', u'ê': u'[eêē]', u'u': u'[uûū]', u'û': u'[uûū]', } lemma_fuzz = '' for c in lemma: lemma_fuzz += fuzzings.get(c, c) return self.XPath( match_fx , lemma=lemma , lemma_fuzz=lemma_fuzz ) search_types.add_custom_lookup_type('regular')(CustomCrkSearch) # NB: eng->crk only class EngToCrkSubstringLookups(CustomLookupType): """ NB: for the moment this is eng-crk specific, this is defined in itwewina.config.yaml.in # TODO: document this """ lemma = etree.XPath('.//e[contains(mg/tg/key/text(), $lemma)]') def filterNodes(self, nodes, lemma): """ This is our own custom modification within this search type will pop off definition nodes that do not match, by operating on clones and returning the clones. Here we select the children of the and run a test on them, if they succeed, then don't pop the node. Then return the trimmed elements. This is probably the best option for compatibility with the rest of NDS, but need to have a way of generalizing this, because at the moment, this is lexicon-specific. """ import copy def duplicate_node(node): return copy.deepcopy(node) def test_node(node): tg_node_expr = " and ".join([ '(key/text() = "%s")' % l_part for l_part in lemma.split(',') ]) _xp = 'tg[%s]' % tg_node_expr return len(node.xpath(_xp)) == 0 def process_node(node): mgs = node.findall('mg') c = len(node.findall('mg')) # Remove nodes not passing the test, these shall diminish # and go into the west, and remain . for mg in mgs: if test_node(mg): c -= 1 node.remove(mg) # If trimming results in no actual translations, we # don't display the node. if c == 0: return None else: return node new_nodes = [] for node in map(duplicate_node, nodes): new_nodes.append(process_node(node)) return [n for n in new_nodes if n != None] def lookupLemma(self, lemma): keys = ' and '.join([ '(mg/tg/key/text() = "%s")' % l for l in lemma.split(',') ]) key_expr = './/e[%s]' % keys xp = etree.XPath(key_expr) nodes = self.XPath( xp, lemma=lemma) return self.filterNodes(nodes, lemma=lemma) search_types.add_custom_lookup_type('substring_match')(EngToCrkSubstringLookups) # NB: this search type has not been registered, just copying here so it # will not get lost. # # search_types.add_custom_lookup_type('keyword')(SubstringLookups) class KeywordLookups(CustomLookupType): """ NB: for the moment this is eng-crk specific. 1. search by //e/mg/tg/t/text() instead of //e/lg/l/text() 2. after the search, we duplicate and re-test the matched nodes to remove any that do not apply to the query. 3. Duplicated nodes are returned to the rest of the query, and no one knows the difference TODO: how to provide an entry hash for these? Linkability to search results would be great. TODO: think about how to generalize this. Since this is code beyond a sort of 'base functionality', it may need to stand somewhere other than in `lexicon.lexicon`. Providing an easy API for extending search types would be great, because down the line there will be more search types. """ def __init__(self, filename=False, tree=False): if not tree: if filename not in PARSED_TREES: print "parsing %s" % filename try: self.tree = etree.parse(filename) PARSED_TREES[filename] = self.tree except Exception, e: print print " *** ** ** ** ** ** * ***" print " *** ERROR parsing %s" % filename print " *** ** ** ** ** ** * ***" print print " Check the compilation process... " print " Is the file empty?" print " Saxon errors?" print sys.exit(2) else: self.tree = PARSED_TREES[filename] else: self.tree = tree self.xpath_evaluator = etree.XPathDocumentEvaluator(self.tree) # Initialize XPath queries self.lemma = etree.XPath('.//e[mg/tg/key/text() = $lemma]') def cleanEntry(self, e): ts = e.findall('mg/tg/t') ts_text = [t.text for t in ts] ts_pos = [t.get('pos') for t in ts] l = e.find('lg/l') right_text = [l.text] return {'left': ts_text, 'pos': ts_pos, 'right': right_text} def filterNodes(self, nodes, lemma): """ # TODO: update this so it's not operating on keywords, instead # definitions Modify the nodes in some way, but by duplicating them first. Here we select the children of the and run a test on them, if they succeed, then don't pop the node. Then return the trimmed elements. This is probably the best option for compatibility with the rest of NDS, but need to have a way of generalizing this, because at the moment, this is lexicon-specific. """ import copy def duplicate_node(node): # previously: etree.XML(etree.tostring(node)) return copy.deepcopy(node) def test_node(node): tg_node_expr = " and ".join([ '(key/text() = "%s")' % l_part for l_part in lemma.split(',') ]) _xp = 'tg[%s]' % tg_node_expr return len(node.xpath(_xp)) == 0 def process_node(node): mgs = node.findall('mg') c = len(node.findall('mg')) # Remove nodes not passing the test, these shall diminish # and go into the west, and remain . for mg in mgs: if test_node(mg): c -= 1 node.remove(mg) # If trimming results in no actual translations, we # don't display the node. if c == 0: return None else: return node new_nodes = [] for node in map(duplicate_node, nodes): new_nodes.append(process_node(node)) return [n for n in new_nodes if n != None] def lookupLemma(self, lemma): keys = ' and '.join([ '(mg/tg/key/text() = "%s")' % l for l in lemma.split(',') ]) key_expr = './/e[%s]' % keys xp = etree.XPath(key_expr) nodes = self.XPath( xp, lemma=lemma) return self.filterNodes(nodes, lemma=lemma) @morphology.postgeneration_filter_for_iso('crk', 'crkMacr', 'crkS') def force_hyphen(generated_forms, *input_args, **input_kwargs): """ For any +Cnj forms that are generated, filter out those without ê- """ def matches_hyphen(f): return u'ê-' in f or u'ē-' in f def form_fx((tag, forms)): if forms: _hyph = [f for f in forms if '-' in f] if len(_hyph) > 0: unhyphs = [h.replace('-', '') for h in _hyph] # throw out all forms that have a hyphenated equivalent _filt = lambda x: x not in unhyphs and '%' not in x fs = filter(_filt, forms) return (tag, fs) return (tag, forms) return map(form_fx, generated_forms) @morphology.tag_filter_for_iso('crk', 'crkMacr', 'crkS') def adjust_tags_for_gen(lemma, tags, node=None, **kwargs): """ **tag filter**: Lexicon -> FST changes. Change POS to be compatible with FST for when they are not. """ if 'template_tag' not in kwargs: return lemma, tags, node from flask import current_app, g import re # get tagset for pre-lemma stuff morph = current_app.config.morphologies.get(g._from, False) tagsets = morph.tagsets.sets prelemmas = tagsets.get('prelemma_tags') # TODO: where is the lemma # print g._from # print lemma # print list(prelemmas.members) cleaned_tags = [] for t in tags: # print t cleaned_tag = [] for pl in prelemmas.members: before = [] rest = [] pl = unicode(pl) try: _pl = re.compile(pl) except Exception, e: _pl = False for part in t: if _pl: if _pl.match(part) or pl == part: before.append(part) continue else: if pl == part: before.append(part) continue rest.append(part) cleaned_tag.extend(before) cleaned_tag.append(lemma) cleaned_tag.extend(rest) # print cleaned_tag cleaned_tags.append(cleaned_tag) if len(cleaned_tags) == 0 and len(tags) > 0: tags = cleaned_tags # print cleaned_tags return lemma, cleaned_tags, node