### Morpho-Lexical interface ### # TODO: do not display analyzed lexical entries for words with mini-paradigms, # e.g., lemma_ref contents should be stripped. # Will need to operate on the output of lookup(), and this is language # specific, so decorator registry thing is probably good here. from flask import current_app from lexicon.lexicon import hash_node from itertools import groupby from operator import itemgetter import time class MorphoLexiconOverrides(object): def override_results(self, function): """ This runs the morpholex lookup, and passes the output through the a set of functions to process the output. """ def decorate(wordform, **input_kwargs): _from = input_kwargs.get('source_lang') raw_return = input_kwargs.get('return_raw_data', False) entries_and_tags = function(wordform, **input_kwargs) if raw_return: ##entries_and_tags, stdout, stderr, entr_right = entries_and_tags entries_and_tags, stdout, stderr = entries_and_tags else: ##entries_and_tags, entr_right = entries_and_tags entries_and_tags = entries_and_tags for f in self.override_functions[_from]: new_res = f(entries_and_tags) if new_res is not None: entries_and_tags = new_res else: continue if raw_return: ##return MorphoLexiconResult(entries_and_tags), stdout, stderr, MorphoLexiconResult(entr_right) return MorphoLexiconResult(entries_and_tags), stdout, stderr else: ##return MorphoLexiconResult(entries_and_tags), MorphoLexiconResult(entr_right) return MorphoLexiconResult(entries_and_tags) return decorate def post_morpho_lexicon_override(self, *language_isos): """ Use this function to register functions as part of this override """ def wrapper(override_function): for language_iso in language_isos: self.override_functions[language_iso]\ .append(override_function) print '%s morpholex overrides: registered - %s' % \ ( language_iso , override_function.__name__ ) return wrapper def __init__(self): from collections import defaultdict self.override_functions = defaultdict(list) morpholex_overrides = MorphoLexiconOverrides() from operator import itemgetter class MorphoLexiconResult(list): """ A subcalss of the List object meant to make sorting through results more readable. """ @property def analyses(self): """ Return a list of Lemma objects for each entry result """ # return [lem for e, lems in self # for lem in lems # if lem is not None] return sum(map(itemgetter(1), self), []) @property def entries(self): """ Return a list of entry objects for each entry result """ # return [e for e, analyses in self] return map(itemgetter(0), self) class MorphoLexicon(object): morphology_kwarg_names = [ 'split_compounds', 'non_compound_only', 'no_derivations', 'return_raw_data', ] lexicon_kwarg_names = [ 'source_lang', 'target_lang', 'lemma', 'pos', 'pos_type', 'entry_hash', ] def lookup(self, wordform, **kwargs): """ Performs a lookup with morphology and lexicon working together. Numerous keyword arguments/parameters are available here. morpholexicon.lookup(wordform, keyword_arg_1=asdf, keyword_arg_2=bbq) Required keyword arguments: - `source_lang` - the 3-character ISO for the source language - `target_lang` - the 3-character ISO for the target language Optional lexicon keyword arguments: - `lemma` - A lemma, if a lemma is known - `pos` - Part of speech filter - `pos_type` - POS type filter - `entry_hash` - an entry hash to return a specific entry Optional morphology keyword arguments: - `split_compounds` - Split compounds in the morphology, and return lemmas for each part of the compound. - `non_compound_only` - Filter out compounds by removing analyses with the compound tag (this is set in the Morphology configuration for the language analyser: configs/sanit.config.yaml: Morphology sme: options: compoundBoundary: "here." - `no_derivations` - Filter out derivations by removing analyses with the derivation tag. This is also specified in the configuration for `derivationMarker` - `return_raw_data` - Include the raw stdout/stderr data from the analyzer. """ source_lang = kwargs.get('source_lang') target_lang = kwargs.get('target_lang') morph_kwargs = {} lex_kwargs = {} lemma_attrs = {} if 'lemma_attrs' in kwargs: lemma_attrs = kwargs.pop('lemma_attrs') if 'entry_hash' in lemma_attrs: entry_hash_filter = lemma_attrs.pop('entry_hash') else: entry_hash_filter = False for k, v in kwargs.iteritems(): if k in self.morphology_kwarg_names: morph_kwargs[k] = v for k, v in kwargs.iteritems(): if k in self.lexicon_kwarg_names: lex_kwargs[k] = v # TODO: if analyses dropping componuds results in lexicalized # form that does not exist in lexicon, then fall back to # compounds? # TODO: to hide more_info link properly, we still need to know # what information has been stripped away in morph_kwargs and # lex_kwargs, so a count of discarded results for at least one # of these would be good. -- alternative is to run the lookup # twice, which might take too much time if someone's hitting # detail frequently. analyzer = self.analyzers.get(source_lang) try: analyses = analyzer.lemmatize(wordform, **morph_kwargs) except AttributeError: analyses = [] ##analyses_right = [] return_raw_data = morph_kwargs.get('return_raw_data', False) raw_output = '' raw_errors = '' if return_raw_data and analyses: ##analyses, raw_output, raw_errors, analyses_right = analyses analyses, raw_output, raw_errors = analyses else: ##analyses, analyses_right = analyses analyses = analyses # if analyses: # lookup_lemmas = [l.lemma for l in analyses] # else: # analyses = [] entries_and_tags = [] ##entries_and_tags_right = [] if analyses: #for analysis in list(set(analyses)): for analysis in list(analyses): if isinstance(analysis, list): if analysis[0].lemma: lex_kwargs = { 'lemma': analysis[0].lemma, 'pos': analysis[0].pos, 'pos_type': False, 'user_input': wordform, } else: continue # lex_kwargs = { # 'lemma': False, # 'pos': False, # 'pos_type': False, # 'user_input': wordform, # } else: if analysis: lex_kwargs = { 'lemma': analysis.lemma, 'pos': analysis.pos, 'pos_type': False, 'user_input': wordform, } else: continue xml_result = self.lexicon.lookup( source_lang , target_lang , **lex_kwargs ) if xml_result: for e in xml_result: entries_and_tags.append((e, analysis)) else: entries_and_tags.append((None, analysis)) ## '''if analyses_right: #for analysis in list(set(analyses)): for analysis_r in list(analyses_right): if isinstance(analysis_r, list): if analysis_r[0].lemma: lex_kwargs_right = { 'lemma': analysis_r[0].lemma, 'pos': analysis_r[0].pos, 'pos_type': False, 'user_input': wordform, } else: continue else: if analysis_r: lex_kwargs_right = { 'lemma': analysis_r.lemma, 'pos': analysis_r.pos, 'pos_type': False, 'user_input': wordform, } else: continue xml_result_right = self.lexicon.lookup( source_lang , target_lang , **lex_kwargs_right ) if xml_result_right: for e in xml_result_right: entries_and_tags_right.append((e, analysis_r)) else: entries_and_tags_right.append((None, analysis_r))'''## no_analysis_xml = self.lexicon.lookup( source_lang , target_lang , wordform , lemma_attrs=lemma_attrs , user_input=wordform ) if no_analysis_xml: for e in no_analysis_xml: entries_and_tags.append((e, None)) ##entries_and_tags_right.append((e, None)) if entry_hash_filter: def filt((x, _)): if x is not None: return hash_node(x) == entry_hash_filter return True entries_and_tags = filter(filt, entries_and_tags) ##entries_and_tags_right = filter(filt, entries_and_tags_right) # group by entry results = [] ##results_right = [] _by_entry = itemgetter(0) ##_by_entry_r = itemgetter(0) #Entries were previously sorted alphabetically #sorted_grouped_entries = groupby( # sorted(entries_and_tags, key=_by_entry), # _by_entry) def collect_same_lemma(array): #Collect same lemma in original order global array_sorted array_sorted = [] k = 0 l = [] l0 = [] none_not_added = True while k < len(array): for i in range(0, len(array)): if (array[i][1] != None) & (array[k][1] != None): if array[i][0] == array[k][0]: if array[i] not in l: array_sorted.append(array[i]) l.append(array[i]) l0.append(array[i][0]) else: if (none_not_added) & (array[i][0] not in l0): array_sorted.append(array[i]) none_not_added = False k += 1 j = 0 #In case there is the same entry twice (with and without analyses), remove the one without analyses while j < len(array_sorted): for i in range(0, len(array_sorted)): if (array_sorted[i][0] == array_sorted[j][0]): if (array_sorted[i][1] is not None) & (array_sorted[j][1] is None): del array_sorted[j] break else: if (array_sorted[j][1] is not None) & (array_sorted[i][1] is None): del array_sorted[i] break j += 1 return array_sorted entries_and_tags_sorted = collect_same_lemma(entries_and_tags) sorted_grouped_entries = groupby(entries_and_tags_sorted, _by_entry) for grouper, grouped in sorted_grouped_entries: analyses = [an for _, an in grouped if an is not None] results.append((grouper, analyses)) entries_and_tags = results ## '''entries_and_tags_sorted_r = collect_same_lemma(entries_and_tags_right) sorted_grouped_entries_r = groupby(entries_and_tags_sorted_r, _by_entry_r) for grouper, grouped in sorted_grouped_entries_r: analyses_r = [an for _, an in grouped if an is not None] results_right.append((grouper, analyses_r)) entries_and_tags_right = results_right'''## # TODO: may need to do the same for derivation? # NOTE: test with things that will never return results just to # make sure recursion doesn't get carried away. _ret = None if (len(entries_and_tags) == 0) and ('non_compound_only' in kwargs): if kwargs['non_compound_only']: new_kwargs = kwargs.copy() new_kwargs.pop('non_compound_only') _ret = self.lookup(wordform, **new_kwargs) else: _ret = [] elif (len(entries_and_tags) == 0) and not analyses: _ret = MorphoLexiconResult([]) else: _ret = MorphoLexiconResult(entries_and_tags) '''ret_right = None if (len(entries_and_tags_right) == 0) and ('non_compound_only' in kwargs): if kwargs['non_compound_only']: new_kwargs = kwargs.copy() new_kwargs.pop('non_compound_only') ret_right = self.lookup(wordform, **new_kwargs) else: ret_right = [] elif (len(entries_and_tags_right) == 0) and not analyses_right: ret_right = MorphoLexiconResult([]) else: ret_right = MorphoLexiconResult(entries_and_tags_right)''' if return_raw_data: ##return _ret, raw_output, raw_errors, ret_right return _ret, raw_output, raw_errors else: ##return _ret, ret_right return _ret def __init__(self, config): self.analyzers = config.morphologies self.lexicon = config.lexicon self.lookup = morpholex_overrides.override_results( self.lookup ) def variant_lookup(self, search_type, wordform, **kwargs): """ Performs a lookup with morphology and lexicon working together. Numerous keyword arguments/parameters are available here. morpholexicon.lookup(wordform, keyword_arg_1=asdf, keyword_arg_2=bbq) Required keyword arguments: - `source_lang` - the 3-character ISO for the source language - `target_lang` - the 3-character ISO for the target language Optional lexicon keyword arguments: - `lemma` - A lemma, if a lemma is known - `pos` - Part of speech filter - `pos_type` - POS type filter - `entry_hash` - an entry hash to return a specific entry Optional morphology keyword arguments: - `split_compounds` - Split compounds in the morphology, and return lemmas for each part of the compound. - `non_compound_only` - Filter out compounds by removing analyses with the compound tag (this is set in the Morphology configuration for the language analyser: configs/sanit.config.yaml: Morphology sme: options: compoundBoundary: "here." - `no_derivations` - Filter out derivations by removing analyses with the derivation tag. This is also specified in the configuration for `derivationMarker` - `return_raw_data` - Include the raw stdout/stderr data from the analyzer. """ source_lang = kwargs.get('source_lang') target_lang = kwargs.get('target_lang') morph_kwargs = {} lex_kwargs = {} lemma_attrs = {} if 'lemma_attrs' in kwargs: lemma_attrs = kwargs.pop('lemma_attrs') if 'entry_hash' in lemma_attrs: entry_hash_filter = lemma_attrs.pop('entry_hash') else: entry_hash_filter = False for k, v in kwargs.iteritems(): if k in self.morphology_kwarg_names: morph_kwargs[k] = v for k, v in kwargs.iteritems(): if k in self.lexicon_kwarg_names: lex_kwargs[k] = v # TODO: if analyses dropping componuds results in lexicalized # form that does not exist in lexicon, then fall back to # compounds? # TODO: to hide more_info link properly, we still need to know # what information has been stripped away in morph_kwargs and # lex_kwargs, so a count of discarded results for at least one # of these would be good. -- alternative is to run the lookup # twice, which might take too much time if someone's hitting # detail frequently. analyzer = self.analyzers.get(source_lang) try: analyses = analyzer.lemmatize(wordform, **morph_kwargs) except AttributeError: analyses = [] analyses_right = [] return_raw_data = morph_kwargs.get('return_raw_data', False) raw_output = '' raw_errors = '' if return_raw_data and analyses: analyses, raw_output, raw_errors, analyses_right = analyses else: analyses, analyses_right = analyses # if analyses: # lookup_lemmas = [l.lemma for l in analyses] # else: # analyses = [] entries_and_tags = [] entries_and_tags_right = [] if analyses: for analysis in list(analyses): if isinstance(analysis, list): if analysis[0].lemma: lex_kwargs = { 'lemma': analysis[0].lemma, 'pos': analysis[0].pos, 'pos_type': False, 'user_input': wordform, } else: continue else: if analysis: lex_kwargs = { 'lemma': analysis.lemma, 'pos': analysis.pos, 'pos_type': False, 'user_input': wordform, } else: continue xml_result = self.lexicon.variant_lookup( source_lang , target_lang , search_type , **lex_kwargs ) if xml_result: for e in xml_result: entries_and_tags.append((e, analysis)) else: entries_and_tags.append((None, analysis)) if analyses_right: #for analysis in list(set(analyses)): for analysis_r in list(analyses_right): if isinstance(analysis_r, list): if analysis_r[0].lemma: lex_kwargs_right = { 'lemma': analysis_r[0].lemma, 'pos': analysis_r[0].pos, 'pos_type': False, 'user_input': wordform, } else: continue else: if analysis_r: lex_kwargs_right = { 'lemma': analysis_r.lemma, 'pos': analysis_r.pos, 'pos_type': False, 'user_input': wordform, } else: continue xml_result_right = self.lexicon.lookup( source_lang , target_lang , **lex_kwargs_right ) if xml_result_right: for e in xml_result_right: entries_and_tags_right.append((e, analysis_r)) else: entries_and_tags_right.append((None, analysis_r)) no_analysis_xml = self.lexicon.variant_lookup( source_lang , target_lang , search_type , wordform , lemma_attrs=lemma_attrs , user_input=wordform ) if no_analysis_xml: for e in no_analysis_xml: entries_and_tags.append((e, None)) entries_and_tags_right.append((e, None)) if entry_hash_filter: def filt((x, _)): if x is not None: return hash_node(x) == entry_hash_filter return True entries_and_tags = filter(filt, entries_and_tags) # group by entry results = [] results_right = [] _by_entry = itemgetter(0) _by_entry_r = itemgetter(0) #Entries were previously sorted alphabetically #sorted_grouped_entries = groupby( # sorted(entries_and_tags, key=_by_entry), # _by_entry) def collect_same_lemma(array): #Collect same lemma in original order global array_sorted array_sorted = [] k = 0 l = [] l0 = [] none_not_added = True while k < len(array): for i in range(0, len(array)): if (array[i][1] != None) & (array[k][1] != None): if array[i][0] == array[k][0]: if array[i] not in l: array_sorted.append(array[i]) l.append(array[i]) l0.append(array[i][0]) else: if (none_not_added) & (array[i][0] not in l0): array_sorted.append(array[i]) none_not_added = False k += 1 j = 0 #In case there is the same entry twice (with and without analyses), remove the one without analyses while j < len(array_sorted): for i in range(0, len(array_sorted)): if (array_sorted[i][0] == array_sorted[j][0]): if (array_sorted[i][1] is not None) & (array_sorted[j][1] is None): del array_sorted[j] break else: if (array_sorted[j][1] is not None) & (array_sorted[i][1] is None): del array_sorted[i] break j += 1 return array_sorted entries_and_tags_sorted = collect_same_lemma(entries_and_tags) sorted_grouped_entries = groupby(entries_and_tags_sorted, _by_entry) for grouper, grouped in sorted_grouped_entries: analyses = [an for _, an in grouped if an is not None] results.append((grouper, analyses)) entries_and_tags = results entries_and_tags_sorted_r = collect_same_lemma(entries_and_tags_right) sorted_grouped_entries_r = groupby(entries_and_tags_sorted_r, _by_entry_r) for grouper, grouped in sorted_grouped_entries_r: analyses_r = [an for _, an in grouped if an is not None] results_right.append((grouper, analyses_r)) entries_and_tags_right = results_right # TODO: may need to do the same for derivation? # NOTE: test with things that will never return results just to # make sure recursion doesn't get carried away. _ret = None if (len(entries_and_tags) == 0) and ('non_compound_only' in kwargs): if kwargs['non_compound_only']: new_kwargs = kwargs.copy() new_kwargs.pop('non_compound_only') _ret = self.lookup(wordform, **new_kwargs) else: _ret = [] elif (len(entries_and_tags) == 0) and not analyses: _ret = MorphoLexiconResult([]) else: _ret = MorphoLexiconResult(entries_and_tags) ret_right = None if (len(entries_and_tags_right) == 0) and ('non_compound_only' in kwargs): if kwargs['non_compound_only']: new_kwargs = kwargs.copy() new_kwargs.pop('non_compound_only') ret_right = self.lookup(wordform, **new_kwargs) else: ret_right = [] elif (len(entries_and_tags_right) == 0) and not analyses_right: ret_right = MorphoLexiconResult([]) else: ret_right = MorphoLexiconResult(entries_and_tags_right) if return_raw_data: return _ret, raw_output, raw_errors, ret_right else: return _ret, ret_right