#!/usr/bin/env python # -*- encoding: utf-8 -*- """ Morphological tools """ from cache import cache import re import os import imp from itertools import groupby from operator import itemgetter # TODO: get from global path configs_path = os.path.join( os.path.dirname(__file__) , '../') class TagPart(object): """ This is a part of a tag, which should behave mostly like a string: >>> v = TagPart('V') >>> v == 'V' True >>> repr(v) 'V' >>> str(v) 'V' >>> unicode(v) u'V' Except when some additional attribuets are defined to allow for regular expression matching >>> v = TagPart({'match': '^PV', 'regex': True}) >>> v == 'bbq' False >>> v == 'PV/e' True If the tagset fails to compile: >>> v = TagPart({'match': '(asdf', 'regex': True}) >>> v == 'bbq' False >>> v == 'PV/e' True """ def __init__(self, _t): self._t = _t self.regex = False if type(_t) == dict: self.val = _t.get('match') self.regex = _t.get('regex', False) else: self.val = _t if self.regex: try: self._re = re.compile(self.val) except Exception, e: print self._t raise e def __unicode__(self): return self.val def __repr__(self): return self.val def __eq__(self, other): if self.regex: m = self._re.match(other) return m is not None else: return self.val == other class Tagset(object): def __init__(self, name, members): self.name = name self.members = map(TagPart, members) def __str__(self): return '' % self.name def __contains__(self, item): return item in self.members class Tagsets(object): def __init__(self, set_definitions): self.sets = {} self.set_definitions = set_definitions self.createTagSets() def createTagSets(self): for name, tags in self.set_definitions.iteritems(): tagset = Tagset(name, tags) self.set(name, tagset) def get(self, name): return self.sets.get(name, False) def __getitem__(self, key): return self.get(key) def set(self, name, tagset): self.sets[name] = tagset def all_tags(self): _all = list(set( sum( [v.members for k, v in self.sets.iteritems()], [] ) )) return _all class Tag(object): """ A model for tags. Can be used as an iterator, as well. #>> for part in Tag('N+G3+Sg+Ill', '+'): #>> print part Also, indexing is the same as Tag.getTagByTagset() >>> _type = Tagset('type', ['G3', 'NomAg']) >>> _case = Tagset('case', ['Nom', 'Ill', 'Loc']) >>> _ng3illsg = Tag('N+G3+Sg+Ill', '+') >>> _ng3illsg[_type] 'G3' >>> _ng3illsg[_case] 'Ill' >>> _pv = Tagset('preverb', ['1', '2', {'match': '^PV', 'regex': True}]) >>> pv_tag = Tag('PV/e+V+Sg', '+') >>> 'PV/e' in _pv True >>> pv_tag[_pv] 'PV/e' >>> pv_tag = Tag('PV/omgbbq+V+Sg', '+') >>> 'PV/omgbbq' in _pv True >>> pv_tag[_pv] != 'PV/e' True TODO: maybe also contains for tag parts and tagsets TODO: begin integrating Tag and Tagsets into morphology code below, will help when generalizing the lexicon-morphology 'type' and 'pos' stuff. E.g., in `sme`, we look up words by 'pos' and 'type' when it exists, but in other languages this will be different. As such, we will need `Tag`, and `Tagset` and `Tagsets` to mitigate this. Also, will need some sort of lexicon lookup definition in configs, to describe how to bring these items together. """ def __init__(self, string, sep, tagsets={}): self.tag_string = string self.sep = sep self.parts = self.tag_string.split(sep) if isinstance(tagsets, Tagsets): self.sets = tagsets.sets elif isinstance(tagsets, dict): self.sets = tagsets else: self.sets = tagsets def __contains__(self, b): if isinstance(b, str) or isinstance(b, unicode): return self.sets.get(b, False) return False def __getitem__(self, b): """ Overloading the xor operator to produce the tag piece that belongs to a given tagset. """ _input = b if isinstance(b, int): return self.parts[b] if isinstance(b, str) or isinstance(b, unicode): b = self.sets.get(b, False) if not b: _s = ', '.join(self.sets.keys()) raise IndexError("Invalid tagset <%s>. Choose one of: %s" % (_input, _s)) elif isinstance(b, Tagset): pass return self.getTagByTagset(b) def __iter__(self): for x in self.parts: yield x def __str__(self): return '' % self.sep.join(self.parts) def __repr__(self): return '' % self.sep.join(self.parts) def matching_tagsets(self): ms = {} for key in self.sets.keys(): if self[key]: ms[key] = self[key] return ms def getTagByTagset(self, tagset): for p in self.parts: if p in tagset.members: return p def splitByTagset(self, tagset): """ #>> tagset = Tagset('compound', ['Cmp#']) [Cmp#] #>> tag = Tag('N+Cmp#+N+Sg+Nom') #>> tag.splitByTagset(tagset) [, ] """ raise NotImplementedError class Lemma(object): """ Lemma class that is bound to the morphology """ def __key(self): return ( self.lemma , self.pos , self.tool.formatTag(self.tag_raw) ) def __eq__(x, y): return x.__key() == y.__key() def __hash__(self): return hash(self.__key()) def __unicode__(self): return self.lemma def __repr__(self): _lem, _pos, _tag = self.__key() _lem = unicode(_lem).encode('utf-8') _pos = unicode(_pos).encode('utf-8') _tag = unicode(_tag).encode('utf-8') cls = self.__class__.__name__ return '<%s: %s, %s, %s>' % (cls, _lem, _pos, _tag) def prepare_tag(self, tag, tagsets): """ Clean up the tag, lemma, and POS, make adjustments depending on whether the langauge has tags before the lemma. NB: if there's a problem here, make sure any possible tags before the lemma are defined as some member of any tagset. """ all_tags = tagsets.all_tags() self.tag = self.tool.tagStringToTag( tag , tagsets=tagsets ) # Best guess is the first item, otherwise... lemma = tag[0] # Best guess is the first item, otherwise... #self.lemma = tag[0] #del tag[0] if lemma in all_tags: # Separate out items that are not values in a tagset, these # are probably the lemma. not_tags = [t for t in tag if t not in all_tags] if len(not_tags) > 0: self.lemma = not_tags[0] else: self.lemma = tag[0] else: self.lemma = lemma self.pos = self.tag['pos'] self.tag_raw = tag def __init__(self, tag=[''], _input=False, tool=False, tagsets={}): self.tagsets = tagsets self.tool = tool self.prepare_tag(tag, tagsets) if 'pos' in self.tag: self.pos = self.tag['pos'] else: self.pos = self.tag.parts[0] self.input = _input self.form = _input class GeneratedForm(Lemma): """ Helper class for generated forms, adds attribute `self.form`, alters repr format. """ def __key(self): return ( self.lemma , self.pos , self.tool.formatTag(self.tag_raw) ) def __repr__(self): _lem, _pos, _tag = self.__key() _lem = unicode(_lem).encode('utf-8') _pos = unicode(_pos).encode('utf-8') _tag = unicode(_tag).encode('utf-8') f = unicode(self.form).encode('utf-8') cls = self.__class__.__name__ return '<%s: %s, %s, %s, %s>' % (cls, f, _lem, _pos, _tag) def __init__(self, *args, **kwargs ): super(GeneratedForm, self).__init__(*args, **kwargs) self.form = self.input def word_generation_context(generated_result, *generation_input_args, **generation_kwargs): """ **Post-generation filter*** Include context for verbs in the text displayed in paradigm generation. The rule in this case is rather complex, and looks at the tag used in generation. Possible contexts: * (mun) dieđán """ language = generation_kwargs.get('language') from jinja2 import Template from flask import current_app context_for_tags = current_app.config.paradigm_contexts.get(language, {}) node = generation_input_args[2] if len(node) == 0: return generated_result context = node.xpath('.//l/@context') if len(context) > 0: context = context[0] else: context = None def apply_context(form): # tag, forms = form # trigger different tuple lengths and adjust the entities #([u'viessat', u'V', u'Ind', u'Prt', u'Pl1'], [u'viesaimet']) # ==> (u'viessat', [u'V', u'Ind', u'Prt', u'Pl1'], [u'viesaimet']) # fix for the bug 2406 if len(form) == 2: tmp_tag, tmp_forms = form tmp_lemma = tmp_tag[0] tmp_tag = tmp_tag[1:len(tmp_tag)] form = (tmp_lemma, tmp_tag, tmp_forms) lemma, tag, forms = form tag = '+'.join(tag) # Get the context, but also fall back to the None option. context_formatter = context_for_tags.get( (context, tag), context_for_tags.get( (None, tag), False ), ) if context_formatter: formatted = [] if forms: for f in forms: _kwargs = {'word_form': f, 'context': context} if isinstance(context_formatter, Template): f = context_formatter.render(**_kwargs) else: f = context_formatter % _kwargs formatted.append(f) formatted_forms = formatted else: formatted_forms = forms tag = tag.split('+') return (tag, formatted_forms) return map(apply_context, generated_result) class GenerationOverrides(object): """ Class for collecting functions marked with decorators that provide special handling of tags. One class instantiated in morphology module: `generation_overrides`. #>> @generation_overrides.tag_filter_for_iso('sme') #>> def someFunction(form, tags, xml_node): #>> ... some processing on tags, may be conditional, etc. #>> return form, tags, xml_node Each time morphology.generation is run, the args will be passed through all of these functions in the order that they were registered, allowing for language-specific conditional rules for filtering. There is also a post-generation tag rewrite decorator registry function """ ## ### Here are the functions that apply all the rules ## def restrict_tagsets(self, lang_code, function): """ This runs through each function in the tagset restriction registry, and applies it to the input arguments of the decorated function. """ def decorate(*args, **kwargs): newargs = args newkwargs = kwargs for f in self.registry[lang_code]: newargs = f(*newargs, **newkwargs) return function(*newargs, **newkwargs) return decorate def process_generation_output(self, lang_code, function): """ This runs the generator function, and applies all of the function contexts to the output. Or in other words, this decorator works on the output of the decorated function, but also captures the input arguments, making them available to each function in the registry. """ def decorate(*input_args, **input_kwargs): raw = input_kwargs.get('return_raw_data', False) if raw: generated_forms, stdout, stderr = function(*input_args, **input_kwargs) else: generated_forms = function(*input_args, **input_kwargs) for f in self.postgeneration_processors[lang_code]: generated_forms = f(generated_forms, *input_args, **input_kwargs) for f in self.postgeneration_processors['all']: input_kwargs['language'] = lang_code if f not in self.postgeneration_processors[lang_code]: generated_forms = f(generated_forms, *input_args, **input_kwargs) if raw: return generated_forms, stdout, stderr else: return generated_forms return decorate def process_analysis_output(self, lang_code, function): """ This runs the analysis function, and applies all of the function contexts to the output. Or in other words, this decorator works on the output of the decorated function, but also captures the input arguments, making them available to each function in the registry. """ def decorate(*input_args, **input_kwargs): generated_forms = function(*input_args, **input_kwargs) for f in self.postanalyzers[lang_code]: generated_forms = f( generated_forms , *input_args , **input_kwargs ) return generated_forms return decorate def apply_pregenerated_forms(self, lang_code, function): def decorate(*args, **kwargs): newargs = args newkwargs = kwargs f = self.pregenerators.get(lang_code, False) if f: newargs = f(*newargs, **newkwargs) return function(*newargs, **newkwargs) return decorate ## ### Here are the decorators ## def post_analysis_processor_for_iso(self, *language_isos): """ For language specific processing after analysis is completed, for example, stripping tags before presentation to users. """ def wrapper(postanalysis_function): for language_iso in language_isos: self.postanalyzers[language_iso].append(postanalysis_function) self.postanalyzers_doc[language_iso].append((postanalysis_function.__name__, postanalysis_function.__doc__)) print '%s overrides: registered post-analysis processor - %s' % \ ( language_iso , postanalysis_function.__name__ ) return wrapper def pregenerated_form_selector(self, *language_isos): """ The function that this decorates is used to select and construct a pregenerated paradigm for a given word and XML node. Only one may be defined. """ def wrapper(pregenerated_selector_function): for language_iso in language_isos: self.pregenerators[language_iso] = pregenerated_selector_function self.pregenerators_doc[language_iso] = [(pregenerated_selector_function.__name__, pregenerated_selector_function.__doc__)] print '%s overrides: registered static paradigm selector - %s' % \ ( language_iso , pregenerated_selector_function.__name__ ) return wrapper def tag_filter_for_iso(self, *language_isos): """ Register a function for a language ISO """ def wrapper(restrictor_function): for language_iso in language_isos: self.registry[language_iso].append(restrictor_function) self.tag_filter_doc[language_iso].append( ( restrictor_function.__name__ , restrictor_function.__doc__ ) ) print '%s overrides: registered pregeneration tag filterer - %s' %\ ( language_iso , restrictor_function.__name__ ) return wrapper def postgeneration_filter_for_iso(self, *language_isos): """ Register a function for a language ISO """ def wrapper(restrictor_function): for language_iso in language_isos: self.postgeneration_processors[language_iso]\ .append(restrictor_function) self.postgeneration_processors_doc[language_iso]\ .append((restrictor_function.__name__, restrictor_function.__doc__)) print '%s overrides: registered entry context formatter - %s' %\ ( language_iso , restrictor_function.__name__ ) return wrapper def __init__(self): from collections import defaultdict self.registry = defaultdict(list) self.tag_filter_doc = defaultdict(list) self.pregenerators = defaultdict(list) self.pregenerators_doc = defaultdict(list) self.postanalyzers = defaultdict(list) self.postanalyzers_doc = defaultdict(list) self.postgeneration_processors = defaultdict(list) self.postgeneration_processors['all'] = [ word_generation_context ] self.postgeneration_processors_doc = defaultdict(list) generation_overrides = GenerationOverrides() class XFST(object): def splitTagByCompound(self, analysis): _cmp = self.options.get('compoundBoundary', False) if _cmp: #analysis_split = analysis.split(_cmp) #if 'Cmp' in analysis: # last_analysis = analysis_split[len(analysis_split)-1] # analysis_split[len(analysis_split)-1] = last_analysis+'+DCmp' return analysis.split(_cmp) #return analysis_split else: return [analysis] def splitTagByString(self, analysis, tag_input): def splitTag(item, tag_string): if tag_string in item: res = [] while tag_string in item: fa = re.findall(tag_string, item) if len(fa) == 1: res.append(item[0:item.find("+"+tag_string)]) res.append(item[item.find("+"+tag_string)+1:len(item)]) break else: result = item[0:item.find("+"+tag_string)] result2 = item[item.find("+"+tag_string)+1:len(item)] res.append(result) item = result2 myres_array.append(res) else: myres_array.append(item) return global myres_array myres_array = [] if isinstance(analysis, list): for var in analysis: splitTag(var, tag_input) else: splitTag(analysis, tag_input) fin_res = [] for item in myres_array: if isinstance(item, list): for var in item: fin_res.append(var) else: fin_res.append(item) return fin_res def tag_processor(self, analysis_line): """ This is a default tag processor which just returns the wordform separated from the tag for a given line of analysis. You can write a function to replace this for an individual morphology by adding it to a file somewhere in the PYTHONPATH, and then setting the Morphology option `tagProcessor` to this path. Ex.) Morphology: crk: options: tagProcessor: "configs/language_specific_rules/file.py:function_name" Note the colon. It may also be a good idea to write some tests in the docstring for that function. If these are present they will be quickly tested on launch of the service, and failures will prevent launch. A tag processor must accept a string as input, and return a tuple of the wordform and processed tag. You may do this to for example, re-order tags, or relabel them, but whateve the output is, it must be a string. Ex.) 'wordform\tlemma+Tag+Tag+Tag' -> ('wordform', 'lemma+Tag+Tag+Tag') """ wordform, _, lemma_tags = analysis_line.partition('\t') return (wordform, lemma_tags) def clean(self, _output): """ Clean XFST lookup text into [('keenaa', ['keen+V+1Sg+Ind+Pres', 'keen+V+3SgM+Ind+Pres']), ('keentaa', ['keen+V+2Sg+Ind+Pres', 'keen+V+3SgF+Ind+Pres'])] """ analysis_chunks = [a for a in _output.split('\n\n') if a.strip()] cleaned = [] for chunk in analysis_chunks: lemmas = [] analyses = [] for part in chunk.split('\n'): (lemma, analysis) = self.tag_processor(part) lemmas.append(lemma) analyses.append(analysis) lemma = list(set(lemmas))[0] append_ = (lemma, analyses) cleaned.append(append_) return cleaned @cache.memoize(60 * 5) def _exec(self, _input, cmd, timeout=5): """ Execute a process, but kill it after 5 seconds. Generally we expect small things here, not big things. """ import subprocess from threading import Timer try: _input = _input.encode('utf-8') except: pass try: lookup_proc = subprocess.Popen(cmd.split(' '), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError: raise Exception("Error executing lookup command for this request, confirm that lookup utilities and analyzer files are present.") except Exception, e: raise Exception("Unhandled exception <%s> in lookup request" % e) def kill_proc(proc=lookup_proc): try: proc.kill() raise Exception("Process for %s took too long." % cmd) except OSError: pass return if not timeout: t = Timer(5, kill_proc) t.start() output, err = lookup_proc.communicate(_input) if output: try: output = output.decode('utf-8') except: pass if err: try: err = err.decode('utf-8') except: pass return (output, err) def load_tag_processor(self): import sys # import doctest print >> sys.stdout, "Loading the tag processor." _path = self.options.get('tagProcessor') module_path, _, from_list = _path.partition(':') try: mod = imp.load_source('.', os.path.join(configs_path, module_path)) except: sys.exit("Unable to import <%s>" % module_path) try: func = mod.__getattribute__(from_list) except: sys.exit("Unable to load <%s> from <%s>" % (from_list, module_path)) self.tag_processor = func def __init__(self, lookup_tool, fst_file, ifst_file=False, options={}): self.cmd = "%s -flags mbTT %s" % (lookup_tool, fst_file) self.options = options if ifst_file: self.icmd = "%s -flags mbTT %s" % (lookup_tool, ifst_file) else: self.icmd = False if 'tagProcessor' in self.options: self.load_tag_processor() def applyMorph(self, morph): morph.tool = self self.logger = morph.logger return morph def lookup(self, lookups_list, raw=False): lookup_string = '\n'.join(lookups_list) output, err = self._exec(lookup_string, cmd=self.cmd) if len(output) == 0 and len(err) > 0: name = self.__class__.__name__ msg = """%s - %s: %s""" % (self.langcode, name, err) self.logger.error(msg.strip()) if raw: return self.clean(output), output, err return self.clean(output) def inverselookup_by_string(self, lookup_string): import sys if not self.icmd: print >> sys.stderr, " * Inverse lookups not available." return False output, err = self._exec(lookup_string, cmd=self.icmd) return self.clean(output) def inverselookup(self, lemma, tags, raw=False, no_preprocess_paradigm=False): import sys if not self.icmd: print >> sys.stderr, " * Inverse lookups not available." return False lookups_list = [] # Some templates (namely those where there are tags before # the lemma), will cause problems. Thus if the lemma is # already in the tag, we consider this to be a completed tag # string for generation. Otherwise, prefix the lemma then # send to generation. # if not no_preprocess_paradigm: for tag in tags: if lemma in tag: combine = tag else: combine = [lemma] + tag lookups_list.append(self.formatTag(combine, inverse=True)) lookup_string = '\n'.join(lookups_list) else: lookup_string = tags + '\n' output, err = self._exec(lookup_string, cmd=self.icmd) if raw: return self.clean(output), output, err else: return self.clean(output) def tagUnknown(self, analysis): if '+?' in analysis: return True else: return False def tagStringToTag(self, parts, tagsets={}, inverse=False): if inverse: delim = self.options.get('inverse_tagsep', self.options.get('tagsep', '+')) else: delim = self.options.get('tagsep', '+') tag = delim.join(parts) return Tag(tag, delim, tagsets=tagsets) def formatTag(self, parts, inverse=False): if inverse: delim = self.options.get('inverse_tagsep', self.options.get('tagsep', '+')) else: delim = self.options.get('tagsep', '+') return delim.join(parts) def splitAnalysis(self, analysis, inverse=False): """ u'lemma+Tag+Tag+Tag' -> [u'lemma', u'Tag', u'Tag', u'Tag'] """ if inverse: delim = self.options.get('inverse_tagsep', self.options.get('tagsep', '+')) else: delim = self.options.get('tagsep', '+') return analysis.split(delim) class HFST(XFST): def __init__(self, lookup_tool, fst_file, ifst_file=False, options={}): self.cmd = "%s %s" % (lookup_tool, fst_file) self.options = options if ifst_file: self.icmd = "%s %s" % (lookup_tool, ifst_file) else: self.icmd = False if 'tagProcessor' in self.options: self.load_tag_processor() class OBT(XFST): """ TODO: this is almost like CG, so separate out those things if necessary. """ def clean(self, _output): """ Clean CG lookup text into [('keenaa', ['keen+V+1Sg+Ind+Pres', 'keen+V+3SgM+Ind+Pres']), ('keentaa', ['keen+V+2Sg+Ind+Pres', 'keen+V+3SgF+Ind+Pres'])] """ analysis_chunks = [] chunk = [] for line in _output.splitlines(): if line.startswith('"<'): if len(chunk) > 0: analysis_chunks.append(chunk) chunk = [line] continue elif line.startswith("\t\""): chunk.append(line.strip()) else: analysis_chunks.append(chunk) cleaned = [] for chunk in analysis_chunks: form, analyses = chunk[0], chunk[1::] lemmas = [] tags = [] for part in analyses: tagparts = part.split(' ') lemma = tagparts[0] lemma = lemma.replace('"', '') lemmas.append(lemma) tags.append(' '.join([lemma] + tagparts[1::])) lemma = list(set(lemmas))[0] form = form[2:len(form) - 2] append_ = (form, tags) cleaned.append(append_) return cleaned def splitAnalysis(self, analysis): return analysis.split(' ') def __init__(self, lookup_tool, options={}): self.cmd = lookup_tool self.options = options class Morphology(object): def generate_to_objs(self, *args, **kwargs): # TODO: occasionally lemma is not lemma, but first part of a # tag, need to fix with the tagsets def make_lemma(r): lems = [] tag, forms = r if isinstance(forms, list): for f in forms: lem = GeneratedForm(tag, _input=f, tool=self.tool, tagsets=self.tagsets) lems.append(lem) else: lems = [] return lems generate_out, stdin, stderr = self.generate(*args, **kwargs) generated = sum(map(make_lemma, generate_out), []) return_raw_data = kwargs.get('return_raw_data', False) if return_raw_data: return generated, stdin, stderr else: return generated def generate(self, lemma, tagsets, node=None, pregenerated=None, **kwargs): """ Run the lookup command, parse output into [(lemma, ['Verb', 'Inf'], ['form1', 'form2'])] If pregenerated, we pass the forms in using the same structure as the analyzed output. The purpose here is that pregenerated forms in lexicon may differ from language to language, and we want to allow processing for that to occur elsewhere. TODO: cache pregenerated forms, return them. """ return_raw_data = kwargs.get('return_raw_data', False) no_preprocess_paradigm = kwargs.get('no_preprocess_paradigm', False) # tagsets as passed in include the lemma and do not require # preprocessing to add it in # if no_preprocess_paradigm: if len(node) > 0: key = self.generate_cache_key(lemma, tagsets, node) else: key = self.generate_cache_key(lemma, tagsets) stdout_key = key + 'stdout' stderr_key = key + 'stderr' _is_cached = self.cache.get(key) if _is_cached: if return_raw_data: cache_stdout = self.cache.get(stdout_key) cache_stderr = self.cache.get(stdout_key) if cache_stdout is None: cache_stdout = 'no cache data' if cache_stderr is None: cache_stderr = 'no cache data' return _is_cached, 'stdout cached: ' + cache_stdout, 'stderr cached: ' + cache_stderr else: return _is_cached # TODO: cache if pregenerated: _is_cached = self.cache.set(key, pregenerated) if return_raw_data: return pregenerated, 'pregenerated', '' else: return pregenerated if return_raw_data: res, raw_output, raw_errors = self.tool.inverselookup(lemma, tagsets, raw=True, no_preprocess_paradigm=no_preprocess_paradigm) else: res = self.tool.inverselookup(lemma, tagsets, no_preprocess_paradigm=no_preprocess_paradigm) raw_output = '' raw_errors = '' reformatted = [] tag = False for tag, forms in res: unknown = False for f in forms: # TODO: how does OBT handle unknown? if '+?' in f: unknown = True msg = self.tool.__class__.__name__ + ': ' + \ tag + '\t' + '|'.join(forms) self.tool.logger.error(msg) if not unknown: reformatted.append((self.tool.splitAnalysis(tag, inverse=True), forms)) else: parts = self.tool.splitAnalysis(tag, inverse=True) forms = False reformatted.append((parts, forms)) # Log generation error: if len(reformatted) == 0: logg_args = [ 'GENERATE', self.langcode, tag or '', ] if len(tagsets) > 0: _tagsets = ','.join(['+'.join(t) for t in tagsets]) else: _tagsets = '' logg_args.append(_tagsets) if 'extra_log_info' in kwargs: _extra_log_info = kwargs.pop('extra_log_info') extra_log_info = ', '.join(["%s: %s" % (k, v) for (k, v) in _extra_log_info.iteritems()]) extra_log_info = extra_log_info.encode('utf-8') logg_args.append(extra_log_info) logg = "\t".join([a for a in logg_args if a]) self.logger.error(logg.strip()) _is_cached = self.cache.set(key, reformatted) _is_cached_out = self.cache.set(stdout_key, raw_output) _is_cached_ert = self.cache.set(stderr_key, raw_errors) if return_raw_data: return reformatted, raw_output, raw_errors else: return reformatted # TODO: option, or separate function to also return discarded to # find out what's been removed to hide more_info link def lemmatize(self, form, split_compounds=False, non_compound_only=False, no_derivations=False, return_raw_data=False): """ For a wordform, return a list of lemmas """ def remove_compound_analyses(_a): _cmp = self.tool.options.get('compoundBoundary', False) if not _cmp: return True if _cmp in _a: return False else: return True def remove_derivations(_a): _der = self.tool.options.get('derivationMarker', False) if not _der: return True if _der in _a: return False else: return True def maybe_filter(function, iterable): result = filter(function, iterable) if len(result) > 0: return result else: return iterable #If the user input is lexicalized then put it as the first element in analyses def check_if_lexicalized(array): for i in range(0, len(array)): if form in array[i]: array.insert(0,array[i]) del array[i+1] break #If the user input is not in the base form, the for above doesn't find the analyses #so find the longest analyses and put it/them in the first/s element/s #in analyses if it is not one of the single parts mystr = [] indmax = [] for i in range(0, len(array)): mystr.append(len(array[i][0:array[i].find("+")])) indmax = [i for i, j in enumerate(mystr) if j == max(mystr)] if not (max(mystr) < len(form)): k = 0 for i in range(0, len(indmax)): array.insert(k, array.pop(indmax[i])) k += 1 return array if return_raw_data: lookups, raw_output, raw_errors = self.tool.lookup([form], raw=True) else: lookups = self.tool.lookup([form]) # Check for unknown unknown = False for k, v in lookups: for a in v: if '?' in a: unknown = True if unknown: if return_raw_data: return False, raw_output, raw_errors else: return False #lemmas = set() #Use list() instead of set() to keep original order lemmas = list() ##lemmas_r = list() for _form, analyses in lookups: if non_compound_only: analyses = maybe_filter(remove_compound_analyses, analyses) if no_derivations: analyses = maybe_filter(remove_derivations, analyses) #Introduce the variable 'analyses_right' because in some cases when Der/ tags # we want to show only specific analyses and not all ##analyses_right = analyses analyses_der = analyses #In case of multiple analyses with different types of Der we need to keep them all #so in each case we append the results #(probably no need for all these variables, so maybe TODO: clean) ##analyses_right_fin = [] analyses_der_fin = [] analyses = check_if_lexicalized(analyses) cnt = [] for item in analyses: cnt.append(item.count('Der')) cnt_orth = [] for item in analyses: cnt_orth.append(item.count('Err/Orth')) import heapq if (min(cnt_orth) == 0 and max(cnt_orth) == 1) or (min(cnt_orth) == 0 and max(cnt_orth) == 0): if len(cnt)>1 and min(cnt)==0 and heapq.nsmallest(2, cnt)[-1] != 0: analyses = [analyses[cnt.index(min(cnt))],analyses[cnt.index(heapq.nsmallest(2, cnt)[-1])]] else: if min(cnt) != 0: analyses = [analyses[cnt.index(min(cnt))]] else: if (min(cnt_orth) == 1 and max(cnt_orth) == 1): analyses = analyses if split_compounds: analyses = sum( map(self.tool.splitTagByCompound, analyses) , [] ) tags = ('Der', 'VAbess', 'VGen', 'Ger', 'Comp', 'Superl') an_split = [] for item in analyses: an_split.append(item.split('+')) k = 0 for item in an_split: index = [] if_tags = False for i in range(0, len(item)): if item[i].startswith(tags): index.append(i) if_tags = True s = '+' b = [] if not if_tags: b.append(analyses[k]) else: for i in range(len(index)): if i == 0: b.append(s.join(item[0:index[i]])) else: b.append(s.join(item[index[i-1]:index[i]])) if i==len(index)-1: b.append(s.join(item[index[i]:len(item)])) k += 1 analyses_der_fin.append(b) def fix_nested_array(nested_array): not_nested_array = [] if len(nested_array) != 0: if isinstance(nested_array[0],list): for item in nested_array: if len(item)>1: for var in item: not_nested_array.append(var) else: not_nested_array.append(item[0]) else: not_nested_array = analyses return not_nested_array #Fix in case analyses_der_fin and analyses_right_fin are nested arrays array_not_nested = fix_nested_array(analyses_der_fin) def remove_duplicates(array_var): newlist = [] for item in array_var: if item not in newlist: newlist.append(item) return newlist #Remove duplicates due to append in different der types analyses_der_fin = remove_duplicates(array_not_nested) ##analyses_right_fin = analyses_der_fin for analysis in analyses_der_fin: # TODO: here's where to begin solving finding a lemma # from: # PV/maci+PV/pwana+nipâw+V+AI+Ind+Prs+1Sg _an_parts = self.tool.splitAnalysis(analysis) # If a word doesn't have a PoS in an analysis, we try to # handle it as best as possible. if len(_an_parts) == 1: _lem = _an_parts[0] lem = Lemma(_an_parts, _input=_lem, tool=self.tool, tagsets=self.tagsets) else: lem = Lemma( _an_parts , _input=form , tool=self.tool, tagsets=self.tagsets ) #lemmas.add(lem) lemmas.append(lem) ## '''for analysis_r in analyses_right_fin: # TODO: here's where to begin solving finding a lemma # from: # PV/maci+PV/pwana+nipâw+V+AI+Ind+Prs+1Sg _an_parts = self.tool.splitAnalysis(analysis_r) # If a word doesn't have a PoS in an analysis, we try to # handle it as best as possible. if len(_an_parts) == 1: _lem = _an_parts[0] lem = Lemma(_an_parts, _input=_lem, tool=self.tool, tagsets=self.tagsets) else: lem = Lemma( _an_parts , _input=form , tool=self.tool, tagsets=self.tagsets ) #lemmas.add(lem) lemmas_r.append(lem)'''## if return_raw_data: ##return list(lemmas), raw_output, raw_errors, list(lemmas_r) return list(lemmas), raw_output, raw_errors else: ##return list(lemmas), list(lemmas_r) return list(lemmas) def de_pickle_lemma(self, lem, tag): _tag = self.tool.splitAnalysis(tag) lem = Lemma( lem, '', _tag, fulltag=_tag , tool=self.tool, tagsets=self.tagsets ) return lem def generate_cache_key(self, lemma, generation_tags, node=False): """ key is something like generation-LANG-nodehash-TAG|TAG|TAG """ import hashlib if type(generation_tags) == list: _cache_tags = '|'.join(['+'.join(a) for a in generation_tags]) else: _cache_tags = generation_tags _cache_key = hashlib.md5() _cache_key.update('generation-%s-' % self.langcode) _cache_key.update(lemma.encode('utf-8')) if node is not None: if len(node) > 0: node_hash = node.__hash__() _cache_key.update(str(node_hash)) _cache_key.update(_cache_tags.encode('utf-8')) return _cache_key.hexdigest() def __init__(self, languagecode, tagsets={}, cache=False): self.langcode = languagecode self.generate = generation_overrides.apply_pregenerated_forms( languagecode, self.generate ) self.generate = generation_overrides.restrict_tagsets( languagecode, self.generate ) self.generate = generation_overrides.process_generation_output( languagecode, self.generate ) self.lemmatize = generation_overrides.process_analysis_output( languagecode, self.lemmatize ) if cache: self.cache = cache else: self.cache = False import logging logfile = logging.FileHandler('morph_log.txt') self.logger = logging.getLogger('morphology') self.logger.setLevel(logging.ERROR) self.logger.addHandler(logfile) self.tagsets = Tagsets(tagsets)