# -*- encoding: utf-8 -*- from django.core.management.base import BaseCommand, CommandError from optparse import make_option WORD_ATTRS = [ # 'wordid', 'language', # 'lemma', # 'presentationform', 'pos', 'stem', 'wordclass', # 'valency', # 'hid', # 'diphthong', # 'gradation', # 'rime', # 'attrsuffix', # 'soggi', # TODO: limit value options to say, 3, + None 'compare', 'frequency', 'geography', # 'tcomm', ] WORD_MANYTOMANIES = [ 'semtype', 'source', 'dialects', ] WORD_TRANSLATION_ATTRS = [ 'language', # 'wordid', # 'lemma', # 'phrase', # 'explanation', 'pos', 'frequency', 'geography', 'tcomm', 'tcomm_pref', ] WORD_TRANSLATION_MANYTOMANIES = [ 'semtype', 'source', ] def getUniques(model, model_attributes): def productKwargs(values_dict): from itertools import product value_product = product(*values_dict.values()) zipper = lambda x: dict([a for a in zip(values_dict.keys(), x) if a[1]]) for item in value_product: yield zipper(item) # Non-generator version # return map(zipper, value_product) def uniqueValues(obj, key_list): print "Getting unique attributes for <%s>..." % obj.__name__ attr_values = {} # TODO: limit unique values # ('lemma', 5) = limits to 5, excluding None # ('lemma', ['2syll', '3syll'] = excludes other values not # these, then includes None for item in key_list: vals = list(set(obj.objects.all().values_list(item, flat=True))) vals = [v for v in vals if v] vals.append(None) if len(vals) > 0: attr_values[item] = vals print ' - %s' % item print ' ' + ', '.join([repr(a) for a in vals]) return attr_values def count_iterable(i): return sum(1 for e in i) model_vals = uniqueValues(model, model_attributes) model_val_kwargs = productKwargs(model_vals) model_val_kwargs_count = productKwargs(model_vals) print 'Iterations: %d' % count_iterable(model_val_kwargs_count) print 'Ok? [Enter]' raw_input() def getUniqueWords(model, kwarg_list, limit=3): import pickle uniq_words = [] count = 0 try: with open('pickle_unique_' + model.__name__, 'r') as f: kwarg_list = pickle.load(f) kwargs_with_things = False print kwarg_list omg except Exception, e: print e kwargs_with_things = [] pass for item in kwarg_list: words = model.objects.filter(**item).order_by('?')[:limit] lemmas = '\n -'.join([a + ': ' + b for a, b in words.values_list('lemma', 'pos')]) if words.count() > 0: ids = words.values_list('pk', flat=True) uniq_words.extend(ids) if type(kwargs_with_things) != bool: kwargs_with_things.append(item) print 'Selecting for kwargs: %s' % repr(item) print lemmas else: print count count += 1 print 'Total fetched: ', str(len(uniq_words)) print 'Product count: ', str(count) if type(kwargs_with_things) != bool: with open('pickle_unique_' + model.__name__, 'w') as f: pickle.dump(kwargs_with_things, f) return uniq_words word_ids = getUniqueWords(model, model_val_kwargs) return word_ids class Command(BaseCommand): args = '--grammarfile FILE --questionfile FILE --qid QID' help = """ Runs through a question XML file and produces test sentences. Errors are printed to stderr, so that the rest can be filtered out. Example command: ./manage.py testquestions --grammarfile grammar_defaults.xml \\ --questionfile noun_questions.xml \\ --logfile accusative_errors.log \\ --iterations 3 \\ --qid acc# """ option_list = BaseCommand.option_list + ( make_option("-g", "--grammarfile", dest="grammarfile", default=False, help="XML-file for grammar defaults for questions"), make_option("-q", "--questionfile", dest="questionfile", default=False, help="XML-file that contains questions"), make_option("--qid", dest="qid", default=False, help="Specify a list of IDs to test with commas and no spaces, or specify a partial part of an id to filter questions by, e.g. ill1,ill2 OR ill#; note the wildcard symbol."), make_option("--iterations", dest="itercount", default=5, help="The count of iterations for each question"), make_option("--logfile", dest="logfile", default=False, help="Store all output to a file in addition to stdout."), # TODO: question iterations count ) def handle(self, *args, **options): """ Minimizes data in database to a set of test data for later import. """ from smadrill.models import Semtype, Word, WordTranslation # Words by unique attributes word_unique_ids = getUniques(Word, WORD_ATTRS) # word_translation_unique_ids = getUniques(WordTranslation, WORD_TRANSLATION_ATTRS) existing_objects = Word.objects.filter(id__in=word_unique_ids) # 3 words from each semtypes for semtype in Semtype.objects.all(): words = existing_objects.filter(semtype=semtype) if words.count() == 0: words = semtype.word_set.all() words = words.order_by('?')[:3] lemmas = '\n -'.join([a + ': ' + b for a, b in words.values_list('lemma', 'pos')]) ids = words.values_list('pk', flat=True) word_unique_ids.extend(ids) print 'Selecting for semtype: %s' % semtype print lemmas print 'Total uniques: ', len(list(set(word_unique_ids))) # 5 words with forms in all tags