# -*- coding: utf-8 -*-
from local_conf import LLL1
import importlib
settings = importlib.import_module(LLL1+'_oahpa.settings')
sdm = importlib.import_module(LLL1+'_oahpa.drill.models')
from xml.dom import minidom as _dom
from optparse import OptionParser
from django import db
import sys
import re
import string
import codecs
def monitor(function):
from functools import wraps
@wraps(function)
def wrapper(*args, **kwargs):
print '--\n'
print ' %s args'
print ' ' + repr(args)
print ' %s kwargs'
print ' ' + repr(kwargs)
result = function(*args, **kwargs)
print ' %s args'
print ' ' + repr(args)
print ' %s kwargs'
print ' ' + repr(kwargs)
print ' %s result'
print ' ' + repr(result)
print '--\n'
return result
return wrapper
class TagError(Exception):
def __init__(self, additional_messages=False):
self.additional_messages = additional_messages
def __str__(self):
msg = ("\n ** Grammars defined in element, but no inflections were found.\n"
" Check that tags.txt and paradigms.txt include all tags.\n"
"\n"
" Alternatively, ensure that is a valid tag,\n"
" or that is a valid PoS.\n"
"\n"
" If the element specification includes an , ensure that\n"
" the refers to a word in the database that has forms \n"
" with the tags specified.\n")
if self.additional_messages:
for k, v in self.additional_messages.iteritems():
values = "\n".join([" %s" % i for i in v])
append = ("\n"
" %s:\n" % k)
append += values
msg += append
# if self.id_forms:
# msg += ("\n"
# " Word in has forms matching:\n")
# for item in self.id_forms:
# msg += " %s\n" % item
return msg
class Questions:
def read_element(self,qaelement,el,el_id,qtype):
semclass = False
print
print "\tCreating element %s (%s)" % (el_id, qaelement.qatype)
# Syntactic function of the element
if self.grammar_defaults.has_key(el_id) and self.grammar_defaults[el_id].syntax:
syntax = self.grammar_defaults[el_id].syntax
else:
syntax = el_id
if not el:
print '\t', syntax, "No element given."
# Some of the answer elements share content of question elements.
content_id = ""
if el:
content_id = el.getAttribute("content")
if not content_id: content_id=el_id
# Search for the same element in question side
# If there is no element given in the answer, the element
# is a copy of the question.
question_qelements = None
qelems = sdm.QElement.objects.filter(question__id=qaelement.question_id,
identifier=content_id)
if (not el or el.getAttribute("content")) and \
sdm.QElement.objects.filter(question__id=qaelement.question_id,
identifier=content_id).count() > 0:
question_qelements = sdm.QElement.objects.filter(question__id=qaelement.question_id,
identifier=content_id)
else:
if el and el.getAttribute("content"):
if sdm.QElement.objects.filter(question__id=qaelement.id,
identifier=content_id).count() > 0:
question_qelements = sdm.QElement.objects.filter(question__id=qaelement.id,
identifier=content_id)
# Hmm, maybe not detecting copy correctly
if not el and question_qelements:
for q in question_qelements:
qe = sdm.QElement.objects.create(question=qaelement,
identifier=el_id,
syntax=q.syntax,
gametype=qaelement.gametype) # added by Heli
# copy = sdm.QElement.objects.get(question=qaelement.question,
# identifier=el_id,
# syntax=q.syntax)
# mark as a copy
q.copy_set.add(qe)
qe.save()
q.save()
return
############### AGREEMENT
# Search for elementes that agree
agr_elements=None
if syntax=="MAINV":
agr_id="SUBJ"
print "\tTRYING verb agreement " + agr_id + " " + qaelement.qatype
if sdm.QElement.objects.filter(question=qaelement, syntax=agr_id,
question__qatype=qaelement.qatype).count() > 0:
agr_elements = sdm.QElement.objects.filter(question=qaelement,
syntax=agr_id,
question__qatype=qaelement.qatype)
agreement = ""
if el: agreement = el.getElementsByTagName("agreement")
if agreement: print "\tAgreement:", agreement[0].getAttribute("id")
# Agreement from xml-files
# Try first inside question or answer
# Then in answer-question level
if agreement:
agr_id=agreement[0].getAttribute("id")
if sdm.QElement.objects.filter(question=qaelement, syntax=agr_id,
question__qatype=qaelement.qatype).count() > 0:
agr_elements = sdm.QElement.objects.filter(question=qaelement,
syntax=agr_id,
question__qatype=qaelement.qatype)
else:
if sdm.Question.objects.filter(id=qaelement.question_id).count() > 0:
q = sdm.Question.objects.filter(id=qaelement.question_id)[0]
if sdm.QElement.objects.filter(question__id=qaelement.question_id,
syntax=agr_id).count() > 0:
agr_elements = sdm.QElement.objects.filter(question__id=qaelement.question_id,
syntax=agr_id)
if not agr_elements:
print "* ERROR: no agreement elements found"
############ WORDS
# Search for existing word in the database.
if el:
ids = el.getElementsByTagName("id")
else:
ids = list()
words = {}
word_elements = None
for i in ids:
word_id = i.firstChild.data
word_id_hid = i.getAttribute("hid").strip()
if word_id:
if word_id_hid:
print "\tfound word %s/%s" % (word_id, word_id_hid)
word_elements = sdm.Word.objects.filter(wordid=word_id, hid=int(word_id_hid))
else:
print "\tfound word %s" % word_id
word_elements = sdm.Word.objects.filter(wordid=word_id)
# Add pos information here!
if not word_elements:
print "\tWord not found! " + word_id
# Search for existing semtype
# Semtype overrides the word id selection
if not word_elements:
semclasses = []
if el:
semclasses = el.getElementsByTagName("sem")
if semclasses:
semclass = semclasses[0].getAttribute("class")
word_elements = sdm.Word.objects.filter(semtype__semtype=semclass)
elif qaelement.question:
# check question for copy, grab semclasses
has_copies = sdm.QElement.objects.filter(question=qaelement.question,
identifier=el_id)
if has_copies:
semclasses = has_copies.values_list('semtype__semtype', flat=True)
semclass = semclasses[0]
word_elements = sdm.Word.objects.filter(semtype__semtype=semclass)
if el:
valclasses = el.getElementsByTagName("val")
if valclasses:
valclass = valclasses[0].getAttribute("class")
word_elements = sdm.Word.objects.filter(valency=valclass)
# If still no words, get the default words for this element:
if not word_elements:
grammar_def = self.grammar_defaults.get(el_id, False)
if grammar_def:
if grammar_def.words:
word_elements = self.grammar_defaults[el_id].words
if word_elements:
for w in word_elements:
if not words.has_key(w.pos): words[w.pos] = []
words[w.pos].append(w)
############# GRAMMAR
tagelements = None
grammars = list()
not_found = []
if el:
grammars = el.getElementsByTagName("grammar")
if not el or not grammars:
# If there is no grammatical specification, the element is created
# solely on the basis of grammar.
# However, if the element is already defined previously in the
# sentence, there is no need to create another element. In fact,
# this could result in weirdness if the element is also defined in
# the grammar, because otherwise the install process would recreate
# it with the wrong default tags.
# If the element is declared in the question, and we are now
# processing the answer, tags need to be grabbed from the question
# elements so that the normal copy process can procede, otherwise
# they are copied from the grammar, which is not what should
# happen.
preceding = sdm.QElement.objects.filter(question=qaelement,
identifier=el_id,)
if qaelement.question:
has_copies = sdm.QElement.objects.filter(question=qaelement.question,
identifier=el_id,)
else:
has_copies = False
if preceding:
print " * Element already declared in the question"
return
if has_copies:
tagelements = sum([list(p.tags.all()) for p in has_copies], [])
elif self.grammar_defaults.has_key(el_id):
if self.grammar_defaults[el_id].tags:
tagelements = self.grammar_defaults[el_id].tags
if tagelements:
tagelements = list(set(tagelements))
# An element for each different grammatical specification.
else:
poses = []
tags = []
for gr in grammars:
tags.append(gr.getAttribute("tag"))
poses.append(gr.getAttribute("pos"))
tagstrings = []
if poses:
if self.grammar_defaults.has_key(el_id):
if self.grammar_defaults[el_id].tags:
tagelements = self.grammar_defaults[el_id].tags.filter(pos__in=poses)
if tags:
tagstrings = self.get_tagvalues(tags)
if tagelements:
tagelements = tagelements or sdm.Tag.objects.filter(string__in=tagstrings)
else:
tagelements = sdm.Tag.objects.filter(string__in=tagstrings)
# print tagelements
# raw_input()
# Extra check for pronouns
# If pronoun id is given, only the tags related to that pronoun are preserved.
for t in tagelements:
if t.pos == 'Pron':
if not words.has_key('Pron'): break
found = False
for w in words['Pron'][:]:
corresponding_forms = sdm.Form.objects.filter(tag__in=tagelements,
word=w)
if corresponding_forms.count() > 0:
found = True
else:
# Should pop those that don't match, or else
# problems may arise
# TODO: this for other POS
not_found.append(
(list(set([w.lemma + '+' + form.tag.string
for form in w.form_set.all()])),
t.string)
)
words['Pron'].pop(words['Pron'].index(w))
if not found:
tagelements = tagelements.exclude(id=t.id)
# Remove those words which do not have any forms with the tags.
if words.has_key('N'):
for w in words['N']:
found = False
for t in tagelements:
if t.pos == 'N':
if sdm.Form.objects.filter(tag=t, word=w).count()>0:
found = True
if not found:
words['N'].remove(w)
# Find different pos-values in tagelements
posvalues = {}
task = ""
# Elements that do not inflection information are not created.
if not tagelements and not agr_elements:
print "\tno inflection for", el_id
if len(grammars) > 0:
additional_messages = {
'Grammar tags available for word id':
sum([a[0] for a in not_found], []),
' specified':
[a[1] for a in not_found],
}
raise TagError(additional_messages)
return
if not tagelements:
posvalues[""] = 1
else:
for t in tagelements:
posvalues[t.pos] = 1
attempt = False
if el:
task = el.getAttribute("task")
if task:
print "\tsetting", el_id, "as task"
qaelement.task = syntax
qaelement.save()
else:
if el_id == qtype:
qaelement.task = syntax
qaelement.save()
# if el:
# task = el.getAttribute("task")
# if task:
# # print task
# # print syntax
# # print 'TEST'
# # raw_input()
# print "setting", el_id, "as task"
# qaelement.task = syntax
# qaelement.save()
# attempt = True
# if qaelement.task != syntax:
# print 'Task not saved!'
# sys.exit(2)
# # print qaelement.task
# # raw_input()
# else:
# if el_id == qtype:
# qaelement.task = syntax
# qaelement.save()
# attempt = True
# if task:
# if qaelement.task != syntax:
# print 'TASK NOT SAVED'
# print qaelement.task
# print syntax
# print 'attempt: '
# print attempt
# sys.exit(2)
############# CREATE ELEMENTS
print '\tCREATING ELEMENTS'
print '\tElements for the following keys...'
print '\t' + repr(posvalues.keys())
# Add an element for each pos:
for p in posvalues.keys():
qe = sdm.QElement.objects.create(question=qaelement,\
identifier=el_id,\
syntax=syntax)
if semclass:
semty, _ = sdm.Semtype.objects.get_or_create(semtype=semclass)
qe.semtype = semty
qe.save()
if task:
qe.task=task
qe.save()
print '\t\tsemtype: ', semclass
# Add links to corresponding question elements.
if question_qelements:
for q in question_qelements:
q.copy_set.add(qe)
qe.save()
q.save()
if tagelements:
for t in tagelements:
print '\t\ttag: ', t.string
if t.pos == p:
qe.tags.add(t)
# Create links to words.
if not words.has_key(p):
word_pks = None
print "\tlooking for words..", el_id, p
# word_elements = sdm.Word.objects.filter(form__tag__in=qe.tags.all()) # pos=p)
# Just filtering isn't enough; .filter() doesn't return a list of unique items with this kind of query.
if semclass:
word_pks = sdm.Word.objects.filter(form__tag__in=qe.tags.all()).filter(semtype=qe.semtype).values_list('pk', flat=True)
else:
word_pks = sdm.Word.objects.filter(form__tag__in=qe.tags.all()).values_list('pk', flat=True)
word_pks = list(set(word_pks))
if len(word_pks) == 0:
print 'Error: Elements with zero possibilities not permitted.'
print ' > ', qe.question
print ' > Word tags: %s' % repr(qe.tags.all())
print ' > semtypes: %s' % repr(qe.semtype)
sys.exit(2)
print '\t%d elements available. ' % len(word_pks)
word_elements_gen = (sdm.Word.objects.get(pk=int(b)) for b in word_pks)
if not word_elements:
word_elements = []
else:
word_elements = list(word_elements)
if word_elements_gen:
for w in word_elements_gen:
if not words.has_key(p):
words[w.pos] = []
if not words.has_key(w.pos):
words[w.pos] = []
words[w.pos].append(w)
word_elements.append(w)
# print 'Creating elements for %d words' % word_elements.count()
for w in word_elements:
qe.wordqelement_set.create(word=w)
# we = sdm.WordQElement.objects.create(qelement=qe,\
# word=w)
# add agreement info.
if agr_elements:
for a in agr_elements:
a.agreement_set.add(qe)
a.save()
qe.save()
# Read elements attached to particular question or answer.
def read_elements(self, head, qaelement, qtype):
els = head.getElementsByTagName("element")
qastrings = qaelement.string.split()
# Read first subject for agreement
element=None
if "SUBJ" in set(qastrings):
for e in els:
if e.getAttribute("id")=="SUBJ":
element = e
break
self.read_element(qaelement, element, "SUBJ", qtype)
# Process rest of the elements in the string.
subj=False
for s in qastrings:
if s=="SUBJ" and not subj:
subj=True
continue
syntax = s.lstrip("(")
syntax = syntax.rstrip(")")
element=None
found = False
for e in els:
el_id = e.getAttribute("id")
if el_id==s and not s=="SUBJ":
self.read_element(qaelement,e,syntax,qtype)
found = True
if not found:
self.read_element(qaelement,None,syntax,qtype)
def read_questions(self, infile, grammarfile):
xmlfile=file(infile)
tree = _dom.parse(infile)
self.read_grammar_defaults(grammarfile)
qs = tree.getElementsByTagName("questions")[0]
gametype = qs.getAttribute("game")
if not gametype: gametype="morfa"
print "Created questions:"
for q in tree.getElementsByTagName("q"):
qid = q.getAttribute('id')
if not qid:
print "ERROR Missing question id, stopping."
exit()
print "\n##"
print "### INSTALLING QUESTION: %s" % qid.encode('utf-8')
print "##\n"
level = q.getAttribute('level')
if not level: level="1"
lemmacount = q.getAttribute('lemmacount') # added by Heli
if not lemmacount: lemmacount="0"
# Store question
qtype=""
qtype_els = q.getElementsByTagName("qtype")
# MIX
if qtype_els:
qtype = ','.join([qtype.firstChild.data for qtype in qtype_els])
# qtype = q.getElementsByTagName("qtype")[0].firstChild.data
question=q.getElementsByTagName("question")[0]
text=question.getElementsByTagName("text")[0].firstChild.data
#If there exists already a question with that name, delete all the references to it.
if qid:
questions = sdm.Question.objects.filter(qid=qid)
if questions:
questions[0].delete()
question_element,created = sdm.Question.objects.get_or_create(qid=qid, \
level=int(level),lemmacount=int(lemmacount), \
string=text, \
qtype=qtype, \
gametype=gametype,\
qatype="question")
# Add source information if present
if q.getElementsByTagName("sources"):
sources = q.getElementsByTagName("sources")[0]
elements=sources.getElementsByTagName("book")
for el in elements:
book=el.getAttribute("name")
if book:
# Add book to the database
# Leave this if DTD is used
book_entry, created = sdm.Source.objects.get_or_create(name=book)
if created:
print "\tCreated book entry with name ", book
question_element.source.add(book_entry)
question_element.save()
else:
book = "all"
# Add book to the database
book_entry, created = sdm.Source.objects.get_or_create(name=book)
if created:
print "\tCreated book entry with name ", book
question_element.source.add(book_entry)
question_element.save()
# Read the elements
self.read_elements(question, question_element,qtype)
# There can be more than one answer for each question,
# Store them separately.
answers=q.getElementsByTagName("answer")
for ans in answers:
text=ans.getElementsByTagName("text")[0].firstChild.data
answer_element = sdm.Question.objects.create(string=text,qatype="answer",question=question_element,level=1,lemmacount=0)
answer_element.save()
self.read_elements(ans, answer_element, qtype)
db.reset_queries()
def read_grammar_defaults(self, infile):
""" Read a grammar file and make the results accessible in
self.grammar_defaults
This has the structure:
{
'SUBJ': {
'pos': [u'N', u'Pron'],
'tags': [, , etc...]
},
'N-LOC': {
'pos': [u'N'],
'tags': [, , etc...]
},
}
{
'SUBJ':
}
"""
class GrammarDefaultError(Exception):
def __init__(self, element=False, tagstrings=False):
self.element = element
self.tagstrings = tagstrings
def __str__(self):
msg = (
"\n ** No tags were present in the database matching\n"
)
if self.element:
msg += " grammar element: %s\n" % self.element
else:
msg += " an unknown grammar element\n"
if self.tagstrings:
msg += " with the following expanded tag strings:\n"
msg += " " + " ".join(self.tagstrings)
msg += "\n Check that these words/forms are installed"
return msg
class GrammarDefault(object):
Error = GrammarDefaultError
def __init__(self,
poses=False,
tags=False,
words=False,
syntax=False):
self.tags = tags or list()
self.poses = poses or list()
self.words = words or list()
self.syntax = syntax or list()
def __str__(self):
returns = []
if self.poses:
returns.append('|'.join(self.poses) + ' - ')
if self.tags:
returns.append(', '.join([t.string for t in self.tags]))
else:
if self.poses:
returns.append('None')
if self.words:
returns.append(', '.join([w.lemma for w in self.words]))
if self.syntax:
returns.append(', '.join(self.syntax))
return ' '.join(returns)
def __repr__(self):
return '' % str(self)
xmlfile = file(infile)
tree = _dom.parse(infile)
self.grammar_defaults = {}
tags = tree.getElementsByTagName("tags")[0]
elements = tags.getElementsByTagName("element")
for el in elements:
identifier = el.getAttribute("id")
grammar_default = GrammarDefault()
word_id = None
word = None
syntax = ""
syntaxes = el.getElementsByTagName("syntax")
if syntaxes:
syntax = syntaxes[0].firstChild.data
grammar_default.syntax = syntax
word_ids = el.getElementsByTagName("id")
if word_ids:
word_id = word_ids[0].firstChild.data
word_id_hid = word_ids[0].getAttribute("hid").strip()
if word_id:
words = sdm.Word.objects.filter(wordid=word_id)
if word_id_hid:
words = words.filter(hid=int(word_id_hid))
grammar_default.words = words
tagstrings = []
grammars = el.getElementsByTagName("grammar")
for gr in grammars:
pos = gr.getAttribute("pos")
if pos:
grammar_default.poses.append(pos)
tag = gr.getAttribute("tag")
tagstrings.extend(self.get_tagvalues([tag]))
if len(tagstrings) > 0:
tags = sdm.Tag.objects.filter(string__in=tagstrings)
if tags.count() == 0:
tag_elements = ', '.join([e.toprettyxml() for e in grammars])
raise GrammarDefault.Error(element=tag_elements,
tagstrings=tagstrings)
else:
grammar_default.tags = tags
self.grammar_defaults[identifier] = grammar_default
def get_tagvalues(self, tags):
""" This alters state of things without returning objects
Recurses through set of supplied tags to ensure that each element
is represented in tags.txt and paradigms.txt. """
def fill_out(tags):
from itertools import product
def make_list(item):
if type(item) == list:
return item
else:
return [item]
return list(product(*map(make_list, tags)))
def parse_tag(tag):
""" Iterate through a tag string by chunks, and check for tag sets
and tag names. Return the reassembled tag on success. """
tag_string = []
for item in tag.split('+'):
if sdm.Tagname.objects.filter(tagname=item).count() > 0:
tag_string.append(item)
elif sdm.Tagset.objects.filter(tagset=item).count() > 0:
tagnames = sdm.Tagname.objects.filter(tagset__tagset=item)
tag_string.append([t.tagname for t in tagnames])
if len(tag_string) > 0:
return ['+'.join(item) for item in fill_out(tag_string)]
else:
return False
if type(tags) == list:
tags = [a for a in tags if a]
parsed = sum(map(parse_tag, tags), [])
return parsed
else:
return False
def delete_question(self, qid=None):
if qid:
questions = sdm.Question.objects.filter(qid=qid)
if questions:
for q in questions:
q.delete()
questions = sdm.Question.objects.filter(string=qid)
if questions:
for q in questions:
q.delete()