# -*- coding: utf-8 -*-
from settings import *
from est_drill.models import *
from xml.dom import minidom as _dom
from optparse import OptionParser
from django import db
import sys
import re
import string
import codecs
def monitor(function):
from functools import wraps
@wraps(function)
def wrapper(*args, **kwargs):
print '--\n'
print ' %s args'
print ' ' + repr(args)
print ' %s kwargs'
print ' ' + repr(kwargs)
result = function(*args, **kwargs)
print ' %s args'
print ' ' + repr(args)
print ' %s kwargs'
print ' ' + repr(kwargs)
print ' %s result'
print ' ' + repr(result)
print '--\n'
return result
return wrapper
class TagError(Exception):
def __init__(self, additional_messages=False):
self.additional_messages = additional_messages
def __str__(self):
msg = ("\n ** Grammars defined in element, but no inflections were found.\n"
" Check that tags.txt and paradigms.txt include all tags.\n"
"\n"
" Alternatively, ensure that is a valid tag,\n"
" or that is a valid PoS.\n"
"\n"
" If the element specification includes an , ensure that\n"
" the refers to a word in the database that has forms \n"
" with the tags specified.\n")
if self.additional_messages:
for k, v in self.additional_messages.iteritems():
values = "\n".join([" %s" % i for i in v])
append = ("\n"
" %s:\n" % k)
append += values
msg += append
# if self.id_forms:
# msg += ("\n"
# " Word in has forms matching:\n")
# for item in self.id_forms:
# msg += " %s\n" % item
return msg
class Questions:
def read_element(self,qaelement,el,el_id,qtype):
semclass = False
print
print "\tCreating element %s (%s)" % (el_id, qaelement.qatype)
# Syntactic function of the element
if self.grammar_defaults.has_key(el_id) and self.grammar_defaults[el_id].syntax:
syntax = self.grammar_defaults[el_id].syntax
else:
syntax = el_id
if not el:
print '\t', syntax, "No element given."
# Some of the answer elements share content of question elements.
content_id = ""
if el:
content_id = el.getAttribute("content")
if not content_id: content_id=el_id
# Search for the same element in question side
# If there is no element given in the answer, the element
# is a copy of the question.
question_qelements = None
qelems = QElement.objects.filter(question__id=qaelement.question_id,
identifier=content_id)
print "qelems: ", qelems
if (not el or el.getAttribute("content")) and \
QElement.objects.filter(question__id=qaelement.question_id,
identifier=content_id).count() > 0:
question_qelements = QElement.objects.filter(question__id=qaelement.question_id,
identifier=content_id)
else:
if el and el.getAttribute("content"):
if QElement.objects.filter(question__id=qaelement.id,
identifier=content_id).count() > 0:
question_qelements = QElement.objects.filter(question__id=qaelement.id,
identifier=content_id)
# Some of the facit elements have to copy their lexical content from the corresponding answer elements. It is indicated by the attribute word, e.g. word="VERB" in the XML-file.
word_id = ""
answer_qelements = None
tagelements = None
grammars = list()
if el:
word_id = el.getAttribute("word")
grammars = el.getElementsByTagName("grammar")
print word_id
#if not word_id: word_id=el_id
if word_id:
# Search for the same element in the answer.
print qaelement.id
aelems = QElement.objects.filter(question__id=qaelement.id-1,identifier=word_id)
print aelems
qe = QElement.objects.create(question=qaelement, syntax=word_id, identifier=word_id, gametype=qaelement.gametype)
# The tags will also be added to the facit element. For other question / answer elements it is done below but because there is "return" in the end of this section the tags to the copy-words must be added here.
print "facit element before adding tags: ", qe.tags.all()
tags = []
for gr in grammars:
tags.append(gr.getAttribute("tag"))
tagstrings = []
if tags:
tagstrings = self.get_tagvalues(tags)
tagelements = Tag.objects.filter(string__in=tagstrings)
if tagelements:
for t in tagelements:
print '\t\ttag: ', t.string
qe.tags.add(t) # was: aelems[0].tags.add(t)
print "tags added to the facit element: ", qe.tags.all()
aelems[0].word_set.add(qe)
aelems[0].save()
return
if (not el or el.getAttribute("word")) and \
QElement.objects.filter(question__id=qaelement.question_id,
identifier=word_id).count() > 0:
answer_qelements = QElement.objects.filter(question__id=qaelement.question_id, identifier=word_id)
else:
if el and el.getAttribute("word"):
if QElement.objects.filter(question__id=qaelement.id, identifier=word_id).count() > 0:
answer_qelements = QElement.objects.filter(question__id=qaelement.id, identifier=word_id)
# Here we are trying to create the copies of the answer elements in the facit that have the attribute word=""
if not el and answer_qelements:
for q in answer_qelements:
qe = QElement.objects.create(question=qaelement,
identifier=word_id,
gametype=qaelement.gametype) # added by Heli
q.word_set.add(qe)
qe.save()
q.save()
return
# Hmm, maybe not detecting copy correctly
if not el and question_qelements:
for q in question_qelements:
qe = QElement.objects.create(question=qaelement,
identifier=el_id,
syntax=q.syntax,
gametype=qaelement.gametype) # added by Heli
# copy = QElement.objects.get(question=qaelement.question,
# identifier=el_id,
# syntax=q.syntax)
# mark as a copy
q.copy_set.add(qe)
qe.save()
q.save()
return
############### AGREEMENT
# Search for elementes that agree
agr_elements=None
if syntax=="MAINV":
agr_id="SUBJ"
print "\tTRYING verb agreement " + agr_id + " " + qaelement.qatype
if QElement.objects.filter(question=qaelement, syntax=agr_id,
question__qatype=qaelement.qatype).count() > 0:
agr_elements = QElement.objects.filter(question=qaelement,
syntax=agr_id,
question__qatype=qaelement.qatype)
agreement = ""
if el: agreement = el.getElementsByTagName("agreement")
if agreement: print "\tAgreement:", agreement[0].getAttribute("id")
# Agreement from xml-files
# Try first inside question or answer
# Then in answer-question level
if agreement:
agr_id=agreement[0].getAttribute("id")
if QElement.objects.filter(question=qaelement, syntax=agr_id,
question__qatype=qaelement.qatype).count() > 0:
agr_elements = QElement.objects.filter(question=qaelement,
syntax=agr_id,
question__qatype=qaelement.qatype)
else:
if Question.objects.filter(id=qaelement.question_id).count() > 0:
q=Question.objects.filter(id=qaelement.question_id)[0]
if QElement.objects.filter(question__id=qaelement.question_id,
syntax=agr_id).count() > 0:
agr_elements = QElement.objects.filter(question__id=qaelement.question_id,
syntax=agr_id)
if not agr_elements:
print "* ERROR: no agreement elements found"
############ WORDS
# Search for existing word in the database.
if el:
ids = el.getElementsByTagName("id")
else:
ids = list()
words = {}
word_elements = None
for i in ids:
word_id = i.firstChild.data
word_id_hid = i.getAttribute("hid").strip()
if word_id:
if word_id_hid:
print "\tfound word %s/%s" % (word_id, word_id_hid)
word_elements = Word.objects.filter(wordid=word_id, hid=int(word_id_hid))
else:
print "\tfound word %s" % word_id
word_elements = Word.objects.filter(wordid=word_id)
# Add pos information here!
if not word_elements:
print "\tWord not found! " + word_id
# Search for existing semtype
# Semtype overrides the word id selection
if not word_elements:
semclasses = []
if el:
semclasses = el.getElementsByTagName("sem")
if semclasses:
semclass = semclasses[0].getAttribute("class")
word_elements = Word.objects.filter(semtype__semtype=semclass)
elif qaelement.question:
# check question for copy, grab semclasses
has_copies = QElement.objects.filter(question=qaelement.question,
identifier=el_id)
if has_copies:
semclasses = has_copies.values_list('semtype__semtype', flat=True)
semclass = semclasses[0]
word_elements = Word.objects.filter(semtype__semtype=semclass)
if el:
valclasses = el.getElementsByTagName("val")
if valclasses:
valclass = valclasses[0].getAttribute("class")
word_elements = Word.objects.filter(valency=valclass)
# If still no words, get the default words for this element:
if not word_elements:
grammar_def = self.grammar_defaults.get(el_id, False)
if grammar_def:
if grammar_def.words:
word_elements = self.grammar_defaults[el_id].words
if word_elements:
for w in word_elements:
if not words.has_key(w.pos): words[w.pos] = []
words[w.pos].append(w)
############# GRAMMAR
tagelements = None
grammars = list()
not_found = []
if el:
grammars = el.getElementsByTagName("grammar")
if not el or not grammars:
# If there is no grammatical specification, the element is created
# solely on the basis of grammar.
# However, if the element is already defined previously in the
# sentence, there is no need to create another element. In fact,
# this could result in weirdness if the element is also defined in
# the grammar, because otherwise the install process would recreate
# it with the wrong default tags.
# If the element is declared in the question, and we are now
# processing the answer, tags need to be grabbed from the question
# elements so that the normal copy process can procede, otherwise
# they are copied from the grammar, which is not what should
# happen.
preceding = QElement.objects.filter(question=qaelement,
identifier=el_id,)
if qaelement.question:
has_copies = QElement.objects.filter(question=qaelement.question,
identifier=el_id,)
else:
has_copies = False
if preceding:
print " * Element already declared in the question"
return
if has_copies:
tagelements = sum([list(p.tags.all()) for p in has_copies], [])
elif self.grammar_defaults.has_key(el_id):
if self.grammar_defaults[el_id].tags:
tagelements = self.grammar_defaults[el_id].tags
if tagelements:
tagelements = list(set(tagelements))
# An element for each different grammatical specification.
else:
poses = []
tags = []
for gr in grammars:
tags.append(gr.getAttribute("tag"))
poses.append(gr.getAttribute("pos"))
tagstrings = []
if poses:
if self.grammar_defaults.has_key(el_id):
if self.grammar_defaults[el_id].tags:
tagelements = self.grammar_defaults[el_id].tags.filter(pos__in=poses)
if tags:
tagstrings = self.get_tagvalues(tags)
if tagelements:
tagelements = tagelements or Tag.objects.filter(string__in=tagstrings)
else:
tagelements = Tag.objects.filter(string__in=tagstrings)
# print tagelements
# raw_input()
# Extra check for pronouns
# If pronoun id is given, only the tags related to that pronoun are preserved.
for t in tagelements:
if t.pos == 'Pron':
if not words.has_key('Pron'): break
found = False
for w in words['Pron'][:]:
corresponding_forms = Form.objects.filter(tag__in=tagelements,
word=w)
if corresponding_forms.count() > 0:
found = True
else:
# Should pop those that don't match, or else
# problems may arise
# TODO: this for other POS
not_found.append(
(list(set([w.lemma + '+' + form.tag.string
for form in w.form_set.all()])),
t.string)
)
words['Pron'].pop(words['Pron'].index(w))
if not found:
tagelements = tagelements.exclude(id=t.id)
# Remove those words which do not have any forms with the tags.
if words.has_key('N'):
for w in words['N']:
found = False
for t in tagelements:
if t.pos == 'N':
if Form.objects.filter(tag=t, word=w).count()>0:
found = True
if not found:
words['N'].remove(w)
# Find different pos-values in tagelements
posvalues = {}
task = ""
# Elements that do not inflection information are not created.
if not tagelements and not agr_elements:
print "\tno inflection for", el_id
if len(grammars) > 0:
additional_messages = {
'Grammar tags available for word id':
sum([a[0] for a in not_found], []),
' specified':
[a[1] for a in not_found],
}
raise TagError(additional_messages)
return
if not tagelements:
posvalues[""] = 1
else:
for t in tagelements:
posvalues[t.pos] = 1
attempt = False
if el:
task = el.getAttribute("task")
if task:
print "\tsetting", el_id, "as task"
qaelement.task = syntax
qaelement.save()
else:
if el_id == qtype:
qaelement.task = syntax
qaelement.save()
# if el:
# task = el.getAttribute("task")
# if task:
# # print task
# # print syntax
# # print 'TEST'
# # raw_input()
# print "setting", el_id, "as task"
# qaelement.task = syntax
# qaelement.save()
# attempt = True
# if qaelement.task != syntax:
# print 'Task not saved!'
# sys.exit(2)
# # print qaelement.task
# # raw_input()
# else:
# if el_id == qtype:
# qaelement.task = syntax
# qaelement.save()
# attempt = True
# if task:
# if qaelement.task != syntax:
# print 'TASK NOT SAVED'
# print qaelement.task
# print syntax
# print 'attempt: '
# print attempt
# sys.exit(2)
############# CREATE ELEMENTS
print '\tCREATING ELEMENTS'
print '\tElements for the following keys...'
print '\t' + repr(posvalues.keys())
# Add an element for each pos:
for p in posvalues.keys():
qe = QElement.objects.create(question=qaelement,\
identifier=el_id,\
syntax=syntax)
if semclass:
semty, _ = Semtype.objects.get_or_create(semtype=semclass)
qe.semtype = semty
qe.save()
if task:
qe.task=task
qe.save()
print '\t\tsemtype: ', semclass
# Add links to corresponding question elements.
if question_qelements:
for q in question_qelements:
q.copy_set.add(qe)
qe.save()
q.save()
if tagelements:
for t in tagelements:
print '\t\ttag: ', t.string
if t.pos == p:
qe.tags.add(t)
# Create links to words.
if not words.has_key(p):
word_pks = None
print "\tlooking for words..", el_id, p
# word_elements = Word.objects.filter(form__tag__in=qe.tags.all()) # pos=p)
# Just filtering isn't enough; .filter() doesn't return a list of unique items with this kind of query.
if semclass:
word_pks = Word.objects.filter(form__tag__in=qe.tags.all()).filter(semtype=qe.semtype).values_list('pk', flat=True)
else:
word_pks = Word.objects.filter(form__tag__in=qe.tags.all()).values_list('pk', flat=True)
word_pks = list(set(word_pks))
if len(word_pks) == 0:
print 'Error: Elements with zero possibilities not permitted.'
print ' > ', qe.question
print ' > Word tags: %s' % repr(qe.tags.all())
print ' > semtypes: %s' % repr(qe.semtype)
sys.exit(2)
print '\t%d elements available. ' % len(word_pks)
word_elements_gen = (Word.objects.get(pk=int(b)) for b in word_pks)
if not word_elements:
word_elements = []
else:
word_elements = list(word_elements)
if word_elements_gen:
for w in word_elements_gen:
if not words.has_key(p):
words[w.pos] = []
if not words.has_key(w.pos):
words[w.pos] = []
words[w.pos].append(w)
word_elements.append(w)
# print 'Creating elements for %d words' % word_elements.count()
for w in word_elements:
qe.wordqelement_set.create(word=w)
# we = WordQElement.objects.create(qelement=qe,\
# word=w)
# add agreement info.
if agr_elements:
for a in agr_elements:
a.agreement_set.add(qe)
a.save()
qe.save()
# Read elements attached to particular question or answer.
def read_elements(self, head, qaelement, qtype):
els = head.getElementsByTagName("element")
qastrings = qaelement.string.split()
# Read first subject for agreement
element=None
if "SUBJ" in set(qastrings):
for e in els:
if e.getAttribute("id")=="SUBJ":
element = e
break
self.read_element(qaelement, element, "SUBJ", qtype)
# Process rest of the elements in the string.
subj=False
for s in qastrings:
if s=="SUBJ" and not subj:
subj=True
continue
syntax = s.lstrip("(")
syntax = syntax.rstrip(")")
element=None
found = False
for e in els:
el_id = e.getAttribute("id")
if el_id==s and not s=="SUBJ":
self.read_element(qaelement,e,syntax,qtype)
found = True
if not found:
self.read_element(qaelement,None,syntax,qtype)
def read_questions(self, infile, grammarfile):
xmlfile=file(infile)
tree = _dom.parse(infile)
self.read_grammar_defaults(grammarfile)
qs = tree.getElementsByTagName("questions")[0]
gametype = qs.getAttribute("game")
if not gametype: gametype="morfa"
print "Created questions:"
for q in tree.getElementsByTagName("q"):
qid = q.getAttribute('id')
if not qid:
print "ERROR Missing question id, stopping."
exit()
print "\n##"
print "### INSTALLING QUESTION: %s" % qid.encode('utf-8')
print "##\n"
level = q.getAttribute('level')
if not level: level="1"
lemmacount = q.getAttribute('lemmacount') # added by Heli
if not lemmacount: lemmacount="0"
# Store question
qtype=""
qtype_els = q.getElementsByTagName("qtype")
# MIX
if qtype_els:
qtype = ','.join([qtype.firstChild.data for qtype in qtype_els])
# qtype = q.getElementsByTagName("qtype")[0].firstChild.data
question=q.getElementsByTagName("question")[0]
text=question.getElementsByTagName("text")[0].firstChild.data
#If there exists already a question with that name, delete all the references to it.
if qid:
questions = Question.objects.filter(qid=qid)
if questions:
questions[0].delete()
question_element,created = Question.objects.get_or_create(qid=qid, \
level=int(level),lemmacount=int(lemmacount), \
string=text, \
qtype=qtype, \
gametype=gametype,\
qatype="question")
# Add source information if present
if q.getElementsByTagName("sources"):
sources = q.getElementsByTagName("sources")[0]
elements=sources.getElementsByTagName("book")
for el in elements:
book=el.getAttribute("name")
if book:
# Add book to the database
# Leave this if DTD is used
book_entry, created = Source.objects.get_or_create(name=book)
if created:
print "\tCreated book entry with name ", book
question_element.source.add(book_entry)
question_element.save()
else:
book = "all"
# Add book to the database
book_entry, created = Source.objects.get_or_create(name=book)
if created:
print "\tCreated book entry with name ", book
question_element.source.add(book_entry)
question_element.save()
# Read the elements
self.read_elements(question, question_element,qtype)
# There can be more than one answer for each question,
# Store them separately.
answers=q.getElementsByTagName("answer")
for ans in answers:
text=ans.getElementsByTagName("text")[0].firstChild.data
answer_element = Question.objects.create(string=text,qatype="answer",question=question_element,level=1,lemmacount=0)
answer_element.save()
self.read_elements(ans, answer_element, qtype)
# Facits are the possible correct answers. This is new for Estonian Oahpa that facit is also saved in the database and sent to the linguistic analysis together with the question and the user's answer.
facits=q.getElementsByTagName("facit")
for fac in facits:
text=fac.getElementsByTagName("text")[0].firstChild.data
facit_element = Question.objects.create(string=text,qatype="facit",question=question_element,level=1,lemmacount=0)
facit_element.save()
self.read_elements(fac, facit_element, qtype)
db.reset_queries()
def read_grammar_defaults(self, infile):
""" Read a grammar file and make the results accessible in
self.grammar_defaults
This has the structure:
{
'SUBJ': {
'pos': [u'N', u'Pron'],
'tags': [, , etc...]
},
'N-LOC': {
'pos': [u'N'],
'tags': [, , etc...]
},
}
{
'SUBJ':
}
"""
class GrammarDefaultError(Exception):
def __init__(self, element=False, tagstrings=False):
self.element = element
self.tagstrings = tagstrings
def __str__(self):
msg = (
"\n ** No tags were present in the database matching\n"
)
if self.element:
msg += " grammar element: %s\n" % self.element
else:
msg += " an unknown grammar element\n"
if self.tagstrings:
msg += " with the following expanded tag strings:\n"
msg += " " + " ".join(self.tagstrings)
msg += "\n Check that these words/forms are installed"
return msg
class GrammarDefault(object):
Error = GrammarDefaultError
def __init__(self,
poses=False,
tags=False,
words=False,
syntax=False):
self.tags = tags or list()
self.poses = poses or list()
self.words = words or list()
self.syntax = syntax or list()
def __str__(self):
returns = []
if self.poses:
returns.append('|'.join(self.poses) + ' - ')
if self.tags:
returns.append(', '.join([t.string for t in self.tags]))
else:
if self.poses:
returns.append('None')
if self.words:
returns.append(', '.join([w.lemma for w in self.words]))
if self.syntax:
returns.append(', '.join(self.syntax))
return ' '.join(returns)
def __repr__(self):
return '' % str(self)
xmlfile = file(infile)
tree = _dom.parse(infile)
self.grammar_defaults = {}
tags = tree.getElementsByTagName("tags")[0]
elements = tags.getElementsByTagName("element")
for el in elements:
identifier = el.getAttribute("id")
grammar_default = GrammarDefault()
word_id = None
word = None
syntax = ""
syntaxes = el.getElementsByTagName("syntax")
if syntaxes:
syntax = syntaxes[0].firstChild.data
grammar_default.syntax = syntax
word_ids = el.getElementsByTagName("id")
if word_ids:
word_id = word_ids[0].firstChild.data
word_id_hid = word_ids[0].getAttribute("hid").strip()
if word_id:
words = Word.objects.filter(wordid=word_id)
if word_id_hid:
words = words.filter(hid=int(word_id_hid))
grammar_default.words = words
tagstrings = []
grammars = el.getElementsByTagName("grammar")
for gr in grammars:
pos = gr.getAttribute("pos")
if pos:
grammar_default.poses.append(pos)
tag = gr.getAttribute("tag")
tagstrings.extend(self.get_tagvalues([tag]))
if len(tagstrings) > 0:
tags = Tag.objects.filter(string__in=tagstrings)
if tags.count() == 0:
tag_elements = ', '.join([e.toprettyxml() for e in grammars])
raise GrammarDefault.Error(element=tag_elements,
tagstrings=tagstrings)
else:
grammar_default.tags = tags
self.grammar_defaults[identifier] = grammar_default
def get_tagvalues(self, tags):
""" This alters state of things without returning objects
Recurses through set of supplied tags to ensure that each element
is represented in tags.txt and paradigms.txt. """
def fill_out(tags):
from itertools import product
def make_list(item):
if type(item) == list:
return item
else:
return [item]
return list(product(*map(make_list, tags)))
def parse_tag(tag):
""" Iterate through a tag string by chunks, and check for tag sets
and tag names. Return the reassembled tag on success. """
tag_string = []
for item in tag.split('+'):
if Tagname.objects.filter(tagname=item).count() > 0:
tag_string.append(item)
elif Tagset.objects.filter(tagset=item).count() > 0:
tagnames = Tagname.objects.filter(tagset__tagset=item)
tag_string.append([t.tagname for t in tagnames])
if len(tag_string) > 0:
return ['+'.join(item) for item in fill_out(tag_string)]
else:
return False
if type(tags) == list:
tags = [a for a in tags if a]
parsed = sum(map(parse_tag, tags), [])
return parsed
else:
return False
def delete_question(self, qid=None):
if qid:
questions = Question.objects.filter(qid=qid)
if questions:
for q in questions:
q.delete()
questions = Question.objects.filter(string=qid)
if questions:
for q in questions:
q.delete()