# -*- coding: utf-8 -*- from settings import * from drill.models import * from morfac.models import * from xml.dom import minidom as _dom from optparse import OptionParser from django import db import sys import re import string import codecs class Questions: def read_element(self,qaelement,el,el_id,qtype): semclass = False print print "Creating element", el_id # Syntactic function of the element if self.values.has_key(el_id) and self.values[el_id].has_key('syntax'): syntax = self.values[el_id]['syntax'] else: syntax = el_id if not el: print syntax, "No element given." # Some of the answer elements share content of question elements. content_id="" if el: content_id = el.getAttribute("content") if not content_id: content_id=el_id # Search for the same element in question side # If there is no element given in the answer, the element # is a copy of the question. question_qelements = None if (not el or el.getAttribute("content")) and \ QElement.objects.filter(question__id=qaelement.question_id, identifier=content_id).count() > 0: question_qelements = QElement.objects.filter(question__id=qaelement.question_id, identifier=content_id) else: if el and el.getAttribute("content"): if QElement.objects.filter(question__id=qaelement.id, identifier=content_id).count() > 0: question_qelements = QElement.objects.filter(question__id=qaelement.id, identifier=content_id) if not el and question_qelements: for q in question_qelements: qe = QElement.objects.create(question=qaelement, identifier=el_id, syntax=q.syntax) # mark as a copy q.copy_set.add(qe) qe.save() q.save() return ############### AGREEMENT # Search for elementes that agree agr_elements=None if syntax=="MAINV": agr_id="SUBJ" print "TRYING verb agreement " + agr_id + " " + qaelement.qatype if QElement.objects.filter(question=qaelement, syntax=agr_id, question__qatype=qaelement.qatype).count() > 0: agr_elements = QElement.objects.filter(question=qaelement, syntax=agr_id, question__qatype=qaelement.qatype) agreement = "" if el: agreement = el.getElementsByTagName("agreement") if agreement: print "Agreement:", agreement[0].getAttribute("id") # Agreement from xml-files # Try first inside question or answer # Then in answer-question level if agreement: agr_id=agreement[0].getAttribute("id") if QElement.objects.filter(question=qaelement, syntax=agr_id, question__qatype=qaelement.qatype).count() > 0: agr_elements = QElement.objects.filter(question=qaelement, syntax=agr_id, question__qatype=qaelement.qatype) else: if Question.objects.filter(id=qaelement.question_id).count() > 0: q=Question.objects.filter(id=qaelement.question_id)[0] if QElement.objects.filter(question__id=qaelement.question_id, syntax=agr_id).count() > 0: agr_elements = QElement.objects.filter(question__id=qaelement.question_id, syntax=agr_id) if not agr_elements: print "ERROR: no agreement elements found" ############ WORDS # Search for existing word in the database. ids = [] if el: ids=el.getElementsByTagName("id") words = {} word_elements = None for i in ids: word_id = i.firstChild.data word_id_hid = i.getAttribute("hid").strip() if word_id: if word_id_hid: print "found word %s/%s" % (word_id, word_id_hid) word_elements = Word.objects.filter(wordid=word_id, hid=int(word_id_hid)) else: print "found word %s" % word_id word_elements = Word.objects.filter(wordid=word_id) # Add pos information here! if not word_elements: print "Word not found! " + word_id # Search for existing semtype # Semtype overrides the word id selection if not word_elements: semclasses= [] if el: semclasses=el.getElementsByTagName("sem") if semclasses: semclass=semclasses[0].getAttribute("class") word_elements = Word.objects.filter(semtype__semtype=semclass) valclasses= [] if el: valclasses=el.getElementsByTagName("val") if valclasses: valclass=valclasses[0].getAttribute("class") word_elements = Word.objects.filter(valency=valclass) # If still no words, get the default words for this element: if not word_elements: if self.values.has_key(el_id) and self.values[el_id].has_key('words'): word_elements = self.values[el_id]['words'] if word_elements: for w in word_elements: if not words.has_key(w.pos): words[w.pos] = [] words[w.pos].append(w) ############# GRAMAMR tagelements = None grammars = [] if el: grammars = el.getElementsByTagName("grammar") if not el or not grammars: # If there is no grammatical specification, the element is created solely # on the basis of grammar. if self.values.has_key(el_id): if self.values[el_id].has_key('tags'): tagelements = self.values[el_id]['tags'] # An element for each different grammatical specification. else: poses = [] tags = [] for gr in grammars: tags.append(gr.getAttribute("tag")) poses.append(gr.getAttribute("pos")) tagstrings = [] if poses: if self.values.has_key(el_id): if self.values[el_id].has_key('tags'): tagelements = self.values[el_id]['tags'].filter(pos__in=poses) if tags: for tag in tags: tagvalues = [] self.get_tagvalues(tag,"",tagvalues) tagstrings.extend(tagvalues) if tagelements: tagelements = tagelements or Tag.objects.filter(string__in=tagstrings) else: tagelements = Tag.objects.filter(string__in=tagstrings) # Extra check for pronouns # If pronoun id is given, only the tags related to that pronoun are preserved. for t in tagelements: if t.pos == 'Pron': if not words.has_key('Pron'): break found = False for w in words['Pron']: if Form.objects.filter(tag=t,word=w).count()>0: found = True break if not found: tagelements = tagelements.exclude(id=t.id) # Remove those words which do not have any forms with the tags. if words.has_key('N'): for w in words['N']: found = False for t in tagelements: if t.pos == 'N': if Form.objects.filter(tag=t, word=w).count()>0: found = True if not found: words['N'].remove(w) # Find different pos-values in tagelements posvalues = {} # Elements that do not inflection information are not created. if not tagelements and not agr_elements: print "no inflection for", el_id if len(grammars) > 0: print >> sys.stderr, " ** Grammars defined in element, but no inflections were found." print >> sys.stderr, " Check that tags.txt and paradigms.txt include all tags." print >> sys.stderr, "" print >> sys.stderr, " Alternatively, ensure that is a valid tag," print >> sys.stderr, " or that is a valid PoS." sys.exit(2) return if not tagelements: posvalues[""] = 1 else: for t in tagelements: posvalues[t.pos] = 1 attempt = False if el: task = el.getAttribute("task") if task: print "setting", el_id, "as task" qaelement.task = syntax qaelement.save() else: if el_id == qtype: qaelement.task = syntax qaelement.save() # if el: # task = el.getAttribute("task") # if task: # # print task # # print syntax # # print 'TEST' # # raw_input() # print "setting", el_id, "as task" # qaelement.task = syntax # qaelement.save() # attempt = True # if qaelement.task != syntax: # print 'Task not saved!' # sys.exit(2) # # print qaelement.task # # raw_input() # else: # if el_id == qtype: # qaelement.task = syntax # qaelement.save() # attempt = True # if task: # if qaelement.task != syntax: # print 'TASK NOT SAVED' # print qaelement.task # print syntax # print 'attempt: ' # print attempt # sys.exit(2) ############# CREATE ELEMENTS print 'CREATING ELEMENTS' print 'Elements for the following keys...' print posvalues.keys() # Add an element for each pos: for p in posvalues.keys(): qe = QElement.objects.create(question=qaelement,\ identifier=el_id,\ syntax=syntax) if semclass: semty, _ = Semtype.objects.get_or_create(semtype=semclass) qe.semtype = semty qe.save() print '\tsemtype: ', semclass # Add links to corresponding question elements. if question_qelements: for q in question_qelements: q.copy_set.add(qe) qe.save() q.save() if tagelements: for t in tagelements: print '\ttag: ', t.string if t.pos == p: qe.tags.add(t) # Create links to words. if not words.has_key(p): word_pks = None print "looking for words..", el_id, p # word_elements = Word.objects.filter(form__tag__in=qe.tags.all()) # pos=p) # Just filtering isn't enough; .filter() doesn't return a list of unique items with this kind of query. if semclass: word_pks = Word.objects.filter(form__tag__in=qe.tags.all()).filter(semtype=qe.semtype).values_list('pk', flat=True) else: word_pks = Word.objects.filter(form__tag__in=qe.tags.all()).values_list('pk', flat=True) word_pks = list(set(word_pks)) if len(word_pks) == 0: print 'Error: Elements with zero possibilities not permitted.' print ' > ', qe.question print ' > Word tags: %s' % repr(qe.tags.all()) print ' > semtypes: %s' % repr(qe.semtype) sys.exit(2) print '%d elements available. ' % len(word_pks) word_elements_gen = (Word.objects.get(pk=int(b)) for b in word_pks) if not word_elements: word_elements = [] else: word_elements = list(word_elements) if word_elements_gen: for w in word_elements_gen: if not words.has_key(p): words[w.pos] = [] if not words.has_key(w.pos): words[w.pos] = [] words[w.pos].append(w) word_elements.append(w) # print 'Creating elements for %d words' % word_elements.count() for w in word_elements: qe.wordqelement_set.create(word=w) # we = WordQElement.objects.create(qelement=qe,\ # word=w) # add agreement info. if agr_elements: for a in agr_elements: a.agreement_set.add(qe) a.save() qe.save() # Read elements attached to particular question or answer. def read_elements(self, head, qaelement, qtype): els = head.getElementsByTagName("element") qastrings = qaelement.string.split() # Read first subject for agreement element=None if "SUBJ" in set(qastrings): for e in els: if e.getAttribute("id")=="SUBJ": element = e break self.read_element(qaelement, element, "SUBJ", qtype) # Process rest of the elements in the string. subj=False for s in qastrings: if s=="SUBJ" and not subj: subj=True continue syntax = s.lstrip("(") syntax = syntax.rstrip(")") element=None found = False for e in els: el_id = e.getAttribute("id") if el_id==s and not s=="SUBJ": self.read_element(qaelement,e,syntax,qtype) found = True if not found: self.read_element(qaelement,None,syntax,qtype) def read_questions(self, infile, grammarfile): xmlfile=file(infile) tree = _dom.parse(infile) self.read_grammar(grammarfile) qs = tree.getElementsByTagName("questions")[0] gametype = qs.getAttribute("game") if not gametype: gametype="morfa" print "Created questions:" for q in tree.getElementsByTagName("q"): qid = q.getAttribute('id') if not qid: print "ERROR Missing question id, stopping." exit() print qid.encode('utf-8') level = q.getAttribute('level') if not level: level="1" # Store question qtype="" qtype_els = q.getElementsByTagName("qtype") # MIX if qtype_els: qtype = ','.join([qtype.firstChild.data for qtype in qtype_els]) # qtype = q.getElementsByTagName("qtype")[0].firstChild.data question=q.getElementsByTagName("question")[0] text=question.getElementsByTagName("text")[0].firstChild.data #If there exists already a question with that name, delete all the references to it. if qid: questions = Question.objects.filter(qid=qid) if questions: questions[0].delete() question_element,created = Question.objects.get_or_create(qid=qid, \ level=int(level), \ string=text, \ qtype=qtype, \ gametype=gametype,\ qatype="question") # Add source information if present if q.getElementsByTagName("sources"): sources = q.getElementsByTagName("sources")[0] elements=sources.getElementsByTagName("book") for el in elements: book=el.getAttribute("name") if book: # Add book to the database # Leave this if DTD is used book_entry, created = Source.objects.get_or_create(name=book) if created: print "Created book entry with name ", book question_element.source.add(book_entry) question_element.save() else: book = "all" # Add book to the database book_entry, created = Source.objects.get_or_create(name=book) if created: print "Created book entry with name ", book question_element.source.add(book_entry) question_element.save() # Read the elements self.read_elements(question, question_element,qtype) # There can be more than one answer for each question, # Store them separately. answers=q.getElementsByTagName("answer") for ans in answers: text=ans.getElementsByTagName("text")[0].firstChild.data answer_element = Question.objects.create(string=text,qatype="answer",question=question_element,level=1) answer_element.save() self.read_elements(ans, answer_element, qtype) db.reset_queries() def read_grammar(self, infile): xmlfile=file(infile) tree = _dom.parse(infile) self.values = {} tags=tree.getElementsByTagName("tags")[0] for el in tags.getElementsByTagName("element"): identifier=el.getAttribute("id") info2 = {} elements = [] word_id="" word = None syntax ="" syntaxes = el.getElementsByTagName("syntax") if syntaxes: syntax = syntaxes[0].firstChild.data info2['syntax'] = syntax word_ids = el.getElementsByTagName("id") if word_ids: word_id = word_ids[0].firstChild.data word_id_hid = word_ids[0].getAttribute("hid").strip() if word_id: words = Word.objects.filter(wordid=word_id) if word_id_hid: words = words.filter(hid=int(word_id_hid)) info2['words'] = words info2['pos'] = [] tagstrings = [] grammars = el.getElementsByTagName("grammar") for gr in grammars: pos=gr.getAttribute("pos") if pos: info2['pos'].append(pos) tag=gr.getAttribute("tag") tagvalues = [] self.get_tagvalues(tag,"",tagvalues) tagstrings.extend(tagvalues) if len(tagstrings) > 0: tags = Tag.objects.filter(string__in=tagstrings) info2['tags'] = tags self.values[identifier] = info2 def get_tagvalues(self,rest,tagstring,tagvalues): if not rest: tagvalues.append(tagstring) return if rest.count("+") > 0: t, rest = rest.split('+',1) else: t=rest rest="" if Tagname.objects.filter(tagname=t).count() > 0: if tagstring: tagstring = tagstring + "+" + t else: tagstring = t self.get_tagvalues(rest,tagstring,tagvalues) else: if Tagset.objects.filter(tagset=t).count() > 0: tagnames=Tagname.objects.filter(tagset__tagset=t) for t in tagnames: if tagstring: tagstring2 = tagstring + "+" + t.tagname else: tagstring2 = t.tagname self.get_tagvalues(rest,tagstring2,tagvalues) def delete_question(self, qid=None): if qid: questions = Question.objects.filter(qid=qid) if questions: for q in questions: q.delete() questions = Question.objects.filter(string=qid) if questions: for q in questions: q.delete()