# -*- coding: utf-8 -*- import settings from django.db.models import Q from xml.dom import minidom as _dom from django.utils.encoding import force_unicode import sys # Word, Form, Tag, Source, Semtype # from drills.models import * # import re # import string # import codecs # For easier debugging. # _D = open('/dev/ttys005', 'w') _D = open('/dev/null', 'w') # Lexicons: words # TODO: 1. How many queries is this running? # @cip: The answer to this question (, which is one of very many I would have asked Saara) # should be in the maintenance document on the web. # TODO: 2. Can be done in one, single transaction for all of these would be good # @cip: I would like to do so, but after the delete-entry bug I am not that sure. # @ryan: Musing while waiting for a database to be populated... What was the bug? # I tested with one big transaction just to see, and it sped things up a bit... # But I think the major slowdown is from words being generated # incrementally. It would be much faster to scan for all the words # generate them in one go, and store them to a data structure for # easy access later, so that `lookup` only has to be called once. # `lookup` seems to take a good deal of time to start up, but processes # much faster, of course. Thus, transactions probably don't matter as much # as eliminating this would. # ... Maybe I'll try this out when there's some free time. # def hasApp(elem): # apps = elem.getElementsByTagName("app") # # for a in apps: # name = a.getAttribute("name") # if name and name == "oahpa": # return True # # return False # # def getPOS(elem): # return elem.getElementsByTagName("lg")[0].getElementsByTagName("l")[0].getAttribute("pos") from django.db import transaction class Words: @transaction.commit_manually def install_lexicon(self,infile,linginfo,delete=None,paradigmfile=None): # xmlfile = file(infile) # never used tree = _dom.parse(infile) lex = tree.getElementsByTagName("r")[0] mainlang = lex.getAttribute("xml:lang") print >> sys.stdout, "Mainlang defined ", mainlang if not mainlang: print >> sys.stderr, "Attribute mainlang not defined in", infile, "stop." sys.exit() self.all_wordids = [] es = tree.getElementsByTagName("e") total = len(es) count = 0 for e in es: hasApp = False apps = e.getElementsByTagName("app") for a in apps: name = a.getAttribute("name") if name and name == "oahpa": hasApp = True pos = e.getElementsByTagName("lg")[0].getElementsByTagName("l")[0].getAttribute("pos") # Uppercase POS. pos = pos.upper() if pos.startswith('PHRASE_'): pos = pos.replace('PHRASE_', '') # Just incase we have longer POS. # this should be checked, too # semantics = e.getElementsByTagName("semantics")[0] # elements = semantics.getElementsByTagName("sem") if hasApp: if pos: print >> sys.stdout, "pos defined ", pos self.store_word(e=e,linginfo=linginfo,mainlang=mainlang,delete=delete,paradigmfile=paradigmfile) else: try: __data = e.getElementsByTagName("lg")[0].getElementsByTagName("l")[0].firstChild.data except AttributeError: __data = 'None' print >> sys.stdout, "undefined pos for ", __data else: try: __data = e.getElementsByTagName("lg")[0].getElementsByTagName("l")[0].firstChild.data except AttributeError: __data = 'None' print >> sys.stdout, "NON-oahpa; empty lemma and no pos defined", __data count += 1 print >> sys.stdout, '--- %d/%d entries processed' % (count, total) if delete and pos: allids = Word.objects.filter(Q(pos=pos) & ~Q(semtype__semtype="PLACE-NAME-LEKSA")).values_list('wordid',flat=True) for a in allids: if force_unicode(a) not in set(self.all_wordids): print >> sys.stdout, "Word id not found from xml. Deleting:", a word = Word.objects.get(pos=pos,wordid=a) word.delete() transaction.commit() def add_translation(self,el,w,pos): if el.firstChild: lemma = phrase = explanation = False if el.tagName == 't': translation = lemma = el.firstChild.data if el.tagName == 'tf': translation = phrase = el.firstChild.data if el.tagName == 'te': translation = explanation = el.firstChild.data # TODO: For the moment the translations and explanations # are only in effect for nob lang = el.getAttribute("xml:lang") pos = pos.upper() if lang == "sma": if Word.objects.filter(wordid=translation,pos__iexact=pos).count()>0: transl = Word.objects.filter(wordid=translation,pos__iexact=pos)[0] else: transl, created = Word.objects.get_or_create(wordid=translation,pos__iexact=pos) if created: transl.lemma = translation transl.save() # Add reference to the new word object as translation. w.translations.add(transl) w.save() else: if lang == "sme": transl, created = Wordsme.objects.get_or_create(wordid=translation) if created: transl.lemma = translation transl.save() w.translations2sme.add(transl) w.save() elif lang == "deu": transl, created = Worddeu.objects.get_or_create(wordid=translation) if created: transl.lemma = translation transl.save() w.translations2deu.add(transl) w.save() elif lang == "eng": transl, created = Wordeng.objects.get_or_create(wordid=translation) if created: transl.lemma = translation transl.save() w.translations2eng.add(transl) w.save() # special treatment for to-infinitive if pos == "V": oo = "to".decode('utf8') wordform = translation.lstrip(oo + " ") transl, created = Wordeng.objects.get_or_create(wordid=wordform) if created: transl.lemma = wordform transl.save() # Add reference to the new word object as translation. w.translations2eng.add(transl) w.save() elif lang == "nob": transl, created = Wordnob.objects.get_or_create(wordid=translation) if created: if lemma: transl.lemma = lemma elif phrase: transl.phrase = phrase elif explanation: transl.explanation = explanation transl.save() w.translations2nob.add(transl) w.save() # special treatment for to-infinitive if pos == "V": oo = "å".decode('utf8') wordform = translation.lstrip(oo + " ") transl, created = Wordnob.objects.get_or_create(wordid=wordform) if created: transl.lemma = wordform transl.save() # Add reference to the new word object as translation. w.translations2nob.add(transl) w.save() def add_semantics(self,e,w): semantics = e.getElementsByTagName("semantics")[0] elements = semantics.getElementsByTagName("sem") for el in elements: sem = el.getAttribute("class") if sem: print >> sys.stdout, "Semantic cls: ", sem # Add semantics entry if not found. # Leave this if DTD is used. sem_entry, created = Semtype.objects.get_or_create(semtype=sem) if created: print >> sys.stdout, "Created semtype entry with name ", sem w.semtype.add(sem_entry) w.save() def add_sources(self,e,w): sources = e.getElementsByTagName("sources")[0] elements = sources.getElementsByTagName("book") for el in elements: book = el.getAttribute("name") if book: # Add book to the database # Leave this if DTD is used book_entry, created = Source.objects.get_or_create(name=book) if created: print >> sys.stdout, "Created book entry with name ", book w.source.add(book_entry) w.save() def store_word(self,e,linginfo,mainlang,paradigmfile,delete): # Store first unique fields wid = e.getAttribute("id") lemma = e.getElementsByTagName("l")[0].firstChild.data if not wid: wid = lemma self.all_wordids.append(wid) stem = "" forms = "" diphthong = "no" gradation = "" rime = "" wordclass = "" if e.getElementsByTagName("l")[0].getAttribute("class"): wordclass = e.getElementsByTagName("l")[0].getAttribute("class") print >> sys.stdout, wordclass attrsuffix = "" soggi = "" valency = "" compare = "" frequency = "" geography = "" presentationform = "" only_sg = 0 only_pl = 0 noleksa = 0 if e.getElementsByTagName("forms"): forms = e.getElementsByTagName("forms")[0] if e.getElementsByTagName("presentationform"): presentationform = e.getElementsByTagName("presentationform")[0].firstChild.data if e.getElementsByTagName("stem"): stem = e.getElementsByTagName("stem")[0].getAttribute("class") diphthong = e.getElementsByTagName("stem")[0].getAttribute("diphthong") gradation = e.getElementsByTagName("stem")[0].getAttribute("gradation") rime = e.getElementsByTagName("stem")[0].getAttribute("rime") if rime == "0": rime="norime" soggi = e.getElementsByTagName("stem")[0].getAttribute("soggi") compare = e.getElementsByTagName("stem")[0].getAttribute("compare") attrsuffix = e.getElementsByTagName("stem")[0].getAttribute("attrsuff") if attrsuffix == "0": attrsuffix = "noattr" if e.getElementsByTagName("frequency"): frequency = e.getElementsByTagName("frequency")[0].getAttribute("class") if e.getElementsByTagName("geography"): geography = e.getElementsByTagName("geography")[0].getAttribute("class") if e.getElementsByTagName("only-sg"): only_sg = 1 if e.getElementsByTagName("only-pl"): only_pl = 1 if e.getElementsByTagName("noleksa"): noleksa = 1 if e.getElementsByTagName("valency"): valencies = e.getElementsByTagName("valency")[0] for val in valencies.getElementsByTagName("val"): valency = val.getAttribute("class") if valency: break # Part of speech information pos = e.getElementsByTagName("l")[0].getAttribute("pos") if pos.startswith('phrase_'): pos = pos[7:8] # if not pos: # print "Part of speech information not found for ", lemma, ". give it command line: --pos=N" # sys.exit() # Search for existing word in the database. w = None # print 'bryllup, wordid/wid: ', wid # raw_input() if mainlang == "nob": w,created = Wordnob.objects.get_or_create(wordid=wid) elif mainlang == "swe": w,created = Wordswe.objects.get_or_create(wordid=wid) elif mainlang == "sme": w,created = Wordsme.objects.get_or_create(wordid=wid) elif mainlang == "eng": w,created = Wordeng.objects.get_or_create(wordid=wid) elif mainlang == "deu": w,created = Worddeu.objects.get_or_create(wordid=wid) else: w,created = Word.objects.get_or_create(wordid=wid,pos=pos) w.wordclass = wordclass w.pos = pos w.lemma = lemma w.presentationform = presentationform w.stem = stem w.rime = rime w.compare = compare w.attrsuffix = attrsuffix w.soggi = soggi w.gradation = gradation w.diphthong = diphthong w.valency = valency w.frequency = frequency w.geography = geography w.save() # Add forms and tags if paradigmfile: linginfo.create_paradigm(lemma=lemma,pos=pos,forms=forms) # Remove old forms. forms = Form.objects.filter(word=w) for f in forms: f.delete() for f in linginfo.paradigm: g=f.classes if w.pos == "A" and w.compare == "no" and \ (g.get('Grade')=="Comp" or g.get('Grade')=="Superl"): continue t,created=Tag.objects.get_or_create(string=f.tags,pos=g.get('Wordclass', ""),\ number=g.get('Number',""),case=g.get('Case',""),\ possessive=g.get('Possessive',""),\ grade=g.get('Grade',""),\ infinite=g.get('Infinite',""), \ personnumber=g.get('Person-Number',""),\ polarity=g.get('Polarity',""),\ tense=g.get('Tense',""),mood=g.get('Mood',""), \ subclass=g.get('Subclass',""),\ attributive=g.get('Attributive',"")) t.save() form = Form(fullform=f.form,tag=t,word=w) print >> sys.stdout, "Created form: ", f.form form.save() try: print >> _D, f.dialects if len(f.dialects)==1: dialects2 = f.dialects[:] else: dialects2 = dialects[:] for d in dialects2: dia, created = Dialect.objects.get_or_create(dialect=d) form.dialects.add(dia) form.save() except AttributeError: print >> _D, "No dialects specified" pass form.save() if only_sg: print >> sys.stdout, "deleting plural forms for", w.lemma Form.objects.filter(Q(word=w.id) & Q(tag__number="Pl")).delete() if only_pl: print >> sys.stdout, "deleting singular forms for", w.lemma Form.objects.filter(Q(word=w.id) & Q(tag__number="Sg")).delete if noleksa: print >> sys.stdout, "word not in leksa", w.lemma w.leksa = 0 else: w.leksa = 1 if e.getElementsByTagName("sources"): self.add_sources(e,w) if e.getElementsByTagName("semantics"): self.add_semantics(e,w) # Add translations translations = e.getElementsByTagName("tg")[0] # minidom is ridiculous elements = [] for node in translations.childNodes: try: tagname = node.tagName except AttributeError: continue if tagname.startswith('t'): elements.append(node) # elements = translations.getElementsByTagName("t") for el in elements: self.add_translation(el,w,pos) def delete_word(self, wid=None,pos=None): if not pos: print "specify the part of speech with option -p" # to debug and fix: delete word routine # wordruss = Wordrus.objects.filter(wordid=wid) # for w in wordruss: # print "Removing", w.wordid # w.delete() if wid and pos: words = Word.objects.filter(wordid=wid,pos=pos) for w in words: print >> sys.stdout, "Removing", w.wordid w.delete() if not words: print wid, "not found"