# -*- coding: utf-8 -*- from os import environ environ['DJANGO_SETTINGS_MODULE'] = 'settings' from settings import * from drill.models import * from xml.dom import minidom as _dom from optparse import OptionParser from django.db.models import Q import sys import re import codecs from ling import Paradigm, Questions parser = OptionParser() parser.add_option("-f", "--file", dest="infile", help="lexicon file name") parser.add_option("-p", "--pos", dest="pos", help="Pos info") parser.add_option("-d", "--db", dest="add_db", action="store_true", default=False, help="Used for adding tag infoformation to database") parser.add_option("-t", "--tagfile", dest="tagfile", help="List of tags and tagsets") parser.add_option("-r", "--paradigmfile", dest="paradigmfile", help="Generate paradigms") parser.add_option("-q", "--questionfile", dest="questionfile", help="XML-file that contains questions") parser.add_option("-g", "--grammarfile", dest="grammarfile", help="XML-file for grammar defaults for questions") parser.add_option("-v", "--vasta", dest="vasta", action="store_true", default=False, help="Questions are for vasta") parser.add_option("-e", "--feedbackfile", dest="feedbackfile", help="XML-file for feedback") parser.add_option("-m", "--messagefile", dest="messagefile", help="XML-file for feedback messages") parser.add_option("-s", "--sem", dest="semtypefile", help="XML-file semantic subclasses") parser.add_option("-n", "--num", dest="numerals", action="store_true", default=False, help="Generate numerals") parser.add_option("-l", "--place", dest="placenamefile", action="store_true", default=False, help="If placenames") parser.add_option("-c", "--comments", dest="commentfile", help="XML-file for comments") parser.add_option("-u", "--update", dest="update", action="store_true", default=False, help="If update data") (options, args) = parser.parse_args() linginfo = Paradigm() questions = Questions() if options.tagfile: linginfo.handle_tags(options.tagfile, options.add_db) if options.paradigmfile: linginfo.read_paradigms(options.paradigmfile, options.tagfile, options.add_db) if options.questionfile and options.grammarfile: questions.read_questions(options.questionfile,options.grammarfile,options.vasta) exit() if options.grammarfile: questions.read_grammar(options.grammarfile) exit() if options.semtypefile: questions.read_semtypes(options.semtypefile) exit() if options.feedbackfile: if options.pos: questions.read_feedback(options.feedbackfile, options.pos, options.messagefile) exit() if options.numerals: linginfo.generate_numerals() exit() if options.messagefile: questions.read_messages(options.messagefile) exit() if options.commentfile: questions.read_comments(options.commentfile) exit() if not options.infile: exit() xmlfile=file(options.infile) tree = _dom.parse(options.infile) lex = tree.getElementsByTagName("lexicon")[0] mainlang = lex.getAttribute("xml:lang") if not mainlang: mainlang="sme" for e in tree.getElementsByTagName("entry"): # Store first unique fields id=e.getAttribute("id") lemma=e.getElementsByTagName("lemma")[0].firstChild.data if not id: id=lemma stem="" dialect="" diphthong="no" gradation="" rime="" attrsuffix="" soggi="" valency="" compare="" frequency="" geography="" only_sg = 0 only_pl = 0 if e.getElementsByTagName("stem"): stem=e.getElementsByTagName("stem")[0].getAttribute("class") diphthong_text=e.getElementsByTagName("stem")[0].getAttribute("diphthong") gradation=e.getElementsByTagName("stem")[0].getAttribute("gradation") rime=e.getElementsByTagName("stem")[0].getAttribute("rime") if rime=="0": rime="norime" soggi=e.getElementsByTagName("stem")[0].getAttribute("soggi") compare=e.getElementsByTagName("stem")[0].getAttribute("compare") attrsuffix=e.getElementsByTagName("stem")[0].getAttribute("attrsuff") if attrsuffix == "0": attrsuffix="noattr" if e.getElementsByTagName("dialect"): dialect=e.getElementsByTagName("dialect")[0].getAttribute("class") if e.getElementsByTagName("frequency"): frequency=e.getElementsByTagName("frequency")[0].getAttribute("class") if e.getElementsByTagName("geography"): geography=e.getElementsByTagName("geography")[0].getAttribute("class") if e.getElementsByTagName("only-sg"): only_sg = 1 if e.getElementsByTagName("only-pl"): only_pl = 1 if e.getElementsByTagName("valency"): valencies = e.getElementsByTagName("valency")[0] for val in valencies.getElementsByTagName("val"): valency = val.getAttribute("class") if valency: break # Part of speech information # Is it in lexicon file or not.. if options.pos: pos=options.pos else: pos=e.getElementsByTagName("pos")[0].getAttribute("class") if not pos: print "Part of speech information not found for ", lemma, ". give it command line: --pos=N" sys.exit() # Search for existing word in the database. if mainlang == "nob": word_elements = Wordnob.objects.filter(Q(wordid=id)) else: word_elements = Word.objects.filter(Q(wordid=id) & Q(pos=pos)) # Update old one if the word was found if word_elements: #if not options.update: # print "Entry exists for ", lemma; w=word_elements[0] w.pos=pos w.lemma=lemma w.stem=stem w.rime=rime w.compare = compare w.attrsuffix = attrsuffix w.soggi=soggi w.gradation=gradation w.diphthong=diphthong w.dialect=dialect w.valency = valency w.frequency = frequency w.geography = geography w.save() # If adding placenames, do not update anymore #if options.placenamefile: continue else: if options.update: print "Adding entry for ", lemma , "."; # Otherwise create new word if mainlang=="nob": w=Wordnob(wordid=id,lemma=id,pos=pos); else: w=Word(wordid=id,lemma=lemma,pos=pos,stem=stem,diphthong=diphthong,\ rime=rime,soggi=soggi,gradation=gradation,dialect=dialect,attrsuffix=attrsuffix); w.save() # Add forms and tags if options.paradigmfile: linginfo.create_paradigm(lemma,pos) for form in linginfo.paradigm: g=form.classes if w.pos == "A" and w.compare == "no" and (g.get('Grade')=="Comp" or g.get('Grade')=="Superl"): #print g.get('Grade') continue #if w.pos == "N" and w.plural == "no" and (form.count('Pl')>0): # continue t,created=Tag.objects.get_or_create(string=form.tags,pos=g.get('Wordclass', ""),\ number=g.get('Number',""),case=g.get('Case',""),\ possessive=g.get('Possessive',""),grade=g.get('Grade',""),\ infinite=g.get('Infinite',""), \ personnumber=g.get('Person-Number',""),\ polarity=g.get('Polarity',""),\ tense=g.get('Tense',""),mood=g.get('Mood',""), \ subclass=g.get('Subclass',""),attributive=g.get('Attributive',"")) t.save() #print form.form, t.string, w.lemma form, created = Form.objects.get_or_create(fullform=form.form,tag=t,word=w,dialect=form.dialect) form.save() if only_sg: print "deleting forms for", w.lemma Form.objects.filter(Q(word=w.id) & Q(tag__number="Pl")).delete() if only_pl: print "deleting forms for", w.lemma Form.objects.filter(Q(word=w.id) & Q(tag__number="Sg")).delete if e.getElementsByTagName("sources"): sources = e.getElementsByTagName("sources")[0] elements=sources.getElementsByTagName("book") for el in elements: book=el.getAttribute("name") if book: # Add book to the database # Leave this if DTD is used book_entry, created = Source.objects.get_or_create(name=book) if created: print "Created book entry with name ", book w.source.add(book_entry) w.save() if e.getElementsByTagName("semantics"): # Give placenames special semantic tag # This is temporary solution. if options.placenamefile: sem_entry, created = Semtype.objects.get_or_create(semtype="PLACE-NAME-LEKSA") if created: print "Created semtype entry with name PLACE-NAME-LEKSA" w.semtype.add(sem_entry) w.save() else: semantics = e.getElementsByTagName("semantics")[0] elements=semantics.getElementsByTagName("sem") for el in elements: sem=el.getAttribute("class") if sem: # Add semantics entry if not found. # Leave this if DTD is used. sem_entry, created = Semtype.objects.get_or_create(semtype=sem) if created: print "Created semtype entry with name ", sem w.semtype.add(sem_entry) w.save() # Add translations translations = e.getElementsByTagName("translations")[0] elements=translations.getElementsByTagName("tr") for el in elements: if el.firstChild: translation=el.firstChild.data #print translation lang=el.getAttribute("xml:lang") if lang == "sme": if Word.objects.filter(wordid=translation,pos=pos).count()>0: transl = Word.objects.filter(wordid=translation,pos=pos)[0] else: transl, created = Word.objects.get_or_create(wordid=translation,pos=pos) if created: transl.lemma=translation transl.save() # Add reference to the new word object as translation. w.translations.add(transl) w.save() else: if lang == "nob": transl, created = Wordnob.objects.get_or_create(wordid=translation) if created: transl.lemma=translation transl.save() w.translations.add(transl) w.save() # special treatment for å-verbs. if pos=="V": oo = "å".decode('utf8') wordform = translation.lstrip(oo + " ") #print wordform transl, created = Wordnob.objects.get_or_create(wordid=wordform) if created: transl.lemma=wordform transl.save() # Add reference to the new word object as translation. w.translations.add(transl) w.save() # If placenames # Give norwegian words same semantic classes as sami words. # Temporary solution. if options.placenamefile: sem_entry, created = Semtype.objects.get_or_create(semtype="PLACE-NAME-LEKSA") if created: print "Created semtype entry with name PLACE-NAME-LEKSA" transl.semtype.add(sem_entry) transl.frequency=w.frequency transl.geography=w.geography transl.save() transl.save() else: continue """ semtypes = ['ABSTRACTS','ACTIONS','AMOUNTS','ANIMALS','BODYPART','CHRISTMAS','CLOTHES','CONTAINERS','CONVERSATION','EDUCATION','FAMILY','FEELINGS','GROUPS','HUMANS','HANDICRAFTS','ILLNESS','JOB','NATURE','OTHERS','PLACES','PLANTS','PROFESSION','SCHOOL','SOUNDS','SOUP','SUBJECTS','THINGS','TIME','TRAVELLING','WEATHER'] for type in semtypes: st=Semtype(semtype=type) st.save() """