# -*- coding: utf-8 -*- import settings from drills.models import * # from xml.dom import minidom as _dom # from django.db.models import Q import sys import os import re # import string import codecs # Using django settings paths, need to make these more central. # # _D = open('/dev/ttys005', 'w') _D = open('/dev/null', 'w') try: fstdir = settings.FST_DIRECTORY except: fstdir = "/opt/smi/sme/bin" try: lookup = settings.LOOKUP_TOOL except: lookup = "/usr/local/bin/lookup" try: language = settings.MAIN_LANGUAGE[0] except: language = "sme" numfst = fstdir + "/" + language + "-num.fst" class Entry: pass class Paradigm: def __init__(self): self.tagset = {} self.paradigms = {} def handle_tags(self, tagfile, add_db): # try: # with codecs.open(tagfile, "r", "utf-8" ) as fileObj: # tags = fileObj.readlines() # except IOError: # print >> sys.stderr, 'Could not open %s. Check paths?' % tagfile # sys.exit() fileObj = codecs.open(tagfile, "r", "utf-8" ) tags = fileObj.readlines() fileObj.close() classObj=re.compile(r'^#\s*(?P[\w\-]*)\s*$', re.U) stringObj=re.compile(r'^(?P[\w]*)\s*$', re.U) tagclass="" for line in tags: line.strip() matchObj=classObj.search(line) if matchObj: tagclass = matchObj.expand(r'\g') else: matchObj=stringObj.search(line) if matchObj: string = matchObj.expand(r'\g') self.tagset[string]=tagclass if add_db and tagclass and string: #print "adding " + tagclass + " " + string tagset, created = Tagset.objects.get_or_create(tagset=tagclass) pos, created = Tagname.objects.get_or_create(tagname=string,tagset=tagset) def read_paradigms(self, paradigmfile, tagfile, add_database): if not self.tagset: self.handle_tags(tagfile) fileObj = codecs.open(paradigmfile, "r", "utf-8" ) posObj = re.compile(r'^(?:\+)?(?P[\w]+)\+.*$', re.U) while True: line = fileObj.readline() if not line: break if not line.strip(): continue matchObj = posObj.search(line) if matchObj: pos = matchObj.expand(r'\g') try: if not self.paradigms.has_key(pos): self.paradigms[pos]=[] except UnboundLocalError: print >> sys.stderr, ' * Could not match pos. Check format of paradigm file.' print >> sys.stderr, ' * Error on line: %s' % line sys.exit() self.paradigms[pos].append(line) def create_paradigm(self, lemma, pos, forms): pos = pos.upper() if not self.tagset: self.handle_tags() self.paradigm = [] # TODO: is this preventing matching south sámi forms? # How can we do this so we don't need to constantly rewrite this to specify a new alphabet? # genObj_re = r'^(?P[\wáŋčžšđŧ]+)\+(?P[\w\+]+)[\t\s]+(?P[\wáŋčžšđŧ]*)$' genObj_re = r'^(?P[\w]+)\+(?P[\w\+]+)[\t\s]+(?P[\w]*)$' genObj=re.compile(genObj_re, re.U) lookups = "" if self.paradigms.has_key(pos): for a in self.paradigms[pos]: lookups = lookups + lemma + "+" + a # generator call # Moving paths up # fstdir = "/opt/smi/sme/bin" # lookup = "/usr/local/bin/lookup" gen_norm_fst = fstdir + "/i%s-norm.fst" % language # None of these dialects in sma # gen_gg_restr_fst = fstdir + "/isme-KJ.restr.fst" # gen_kj_restr_fst = fstdir + "/isme-GG.restr.fst" print >> _D, lookups.encode('utf-8') gen_norm_lookup = "echo \"" + lookups.encode('utf-8') + "\" | " + lookup + " -flags mbTT -utf8 -d " + gen_norm_fst # gen_gg_restr_lookup = "echo \"" + lookups.encode('utf-8') + "\" | " + lookup + " -flags mbTT -utf8 -d " + gen_gg_restr_fst # gen_kj_restr_lookup = "echo \"" + lookups.encode('utf-8') + "\" | " + lookup + " -flags mbTT -utf8 -d " + gen_kj_restr_fst # TODO: check where de/code is? lines_tmp = [a.decode('utf-8') for a in os.popen(gen_norm_lookup).readlines()] # lines_gg_restr_tmp = os.popen(gen_gg_restr_lookup).readlines() # lines_kj_restr_tmp = os.popen(gen_kj_restr_lookup).readlines() extraforms={} if forms: if forms.getElementsByTagName("form"): form_els = forms.getElementsByTagName("form") for f in form_els: tagstring = f.getAttribute("tag") wordform = f.firstChild.data extraforms[tagstring] = wordform print "adding extra wordform..", wordform for line in lines_tmp: if not line.strip(): continue matchObj=genObj.search(line) if matchObj: g = Entry() g.classes={} lemma = matchObj.expand(r'\g') g.form = matchObj.expand(r'\g') if re.compile("\?").match(g.form): continue g.tags = matchObj.expand(r'\g') for t in g.tags.split('+'): if self.tagset.has_key(t): tagclass=self.tagset[t] g.classes[tagclass]=t self.paradigm.append(g) #extraforms override generated ones if extraforms.has_key(g.tags): g.form=extraforms[g.tags] def generate_numerals(self): """ Generate all the cardinal numbers Create paradigms and store to db """ print >> _D, 'generate_numerals called' # Moving paths up # language = "sme" # #fstdir = "/opt/smi/" + language + "/bin" # #lookup = /usr/local/bin/lookup # # fstdir = "/Users/saara/gt/" + language + "/bin" # lookup = "/Users/saara/bin/lookup" # # numfst = fstdir + "/" + language + "-num.fst" for num in range(1,20): num_lookup = "echo \"" + str(num) + "\" | " + lookup + " -flags mbTT -utf8 -d " + numfst numerals = os.popen(num_lookup).readlines() # Take only first one. # Change this if needed! num_list=[] for num in numerals: line = num.strip() if line: nums = line.split('\t') num_list.append(nums[1].decode('utf-8')) numstring = num_list[0] w, created = Word.objects.get_or_create(wordid=num, lemma=numstring, pos="Num") w.save() self.create_paradigm(numstring, "Num") for form in self.paradigm: form.form = form.form.replace("#","") g=form.classes t,created=Tag.objects.get_or_create(string=form.tags,pos=g.get('Wordclass', ""),\ number=g.get('Number',""),case=g.get('Case',""),\ possessive=g.get('Possessive',""),grade=g.get('Grade',""),\ infinite=g.get('Infinite',""), \ personnumber=g.get('Person-Number',""),\ polarity=g.get('Polarity',""),\ tense=g.get('Tense',""),mood=g.get('Mood',""), \ subclass=g.get('Subclass',""), \ attributive=g.get('Attributive',"")) t.save() form, created = Form.objects.get_or_create(fullform=form.form,tag=t,word=w) form.save()