# -*- coding: utf-8 -*-
import settings
from django.db.models import Q
from xml.dom import minidom as _dom
from django.utils.encoding import force_unicode
import sys

# Word, Form, Tag, Source, Semtype 
# from drills.models import * 

# import re
# import string
# import codecs

# For easier debugging.
# _D = open('/dev/ttys005', 'w')
_D = open('/dev/null', 'w')

# Lexicons: words

# TODO: 1. How many queries is this running? 
# @cip: The answer to this question (, which is one of very many I would have asked Saara) 
#       should be in the maintenance document on the web.
# TODO: 2. Can be done in one, single transaction for all of these would be good
# @cip: I would like to do so, but after the delete-entry bug I am not that sure.

# @ryan: Musing while waiting for a database to be populated... What was the bug? 
# 		 I tested with one big transaction just to see, and it sped things up a bit...
#        But I think the major slowdown is from words being generated 
#        incrementally. It would be much faster to scan for all the words
#		 generate them in one go, and store them to a data structure for
#        easy access later, so that `lookup` only has to be called once.
#		 `lookup` seems to take a good deal of time to start up, but processes
#        much faster, of course. Thus, transactions probably don't matter as much
# 	 	 as eliminating this would.
#        ... Maybe I'll try this out when there's some free time.


# def hasApp(elem):
# 	apps = elem.getElementsByTagName("app")
# 	
# 	for a in apps:
# 		name = a.getAttribute("name")
# 		if name and name == "oahpa":
# 			return True
# 	
# 	return False
# 
# def getPOS(elem):
# 	return elem.getElementsByTagName("lg")[0].getElementsByTagName("l")[0].getAttribute("pos")


from django.db import transaction

class Words:
	
	@transaction.commit_manually
	def install_lexicon(self,infile,linginfo,delete=None,paradigmfile=None):
		
		# xmlfile = file(infile) # never used
		tree = _dom.parse(infile)
		
		lex = tree.getElementsByTagName("r")[0]
		mainlang = lex.getAttribute("xml:lang")
		print >> sys.stdout, "Mainlang defined ", mainlang
		if not mainlang:
			print >> sys.stderr, "Attribute mainlang not defined in", infile, "stop."
			sys.exit()

		self.all_wordids = []
		
		es = tree.getElementsByTagName("e")
		total = len(es)
		count = 0
		
		for e in es:
			hasApp = False

			apps = e.getElementsByTagName("app")
			
			for a in apps:
				name = a.getAttribute("name")
				if name and name == "oahpa":
					hasApp = True

			pos = e.getElementsByTagName("lg")[0].getElementsByTagName("l")[0].getAttribute("pos")
			
			# Uppercase POS.
			pos = pos.upper()
			if pos.startswith('PHRASE_'):
				pos = pos.replace('PHRASE_', '') # Just incase we have longer POS.
			
			# this should be checked, too
#			semantics = e.getElementsByTagName("semantics")[0]
#			elements = semantics.getElementsByTagName("sem")

			if hasApp:
				if pos:
					print >> sys.stdout, "pos defined ", pos
					self.store_word(e=e,linginfo=linginfo,mainlang=mainlang,delete=delete,paradigmfile=paradigmfile)
				else:
					try:
						__data = e.getElementsByTagName("lg")[0].getElementsByTagName("l")[0].firstChild.data
					except AttributeError:
						__data = 'None'
					print >> sys.stdout, "undefined pos for ", __data
			else: 
				try:
					__data = e.getElementsByTagName("lg")[0].getElementsByTagName("l")[0].firstChild.data
				except AttributeError:
					__data = 'None'
				print >> sys.stdout, "NON-oahpa; empty lemma and no pos defined", __data
			count += 1
			print >> sys.stdout, '--- %d/%d entries processed' % (count, total)
			
			
		if delete and pos:
			allids = Word.objects.filter(Q(pos=pos) & ~Q(semtype__semtype="PLACE-NAME-LEKSA")).values_list('wordid',flat=True)
			for a in allids:
				if force_unicode(a) not in set(self.all_wordids):
					print >> sys.stdout, "Word id not found from xml. Deleting:", a
					word = Word.objects.get(pos=pos,wordid=a)
					word.delete()
		
		transaction.commit()


	def add_translation(self,el,w,pos):
		if el.firstChild:
			lemma = phrase = explanation = False
			if el.tagName == 't':
				translation = lemma = el.firstChild.data
			if el.tagName == 'tf':
				translation = phrase = el.firstChild.data
			if el.tagName == 'te':
				translation = explanation = el.firstChild.data
			
			# TODO: For the moment the translations and explanations 
			# are only in effect for nob
			
			lang = el.getAttribute("xml:lang")
			pos = pos.upper()
			
			if lang == "sma":
				if Word.objects.filter(wordid=translation,pos__iexact=pos).count()>0:
					transl = Word.objects.filter(wordid=translation,pos__iexact=pos)[0]
				else:
					transl, created = Word.objects.get_or_create(wordid=translation,pos__iexact=pos)
					if created:
						transl.lemma = translation
						transl.save()
						# Add reference to the new word object as translation.
				w.translations.add(transl)
				w.save()				   

			else:
				if lang == "sme": 
					transl, created = Wordsme.objects.get_or_create(wordid=translation)
					if created:
						transl.lemma = translation
						transl.save()
					w.translations2sme.add(transl)
					w.save()

				elif lang == "deu":
					transl, created = Worddeu.objects.get_or_create(wordid=translation)
					if created:
						transl.lemma = translation
						transl.save()
					w.translations2deu.add(transl)
					w.save()

				elif lang == "eng":
					transl, created = Wordeng.objects.get_or_create(wordid=translation)
					if created:
						transl.lemma = translation
						transl.save()
					w.translations2eng.add(transl)
					w.save()

					# special treatment for to-infinitive
					if pos == "V":
						oo = "to".decode('utf8')
						wordform = translation.lstrip(oo + " ")
						transl, created = Wordeng.objects.get_or_create(wordid=wordform)
						if created:
							transl.lemma = wordform
							transl.save()
					# Add reference to the new word object as translation.
					w.translations2eng.add(transl)
					w.save()				   

				elif lang == "nob":
					transl, created = Wordnob.objects.get_or_create(wordid=translation)
					if created:
						if lemma:
							transl.lemma = lemma
						elif phrase:
							transl.phrase = phrase
						elif explanation:
							transl.explanation = explanation
						
						transl.save()
					w.translations2nob.add(transl)
					w.save()

					# special treatment for to-infinitive
					if pos == "V":
						oo = "å".decode('utf8')
						wordform = translation.lstrip(oo + " ")
						transl, created = Wordnob.objects.get_or_create(wordid=wordform)
						if created:
							transl.lemma = wordform
							transl.save()
					# Add reference to the new word object as translation.
					w.translations2nob.add(transl)
					w.save()				   

	def add_semantics(self,e,w):
		semantics = e.getElementsByTagName("semantics")[0]
		elements = semantics.getElementsByTagName("sem")
		
		for el in elements:
			sem = el.getAttribute("class")
			if sem:
				print >> sys.stdout, "Semantic cls: ", sem					
				# Add semantics entry if not found.
				# Leave this if DTD is used.
				sem_entry, created = Semtype.objects.get_or_create(semtype=sem)
				if created:
					print >> sys.stdout, "Created semtype entry with name ", sem
				w.semtype.add(sem_entry)
				w.save()		

	def add_sources(self,e,w):
		sources = e.getElementsByTagName("sources")[0]
		elements = sources.getElementsByTagName("book")
		for el in elements:
			book = el.getAttribute("name")
			if book:
				# Add book to the database
				# Leave this if DTD is used
				book_entry, created = Source.objects.get_or_create(name=book)
				if created:
					print >> sys.stdout, "Created book entry with name ", book

				w.source.add(book_entry)
				w.save()

	def store_word(self,e,linginfo,mainlang,paradigmfile,delete):
		
		# Store first unique fields
		wid = e.getAttribute("id")
		lemma = e.getElementsByTagName("l")[0].firstChild.data
		if not wid:
			wid = lemma
		self.all_wordids.append(wid)
		stem = ""
		forms = ""
		diphthong = "no"
		gradation = ""
		rime = ""
		wordclass = ""
		if e.getElementsByTagName("l")[0].getAttribute("class"):
			wordclass = e.getElementsByTagName("l")[0].getAttribute("class")
			print >> sys.stdout, wordclass 
		
		attrsuffix = ""
		soggi = ""
		valency = ""
		compare = ""
		frequency = ""
		geography = ""
		presentationform = ""
		only_sg = 0
		only_pl = 0
		noleksa = 0
		if e.getElementsByTagName("forms"):
			forms = e.getElementsByTagName("forms")[0]
			
		if e.getElementsByTagName("presentationform"):
			presentationform = e.getElementsByTagName("presentationform")[0].firstChild.data

		if e.getElementsByTagName("stem"):
			stem = e.getElementsByTagName("stem")[0].getAttribute("class")
			diphthong = e.getElementsByTagName("stem")[0].getAttribute("diphthong")
			gradation = e.getElementsByTagName("stem")[0].getAttribute("gradation")
			rime = e.getElementsByTagName("stem")[0].getAttribute("rime")
			if rime == "0": rime="norime"
			soggi = e.getElementsByTagName("stem")[0].getAttribute("soggi")
			compare = e.getElementsByTagName("stem")[0].getAttribute("compare")
			attrsuffix = e.getElementsByTagName("stem")[0].getAttribute("attrsuff")
			if attrsuffix == "0": attrsuffix = "noattr"
		
		if e.getElementsByTagName("frequency"):
			frequency = e.getElementsByTagName("frequency")[0].getAttribute("class")

		if e.getElementsByTagName("geography"):
			geography = e.getElementsByTagName("geography")[0].getAttribute("class")


		if e.getElementsByTagName("only-sg"):
			only_sg = 1
		if e.getElementsByTagName("only-pl"):
			only_pl = 1
		if e.getElementsByTagName("noleksa"):
			noleksa = 1

		if e.getElementsByTagName("valency"):
			valencies = e.getElementsByTagName("valency")[0]
			for val in valencies.getElementsByTagName("val"):
				valency = val.getAttribute("class")
				if valency: break

		# Part of speech information
		pos = e.getElementsByTagName("l")[0].getAttribute("pos") 

		if pos.startswith('phrase_'):
			pos = pos[7:8]

#		if not pos:
#			print "Part of speech information not found for ", lemma, ". give it command line: --pos=N"
#			sys.exit()

		# Search for existing word in the database.
		w = None
		# print 'bryllup, wordid/wid: ', wid
		# raw_input()
		if mainlang == "nob":
			w,created = Wordnob.objects.get_or_create(wordid=wid)
		elif mainlang == "swe":
			w,created = Wordswe.objects.get_or_create(wordid=wid)
		elif mainlang == "sme":
			w,created = Wordsme.objects.get_or_create(wordid=wid)
		elif mainlang == "eng":
			w,created = Wordeng.objects.get_or_create(wordid=wid)
		elif mainlang == "deu":
			w,created = Worddeu.objects.get_or_create(wordid=wid)
		else:
			w,created = Word.objects.get_or_create(wordid=wid,pos=pos)
		
		w.wordclass = wordclass
		w.pos = pos
		w.lemma = lemma
		w.presentationform = presentationform
		w.stem = stem
		w.rime = rime
		w.compare = compare
		w.attrsuffix = attrsuffix
		w.soggi = soggi
		w.gradation = gradation
		w.diphthong = diphthong

		w.valency = valency
		w.frequency = frequency
		w.geography = geography
		w.save()
		
		# Add forms and tags
		if paradigmfile:
			linginfo.create_paradigm(lemma=lemma,pos=pos,forms=forms)
			# Remove old forms.
			forms = Form.objects.filter(word=w)
			for f in forms:
				f.delete()
			for f in linginfo.paradigm:

				g=f.classes
				if w.pos == "A" and w.compare == "no" and \
					   (g.get('Grade')=="Comp" or g.get('Grade')=="Superl"):
					continue

				t,created=Tag.objects.get_or_create(string=f.tags,pos=g.get('Wordclass', ""),\
													number=g.get('Number',""),case=g.get('Case',""),\
													possessive=g.get('Possessive',""),\
													grade=g.get('Grade',""),\
													infinite=g.get('Infinite',""), \
													personnumber=g.get('Person-Number',""),\
													polarity=g.get('Polarity',""),\
													tense=g.get('Tense',""),mood=g.get('Mood',""), \
													subclass=g.get('Subclass',""),\
													attributive=g.get('Attributive',""))

				t.save()

				form = Form(fullform=f.form,tag=t,word=w)				
				print >> sys.stdout, "Created form: ", f.form
				form.save()
				try:
					print >> _D, f.dialects
					if len(f.dialects)==1: dialects2 = f.dialects[:]
					else: dialects2 = dialects[:]
					for d in dialects2:
						dia, created = Dialect.objects.get_or_create(dialect=d)
						form.dialects.add(dia)
						form.save()
				except AttributeError:
					print >> _D, "No dialects specified"
					pass
				form.save()
		
		if only_sg:
			print >> sys.stdout, "deleting plural forms for", w.lemma
			Form.objects.filter(Q(word=w.id) & Q(tag__number="Pl")).delete()
		if only_pl:
			print >> sys.stdout, "deleting singular forms for", w.lemma
			Form.objects.filter(Q(word=w.id) & Q(tag__number="Sg")).delete
		if noleksa:
			print >> sys.stdout, "word not in leksa", w.lemma
			w.leksa = 0
		else:
			w.leksa = 1

		if e.getElementsByTagName("sources"):
			self.add_sources(e,w)
		
		if e.getElementsByTagName("semantics"):
			self.add_semantics(e,w)
		
		# Add translations
		translations = e.getElementsByTagName("tg")[0]
		
		# minidom is ridiculous
		elements = []
		for node in translations.childNodes:
			try:
				tagname = node.tagName
			except AttributeError:
				continue
			if tagname.startswith('t'):
				elements.append(node)
		
		# elements = translations.getElementsByTagName("t")
		for el in elements:
			self.add_translation(el,w,pos)

	def delete_word(self, wid=None,pos=None):

		if not pos:
			print "specify the part of speech with option -p"
			# to debug and fix: delete word routine
			# wordruss = Wordrus.objects.filter(wordid=wid)
			# for w in wordruss:
			# 		print "Removing", w.wordid
			#		w.delete()
		if wid and pos:
			words = Word.objects.filter(wordid=wid,pos=pos)
			for w in words:
				print >> sys.stdout, "Removing", w.wordid
				w.delete()
		if not words:
			print wid, "not found"