#!/usr/bin/env python

import lxml.etree as etree
import sys

# Change the given languages to something wikimedia digests
lan = {}
lan["eng"] = "en";
lan["fin"] = "fi";
lan["lat"] = "lat";
lan["nor"] = "nb";
lan["sma"] = "sma";
lan["sme"] = "se";
lan["smj"] = "smj";
lan["smn"] = "smn";
lan["sms"] = "sms";
lan["swe"] = "sv";

# Change the pos element to the ones found in $GTHOME/gt/sme/src
pos = {}
pos["A"] = "A";
pos["Adjektiv"] = "A";
pos["a"] = "A";
pos["ABBR"] = "N";
pos["Adv"] = "Adv";
pos["adv"] = "Adv";
pos["PP"] = "N";
pos["Pron"] = "Pron";
pos["S"] = "N";
pos["s"] = "N";
pos["d"] = "N";
pos["V"] = "V";
pos["v"] = "V";


def head2wiki(head):
    """Receives a head xml element and turns it into a string containing
    wiki page content.
    """
    r = '{{Expression'
    r = r + '|language=' + head.get('lang')
    r = r + '|pos=' + head.get('pos')
    r = r + '}}'

    return r

def collectAllExpressions():
    """Parses all terms-xxx.xml files, collect all head elements
    Returns the collected head elements in the expressions variable
    """
    # A dict that holds etrees containing the terms-lang.xml files
    tree = {}

    # An etree containing all head elements from the terms-xxx.xml files
    expressions = etree.Element("expressions")

    for lang in ["eng", "fin", "lat", "nor", "sma", "sme", "smj", "smn", "sms", "swe"]:
        tree[lang] = etree.parse("terms-" + lang + ".xml")
        root = tree[lang].getroot()
        for head in root.xpath('//head'):
            if head.text:
                head.set('lang', lan[lang])
                head.set('pos', pos[head.get('pos')])
                head.text = head.text.strip()
                if head.text != "":
                    expressions.append(head)

    return expressions

def printCSVFile(expressions):
    """Prints a csv file where the first field is the expression
    (the text of head elements).
    The second field will become the content of the expression page on termwiki
    """
    tofile = open(sys.argv[1], 'w')

    for text in set(expressions.xpath('//head/text()')):
        tofile.write(text.encode('utf8'))
        tofile.write('\t')
        sys.stdout.write(".")
        syns = expressions.xpath('//head[text()="' + text + '"]')
        if len(syns) > 1:
            synset = set()
            for syn in syns:
                synset.add(head2wiki(syn))
            for s in synset:
                tofile.write(s.encode('utf8'))
        else:
            tofile.write(head2wiki(syns[0]).encode('utf8'))
        tofile.write('\n')
        sys.stdout.flush()

printCSVFile(collectAllExpressions())
print "\n", len(set(expressions.xpath('//head/text()')))
