#!/usr/bin/env python import lxml.etree as etree import sys # Change the given languages to something wikimedia digests lan = {} lan["eng"] = "en"; lan["fin"] = "fi"; lan["lat"] = "lat"; lan["nor"] = "nb"; lan["sma"] = "sma"; lan["sme"] = "se"; lan["smj"] = "smj"; lan["smn"] = "smn"; lan["sms"] = "sms"; lan["swe"] = "sv"; # Change the pos element to the ones found in $GTHOME/gt/sme/src pos = {} pos["A"] = "A"; pos["Adjektiv"] = "A"; pos["a"] = "A"; pos["ABBR"] = "N"; pos["Adv"] = "Adv"; pos["adv"] = "Adv"; pos["PP"] = "N"; pos["Pron"] = "Pron"; pos["S"] = "N"; pos["s"] = "N"; pos["d"] = "N"; pos["V"] = "V"; pos["v"] = "V"; def head2wiki(head): """Receives a head xml element and turns it into a string containing wiki page content. """ r = '{{Expression' r = r + '|language=' + head.get('lang') r = r + '|pos=' + head.get('pos') r = r + '}}' return r def collectAllExpressions(): """Parses all terms-xxx.xml files, collect all head elements Returns the collected head elements in the expressions variable """ # A dict that holds etrees containing the terms-lang.xml files tree = {} # An etree containing all head elements from the terms-xxx.xml files expressions = etree.Element("expressions") for lang in ["eng", "fin", "lat", "nor", "sma", "sme", "smj", "smn", "sms", "swe"]: tree[lang] = etree.parse("terms-" + lang + ".xml") root = tree[lang].getroot() for head in root.xpath('//head'): if head.text: head.set('lang', lan[lang]) head.set('pos', pos[head.get('pos')]) head.text = head.text.strip() if head.text != "": expressions.append(head) return expressions def printCSVFile(expressions): """Prints a csv file where the first field is the expression (the text of head elements). The second field will become the content of the expression page on termwiki """ tofile = open(sys.argv[1], 'w') for text in set(expressions.xpath('//head/text()')): tofile.write(text.encode('utf8')) tofile.write('\t') sys.stdout.write(".") syns = expressions.xpath('//head[text()="' + text + '"]') if len(syns) > 1: synset = set() for syn in syns: synset.add(head2wiki(syn)) for s in synset: tofile.write(s.encode('utf8')) else: tofile.write(head2wiki(syns[0]).encode('utf8')) tofile.write('\n') sys.stdout.flush() printCSVFile(collectAllExpressions()) print "\n", len(set(expressions.xpath('//head/text()')))