#!/usr/bin/env python3 from lxml import etree from termwikiimporter import read_termwiki def l_to_expression(lang, lemma_element): for exp in [exp.strip() for exp in lemma_element.text.split(',')]: expression = {} expression['expression'] = exp expression['pos'] = lemma_element.get('pos') expression['language'] = lang expression['sanctioned'] = 'True' yield expression def entry2concept(entry): concept = read_termwiki.Concept() concept.data['concept']['collection'] = set() concept.data['concept']['collection'].add('JustermTana') for expression in l_to_expression('nb', entry.find('.//l')): concept.related_expressions.append(expression) for translation_group in entry.iter('tg'): lang = LANGS[translation_group.get( '{http://www.w3.org/XML/1998/namespace}lang')] for expression in l_to_expression(lang, translation_group.find('.//t')): concept.related_expressions.append(expression) return str(concept) LANGS = { "fin": "fi", "sme": "se", } TREE = etree.parse('gt_rapl-ril.xml') PAGES = etree.Element('pages') for index, entry in enumerate(TREE.getroot().iter('e')): page = etree.SubElement(PAGES, 'page') page.set('title', f'Juridihkka:JustermTana {index}') concept = etree.SubElement(page, 'concept') concept.text = entry2concept(entry) with open(f'pages.xml', 'w') as to_file: to_file.write(etree.tostring(PAGES, pretty_print=True, encoding='unicode'))