#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Convert mekanikk-1999 to TermWiki.""" import os from lxml import etree from termwikiimporter import read_termwiki SRCDIR = os.path.join(os.getenv('GTHOME'), 'words/terms/mekanikk-1999/newsrc') def read_terms(termfile, title_index, language): """Read a terms-xxx.xml file.""" for sense in termfile.iter('sense'): identifier = sense.get('idref') if identifier is not None: concept = title_index.get(identifier, read_termwiki.Concept()) if sense.get('class'): concept.data['concept']['category'] = sense.get('class') if not concept.data['concept'].get('collection'): concept.data['concept']['collection'] = set() concept.data['concept']['collection'].add('Mekanikk-1999') definition = sense.find('.//def') if definition is not None and definition.text is not None: concept.data['concept_infos'].append({ 'language': language, 'definition': ' '.join(definition.text.split()) }) head = sense.getparent().getparent().find('.//head') for exp in head.text.split(','): exp = exp.strip() lang = language if '[' in exp: if '[b]' in exp: lang = 'nb' else: lang = 'nn' exp = exp[:-3] expression = { 'language': lang, 'expression': exp, 'sanctioned': 'True' } concept.clean_up_expression(expression) title_index[identifier] = concept def read_sdterm(): """Read term entries from SD-terms files.""" title_index = {} # Change the given languages to something wikimedia digests langs = { 'eng': 'en', 'fin': 'fi', 'nor': 'nb', 'sme': 'se', 'swe': 'sv', } for lang in langs: filename = f'{SRCDIR}/terms-{lang}.xml' read_terms(etree.parse(filename), title_index, langs[lang]) return title_index def write_pages(): """Write the content of the mekanikk-1999 files to a tw like format.""" pages = etree.Element('pages') title_index = read_sdterm() print(len(title_index)) for title, concept in title_index.items(): page = etree.SubElement(pages, 'page') page.set('title', title) xml_concept = etree.SubElement(page, 'concept') xml_concept.text = str(concept) with open(f'{SRCDIR}/pages.xml', 'w') as to_file: to_file.write( etree.tostring(pages, pretty_print=True, encoding='unicode')) write_pages()