#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this file. If not, see . # # Copyright © 2016-2018 The University of Tromsø & # the Norwegian Sámi Parliament # http://giellatekno.uit.no & http://divvun.no # """Sort tags in lexc lines. We are only interested in lexc lines that have two or more tags. Other lines should go untouched. """ import fileinput import glob import os import re from collections import defaultdict LEXC_LINE_RE = re.compile(r''' (?P^\s*!\s*)? # optional comment (?P(<.+>)|(.+))? # optional content (?P\s+) # space between content and contlex (?P\S+) # any nonspace (?P\s+".*")? # optional translation, might be empty (?P\s*;\s*) # semicolon and space surrounding it (?P!.*)? # followed by an optional comment $ ''', re.VERBOSE | re.UNICODE) TAG = re.compile(r'''\+[^+]+''') def is_interesting_line(line): lexc_match = LEXC_LINE_RE.match(line.replace('% ', '%¥')) if lexc_match: groupdict = lexc_match.groupdict() if not groupdict.get('exclam') and groupdict.get('content'): content = groupdict.get('content').replace('%¥', '% ') lexc_line_match = content.find(':') if (not (content.startswith('<') and content.endswith('>')) and lexc_line_match != -1): upper = content[:lexc_line_match] lower = content[lexc_line_match:] tags = TAG.findall(upper) if len(tags) > 1: new_parts = [TAG.sub('', upper), sort_tags(tags), lower] new_parts.extend([ groupdict[key] for key in [ 'contlex_space', 'contlex', 'translation', 'semicolon', 'comment' ] if groupdict.get(key) ]) return ''.join(new_parts) return line def sort_tags(tags): tagsets = defaultdict(list) for tag in tags: if tag in ['+NomAg', '+G3'] or tag.startswith('+Hom'): tagsets['Hom'].append(tag) elif tag.startswith('+v'): tagsets['v'].append(tag) elif tag.startswith('+CmpNP'): tagsets['CmpNP'].append(tag) elif tag.startswith('+CmpN'): tagsets['CmpN'].append(tag) elif tag.startswith('+Sem'): tagsets['Sem'].append(tag) elif tag in [ '+N', '+A', '+Adv', '+V', '+Pron', '+CS', '+CC', '+Adp', '+Po', '+Pr', '+Interj', '+Pcle', '+Num' ]: tagsets['Pos'].append(tag) else: tagsets['resten'].append(tag) if len(tagsets['v']) > 1: raise ValueError('too many v') if len(tagsets['Hom']) > 1: raise ValueError('too many hom') return ''.join(valid_tags(tagsets)) def valid_tags(tagsets): for tag_group in ['v', 'Hom', 'Pos', 'Sem', 'CmpN', 'CmpNP', 'resten']: if tagsets.get(tag_group): for tag in tagsets[tag_group]: yield tag def stemroots(): for lang in [ 'chp', 'cor', 'deu', 'est', 'fin', 'hdn', 'kal', 'koi', 'kpv', 'mdf', 'mhr', 'myv', 'nob', 'olo', 'sje', 'sma', 'sme', 'smj', 'smn', 'sms', 'som', 'vro' ]: yield os.path.join( os.getenv('GTHOME'), 'langs', lang, 'src/morphology/stems/') def filenames(): for stemroot in stemroots(): for filename in glob.glob(stemroot + '*.lexc'): yield filename def main(): for filename in filenames(): print(filename) for line in fileinput.input(filename, inplace=True): print(is_interesting_line(line[:-1] if line[-1] == '\n' else line)) if __name__ == '__main__': main()