#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file. If not, see .
#
# Copyright © 2016-2018 The University of Tromsø &
# the Norwegian Sámi Parliament
# http://giellatekno.uit.no & http://divvun.no
#
"""Sort tags in lexc lines.
We are only interested in lexc lines that have two or more tags. Other
lines should go untouched.
"""
import fileinput
import glob
import os
import re
from collections import defaultdict
LEXC_LINE_RE = re.compile(r'''
(?P^\s*!\s*)? # optional comment
(?P(<.+>)|(.+))? # optional content
(?P\s+) # space between content and contlex
(?P\S+) # any nonspace
(?P\s+".*")? # optional translation, might be empty
(?P\s*;\s*) # semicolon and space surrounding it
(?P!.*)? # followed by an optional comment
$
''', re.VERBOSE | re.UNICODE)
TAG = re.compile(r'''\+[^+]+''')
def is_interesting_line(line):
lexc_match = LEXC_LINE_RE.match(line.replace('% ', '%¥'))
if lexc_match:
groupdict = lexc_match.groupdict()
if not groupdict.get('exclam') and groupdict.get('content'):
content = groupdict.get('content').replace('%¥', '% ')
lexc_line_match = content.find(':')
if (not (content.startswith('<') and content.endswith('>'))
and lexc_line_match != -1):
upper = content[:lexc_line_match]
lower = content[lexc_line_match:]
tags = TAG.findall(upper)
if len(tags) > 1:
new_parts = [TAG.sub('', upper), sort_tags(tags), lower]
new_parts.extend([
groupdict[key]
for key in [
'contlex_space', 'contlex', 'translation',
'semicolon', 'comment'
] if groupdict.get(key)
])
return ''.join(new_parts)
return line
def sort_tags(tags):
tagsets = defaultdict(list)
for tag in tags:
if tag in ['+NomAg', '+G3'] or tag.startswith('+Hom'):
tagsets['Hom'].append(tag)
elif tag.startswith('+v'):
tagsets['v'].append(tag)
elif tag.startswith('+CmpNP'):
tagsets['CmpNP'].append(tag)
elif tag.startswith('+CmpN'):
tagsets['CmpN'].append(tag)
elif tag.startswith('+Sem'):
tagsets['Sem'].append(tag)
elif tag in [
'+N', '+A', '+Adv', '+V', '+Pron', '+CS', '+CC', '+Adp', '+Po',
'+Pr', '+Interj', '+Pcle', '+Num'
]:
tagsets['Pos'].append(tag)
else:
tagsets['resten'].append(tag)
if len(tagsets['v']) > 1:
raise ValueError('too many v')
if len(tagsets['Hom']) > 1:
raise ValueError('too many hom')
return ''.join(valid_tags(tagsets))
def valid_tags(tagsets):
for tag_group in ['v', 'Hom', 'Pos', 'Sem', 'CmpN', 'CmpNP', 'resten']:
if tagsets.get(tag_group):
for tag in tagsets[tag_group]:
yield tag
def stemroots():
for lang in [
'chp', 'cor', 'deu', 'est', 'fin', 'hdn', 'kal', 'koi', 'kpv',
'mdf', 'mhr', 'myv', 'nob', 'olo', 'sje', 'sma', 'sme', 'smj',
'smn', 'sms', 'som', 'vro'
]:
yield os.path.join(
os.getenv('GTHOME'), 'langs', lang, 'src/morphology/stems/')
def filenames():
for stemroot in stemroots():
for filename in glob.glob(stemroot + '*.lexc'):
yield filename
def main():
for filename in filenames():
print(filename)
for line in fileinput.input(filename, inplace=True):
print(is_interesting_line(line[:-1] if line[-1] == '\n' else line))
if __name__ == '__main__':
main()