#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file. If not, see .
#
# Copyright © 2016 The University of Tromsø & the Norwegian Sámi Parliament
# http://giellatekno.uit.no & http://divvun.no
#
from __future__ import absolute_import, print_function
import argparse
import codecs
import io
import re
import sys
import unittest
from collections import defaultdict
from io import open
class TestLines(unittest.TestCase):
def test_non_lexc_line(self):
input = u'''
abb ; babb
'''
expected_result = u'''
abb ; babb
'''
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_longest(self):
input = u'''
+N+Sg: N_ODD_SG ;
+N+Pl: N_ODD_PL ;
+N: N_ODD_ESS ;
+N+SgNomCmp:e%^DISIMP R ;
+N+SgGenCmp:e%>%^DISIMPn R ;
+N+PlGenCmp:%>%^DISIMPi R ;
+N+Der1+Der/Dimin+N:%»adtj GIERIEHTSADTJE ;
+A:%>X7 NomVadj "good A" ;
'''
l = Lines()
l.parse_lines(input.split(u'\n'))
longest = {}
longest[u'upper'] = 19
longest[u'lower'] = 12
longest[u'contlex'] = 14
longest[u'translation'] = 8
longest[u'divisor'] = 1
self.assertEqual(longest, l.longest)
def test_output_with_empty_upper_lower(self):
input = u'''
FINAL1 ;
+N+Sg: N_ODD_SG ;
'''
expected_result = u'''
FINAL1 ;
+N+Sg: N_ODD_SG ;
'''
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_output_with_lexicon_and_semicolon(self):
input = u'''
LEXICON GOAHTI-NE !!= * __@CODE@__ Bisyll. V-Nouns
NomV ;
EssV ;
'''
expected_result = u'''
LEXICON GOAHTI-NE !!= * __@CODE@__ Bisyll. V-Nouns
NomV ;
EssV ;
'''
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_output_with_lines_starting_with_chars(self):
input = u'''
LEXICON Conjunction
jïh Cc ;
jah Cc ;
'''
expected_result = u'''
LEXICON Conjunction
jïh Cc ;
jah Cc ;
'''
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_output_with_lines_starting_with_exclam(self):
input = u'''
LEXICON Conjunction
!dovne Cc ; ! dovne A jïh B
jïh Cc ;
jah Cc ;
'''
expected_result = u'''
LEXICON Conjunction
! dovne Cc ; ! dovne A jïh B
jïh Cc ;
jah Cc ;
'''
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_output_with_lines_with_leading_non_w(self):
input = u'''
LEXICON Cc
+CC:0 # ;
'''
expected_result = u'''
LEXICON Cc
+CC:0 # ;
'''
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_output(self):
input = u'''
LEXICON DAKTERE
+N+Sg: N_ODD_SG ;
+N+Pl: N_ODD_PL ;
+N: N_ODD_ESS ;
+N+SgNomCmp:e%^DISIMP R ;
+N+SgGenCmp:e%>%^DISIMPn R ;
+N+PlGenCmp:%>%^DISIMPi R ;
+N+Der1+Der/Dimin+N:%»adtj GIERIEHTSADTJE ;
+A+Comp+Attr:%>abpa ATTRCONT ; ! båajasabpa, *båajoesabpa
+A:%>X7 NomVadj "good A" ;
! Test data:
!!€gt-norm: daktere # Odd-syllable test
'''
expected_result = u'''
LEXICON DAKTERE
+N+Sg: N_ODD_SG ;
+N+Pl: N_ODD_PL ;
+N: N_ODD_ESS ;
+N+SgNomCmp:e%^DISIMP R ;
+N+SgGenCmp:e%>%^DISIMPn R ;
+N+PlGenCmp:%>%^DISIMPi R ;
+N+Der1+Der/Dimin+N:%»adtj GIERIEHTSADTJE ;
+A+Comp+Attr:%>abpa ATTRCONT ; ! båajasabpa, *båajoesabpa
+A:%>X7 NomVadj "good A" ;
! Test data:
!!€gt-norm: daktere # Odd-syllable test
''' # nopep8
self.maxDiff = None
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_less_great(self):
input = u'''
LEXICON test
+V+IV+Inf+Err/Orth-a/á:uvvát K ;
< "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> "+Der4":» "+Der/NomAct":m > ContLex ;
+V+IV+Inf+Err/Orth-a/á:uvvát K ;
'''
expected_result = u'''
LEXICON test
+V+IV+Inf+Err/Orth-a/á:uvvát K ;
< "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> "+Der4":» "+Der/NomAct":m > ContLex ;
+V+IV+Inf+Err/Orth-a/á:uvvát K ;
'''
self.maxDiff = None
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_line_percent_space_ending(self):
input = u'''
abb:babb% ContLex;
uff:puf Contlex;
'''
expected_result = u'''
abb:babb% ContLex ;
uff:puf Contlex ;
'''
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_line_multiple_percent_space(self):
input = u'''
LEXICON GOAHTILONGSHORT !!= * __@CODE@__ Sometimes long nom-compound-forms, long gen
+N:%> GOAHTILONGSHORTCMP ;
+N+Sg+Nom: K ;
< "+N":0 "+Sg":0 "+Nom":%> "@R.Nom3Px.add@" > NPx3V ;
+N+Der+Der/viđá+Adv+Use/-PLX:»X7% viđá% K ;
+N+Der+Der/viđi+Adv+Use/-PLX:»X7viđi K ;
'''
expected_result = u'''
LEXICON GOAHTILONGSHORT !!= * __@CODE@__ Sometimes long nom-compound-forms, long gen
+N:%> GOAHTILONGSHORTCMP ;
+N+Sg+Nom: K ;
< "+N":0 "+Sg":0 "+Nom":%> "@R.Nom3Px.add@" > NPx3V ;
+N+Der+Der/viđá+Adv+Use/-PLX:»X7% viđá% K ;
+N+Der+Der/viđi+Adv+Use/-PLX:»X7viđi K ;
'''
self.maxDiff = None
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
def test_line_startswith_contlex(self):
input = u'''
LEXICON NounRoot
N_NEWWORDS ;
N_sms2x ;
! N-INCOMING ;
LEXICON nouns
!! This is a temporary solution until nouns are moved to xml
N_NEWWORDS ;
'''
expected_result = u'''
LEXICON NounRoot
N_NEWWORDS ;
N_sms2x ;
! N-INCOMING ;
LEXICON nouns
!! This is a temporary solution until nouns are moved to xml
N_NEWWORDS ;
'''
self.maxDiff = None
l = Lines()
l.parse_lines(input.split(u'\n'))
self.assertEqual(expected_result, '\n'.join(l.adjust_lines()))
class TestLine(unittest.TestCase):
def test_line_parser_upper_lower(self):
l = Lines()
line = u' +N+SgNomCmp:e%^DISIMP R ;'
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {
u'upper': u'+N+SgNomCmp',
u'lower': u'e%^DISIMP',
u'contlex': u'R',
u'divisor': u':'
}
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_no_lower(self):
l = Lines()
line = (
u' +N+Sg: N_ODD_SG ;')
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {
u'upper': u'+N+Sg',
u'lower': u'',
u'contlex': u'N_ODD_SG',
u'divisor': u':'
}
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_no_upper_no_lower(self):
l = Lines()
line = u' N_ODD_ESS;'
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {
u'contlex': u'N_ODD_ESS',
}
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_empty_upper_lower(self):
l = Lines()
line = u' : N_ODD_E;'
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {
u'upper': u'', u'lower': u'',
u'contlex': u'N_ODD_E',
u'divisor': u':'
}
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_with_comment(self):
l = Lines()
line = (
u'+A+Comp+Attr:%>abpa ATTRCONT; '
u'! båajasabpa, *båajoesabpa')
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {
u'upper': u'+A+Comp+Attr',
u'lower': u'%>abpa',
u'contlex': u'ATTRCONT',
u'comment': u'! båajasabpa, *båajoesabpa',
u'divisor': u':'
}
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_with_translation(self):
l = Lines()
line = u' +A:%>X7 NomVadj "good A" ;'
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {
u'upper': u'+A', u'lower': u'%>X7',
u'contlex': u'NomVadj',
u'translation': u'"good A"',
u'divisor': u':'
}
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_with_leading_upper_and_contlex(self):
l = Lines()
line = u'jïh Cc ;'
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {
u'upper': u'jïh',
u'contlex': u'Cc',
}
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_with_leading_exclam(self):
l = Lines()
line = u'!dovne Cc ; ! dovne A jïh B'
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {
u'comment': u'! dovne A jïh B',
u'upper': u'dovne',
u'contlex': u'Cc',
u'exclam': u'!'
}
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_less_great(self):
l = Lines()
line = (
u'< "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> "+Der4":» '
u'"+Der/NomAct":m > ContLex ;')
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {u'contlex': u'ContLex',
u'upper':
u'< "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> '
u'"+Der4":» "+Der/NomAct":m >'}
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_lower_ends_with_percent(self):
l = Lines()
line = u'abb:babb%¥ ContLex ;'
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {u'contlex': u'ContLex',
u'upper': u'abb',
u'lower': u'babb% ',
u'divisor': u':', }
self.assertDictEqual(parse_line(input), expected_result)
def test_line_parser_multiple_percent_space(self):
l = Lines()
line = u'+N+Der+Der/viđá+Adv+Use/-PLX:»X7%¥viđá%¥ K ;'
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {u'contlex': u'K',
u'upper': u'+N+Der+Der/viđá+Adv+Use/-PLX',
u'lower': u'»X7% viđá% ',
u'divisor': u':', }
self.assertDictEqual(parse_line(input), expected_result)
def test_only_contlex(self):
l = Lines()
line = u'N_NEWWORDS ;'
input = l.lexc_line_re.search(line).groupdict()
input.update(l.lexc_content_re.match(
l.lexc_line_re.sub('', line)).groupdict())
expected_result = {u'contlex': u'N_NEWWORDS'}
self.assertDictEqual(parse_line(input), expected_result)
class Lines(object):
lexc_line_re = re.compile(r'''
(?P\S+) # any nonspace
(?P\s+".+")? # optional translation
\s*;\s* # skip space and semicolon
(?P!.*)? # followed by an optional comment
$
''', re.VERBOSE | re.UNICODE)
lexc_content_re = re.compile(r'''
(?P^\s*!\s*)? # optional comment
(?P(<.+>)|(.+))? # optional content
''', re.VERBOSE | re.UNICODE)
def __init__(self):
self.longest = defaultdict(int)
self.lines = []
def parse_lines(self, lines):
for line in lines:
line = line.rstrip()
line = line.replace(u'% ', u'%¥')
lexc_line_match = self.lexc_line_re.search(line)
if lexc_line_match and not line.startswith('LEXICON '):
input = lexc_line_match.groupdict()
input.update(self.lexc_content_re.match(
self.lexc_line_re.sub('', line)).groupdict())
l = parse_line(input)
self.lines.append(l)
self.find_longest(l)
else:
self.lines.append(line)
def find_longest(self, l):
for name in l:
if self.longest[name] < len(l[name]):
self.longest[name] = len(l[name])
def adjust_lines(self):
newlines = []
for l in self.lines:
if isinstance(l, dict):
s = io.StringIO()
if self.longest[u'exclam']:
if l[u'exclam']:
s.write(l[u'exclam'])
else:
s.write(u' ')
s.write(u' ' *
(self.longest[u'upper'] - len(l[u'upper']) + 1))
s.write(l[u'upper'])
if l[u'divisor']:
s.write(l[u'divisor'])
elif self.longest[u'divisor']:
s.write(u' ')
s.write(l[u'lower'])
s.write(u' ' *
(self.longest[u'lower'] - len(l[u'lower']) + 1))
s.write(l[u'contlex'])
s.write(u' ' *
(self.longest[u'contlex'] -
len(l[u'contlex']) + 1))
s.write(l[u'translation'])
if self.longest[u'translation'] > 0:
s.write(u' ' *
(self.longest[u'translation'] -
len(l[u'translation']) + 1))
s.write(u';')
if l[u'comment'] != u'':
s.write(u' ')
s.write(l[u'comment'])
newlines.append(s.getvalue())
else:
newlines.append(l)
return newlines
def parse_line(old_match):
line_dict = defaultdict(unicode)
if old_match.get('exclam'):
line_dict[u'exclam'] = u'!'
line_dict[u'contlex'] = old_match.get(u'contlex')
if old_match.get(u'translation'):
line_dict[u'translation'] = old_match.get(
u'translation').strip().replace(u'%¥', u'% ')
if old_match.get(u'comment'):
line_dict[u'comment'] = old_match.get(
u'comment').strip().replace(u'%¥', u'% ')
line = old_match.get('content')
if line:
line = line.replace(u'%¥', u'% ')
if line.startswith(u'<') and line.endswith(u'>'):
line_dict[u'upper'] = line
else:
lexc_line_match = line.find(u":")
if lexc_line_match != -1:
line_dict[u'upper'] = line[:lexc_line_match].strip()
line_dict[u'divisor'] = u':'
line_dict[u'lower'] = line[lexc_line_match + 1:].strip()
if line_dict[u'lower'].endswith('%'):
line_dict[u'lower'] = line_dict[u'lower'] + u' '
else:
if line.strip():
line_dict[u'upper'] = line.strip()
return line_dict
def parse_options():
parser = argparse.ArgumentParser(
description=u'Align rules given in lexc files')
parser.add_argument(u'lexcfile',
help=u'Lexc file where rules should be aligned\n'
'If filename is -, then the file is read from '
'stdin and written to stdout.')
args = parser.parse_args()
return args
if __name__ == u'__main__':
UTF8Reader = codecs.getreader('utf8')
sys.stdin = UTF8Reader(sys.stdin)
UTF8Writer = codecs.getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
args = parse_options()
with open(args.lexcfile) if args.lexcfile is not "-" \
else sys.stdin as f:
newlines = []
readlines = []
for l in f.readlines():
if l.startswith(u'LEXICON '):
lines = Lines()
lines.parse_lines(readlines)
newlines += lines.adjust_lines()
readlines = []
readlines.append(l)
lines = Lines()
lines.parse_lines(readlines)
newlines += lines.adjust_lines()
with open(args.lexcfile, u'w') if args.lexcfile is not "-" \
else sys.stdout as f:
f.write(u'\n'.join(newlines))
f.write(u'\n')