#!/usr/bin/env python # -*- coding: utf-8 -*- # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this file. If not, see . # # Copyright © 2016-2017 The University of Tromsø & # the Norwegian Sámi Parliament # http://giellatekno.uit.no & http://divvun.no # """Script to sort and align lexc entries.""" from __future__ import absolute_import, print_function import argparse import codecs import io import re import sys import unittest from collections import defaultdict LEXC_LINE_RE = re.compile(r''' (?P\S+) # any nonspace (?P\s+".*")? # optional translation, might be empty \s*;\s* # skip space and semicolon (?P!.*)? # followed by an optional comment $ ''', re.VERBOSE | re.UNICODE) LEXC_CONTENT_RE = re.compile(r''' (?P^\s*!\s*)? # optional comment (?P(<.+>)|(.+))? # optional content ''', re.VERBOSE | re.UNICODE) class TestLexcAligner(unittest.TestCase): """Test that lexc alignment works as supposed to.""" def test_non_lexc_line(self): """Test how non lexc line is handled.""" content = u''' abb ; babb ''' expected_result = u''' abb ; babb ''' aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_longest(self): """Check that the longest attribute is set correctly.""" content = u''' +N+Sg: N_ODD_SG ; +N+Pl: N_ODD_PL ; +N: N_ODD_ESS ; +N+SgNomCmp:e%^DISIMP R ; +N+SgGenCmp:e%>%^DISIMPn R ; +N+PlGenCmp:%>%^DISIMPi R ; +N+Der1+Der/Dimin+N:%»adtj GIERIEHTSADTJE ; +A:%>X7 NomVadj "good A" ; ''' aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) longest = {} longest[u'upper'] = 19 longest[u'lower'] = 12 longest[u'contlex'] = 14 longest[u'translation'] = 8 longest[u'divisor'] = 1 self.assertEqual(longest, aligner.longest) def test_only_contlex(self): """Test how contlex only entries are handled.""" content = u''' FINAL1 ; +N+Sg: N_ODD_SG ; ''' expected_result = u''' FINAL1 ; +N+Sg: N_ODD_SG ; ''' aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_lexicon_contlex(self): """Check how lexicon and contlex only entries are handled.""" content = u''' LEXICON GOAHTI-NE !!= * __@CODE@__ Bisyll. V-Nouns NomV ; EssV ; ''' expected_result = u''' LEXICON GOAHTI-NE !!= * __@CODE@__ Bisyll. V-Nouns NomV ; EssV ; ''' aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_lexicon_charientries(self): """Check how lexicon entries with only chars are handled.""" content = u''' LEXICON Conjunction jïh Cc ; jah Cc ; ''' expected_result = u''' LEXICON Conjunction jïh Cc ; jah Cc ; ''' aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_lexicon_comment(self): """Check how commented lexc entries are handled.""" content = u''' LEXICON Conjunction !dovne Cc ; ! dovne A jïh B jïh Cc ; jah Cc ; ''' expected_result = u''' LEXICON Conjunction ! dovne Cc ; ! dovne A jïh B jïh Cc ; jah Cc ; ''' aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_nonalphabetic(self): """Test how entries starting with nonalphetic chars are handled.""" content = u''' LEXICON Cc +CC:0 # ; ''' expected_result = u''' LEXICON Cc +CC:0 # ; ''' aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_output(self): """Test that only lexc entries are adjusted.""" content = u''' LEXICON DAKTERE +N+Sg: N_ODD_SG ; +N+Pl: N_ODD_PL ; +N: N_ODD_ESS ; +N+SgNomCmp:e%^DISIMP R ; +N+SgGenCmp:e%>%^DISIMPn R ; +N+PlGenCmp:%>%^DISIMPi R ; +N+Der1+Der/Dimin+N:%»adtj GIERIEHTSADTJE ; +A+Comp+Attr:%>abpa ATTRCONT ; ! båajasabpa, *båajoesabpa +A:%>X7 NomVadj "good A" ; ! Test data: !!€gt-norm: daktere # Odd-syllable test ''' expected_result = u''' LEXICON DAKTERE +N+Sg: N_ODD_SG ; +N+Pl: N_ODD_PL ; +N: N_ODD_ESS ; +N+SgNomCmp:e%^DISIMP R ; +N+SgGenCmp:e%>%^DISIMPn R ; +N+PlGenCmp:%>%^DISIMPi R ; +N+Der1+Der/Dimin+N:%»adtj GIERIEHTSADTJE ; +A+Comp+Attr:%>abpa ATTRCONT ; ! båajasabpa, *båajoesabpa +A:%>X7 NomVadj "good A" ; ! Test data: !!€gt-norm: daktere # Odd-syllable test ''' # nopep8 aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_less_great(self): """Content inside <> should be untouched, but aligned.""" content = u''' LEXICON test +V+IV+Inf+Err/Orth-a/á:uvvát K ; < "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> "+Der4":» "+Der/NomAct":m > ContLex ; +V+IV+Inf+Err/Orth-a/á:uvvát K ; ''' # nopep8 expected_result = u''' LEXICON test +V+IV+Inf+Err/Orth-a/á:uvvát K ; < "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> "+Der4":» "+Der/NomAct":m > ContLex ; +V+IV+Inf+Err/Orth-a/á:uvvát K ; ''' # nopep8 aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_line_percent_space_ending(self): """Check how lower parts ending on percent are handled.""" content = u''' abb:babb% ContLex; uff:puf Contlex; ''' expected_result = u''' abb:babb% ContLex ; uff:puf Contlex ; ''' aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_percent_space(self): """Check how lines containing multiple percent signs are handled.""" content = u''' LEXICON GOAHTILONGSHORT !!= * __@CODE@__ Sometimes long nom-compound-forms, long gen +N:%> GOAHTILONGSHORTCMP ; +N+Sg+Nom: K ; < "+N":0 "+Sg":0 "+Nom":%> "@R.Nom3Px.add@" > NPx3V ; +N+Der+Der/viđá+Adv+Use/-PLX:»X7% viđá% K ; +N+Der+Der/viđi+Adv+Use/-PLX:»X7viđi K ; ''' # nopep8 expected_result = u''' LEXICON GOAHTILONGSHORT !!= * __@CODE@__ Sometimes long nom-compound-forms, long gen +N:%> GOAHTILONGSHORTCMP ; +N+Sg+Nom: K ; < "+N":0 "+Sg":0 "+Nom":%> "@R.Nom3Px.add@" > NPx3V ; +N+Der+Der/viđá+Adv+Use/-PLX:»X7% viđá% K ; +N+Der+Der/viđi+Adv+Use/-PLX:»X7viđi K ; ''' # nopep8 aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) def test_line_startswith_contlex(self): """Check how lines with only contlex entries are handled.""" content = u''' LEXICON NounRoot N_NEWWORDS ; N_sms2x ; ! N-INCOMING ; LEXICON nouns !! This is a temporary solution until nouns are moved to xml N_NEWWORDS ; ''' expected_result = u''' LEXICON NounRoot N_NEWWORDS ; N_sms2x ; ! N-INCOMING ; LEXICON nouns !! This is a temporary solution until nouns are moved to xml N_NEWWORDS ; ''' aligner = LexcAligner() aligner.parse_lines(content.split(u'\n')) self.assertEqual(expected_result, '\n'.join(aligner.adjust_lines())) class TestLineParser(unittest.TestCase): """Test how individual lines are parsed.""" def test_line_parser_upper_lower(self): """Check that lines with upper and lower defined are handled.""" line = u' +N+SgNomCmp:e%^DISIMP R ;' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = { u'upper': u'+N+SgNomCmp', u'lower': u'e%^DISIMP', u'contlex': u'R', u'divisor': u':' } self.assertDictEqual(parse_line(content), expected_result) def test_line_parser_no_lower(self): """Check how lines with empty lower are handled.""" line = ( u' +N+Sg: N_ODD_SG ;') content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = { u'upper': u'+N+Sg', u'lower': u'', u'contlex': u'N_ODD_SG', u'divisor': u':' } self.assertDictEqual(parse_line(content), expected_result) def test_line_contlex_only(self): """Check how lines without upper and lower parts are handled.""" line = u' N_ODD_ESS;' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = { u'contlex': u'N_ODD_ESS', } self.assertDictEqual(parse_line(content), expected_result) def test_empty_upper_lower(self): """Check how empty upper/lower combo is handled.""" line = u' : N_ODD_E;' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = { u'upper': u'', u'lower': u'', u'contlex': u'N_ODD_E', u'divisor': u':' } self.assertDictEqual(parse_line(content), expected_result) def test_comment(self): """Check how commented lines are handled.""" line = ( u'+A+Comp+Attr:%>abpa ATTRCONT; ' u'! båajasabpa, *båajoesabpa') content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = { u'upper': u'+A+Comp+Attr', u'lower': u'%>abpa', u'contlex': u'ATTRCONT', u'comment': u'! båajasabpa, *båajoesabpa', u'divisor': u':' } self.assertDictEqual(parse_line(content), expected_result) def test_translation(self): """Check how lines containing translations are handled.""" line = u' +A:%>X7 NomVadj "good A" ;' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = { u'upper': u'+A', u'lower': u'%>X7', u'contlex': u'NomVadj', u'translation': u'"good A"', u'divisor': u':' } self.assertDictEqual(parse_line(content), expected_result) def test_upper_contlex(self): """Check how entries with only upper and contlex are handled.""" line = u'jïh Cc ;' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = { u'upper': u'jïh', u'contlex': u'Cc', } self.assertDictEqual(parse_line(content), expected_result) def test_leading_exclam(self): """Check how entries with a leading exclam are handled.""" line = u'!dovne Cc ; ! dovne A jïh B' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = { u'comment': u'! dovne A jïh B', u'upper': u'dovne', u'contlex': u'Cc', u'exclam': u'!' } self.assertDictEqual(parse_line(content), expected_result) def test_less_great(self): """Check that entries within <> are correctly handled.""" line = ( u'< "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> "+Der4":» ' u'"+Der/NomAct":m > ContLex ;') content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = {u'contlex': u'ContLex', u'upper': u'< "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> ' u'"+Der4":» "+Der/NomAct":m >'} self.assertDictEqual(parse_line(content), expected_result) def test_ends_with_percent(self): """Check that entries containing percent are correctly handled.""" line = u'abb:babb%¥ ContLex ;' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = {u'contlex': u'ContLex', u'upper': u'abb', u'lower': u'babb% ', u'divisor': u':', } self.assertDictEqual(parse_line(content), expected_result) def test_multiple_percent(self): """Check how entries with multiple percent signs are handled.""" line = u'+N+Der+Der/viđá+Adv+Use/-PLX:»X7%¥viđá%¥ K ;' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = {u'contlex': u'K', u'upper': u'+N+Der+Der/viđá+Adv+Use/-PLX', u'lower': u'»X7% viđá% ', u'divisor': u':', } self.assertDictEqual(parse_line(content), expected_result) def test_only_contlex(self): """Check how contlex only lines are handled.""" line = u'N_NEWWORDS ;' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = {u'contlex': u'N_NEWWORDS'} self.assertDictEqual(parse_line(content), expected_result) def test_empty_translation(self): """Check lines with empty translation.""" line = u'tsollegidh:tsolleg GOLTELIDH_IV "" ;' content = LEXC_LINE_RE.search(line).groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) expected_result = {u'contlex': u'GOLTELIDH_IV', u'upper': u'tsollegidh', u'lower': u'tsolleg', u'translation': u'""', u'divisor': u':'} self.assertDictEqual(parse_line(content), expected_result) class TestLineCompactor(unittest.TestCase): """Test how individual lines are compacted.""" def test_line_parser_upper_lower(self): """Check that lines with upper and lower defined are handled.""" content = { u'upper': u'+N+SgNomCmp', u'lower': u'e%^DISIMP', u'contlex': u'R', u'divisor': u':' } expected_result = u'+N+SgNomCmp:e%^DISIMP R ;' self.assertEqual(compact_line(content), expected_result) def test_line_parser_no_lower(self): """Check how lines with empty lower are handled.""" content = { u'upper': u'+N+Sg', u'lower': u'', u'contlex': u'N_ODD_SG', u'divisor': u':' } expected_result = (u'+N+Sg: N_ODD_SG ;') self.assertEqual(compact_line(content), expected_result) def test_line_contlex_only(self): """Check how lines without upper and lower parts are handled.""" content = { u'contlex': u'N_ODD_ESS', } expected_result = u'N_ODD_ESS ;' self.assertEqual(compact_line(content), expected_result) def test_empty_upper_lower(self): """Check how empty upper/lower combo is handled.""" content = { u'upper': u'', u'lower': u'', u'contlex': u'N_ODD_E', u'divisor': u':' } expected_result = u': N_ODD_E ;' self.assertEqual(compact_line(content), expected_result) def test_comment(self): """Check how commented lines are handled.""" content = { u'upper': u'+A+Comp+Attr', u'lower': u'%>abpa', u'contlex': u'ATTRCONT', u'comment': u'! båajasabpa, *båajoesabpa', u'divisor': u':' } expected_result = ( u'+A+Comp+Attr:%>abpa ATTRCONT ; ' u'! båajasabpa, *båajoesabpa') self.assertEqual(compact_line(content), expected_result) def test_translation(self): """Check how lines containing translations are handled.""" content = { u'upper': u'+A', u'lower': u'%>X7', u'contlex': u'NomVadj', u'translation': u'"good A"', u'divisor': u':' } expected_result = u'+A:%>X7 NomVadj "good A" ;' self.assertEqual(compact_line(content), expected_result) def test_upper_contlex(self): """Check how entries with only upper and contlex are handled.""" content = { u'upper': u'jïh', u'contlex': u'Cc', } expected_result = u'jïh Cc ;' self.assertEqual(compact_line(content), expected_result) def test_leading_exclam(self): """Check how entries with a leading exclam are handled.""" content = { u'comment': u'! dovne A jïh B', u'upper': u'dovne', u'contlex': u'Cc', u'exclam': u'!' } expected_result = u'!dovne Cc ; ! dovne A jïh B' self.assertEqual(compact_line(content), expected_result) def test_less_great(self): """Check that entries within <> are correctly handled.""" content = {u'contlex': u'ContLex', u'upper': u'< "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> ' u'"+Der4":» "+Der/NomAct":m >'} expected_result = ( u'< "@P.Px.add@" 0:u 0:v 0:v "+V":a "+IV":%> "+Der4":» ' u'"+Der/NomAct":m > ContLex ;') self.assertEqual(compact_line(content), expected_result) def test_ends_with_percent(self): """Check that entries containing percent are correctly handled.""" content = {u'contlex': u'ContLex', u'upper': u'abb', u'lower': u'babb% ', u'divisor': u':', } expected_result = u'abb:babb% ContLex ;' self.assertEqual(compact_line(content), expected_result) def test_multiple_percent(self): """Check how entries with multiple percent signs are handled.""" content = {u'contlex': u'K', u'upper': u'+N+Der+Der/vida+Adv+Use/-PLX', u'lower': u'»X7% vida% ', u'divisor': u':', } expected_result = u'+N+Der+Der/vida+Adv+Use/-PLX:»X7% vida% K ;' self.assertEqual(compact_line(content), expected_result) def test_only_contlex(self): """Check how contlex only lines are handled.""" expected_result = u'N_NEWWORDS ;' content = {u'contlex': u'N_NEWWORDS'} self.assertEqual(compact_line(content), expected_result) class TestSorting(unittest.TestCase): """Test how individual lines are parsed.""" def setUp(self): """Set up common resources.""" self.sorting_lines = [ u'ábčđ:cdef ABBR;', u'aŋđŧá:abcd CABBR;', u'bžčŋ:bcde BABBR;', ] def test_alpha(self): """Test sorting by lemma.""" self.assertListEqual( [u'aŋđŧá:abcd CABBR ;', u'bžčŋ:bcde BABBR ;', u'ábčđ:cdef ABBR ;', u''], sort_lexicon(self.sorting_lines, mode='alpha')) def test_contlex(self): """Test sorting by continuation lexicon.""" self.assertListEqual( [u'ábčđ:cdef ABBR ;', u'bžčŋ:bcde BABBR ;', u'aŋđŧá:abcd CABBR ;', u''], sort_lexicon(self.sorting_lines, mode='contlex')) def test_revstem(self): """Test sorting by reverted stem.""" self.assertListEqual( [u'aŋđŧá:abcd CABBR ;', u'bžčŋ:bcde BABBR ;', u'ábčđ:cdef ABBR ;', u''], sort_lexicon(self.sorting_lines, mode='revstem')) class LexcAligner(object): """Class to align lexc elements inside a lexicon.""" def __init__(self): """Initialise the LexcAligner class.""" self.longest = defaultdict(int) self.lines = [] def parse_lines(self, lines): """Parse the lines given. Arguments: lines (list of str): the entries of a lexicon. """ for line in lines: line = line.replace(u'% ', u'%¥') lexc_line_match = LEXC_LINE_RE.search(line) if lexc_line_match and not line.startswith('LEXICON '): content = lexc_line_match.groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) line_dict = parse_line(content) self.lines.append(line_dict) self.set_longest(line_dict) else: self.lines.append(line) def set_longest(self, line_dict): """Record the longest entries.""" for name in line_dict: if self.longest[name] < len(line_dict[name]): self.longest[name] = len(line_dict[name]) def adjust_lines(self): """Align the lines of a lexicon.""" adjusted_lines = [] for line in self.lines: if isinstance(line, dict): string_buffer = [] if self.longest[u'exclam']: if line[u'exclam']: string_buffer.append(line[u'exclam']) else: string_buffer.append(u' ') string_buffer.append(u' ' * (self.longest[u'upper'] - len(line[u'upper']) + 1)) string_buffer.append(line[u'upper']) if line[u'divisor']: string_buffer.append(line[u'divisor']) elif self.longest[u'divisor']: string_buffer.append(u' ') string_buffer.append(line[u'lower']) string_buffer.append(u' ' * (self.longest[u'lower'] - len(line[u'lower']) + 1)) string_buffer.append(line[u'contlex']) string_buffer.append(u' ' * (self.longest[u'contlex'] - len(line[u'contlex']) + 1)) string_buffer.append(line[u'translation']) if self.longest[u'translation'] > 0: string_buffer.append(u' ' * (self.longest[u'translation'] - len(line[u'translation']) + 1)) string_buffer.append(u';') if line[u'comment'] != u'': string_buffer.append(u' ') string_buffer.append(line[u'comment']) adjusted_lines.append(''.join(string_buffer)) else: adjusted_lines.append(line) return adjusted_lines class LexcSorter(object): """Sort entries in a lexc lexicon.""" def __init__(self, mode): """Initialise the LexcSorter class.""" self.lines = [] self.lexc_lines = [] self.mode = mode def parse_lines(self, lines): """Parse the lines of a lexicon. Arguments: lines (list of str): the lines of a lexicon. """ for line in lines: lexc_line_match = LEXC_LINE_RE.search(line) if lexc_line_match and not line.startswith('LEXICON '): content = lexc_line_match.groupdict() content.update(LEXC_CONTENT_RE.match( LEXC_LINE_RE.sub('', line)).groupdict()) line_dict = parse_line(content) self.lexc_lines.append((self.sorting_key(line_dict), compact_line(line_dict))) else: if line.strip(): self.lines.append(line) def sorting_key(self, line_tuple): """Revert the sorting key depending on sorting mode. Arguments: line_tuple (dict): dict containing the different parts of a lexc line. Returns: unicode """ if self.mode == 'alpha': return line_tuple['upper'] elif self.mode == 'revstem': # nopep8 https://stackoverflow.com/questions/931092/reverse-a-string-in-python return line_tuple['lower'][::-1] if line_tuple.get('lower') \ else line_tuple['upper'][::-1] elif self.mode == 'contlex': return line_tuple['contlex'] else: raise KeyError('No sorting mode given') def adjust_lines(self): """Sort the lines.""" self.lines.extend([line_tuple[1] for line_tuple in sorted(self.lexc_lines)]) self.lines.append('') def compact_line(line_dict): """Remove unneeded white space from a lexc entry.""" string_buffer = [] if line_dict.get(u'exclam'): string_buffer.append(line_dict[u'exclam']) if line_dict.get(u'upper'): string_buffer.append(line_dict[u'upper']) if line_dict.get(u'divisor'): string_buffer.append(line_dict[u'divisor']) if line_dict.get(u'lower'): string_buffer.append(line_dict[u'lower']) if string_buffer: string_buffer.append(' ') string_buffer.append(line_dict[u'contlex']) if line_dict.get(u'translation'): string_buffer.append(' ') string_buffer.append(line_dict[u'translation']) string_buffer.append(u' ;') if line_dict.get(u'comment'): string_buffer.append(u' ') string_buffer.append(line_dict[u'comment']) return ''.join(string_buffer) def parse_line(old_match): """Parse a lexc line. Arguments: old_match: Returns: dict of unicode: The entries inside the lexc line expressed as a dict """ line_dict = defaultdict(unicode) if old_match.get('exclam'): line_dict[u'exclam'] = u'!' line_dict[u'contlex'] = old_match.get(u'contlex') if old_match.get(u'translation'): line_dict[u'translation'] = old_match.get( u'translation').strip().replace(u'%¥', u'% ') if old_match.get(u'comment'): line_dict[u'comment'] = old_match.get( u'comment').strip().replace(u'%¥', u'% ') line = old_match.get('content') if line: line = line.replace(u'%¥', u'% ') if line.startswith(u'<') and line.endswith(u'>'): line_dict[u'upper'] = line else: lexc_line_match = line.find(u":") if lexc_line_match != -1: line_dict[u'upper'] = line[:lexc_line_match].strip() line_dict[u'divisor'] = u':' line_dict[u'lower'] = line[lexc_line_match + 1:].strip() if line_dict[u'lower'].endswith('%'): line_dict[u'lower'] = line_dict[u'lower'] + u' ' else: if line.strip(): line_dict[u'upper'] = line.strip() return line_dict def align_lexicon(lexc_lines): """Align lexicons. Arguments: lexc_lines (list of str): contents of a lexicon to be aligned. Returns: list of str: aligned lines. """ lines = LexcAligner() lines.parse_lines(lexc_lines) return lines.adjust_lines() def sort_lexicon(lexc_lines, mode): """Sort lexicons. Arguments: lexc_lines (list of str): contents of a lexicon to be sorted. mode (str): the sorting mode applied Returns: list of str: sorted lines. """ lines = LexcSorter(mode=mode) lines.parse_lines(lexc_lines) lines.adjust_lines() return lines.lines def parse_options(): """Parse command line arguments.""" parser = argparse.ArgumentParser( description=u'Align or sort rules given in lexc files') group = parser.add_mutually_exclusive_group(required=True) group.add_argument(u'--align', action=u'store_true', help=u'Align lexicon entries') group.add_argument(u'--sort', choices=['alpha', 'revstem', 'contlex'], help=u'Sort lexicon entries') parser.add_argument(u'lexcfile', help=u'Lexc file where lexicon entries should ' 'be manipulated. If filename is -, then the file ' 'is read from stdin and written to stdout.') arguments = parser.parse_args() return arguments if __name__ == u'__main__': # nopep8 https://stackoverflow.com/questions/2737966/how-to-change-the-stdin-encoding-on-python UTF8READER = codecs.getreader('utf8') sys.stdin = UTF8READER(sys.stdin) UTF8WRITER = codecs.getwriter('utf8') sys.stdout = UTF8WRITER(sys.stdout) ARGS = parse_options() with io.open(ARGS.lexcfile) if ARGS.lexcfile is not "-" \ else sys.stdin as file_: NEWLINES = [] READLINES = [] for lexc_line in file_: if lexc_line.startswith(u'LEXICON '): NEWLINES.extend(READLINES) READLINES = [lexc_line.rstrip()] break READLINES.append(lexc_line.rstrip()) for lexc_line in file_: if lexc_line.startswith(u'LEXICON ') or lexc_line.startswith('!!'): if ARGS.align: NEWLINES.extend(align_lexicon(READLINES)) if ARGS.sort: NEWLINES.extend(sort_lexicon(READLINES, ARGS.sort)) READLINES = [] READLINES.append(lexc_line.rstrip()) if ARGS.align: NEWLINES.extend(align_lexicon(READLINES)) if ARGS.sort: NEWLINES.extend(sort_lexicon(READLINES, ARGS.sort)) with io.open(ARGS.lexcfile, u'w') if ARGS.lexcfile is not "-" \ else sys.stdout as file_: file_.write(u'\n'.join(NEWLINES)) file_.write(u'\n')