#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Test the perl script preprocess.""" import os import subprocess import sys import unittest from nose_parameterized import parameterized class TestConversion(unittest.TestCase): """Class to test html to divvun-corpus format conversion.""" def get_preprocess_result(self, expression, language): """Run an expression through preprocess. Args: expression (str): input sent to preprocess language (str): language of the input Returns: str: the output of preprocess """ preprocess_command = 'preprocess --abbr={}'.format( self.get_abbr_file(language)) subp = subprocess.Popen(preprocess_command.split(), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (output, error) = subp.communicate(expression.encode('utf-8')) if subp.returncode != 0: print('Could not preprocess', file=sys.stderr) print(output, file=sys.stderr) print(error, file=sys.stderr) sys.exit() else: return output.decode('utf8') def get_abbr_file(self, language): """Get the correct abbr file. Args: language (str): languge of the abbr file Returns: str: path to the abbr file """ abbrfile = os.path.join(os.environ['GTHOME'], 'langs', language, 'tools/preprocess/abbr.txt') if os.path.exists(abbrfile): return abbrfile else: raise IOError('You must run "make abbr" in ' '$GTHOME/langs/{}/tools/preprocess ' 'before running this test'.format(language)) @parameterized.expand([ ('nr at end of sentence', 'nr. Nysetning', 'nr.\n.\nNysetning\n', 'sme'), ('b at end of sentence', 'b. Nysetning.', 'b.\n.\nNysetning\n.\n', 'sme'), ('numbers and newline 1', '2/00 7', '2/00 7\n', 'sme'), ('numbers and newline 2', '2/00\n7', '2/00 7\n', 'sme'), ('test b inside sentence', 'njukčamánu 1. b. dii. 09.00.', 'njukčamánu\n1.\nb.\ndii.\n09.00\n.\n', 'sme'), ('du at end of sentences ' 'http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=761', 'Oahpaheaddji, du.\nOahpaheaddji, du.\nOahppit bohte.\n', 'Oahpaheaddji\n,\ndu\n.\nOahpaheaddji\n,\ndu\n.\nOahppit\nbohte\n.\n', 'sme'), ('etc with parenthesis', '(gamle melkegjerder, fangstgraver etc.).', '(\ngamle\nmelkegjerder\n,\nfangstgraver\netc.\n)\n.\n', 'nob'), ('number followed by lowercase', 'Nu lei dassážii go ollejin 3. dehe 4. klássii.', 'Nu\nlei\ndassážii go\nollejin\n3.\ndehe\n4.\nklássii\n.\n', 'sme'), ('num lexicon', 'Sámi Ællin / Sameliv, 1961–63.', 'Sámi\nÆllin\n/\nSameliv\n,\n1961–63\n.\n', 'sme'), ('idiom lexicon', 'feara máid', 'feara máid\n', 'sme'), ('iešguđetládje with pound', 'iešguđetládje £', 'iešguđetládje\n£\n', 'sme'), ('iešguđet ládje with pound', 'iešguđet ládje £', 'iešguđet\nládje\n£\n', 'sme'), ('trab lexicon', 'Lámispenš. 16–66 jagi – pr. 1000', 'Lámispenš.\n16–66\njagi\n–\npr.\n1000\n', 'sme'), ('trnumab lexicon 1', 'RR 2005 Buš 2006', 'RR\n2005\nBuš\n2006\n', 'sme'), ('trnumab lexicon 2', '500 kr.', '500\nkr.\n.\n', 'sme'), ('trnumab lexicon 3', '500 kr. Juohke', '500\nkr.\n.\nJuohke\n', 'sme'), ('trnumab lexicon add .', 'ILO-konv. nr. 169.', 'ILO-konv. nr.\n169\n.\n', 'sme'), ('itrab lexicon 1', 'Lyngmos Eftf.-gávpái Dáččavággái', 'Lyngmos\nEftf.\n-\ngávpái\nDáččavággái\n', 'sme'), ('itrab lexicon 2', '500 ru. Juohke', '500\nru.\n.\nJuohke\n', 'sme'), ('itrab lexicon 3', '500 ru.', '500\nru.\n.\n', 'sme'), ('itrab lexicon 4', 'Hei Lola-vuoiti 2007.', 'Hei Lola-vuoiti\n2007\n.\n', 'sme'), ('numeral combination 1', '2. - 3. mars.', '2. - 3.\nmars\n.\n', 'nob'), ('numeral combination 2', '02. - 03.03.00', '02. - 03.03.00\n', 'nob'), ('numeral combination 3', '29.02. - 03.03.00', '29.02. - 03.03.00\n', 'nob'), ('dot followed by hyphen', 'dr.polit.-utdanningen', 'dr.polit.\n-\nutdanningen\n', 'nob'), ('quotemark_followed_by_parenthesis', 'urfolksspørsmål i bistanden”)', 'urfolksspørsmål\ni\nbistanden\n”\n)\n', 'nob'), ('number_followed_by_quotemark', '25. «tar', '25.\n«\ntar\n', 'nob'), ('numeral_followed_by_parenthesis', '1950-tallet. (', '1950-tallet\n.\n(\n', 'nob'), ('numeral_with_hyphen', '12.-16. juli', '12.-16.\njuli\n', 'nob'), ('su 1', 'Dat mávssii su. njeallje ruvnno.', 'Dat\nmávssii\nsu.\nnjeallje\nruvnno\n.\n', 'sme'), ('su 2', 'Dat mávssii su. 4 ruvnno.', 'Dat\nmávssii\nsu.\n4\nruvnno\n.\n', 'sme'), ('su 3', 'Mun oidnen su. Son lei olgun.', 'Mun\noidnen\nsu\n.\nSon\nlei\nolgun\n.\n', 'sme'), ('su 4', 'Mun oidnen su.', 'Mun\noidnen\nsu\n.\n', 'sme'), ('dot followed by parenthesis', 'lulás.)', 'lulás\n.\n)\n', 'sme'), ('numeral_followed_by_dot_and_degree', '2007. °', '2007\n.\n°\n', 'sme'), ('abbreviation_without_punctum', 'St.meld. nr', 'St.meld. nr\n', 'nob'), ('bug1342', 'mii jua mii lea', 'mii\njua\nmii\nlea\n', 'sme'), ('i_with_capital_letter_nob. Fixed in svn commit r61957', 'i B i b', 'i\nB\ni\nb\n', 'nob'), ('i_dot_with_capital_letter_nob. Fixed in svn commit r64544', 'norsk. Bare i. Bare', 'norsk\n.\nBare\ni\n.\nBare\n', 'nob'), ('colon. http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=1482', 'L.H:', 'L.\nH\n:\n', 'sme'), ('alphabetical_word_with_three_and_more_dots_in_front 1', '.....ape', '...\nape\n', 'nob'), ('alphabetical_word_with_three_and_more_dots_in_front 2', '....ape', '...\nape\n', 'nob'), ('alphabetical_word_with_three_and_more_dots_in_front 3', '...ape', '...\nape\n', 'nob'), ('numerical_word_with_three_and_more_dots_in_front 1', '.....123', '...\n123\n', 'nob'), ('numerical_word_with_three_and_more_dots_in_front 2', '....123', '...\n123\n', 'nob'), ('numerical_word_with_three_and_more_dots_in_front 3', '...123', '...\n123\n', 'nob'), ('alphabetical_word_with_two_and_less_dots_in_front 1', '..ape', '.\n.\nape\n', 'nob'), ('alphabetical_word_with_two_and_less_dots_in_front 2', '.ape', '.\nape\n', 'nob'), ('numerical_word_with_two_and_less_dots_in_front 1', '..123', '.\n.\n123\n', 'nob'), ('numerical_word_with_two_and_less_dots_in_front 2', '.123', '.\n123\n', 'nob'), ('three_or_more_dots 1', '.....', '...\n', 'nob'), ('three_or_more_dots 2', '...', '...\n', 'nob'), ('two or less dots 1', '..', '.\n.\n', 'nob'), ('two or less dots 2', '.', '.\n', 'nob'), ('word_with_leading_hyphen', '-ape', '-\nape\n', 'nob'), ('word ending with numbers 1', 'ape12', 'ape12\n', 'nob'), ('word ending with numbers 2', '34ape12', '34ape12\n', 'nob'), ('word starting with numbers 1', '34ape', '34\nape\n', 'nob'), ('word starting with numbers 2', '34s', '34\ns\n', 'nob'), ('word starting with numbers 3', '34:s', '34:s\n', 'nob'), ('word starting with numbers 4', "34's", "34's\n", 'nob'), ('double_newline_in_input', 'abc\n\ndef', 'abc\n¶\ndef\n', 'sme'), ('comma_inside_mwe_is_moved_behind_mwe 1', 'feara, mii eale', 'feara\n,\nmii\neale\n', 'sme'), ('comma_inside_mwe_is_moved_behind_mwe 2', 'feara mii, eale', 'feara mii\n,\neale\n', 'sme'), ('capital_first_letter_in_mwe 1 ' 'http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=1296#c31', 'Feara mii', 'Feara mii\n', 'sme'), ('capital_first_letter_in_mwe 2 ' 'http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=1296#c31', 'feara mii', 'feara mii\n', 'sme'), ('capital_first_letter_in_second word of_mwe 1 ' 'http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=1296#c31', 'Min áigge', 'Min\náigge\n', 'sme'), ('capital_first_letter_in_second word of_mwe 2 ' 'http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=1296#c31', 'Min Áigge', 'Min Áigge\n', 'sme'), ('paragraph_sign', 'guoli. ¶', 'guoli\n.\n¶\n', 'sme'), ('parenthesis with no leading space ' 'http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=2228', 'ja(20)', 'ja\n(\n20\n)\n', 'sme'), ('parenthesis with leading space ' 'http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=2228', 'ja (20)', 'ja\n(\n20\n)\n', 'sme'), ]) def test_preprocess(self, test_name, test_expression, want_expression, language): """Get the result from preprocess. Args: test_name (str): name of the test. test_expression (str): input to preprocess. want_expression (str): the string preprocess is expected to return. Raises: AssertionError if the output from preprocess is not equal to the expected output. """ self.assertEqual(self.get_preprocess_result(test_expression, language), want_expression) if __name__ == "__main__": unittest.main()