#!/usr/bin/env python
# -*- coding: utf-8 -*-
import unittest
import lxml.doctestcompare
import lxml.etree
import doctest
import os
import sys
import argparse
import difflib
import parallelize
class TestParallelFile(unittest.TestCase):
"""
A test class for the ParallelFile class
"""
def setUp(self):
self.pfile = parallelize.ParallelFile()
def testName(self):
self.pfile.setName("test")
self.assertEqual(self.pfile.getName(), "test")
def testLang(self):
self.pfile.setLang("sme")
self.assertEqual(self.pfile.getLang(), "sme")
class TestParallelize(unittest.TestCase):
"""
A test class for the Parallelize module
"""
def setUp(self):
self.parallelize = parallelize.Parallelize(os.environ['GTFREE'] + "/prestable/converted/sme/facta/skuvlahistorja2/aarseth2-s.htm.xml", "nob")
def tuToString(self, tu):
"""
Extract the two strings of a tu element
"""
string = ""
try:
string = string + tu[0][0].text.strip()
except(AttributeError):
pass
string = string + "\t"
try:
string = string + tu[1][0].text.strip()
except(AttributeError):
pass
return string.encode('utf-8')
def tmxToStringlist(self, tmx):
"""
Extract all string pairs in a tmx to a list of strings
"""
all_tu = tmx.findall('.//tu')
strings = []
for tu in all_tu:
strings.append(self.tuToString(tu))
return strings
def diffOfTmxes(self, gotTmx, wantTmx):
"""
Find the diff of two tmx elements
"""
gotStrings = self.tmxToStringlist(gotTmx)
wantStrings = self.tmxToStringlist(wantTmx)
return difflib.context_diff(gotStrings, wantStrings, n = 0)
def writeDiffFile(self, contextDiff):
"""
Given a context_diff, write it and some info to a file
"""
diff = []
numDiffs = -1
numDiffLines = 0
for line in contextDiff:
if line[:3] == '---':
numDiffs += 1
if line[:1] == '!':
numDiffLines += 1
diff.append(line + '\n')
f = open('diff.txt', 'w')
f.write('Number of diffs ' + str(numDiffs) + '\n')
f.write('Number of lines in the diff ' + str(numDiffLines) + '\n\n')
f.writelines(diff)
f.close()
def assertXmlEqual(self, got, want):
"""
Check if two xml snippets are equal
"""
string_got = lxml.etree.tostring(got, pretty_print = True)
string_want = lxml.etree.tostring(want, pretty_print = True)
checker = lxml.doctestcompare.LXMLOutputChecker()
if not checker.check_output(string_got, string_want, 0):
self.writeDiffFile(self.diffOfTmxes(got, want))
raise AssertionError("xml equal failed")
def testFindParallelFilename(self):
self.assertEqual(self.parallelize.findParallelFilename(), 'aarseth2-n.htm')
def testOrigPath(self):
self.assertEqual(self.parallelize.getorigfile1(), os.environ['GTFREE'] + "/prestable/converted/sme/facta/skuvlahistorja2/aarseth2-s.htm.xml")
def testParallelPath(self):
self.assertEqual(self.parallelize.getorigfile2(), os.environ['GTFREE'] + "/prestable/converted/nob/facta/skuvlahistorja2/aarseth2-n.htm.xml")
def testLang1(self):
self.assertEqual(self.parallelize.getlang1(), "sme")
def testLang2(self):
self.assertEqual(self.parallelize.getlang2(), "nob")
def testGetSentFilename(self):
self.assertEqual(self.parallelize.getSentFilename(self.parallelize.getorigfile1()), os.environ['GTFREE'] + "/tmp/aarseth2-s.htm_sent.xml")
def testMakeTu(self):
line1 = 'ubba gubba. ibba gibba.'
line2 = 'abba gabba. ebba gebba.'
gotTu = self.parallelize.makeTu(line1, line2)
wantTu = lxml.etree.XML('ubba gubba. ibba gibba.abba gabba. ebba gebba.')
self.assertXmlEqual(gotTu, wantTu)
def testMakeTuv(self):
line = 'ubba gubba. ibba gibba.'
lang = 'smi'
gotTuv = self.parallelize.makeTuv(line, lang)
wantTuv = lxml.etree.XML('ubba gubba. ibba gibba.')
self.assertXmlEqual(gotTuv, wantTuv)
def testMakeTmxHeader(self):
lang = 'smi'
gotTuv = self.parallelize.makeTmxHeader(lang)
wantTuv = lxml.etree.XML('')
self.assertXmlEqual(gotTuv, wantTuv)
def testRemoveSTag(self):
got = self.parallelize.removeSTag('ubba gubba. ibba gibba.')
want = 'ubba gubba. ibba gibba.'
self.assertEqual(got, want)
def testDividePIntoSentences(self):
self.assertEqual(self.parallelize.dividePIntoSentences(), 0)
def testParallizeFiles(self):
self.assertEqual(self.parallelize.parallelizeFiles(), 0)
def testPrintTmxFile(self):
got = lxml.etree.parse("aarseth2-s.htm.tmx")
want = lxml.etree.parse(self.parallelize.printTmxFile(self.parallelize.makeTmx()))
self.assertXmlEqual(got, want)
def testGoldstandard(self):
goldstandard = {}
goldstandard['/prestable/tmx/goldstandard/nob2sme/samisk_strategiplan_samisk.doc.tmx'] = '/prestable/converted/sme/admin/others/samisk_strategiplan_samisk.doc.xml'
goldstandard['/prestable/tmx/goldstandard/nob2sme/dc_05_1.doc.tmx'] = 'prestable/converted/sme/admin/sd/other_files/dc_05_1.doc.xml'
goldstandard['/prestable/tmx/goldstandard/nob2sme/finnmarkkulahka_web_lettere.pdf.tmx'] = 'prestable/converted/sme/laws/other_files/finnmarkkulahka_web_lettere.pdf.xml'
for tmxFile, xmlFile in goldstandard.items():
self.parallelize = parallelize.Parallelize(os.environ['GTFREE'] + "/" + xmlFile, 'nob')
self.parallelize.dividePIntoSentences()
self.parallelize.parallelizeFiles()
got = lxml.etree.parse(self.parallelize.printTmxFile(self.parallelize.makeTmx()))
want = lxml.etree.parse(os.environ['GTFREE'] + "/" + tmxFile)
self.assertXmlEqual(got, want)
def lightTests():
independentSuite = unittest.TestSuite()
independentSuite.addTest(TestParallelize("testRemoveSTag"))
independentSuite.addTest(TestParallelize("testMakeTmxHeader"))
independentSuite.addTest(TestParallelize("testMakeTuv"))
independentSuite.addTest(TestParallelize("testMakeTu"))
independentSuite.addTest(TestParallelize("testGetSentFilename"))
independentSuite.addTest(TestParallelize("testLang1"))
independentSuite.addTest(TestParallelize("testLang2"))
independentSuite.addTest(TestParallelize("testParallelPath"))
independentSuite.addTest(TestParallelize("testOrigPath"))
independentSuite.addTest(TestParallelize("testFindParallelFilename"))
return independentSuite
def defaultChainTest():
chainTestSuite = unittest.TestSuite()
chainTestSuite.addTest(TestParallelize("testDividePIntoSentences"))
chainTestSuite.addTest(TestParallelize("testParallizeFiles"))
chainTestSuite.addTest(TestParallelize("testPrintTmxFile"))
return chainTestSuite
def customChainTest():
customSuite = unittest.TestSuite()
customSuite.addTest(TestParallelize("testGoldstandard"))
return customSuite
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Test various parts of the alignment process')
parser.add_argument('-g', '--goldstandard', dest = 'goldstandard', help = 'Check if the current aligner pipeline agrees with the goldstandard docs', action = 'store_true')
args = parser.parse_args()
if args.goldstandard:
unittest.TextTestRunner().run(customChainTest())
else:
unittest.TextTestRunner().run(lightTests())
unittest.TextTestRunner().run(defaultChainTest())