'
df = DocumentFixer(origDoc)
df.setWordCount()
self.assertXmlEqual(etree.tostring(df.etree), expectedDoc)
class DocumentFixer:
"""
Receive a stringified etree from one of the raw converters,
replace ligatures, fix the encoding and return an etree with correct
characters
"""
def __init__(self, document):
self.etree = document
def replaceLigatures(self):
"""
document is a stringified xml document
"""
replacements = {
u"[dstrok]": u"đ",
u"[Dstrok]": u"Đ",
u"[tstrok]": u"ŧ",
u"[Tstrok]": u"Ŧ",
u"[scaron]": u"š",
u"[Scaron]": u"Š",
u"[zcaron]": u"ž",
u"[Zcaron]": u"Ž",
u"[ccaron]": u"č",
u"[Ccaron]": u"Č",
u"[eng": u"ŋ",
" ]": "",
u"Ď": u"đ", # cough
u"ď": u"đ", # cough
"\x03": "",
"\x04": "",
"\x07": "",
"\x08": "",
"\x0F": "",
"\x10": "",
"\x11": "",
"\x13": "",
"\x14": "",
"\x15": "",
"\x17": "",
"\x18": "",
"\x1A": "",
"\x1B": "",
"\x1C": "",
"\x1D": "",
"\x1E": "",
u"fi": "fi",
u"fl": "fl",
u"ff": "ff",
u"ffi": "ffi",
u"ffl": "ffl",
u"ſt": "ft",
}
for element in self.etree.iter('p'):
if element.text:
for key, value in replacements.items():
element.text = element.text.replace(key + ' ', value)
element.text = element.text.replace(key, value)
def fixBodyEncoding(self):
"""
Send a stringified version of the body into the EncodingGuesser class.
It returns the same version, but with fixed characters.
Parse the returned string, insert it into the document
"""
self.replaceLigatures()
if isinstance(self.etree, etree._XSLTResultTree):
sys.stderr.write("xslt!\n")
body = self.etree.find('body')
bodyString = etree.tostring(body, encoding='utf-8')
body.getparent().remove(body)
eg = decode.EncodingGuesser()
encoding = eg.guessBodyEncoding(bodyString)
body = etree.fromstring(eg.decodePara(encoding, bodyString))
self.etree.append(body)
self.detectQuotes()
return etree.parse(io.BytesIO(etree.tostring(self.etree)))
def detectQuote(self, element):
"""Detect quotes in an etree element.
"""
newelement = deepcopy(element)
element.text = ''
for child in element:
child.getparent().remove(child)
quoteList = []
quoteRegexes = [re.compile('".+?"'), re.compile(u'«.+?»'), re.compile(u'“.+?”')]
text = newelement.text
if text:
for quoteRegex in quoteRegexes:
for m in quoteRegex.finditer(text):
quoteList.append(m.span())
if len(quoteList) > 0:
quoteList.sort()
element.text = text[0:quoteList[0][0]]
for x in range(0, len(quoteList)):
span = etree.Element('span')
span.set('type', 'quote')
span.text = text[quoteList[x][0]:quoteList[x][1]]
if x + 1 < len(quoteList):
span.tail = text[quoteList[x][1]:quoteList[x + 1][0]]
else:
span.tail = text[quoteList[x][1]:]
element.append(span)
else:
element.text = text
for child in newelement:
element.append(self.detectQuote(child))
if child.tail:
quoteList = []
text = child.tail
for quoteRegex in quoteRegexes:
for m in quoteRegex.finditer(text):
quoteList.append(m.span())
if len(quoteList) > 0:
quoteList.sort()
child.tail = text[0:quoteList[0][0]]
for x in range(0, len(quoteList)):
span = etree.Element('span')
span.set('type', 'quote')
span.text = text[quoteList[x][0]:quoteList[x][1]]
if x + 1 < len(quoteList):
span.tail = text[quoteList[x][1]:quoteList[x + 1][0]]
else:
span.tail = text[quoteList[x][1]:]
element.append(span)
return element
def detectQuotes(self):
"""Detect quotes in all paragraphs
"""
for paragraph in self.etree.iter('p'):
paragraph = self.detectQuote(paragraph)
def setWordCount(self):
"""Count the words in the file
"""
plist = []
for paragraph in self.etree.iter('p'):
plist.append(etree.tostring(paragraph, method = 'text', encoding = 'utf8'))
words = len(re.findall(r'\S+', ' '.join(plist)))
wordcount = self.etree.find('header/wordcount')
if wordcount is None:
tags = ['collection', 'publChannel', 'place', 'year', 'translated_from', 'translator', 'author']
for tag in tags:
found = self.etree.find('header/' + tag)
if found is not None:
wordcount = etree.Element('wordcount')
header = found.getparent()
header.insert(header.index(found) + 1, wordcount)
break
wordcount.text = str(words)
class TestXslMaker(unittest.TestCase):
def assertXmlEqual(self, got, want):
"""Check if two stringified xml snippets are equal
"""
checker = doctestcompare.LXMLOutputChecker()
if not checker.check_output(want, got, 0):
message = checker.output_difference(doctest.Example("", want), got, 0).encode('utf-8')
raise AssertionError(message)
def testGetXsl(self):
xslmaker = XslMaker('parallelize_data/samediggi-article-48.html.xsl')
got = xslmaker.getXsl()
want = etree.parse('parallelize_data/test.xsl')
self.assertXmlEqual(etree.tostring(got), etree.tostring(want))
class XslMaker:
"""
To convert the intermediate xml to a fullfledged giellatekno document
a combination of three xsl files + the intermediate files is needed
This class makes the xsl file
"""
def __init__(self, xslfile):
preprocessXsl = etree.parse(os.path.join(os.getenv('GTHOME'), \
'gt/script/corpus/preprocxsl.xsl'))
preprocessXslTransformer = etree.XSLT(preprocessXsl)
self.filename = xslfile
try:
filexsl = etree.parse(xslfile)
except etree.XMLSyntaxError as e:
logfile = open(self.filename + '.log', 'w')
for entry in e.error_log:
logfile.write(str(entry))
logfile.write('\n')
logfile.close()
raise ConversionException("Syntax error in " + self.filename)
self.finalXsl = preprocessXslTransformer(filexsl, commonxsl = etree.XSLT.strparam('file://' + os.path.join(os.getenv('GTHOME'), \
'gt/script/corpus/common.xsl')))
def getXsl(self):
return self.finalXsl
class TestLanguageDetector(unittest.TestCase):
"""
Test the functionality of LanguageDetector
"""
def setUp(self):
self.document = etree.parse('parallelize_data/samediggi-article-48s-before-lang-detection-with-multilingual-tag.xml')
def assertXmlEqual(self, got, want):
"""Check if two stringified xml snippets are equal
"""
checker = doctestcompare.LXMLOutputChecker()
if not checker.check_output(want, got, 0):
message = checker.output_difference(doctest.Example("", want), got, 0).encode('utf-8')
raise AssertionError(message)
def testGetMainLang(self):
testMainLang = 'sme'
ld = LanguageDetector(self.document)
self.assertEqual(testMainLang, ld.getMainlang())
def testSetParagraphLanguageMainlanguage(self):
origParagraph = '
Sámegiella lea 2004 čavčča rájes standárda giellaválga Microsofta operatiivavuogádagas Windows XP. Dat mearkkaša ahte sámegiel bustávaid ja hámiid sáhttá válljet buot prográmmain. Buot leat dás dán fitnodaga Service Pack 2-páhkas, maid ferte viežžat ja bidjat dihtorii. Boađus lea ahte buot boahttevaš Microsoft prográmmat dorjot sámegiela. Dattetge sáhttet deaividit váttisvuođat go čálát sámegiela Outlook-kaleandaris dahje e-poastta namahussajis, ja go čálát sámegillii dakkár prográmmain, maid Microsoft ii leat ráhkadan.
'
expectedParagraph = '
Sámegiella lea 2004 čavčča rájes standárda giellaválga Microsofta operatiivavuogádagas Windows XP. Dat mearkkaša ahte sámegiel bustávaid ja hámiid sáhttá válljet buot prográmmain. Buot leat dás dán fitnodaga Service Pack 2-páhkas, maid ferte viežžat ja bidjat dihtorii. Boađus lea ahte buot boahttevaš Microsoft prográmmat dorjot sámegiela. Dattetge sáhttet deaividit váttisvuođat go čálát sámegiela Outlook-kaleandaris dahje e-poastta namahussajis, ja go čálát sámegillii dakkár prográmmain, maid Microsoft ii leat ráhkadan.
Sámegiella lea 2004 čavčča rájes standárda giellaválga Microsofta operatiivavuogádagas Windows XP. Dat mearkkaša ahte sámegiel bustávaid ja hámiid sáhttá válljet buot prográmmain. «Alt finnes i den foreliggende Service Pack 2 fra selskapet, som må lastes ned og installeres på din datamaskin. Konsekvensen er at all framtidig programvare fra Microsoft vil inneholde støtte for samisk». Boađus lea ahte buot boahttevaš Microsoft prográmmat dorjot sámegiela. Dattetge sáhttet deaividit váttisvuođat go čálát sámegiela Outlook-kaleandaris dahje e-poastta namahussajis, ja go čálát sámegillii dakkár prográmmain, maid Microsoft ii leat ráhkadan.
'
expectedParagraph = '
Sámegiella lea 2004 čavčča rájes standárda giellaválga Microsofta operatiivavuogádagas Windows XP. Dat mearkkaša ahte sámegiel bustávaid ja hámiid sáhttá válljet buot prográmmain. «Alt finnes i den foreliggende Service Pack 2 fra selskapet, som må lastes ned og installeres på din datamaskin. Konsekvensen er at all framtidig programvare fra Microsoft vil inneholde støtte for samisk». Boađus lea ahte buot boahttevaš Microsoft prográmmat dorjot sámegiela. Dattetge sáhttet deaividit váttisvuođat go čálát sámegiela Outlook-kaleandaris dahje e-poastta namahussajis, ja go čálát sámegillii dakkár prográmmain, maid Microsoft ii leat ráhkadan.
Samisk er fra høsten 2004 et standard språkvalg Microsofts operativsystem Windows XP. I praksis betyr det at samiske bokstaver og formater kan velges i alle programmer. Alt finnes i den foreliggende Service Pack 2 fra selskapet, som må lastes ned og installeres på din datamaskin. Konsekvensen er at all framtidig programvare fra Microsoft vil inneholde støtte for samisk. Du vil imidlertid fremdeles kunne oppleve problemer med å skrive samisk i Outlook-kalenderen eller i tittel-feltet i e-post, og med å skrive samisk i programmer levert av andre enn Microsoft.
'
expectedParagraph = '
Samisk er fra høsten 2004 et standard språkvalg Microsofts operativsystem Windows XP. I praksis betyr det at samiske bokstaver og formater kan velges i alle programmer. Alt finnes i den foreliggende Service Pack 2 fra selskapet, som må lastes ned og installeres på din datamaskin. Konsekvensen er at all framtidig programvare fra Microsoft vil inneholde støtte for samisk. Du vil imidlertid fremdeles kunne oppleve problemer med å skrive samisk i Outlook-kalenderen eller i tittel-feltet i e-post, og med å skrive samisk i programmer levert av andre enn Microsoft.
bla bla bla1 bla ble ble bla2 blabli bli bla3 bla blo blo
'
expectedParagraph = 'bla bla ble ble bli bli blo blo'
ld = LanguageDetector(self.document)
gotParagraph = ld.removeQuote(etree.fromstring(origParagraph))
self.assertEqual(gotParagraph, expectedParagraph)
def testDetectLanguageWithMultilingualtag(self):
ld = LanguageDetector(etree.parse('parallelize_data/samediggi-article-48s-before-lang-detection-with-multilingual-tag.xml'))
ld.detectLanguage()
gotDocument = ld.getDocument()
expectedDocument = etree.parse('parallelize_data/samediggi-article-48s-after-lang-detection-with-multilingual-tag.xml')
self.assertXmlEqual(etree.tostring(gotDocument), etree.tostring(expectedDocument))
def testDetectLanguageWithoutMultilingualtag(self):
ld = LanguageDetector(etree.parse('parallelize_data/samediggi-article-48s-before-lang-detection-without-multilingual-tag.xml'))
ld.detectLanguage()
gotDocument = ld.getDocument()
expectedDocument = etree.parse('parallelize_data/samediggi-article-48s-after-lang-detection-without-multilingual-tag.xml')
self.assertXmlEqual(etree.tostring(gotDocument), etree.tostring(expectedDocument))
class LanguageDetector:
"""
Receive an etree.
Detect the languages of quotes.
Detect the languages of the paragraphs.
"""
def __init__(self, document):
self.document = document
self.mainlang = self.document.getroot().attrib['{http://www.w3.org/XML/1998/namespace}lang']
inlangs = []
for language in self.document.findall('header/multilingual/language'):
inlangs.append(language.get('{http://www.w3.org/XML/1998/namespace}lang'))
if len(inlangs) != 0:
if self.mainlang != '':
inlangs.append(self.mainlang)
else:
raise ConversionException('mainlang not set')
self.languageGuesser = ngram.NGram(os.path.join(os.getenv('GTHOME'), 'tools/lang-guesser/LM/'), langs = inlangs )
def getDocument(self):
return self.document
def getMainlang(self):
"""
Get the mainlang of the file
"""
return self.mainlang
def setParagraphLanguage(self, paragraph):
"""Extract the text outside the quotes, use this text to set language of
the paragraph.
Set the language of the quotes in the paragraph
"""
paragraphText = self.removeQuote(paragraph)
lang = self.languageGuesser.classify(paragraphText.encode("ascii", "ignore"))
if lang != self.getMainlang():
paragraph.set('{http://www.w3.org/XML/1998/namespace}lang', lang)
for element in paragraph.iter("span"):
if element.get("type") == "quote":
lang = self.languageGuesser.classify(element.text.encode("ascii", "ignore"))
if lang != self.getMainlang():
element.set('{http://www.w3.org/XML/1998/namespace}lang', lang)
return paragraph
def removeQuote(self, paragraph):
"""Extract all text except the one inside """
text = ''
for element in paragraph.iter():
if element.tag == 'span' and element.get('type') == 'quote' and element.tail != None:
text = text + element.tail
else:
if element.text != None:
text = text + element.text
if element.tail != None:
text = text + element.tail
return text
def detectLanguage(self):
"""Detect language in all the paragraphs in self.document
"""
if self.document.find('header/multilingual') is not None:
for paragraph in self.document.iter('p'):
paragraph = self.setParagraphLanguage(paragraph)
class TestDocumentTester(unittest.TestCase):
def setUp(self):
pass
def assertXmlEqual(self, got, want):
"""Check if two stringified xml snippets are equal
"""
checker = doctestcompare.LXMLOutputChecker()
if not checker.check_output(want, got, 0):
message = checker.output_difference(doctest.Example("", want), got, 0).encode('utf-8')
raise AssertionError(message)
def testRemoveForeignLanguage1(self):
origDoc = etree.parse(io.BytesIO('
Sámegiellaqw leaqw 2004 čavččaqw rájesqw standárdaqw giellaválgaqw Microsoftaqw qwoperatiivavuogádagas qwWindows qwXP. qwDat mearkkaša ahte sámegiel bustávaid ja hámiid sáhttá válljet buot prográmmain. «Alt finnes i den foreliggende Service Pack 2 fra selskapet, som må lastes ned og installeres på din datamaskin. Konsekvensen er at all framtidig programvare fra Microsoft vil inneholde støtte for samisk». Boađus lea ahte buot boahttevaš Microsoft prográmmat dorjot sámegiela. Dattetge sáhttet deaividit váttisvuođat go čálát sámegiela Outlook-kaleandaris dahje e-poastta namahussajis, ja go čálát sámegillii dakkár prográmmain, maid Microsoft ii leat ráhkadan.
'))
dt = DocumentTester(origDoc)
self.assertEqual(decimal.Decimal(dt.getUnknownWordsRatio()).quantize(decimal.Decimal('.1'), rounding=decimal.ROUND_DOWN) , decimal.Decimal('0.2').quantize(decimal.Decimal('.1'), rounding=decimal.ROUND_DOWN))
class DocumentTester:
def __init__(self, document):
self.document = document
self.mainlang = self.document.getroot().attrib['{http://www.w3.org/XML/1998/namespace}lang']
self.removeForeignLanguage()
def getMainlang(self):
"""
Get the mainlang of the file
"""
return self.mainlang
def getMainlangWordcount(self):
return len(re.findall(r'\S+', self.getMainlangWords()))
def getUnknownWordsRatio(self):
return 1.0 * self.getUnknownWordcount() / self.getMainlangWordcount()
def getMainlangRatio(self):
return 1.0 * self.getMainlangWordcount() / float(self.document.find('header/wordcount').text)
def removeForeignLanguage(self):
"""Remove text mark as not belonging to mainlang
First remove foreign language in quotes
Then look for paragraphs with foreign language
If they contain quotes in the original language, set that as the text
of the paragraph and remove the xml:lang attribute
If it contains only foreign text, remove the whole paragraph
"""
for span in self.document.xpath('//span[@xml:lang]'):
span.text = ''
hit = False
for paragraph in self.document.xpath('//p[@xml:lang]'):
paragraph.text = ''
for span in paragraph.xpath('//span[@type="quote"]'):
if span.get('xml:lang') is None:
hit = True
paragraph.text = paragraph.text + span.text
span.getparent().remove(span)
if not hit:
paragraph.getparent().remove(paragraph)
else:
del paragraph.attrib['{http://www.w3.org/XML/1998/namespace}lang']
def getUnknownWordcount(self):
lookupCommand = ['lookup', '-flags', 'mbTT']
if self.getMainlang() == 'sme':
lookupCommand.append(os.getenv('GTHOME') + '/gt/' + self.getMainlang() + '/bin/' + self.getMainlang() + '.fst')
else:
lookupCommand.append(os.getenv('GTHOME') + '/langs/' + self.getMainlang() + '/src/analyser-gt-desc.xfst')
subp = subprocess.Popen(lookupCommand, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
(output, error) = subp.communicate(self.getPreprocessedMainlangWords())
if subp.returncode != 0:
print >>sys.stderr, 'Could not lookup text'
print >>sys.stderr, output
raise ConversionException(error)
else:
count = 0
for line in output.split():
if '+?' in line:
count += 1
return count
def getPreprocessedMainlangWords(self):
"""Send the text into preprocess, return the result.
If the process fails, exit the program
"""
preprocessCommand = []
if self.getMainlang() == 'sme':
abbrFile = os.path.join(os.environ['GTHOME'], 'gt/sme/bin/abbr.txt')
corrFile = os.path.join(os.environ['GTHOME'], 'gt/sme/bin/corr.txt')
preprocessCommand = ['preprocess', '--abbr=' + abbrFile, '--corr=' + corrFile]
else:
preprocessCommand = ['preprocess']
subp = subprocess.Popen(preprocessCommand, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
(output, error) = subp.communicate(self.getMainlangWords().replace('\n', ' '))
if subp.returncode != 0:
print >>sys.stderr, output
print >>sys.stderr, error
raise ConversionException('Could not preprocess text')
else:
return output
def getMainlangWords(self):
plist = []
for paragraph in self.document.iter('p'):
plist.append(etree.tostring(paragraph, method = 'text', encoding = 'utf8'))
return ' '.join(plist)