#!/usr/bin/env python # -*- coding: utf-8 -*- # # This is a program to pick out parallel files to prestable/converted # inside a corpus directory # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this file. If not, see . # # Copyright 2012 Børre Gaup # from __future__ import print_function import os import sys import argparse sys.path.append(os.getenv('GTHOME') + '/gt/script/langTools') import parallelize from lxml import etree from lxml import doctestcompare import doctest import shutil import inspect import unittest def PrintFrame(input = "empty"): """ Print debug output """ callerframerecord = inspect.stack()[1] # 0 represents this line # 1 represents line at caller frame = callerframerecord[0] info = inspect.getframeinfo(frame) print(info.lineno, info.function, input) class TestParallelPicker(unittest.TestCase): def setUp(self): self.language1ConvertedDir = os.path.join(os.environ['GTHOME'], 'gt/script/langTools/fakecorpus/converted/sme') self.pp = ParallelPicker(self.language1ConvertedDir, 'nob', '73', '110') def testCalculateLanguage1(self): self.pp.calculateLanguage1(self.language1ConvertedDir) self.assertEqual(self.pp.getLanguage1(), 'sme') def testGetParallelLanguage(self): self.assertEqual(self.pp.getParallelLanguage(), 'nob') def testHasOrig(self): fileWithOrig1 = parallelize.CorpusXMLFile(os.path.join(self.language1ConvertedDir, 'samediggi-article-47.html.xml'), self.pp.getParallelLanguage()) self.assertEqual(self.pp.hasOrig(fileWithOrig1), True) language1PrestableConvertedDir = os.path.join(os.environ['GTHOME'], 'gt/script/langTools/fakecorpus/prestable/converted/sme') fileWithOrig2 = parallelize.CorpusXMLFile(os.path.join(language1PrestableConvertedDir, 'samediggi-article-47.html.xml'), self.pp.getParallelLanguage()) self.assertEqual(self.pp.hasOrig(fileWithOrig2), True) fileWithoutOrig1 = parallelize.CorpusXMLFile(os.path.join(self.language1ConvertedDir, 'samediggi-article-1.html.xml'), self.pp.getParallelLanguage()) self.assertEqual(self.pp.hasOrig(fileWithoutOrig1), False) fileWithoutOrig2 = parallelize.CorpusXMLFile(os.path.join(language1PrestableConvertedDir, 'samediggi-article-1.html.xml'), self.pp.getParallelLanguage()) self.assertEqual(self.pp.hasOrig(fileWithoutOrig2), False) def testHasParallel(self): fileWithParallel1 = parallelize.CorpusXMLFile(os.path.join(self.language1ConvertedDir, 'samediggi-article-47.html.xml'), self.pp.getParallelLanguage()) self.assertEqual(self.pp.hasParallel(fileWithParallel1), True) language1PrestableConvertedDir = os.path.join(os.environ['GTHOME'], 'gt/script/langTools/fakecorpus/prestable/converted/sme') fileWithParallel2 = parallelize.CorpusXMLFile(os.path.join(language1PrestableConvertedDir, 'samediggi-article-47.html.xml'), self.pp.getParallelLanguage()) self.assertEqual(self.pp.hasParallel(fileWithParallel2), True) fileWithoutParallel1 = parallelize.CorpusXMLFile(os.path.join(self.language1ConvertedDir, 'samediggi-article-53.html.xml'), self.pp.getParallelLanguage()) self.assertEqual(self.pp.hasParallel(fileWithoutParallel1), False) fileWithoutParallel2 = parallelize.CorpusXMLFile(os.path.join(language1PrestableConvertedDir, 'samediggi-article-53.html.xml'), self.pp.getParallelLanguage()) self.assertEqual(self.pp.hasParallel(fileWithoutParallel2), False) class ParallelPicker: def __init__(self, language1Dir, parallelLanguage, minratio, maxratio): self.language1Dir = language1Dir self.calculateLanguage1(language1Dir) self.parallelLanguage = parallelLanguage self.minratio = minratio self.maxratio = maxratio self.oldFiles = [] self.noOrig = [] self.noParallel = [] self.poorRatio = [] self.tooFewWords = [] self.changedFiles = [] self.noFilesTranslations = [] def calculateLanguage1(self, language1Dir): """ The language is the part after 'converted/' """ convertedPos = language1Dir.find('converted/') partAfterConverted = language1Dir[convertedPos + len('converted/'):] if partAfterConverted.find('/') == -1: self.language1 = partAfterConverted else: self.language1 = partAfterConverted[:partAfterConverted.find('/')] def getLanguage1(self): return self.language1 def getParallelLanguage(self): return self.parallelLanguage def addOldfiles(self, filename): """ Add a filename to the list of files that were in prestable before new files were added """ self.oldFiles.append(filename) def addNoOrig(self, filename): """ Add a filename to the list of files in prestable that had no original file before new files were added """ self.noOrig.append(filename) def addNoParallel(self, filename): """ Add a filename to the list of files in prestable that had no parallel file before new files were added """ self.noParallel.append(filename) def addNoFilesTranslations(self, language1File, parallelFile): self.noFilesTranslations.append(language1File.getName() + ' ,' + parallelFile.getName()) def removeFile(self, filename): """ Remove the given file """ os.remove(filename) def checkPrestableFile(self, corpusFile): """ Remove a file and its parallel file from prestable if it has no orig file Remove a file from prestable if it has no parallel If not, add the file name to the list of old files """ # PrintFrame() if not self.hasOrig(corpusFile): self.addNoOrig(corpusFile.getName()) self.removeFile(corpusFile.getName()) if self.hasParallel(corpusFile): self.addNoOrig(corpusFile.getParallelFilename()) self.removeFile(corpusFile.getParallelFilename()) elif not self.hasParallel(corpusFile): self.addNoParallel(corpusFile.getName()) self.removeFile(corpusFile.getName()) else: self.addOldfiles(corpusFile.getName()) def getOldFileNames(self): """ Get all the filenames in prestable for the language pair that is given to the program """ prestableDir = self.language1Dir.replace('converted/', 'prestable/converted/') # PrintFrame(prestableDir) for root, dirs, files in os.walk(prestableDir): # Walk directory tree for f in files: if f.endswith('.xml'): # PrintFrame(os.path.join(root, f)) corpusFile = parallelize.CorpusXMLFile(os.path.join(root, f), self.getParallelLanguage()) self.checkPrestableFile(corpusFile) l2prestableDir = prestableDir.replace('/' + self.getLanguage1(), '/' + self.getParallelLanguage()) for root, dirs, files in os.walk(l2prestableDir): # Walk directory tree for f in files: if f.endswith('.xml'): # PrintFrame(os.path.join(root, f)) corpusFile = parallelize.CorpusXMLFile(os.path.join(root, f), self.getLanguage1()) self.checkPrestableFile(corpusFile) def findLang1Files(self): """ Find the language1 files """ language1Files = [] for root, dirs, files in os.walk(self.language1Dir): # Walk directory tree for f in files: if f.endswith('.xml'): language1Files.append(parallelize.CorpusXMLFile(root + '/' + f, self.parallelLanguage)) return language1Files def hasParallel(self, language1File): """ Check if the given file has a parallel file """ return language1File.getParallelFilename() is not None and os.path.isfile(language1File.getParallelFilename()) def hasOrig(self, language1File): """ Check if the given file has an original file """ return language1File.getOriginalFilename() is not None and os.path.isfile(language1File.getOriginalFilename()) def hasSufficientWords(self, language1File, parallelFile): """ Check if the given file contains more words than the threshold """ if language1File.getWordCount() is not None and float(language1File.getWordCount()) > 30 and parallelFile.getWordCount() is not None and float(parallelFile.getWordCount()) > 30 : return True else: # PrintFrame(u'Too few words ' + language1File.getName() + ' ' + language1File.getWordCount() + ' ' + parallelFile.getName() + ' ' + parallelFile.getWordCount()) self.addTooFewWords(language1File.getName(), parallelFile.getName()) return False def addTooFewWords(self, name1, name2): """ Add the file names of the files with to few words """ self.tooFewWords.append(name1 + ' ' + name2) def hasSufficientRatio(self, file1, file2): """ See if the ratio of words is good enough """ ratio = float(file1.getWordCount())/float(file2.getWordCount())*100 if ratio > float(self.minratio) and ratio < float(self.maxratio): return True else: self.addPoorRatio(file1.getName(), file2.getName(), ratio) return False def addPoorRatio(self, name1, name2, ratio): """ Add filenames to the poorRatio list """ self.poorRatio.append(name1 + ',' + name2 + ',' + repr(ratio)) def addChangedFile(self, corpusFile): self.changedFiles.append(corpusFile.getName()) prestableFilename = corpusFile.getName().replace('converted/', 'prestable/converted/') print(prestableFilename) if prestableFilename in self.oldFiles: self.oldFiles.remove(prestableFilename) def bothFilesTranslatedFrom(self, parallelFile, language1File): if parallelFile.getTranslatedFrom() == language1File.getLang() and \ language1File.getTranslatedFrom() == self.parallelLanguage: # print ("Both files claim to be translations of the other") self.addBothFilesTranslated(language1File, parallelFile) return True else: return False def oneFileTranslatedFrom(self, language1File, parallelFile): if language1File.getTranslatedFrom() == self.parallelLanguage or \ parallelFile.getTranslatedFrom() == language1File.getLang(): if self.validDiff(language1File, parallelFile.getLang()): self.addChangedFile(language1File) self.copyFile(language1File) if self.validDiff(parallelFile, language1File.getLang()): self.addChangedFile(parallelFile) self.copyFile(parallelFile) else: # print ("None of the files are translations of the other", language1File.getName(), parallelFile.getName()) self.addNoFilesTranslations(language1File, parallelFile) def traverseFiles(self): """ Go through all files """ for language1File in self.findLang1Files(): # print('.', end='') if self.hasParallel(language1File): parallelFile = parallelize.CorpusXMLFile(language1File.getParallelFilename(), language1File.getLang()) # PrintFrame(language1File.getName() + ' ' + language1File.getWordCount()) # PrintFrame(parallelFile.getName() + ' ' + parallelFile.getWordCount()) if self.hasSufficientWords(language1File, parallelFile) and \ self.hasSufficientRatio(language1File, parallelFile): if not self.bothFilesTranslatedFrom(parallelFile, language1File): self.oneFileTranslatedFrom(language1File, parallelFile) def validDiff(self, convertedFile, parallelLanguage): """ Check if there are differences between the files in converted and prestable/converted """ isValidDiff = True prestableFilename = convertedFile.getName().replace('converted/', 'prestable/converted/') if os.path.isfile(prestableFilename): prestableFile = parallelize.CorpusXMLFile(prestableFilename, parallelLanguage) prestableFile.removeVersion() convertedFile.removeVersion() # checkDiff sets isValidDiff either True or False # PrintFrame(convertedFile.getName()) # PrintFrame(prestableFile.getName()) isValidDiff = self.checkDiff(convertedFile.geteTree(), prestableFile.geteTree()) return isValidDiff def checkDiff(self, eTree1, eTree2): """ Return true if there is a difference between the content of eTree1 and eTree2 """ doc1 = etree.tostring(eTree1) doc2 = etree.tostring(eTree2) checker = doctestcompare.LXMLOutputChecker() if not checker.check_output(doc1, doc2, 0): return True else: return False def copyFile(self, xmlFile): """ Copy xmlFile to prestable/converted """ prestableDir = xmlFile.getDirname().replace('converted/', 'prestable/converted/') if not os.path.isdir(prestableDir): try: os.makedirs(prestableDir) except os.error: pass # print ("couldn't make", prestableDir) shutil.copy(xmlFile.getName(), prestableDir) def treatLists(self): for oldFile in self.oldFiles: self.removeFile(oldFile) print(len(self.oldFiles), 'of the original prestable files were deleted') print(len(self.noOrig), 'of the original prestable files had no original file') print(len(self.noParallel), 'of the original prestable files had no original file') print(len(self.poorRatio), 'pairs of the candidate files had too bad ratio') print(len(self.tooFewWords), 'pairs of the candidate files had too few words') print(len(self.changedFiles), 'of the candidate files were copied into prestable') print(len(self.noFilesTranslations), 'pairs of the candidate files had no translated_from entry') def writeLog(self): logFile = open('pick.log', 'w') logFile.write('oldFiles' + '\n') for oldFile in self.oldFiles: logFile.write(oldFile + '\n') logFile.write('\n') logFile.close() def parseOptions(): parser = argparse.ArgumentParser(description = 'Pick out parallel files from converted to prestable/converted.') parser.add_argument('language1Dir', help = "directory where the files of language1 exist") parser.add_argument('-p', '--parallelLanguage', dest = 'parallelLanguage', help = "The language where we would like to find parallel documents", required = True) parser.add_argument('--minratio', dest = 'minratio', help = "The minimum ratio", required = True) parser.add_argument('--maxratio', dest = 'maxratio', help = "The maximum ratio", required = True) args = parser.parse_args() return args def main(): args = parseOptions() language1Dir = args.language1Dir parallelLanguage = args.parallelLanguage minratio = args.minratio maxratio = args.maxratio pp = ParallelPicker(language1Dir, parallelLanguage, minratio, maxratio) pp.getOldFileNames() pp.traverseFiles() pp.treatLists() pp.writeLog() if __name__ == '__main__': main()