# -*- coding: utf-8 -*-
#
# This file contains a class to analyse text in giellatekno xml format
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file. If not, see .
#
# Copyright 2013 Børre Gaup
#
import os
import sys
import subprocess
import multiprocessing
import re
import datetime
import lxml.etree as etree
from io import open
import StringIO
import ccat
def unwrap_self_analyse(arg, **kwarg):
return Analyser.analyse(*arg, **kwarg)
class Analyser(object):
def __init__(self, lang, old=False):
self.lang = lang
self.old = old
self.xp = ccat.XMLPrinter(lang=lang, allP=True)
self.xp.setOutfile(StringIO.StringIO())
def exitOnError(self, filename):
error = False
if filename is None:
print >>sys.stderr, filename, 'is not defined'
error = True
elif not os.path.exists(filename):
print >>sys.stderr, filename, 'does not exist'
error = True
if error:
sys.exit(4)
def setAnalysisFiles(self,
abbrFile=None,
fstFile=None,
disambiguationAnalysisFile=None,
functionAnalysisFile=None,
dependencyAnalysisFile=None):
if self.lang in ['sma', 'sme', 'smj']:
self.exitOnError(abbrFile)
self.exitOnError(fstFile)
self.exitOnError(disambiguationAnalysisFile)
self.exitOnError(functionAnalysisFile)
self.exitOnError(dependencyAnalysisFile)
self.abbrFile = abbrFile
self.fstFile = fstFile
self.disambiguationAnalysisFile = disambiguationAnalysisFile
self.functionAnalysisFile = functionAnalysisFile
self.dependencyAnalysisFile = dependencyAnalysisFile
def setCorrFile(self, corrFile):
self.exitOnError(corrFile)
self.corrFile = corrFile
def collectFiles(self, convertedDirs):
'''convertedDirs is a list of directories containing converted xml files
'''
self.xmlFiles = []
for cdir in convertedDirs:
for root, dirs, files in os.walk(cdir): # Walk directory tree
for f in files:
if self.lang in root and f.endswith(u'.xml'):
self.xmlFiles.append(os.path.join(root, f))
def makedirs(self):
u"""Make the analysed directory
"""
try:
os.makedirs(os.path.dirname(self.analysisXmlFile))
except OSError:
pass
def getLang(self):
u"""
@brief Get the mainlang from the xml file
:returns: the language as set in the xml file
"""
if self.eTree.getroot().attrib[u'{http://www.w3.org/XML/1998/namespace}lang'] is not None:
return self.eTree.getroot().attrib[u'{http://www.w3.org/XML/1998/namespace}lang']
else:
return u'none'
def getGenre(self):
u"""
@brief Get the genre from the xml file
:returns: the genre as set in the xml file
"""
if self.eTree.getroot().find(u".//genre") is not None:
return self.eTree.getroot().find(u".//genre").attrib[u"code"]
else:
return u'none'
def getOcr(self):
u"""
@brief Check if the ocr element exists
:returns: the ocr element or None
"""
return self.eTree.getroot().find(u".//ocr")
def getTranslatedfrom(self):
u"""
@brief Get the translated_from value from the xml file
:returns: the value of translated_from as set in the xml file
"""
if self.eTree.getroot().find(u".//translated_from") is not None:
return self.eTree.getroot().find(u".//translated_from").attrib[u"{http://www.w3.org/XML/1998/namespace}lang"]
else:
return u'none'
def calculateFilenames(self, xmlFile):
u"""Set the names of the analysis files
"""
self.dependencyAnalysisName = xmlFile.replace(u'/converted/', u'/analysed')
def ccat(self):
u"""Runs ccat on the input file
Returns the output of ccat
"""
self.xp.processFile(self.xmlFile)
return self.xp.outfile.getvalue()
def runExternalCommand(self, command, input):
'''Run the command with input using subprocess
'''
subp = subprocess.Popen(command,
stdin = subprocess.PIPE,
stdout = subprocess.PIPE,
stderr = subprocess.PIPE)
(output, error) = subp.communicate(input)
self.checkError(command, error)
return output
def preprocess(self):
u"""Runs preprocess on the ccat output.
Returns the output of preprocess
"""
preProcessCommand = [u'preprocess']
if self.abbrFile is not None:
preProcessCommand.append(u'--abbr=' + self.abbrFile)
if self.lang == 'sme' and self.corrFile is not None:
preProcessCommand.append(u'--corr=' + self.corrFile)
return self.runExternalCommand(preProcessCommand, self.ccat())
def lookup(self):
u"""Runs lookup on the preprocess output
Returns the output of preprocess
"""
lookupCommand = [u'lookup', u'-q', u'-flags', u'mbTT', self.fstFile]
return self.runExternalCommand(lookupCommand, self.preprocess())
def lookup2cg(self):
u"""Runs the lookup on the lookup output
Returns the output of lookup2cg
"""
lookup2cgCommand = [u'lookup2cg']
return self.runExternalCommand(lookup2cgCommand, self.lookup())
def disambiguationAnalysis(self):
u"""Runs vislcg3 on the lookup2cg output, which produces a disambiguation
analysis
The output is stored in a .dis file
"""
disAnalysisCommand = \
[u'vislcg3', u'-g', self.disambiguationAnalysisFile]
self.disambiguation = \
self.runExternalCommand(disAnalysisCommand, self.lookup2cg())
def functionAnalysis(self):
u"""Runs vislcg3 on the dis file
Return the output of this process
"""
self.disambiguationAnalysis()
functionAnalysisCommand = \
[u'vislcg3', u'-g', self.functionAnalysisFile]
return self.runExternalCommand(functionAnalysisCommand, self.getDisambiguation())
def dependencyAnalysis(self):
u"""Runs vislcg3 on the .dis file.
Produces output in a .dep file
"""
depAnalysisCommand = \
[u'vislcg3', u'-g', self.dependencyAnalysisFile]
self.dependency = \
self.runExternalCommand(depAnalysisCommand, self.functionAnalysis())
def getDisambiguation(self):
return self.disambiguation
def getDisambiguationXml(self):
disambiguation = etree.Element(u'disambiguation')
disambiguation.text = self.disambiguationAnalysis().decode(u'utf8')
body = etree.Element(u'body')
body.append(disambiguation)
oldbody = self.eTree.find(u'.//body')
oldbody.getparent().replace(oldbody, body)
return self.eTree
def getDependency(self):
return self.dependency
def getAnalysisXml(self):
body = etree.Element(u'body')
disambiguation = etree.Element(u'disambiguation')
disambiguation.text = self.getDisambiguation().decode(u'utf8')
body.append(disambiguation)
dependency = etree.Element(u'dependency')
dependency.text = self.getDependency().decode(u'utf8')
body.append(dependency)
oldbody = self.eTree.find(u'.//body')
oldbody.getparent().replace(oldbody, body)
return self.eTree
def checkError(self, command, error):
if error is not None and len(error) > 0:
print >>sys.stderr, self.xmlFile
print >>sys.stderr, command
print >>sys.stderr, error
def analyse(self, xmlFile):
u'''Analyse a file if it is not ocr'ed
'''
self.xmlFile = xmlFile
self.analysisXmlFile = self.xmlFile.replace(u'converted/', u'analysed/')
self.eTree = etree.parse(xmlFile)
self.calculateFilenames(xmlFile)
if self.getOcr() is None:
self.dependencyAnalysis()
self.makedirs()
self.getAnalysisXml().write(
self.analysisXmlFile,
encoding=u'utf8',
xml_declaration=True)
def analyseInParallel(self):
poolSize = multiprocessing.cpu_count() * 2
pool = multiprocessing.Pool(processes=poolSize,)
poolOutputs = pool.map(
unwrap_self_analyse,
zip([self]*len(self.xmlFiles), self.xmlFiles))
pool.close() # no more tasks
pool.join() # wrap up current tasks
def analyseSerially(self):
for xmlFile in self.xmlFiles:
print >>sys.stderr, u'Analysing', xmlFile
self.analyse(xmlFile)
class AnalysisConcatenator(object):
def __init__(self, goalDir, xmlFiles, old=False):
u"""
@brief Receives a list of filenames that has been analysed
"""
self.basenames = xmlFiles
self.old = old
if old:
self.disoldFiles = {}
self.depoldFiles = {}
self.disFiles = {}
self.depFiles = {}
self.goalDir = os.path.join(goalDir, datetime.date.today().isoformat())
try:
os.makedirs(self.goalDir)
except OSError:
pass
def concatenateAnalysedFiles(self):
u"""
@brief Concatenates analysed files according to origlang, translated_from_lang and genre
"""
for xmlFile in self.basenames:
self.concatenateAnalysedFile(xmlFile[1].replace(u".xml", u".dis"))
self.concatenateAnalysedFile(xmlFile[1].replace(u".xml", u".dep"))
if self.old:
self.concatenateAnalysedFile(xmlFile[1].replace(u".xml", u".disold"))
self.concatenateAnalysedFile(xmlFile[1].replace(u".xml", u".depold"))
def concatenateAnalysedFile(self, filename):
u"""
@brief Adds the content of the given file to file it belongs to
:returns: ...
"""
if os.path.isfile(filename):
fromFile = open(filename)
self.getToFile(fromFile.readline(), filename).write(fromFile.read())
fromFile.close()
os.unlink(filename)
def getToFile(self, prefix, filename):
u"""
@brief Gets the prefix of the filename. Opens a file object with the files prefix.
:returns: File object belonging to the prefix of the filename
"""
prefix = os.path.join(self.goalDir, prefix.strip())
if filename[-4:] == u".dis":
try:
self.disFiles[prefix]
except KeyError:
self.disFiles[prefix] = open(prefix + u".dis", u"w")
return self.disFiles[prefix]
elif filename[-4:] == u".dep":
try:
self.depFiles[prefix]
except KeyError:
self.depFiles[prefix] = open(prefix + u".dep", u"w")
return self.depFiles[prefix]
if filename[-7:] == u".disold":
try:
self.disoldFiles[prefix]
except KeyError:
self.disoldFiles[prefix] = open(prefix + u".disold", u"w")
return self.disoldFiles[prefix]
elif filename[-7:] == u".depold":
try:
self.depoldFiles[prefix]
except KeyError:
self.depoldFiles[prefix] = open(prefix + u".depold", u"w")
return self.depoldFiles[prefix]