#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file contains a program to convert corpus files with a
# make like function
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file. If not, see .
#
# Copyright 2012 Børre Gaup
#
import os
import sys
import subprocess
import argparse
class CorpusBuilder:
def __init__(self, orig_dir):
gthome = os.getenv('GTHOME')
if not os.path.isdir(orig_dir):
sys.exit(1)
else:
self.orig_dir = orig_dir
self.convertable_files = 0
self.failed_files = 0
self.bible_dep = [os.path.join(gthome, 'gt/script/langTools/BibleXMLConverter.pm'), os.path.join(gthome, 'gt/script/corpus/bible2xml.pl'), os.path.join(gthome, 'gt/script/corpus/paratext2xml.pl'), os.path.join(gthome, 'gt/script/langTools/ParatextConverter.pm')]
self.pdf_dep = [os.path.join(gthome, 'gt/script/langTools/PDFConverter.pm'), os.path.join(gthome, 'gt/script/langTools/PlaintextConverter.pm')]
self.html_dep = [os.path.join(gthome, 'gt/script/langTools/HTMLConverter.pm'), os.path.join(gthome, 'gt/script/langTools/RTFConverter.pm'), os.path.join(gthome, 'gt/script/corpus/xhtml2corpus.xsl')]
self.svg_dep = [os.path.join(gthome, 'gt/script/langTools/SVGConverter.pm'), os.path.join(gthome, 'gt/script/corpus/svg2corpus.xsl')]
self.common_dep = [os.path.join(gthome, 'gt/script/langTools/CantHandle.pm'), os.path.join(gthome, 'gt/script/langTools/CorrectXMLConverter.pm'), os.path.join(gthome, 'gt/script/corpus/common.xsl'), os.path.join(gthome, 'gt/script/corpus/convert2xml.pl'), os.path.join(gthome, 'gt/script/langTools/Preconverter.pm'), os.path.join(gthome, 'gt/script/langTools/Converter.pm'), os.path.join(gthome, 'gt/script/langTools/Corpus.pm'), os.path.join(gthome, 'gt/script/langTools/Decode.pm'), os.path.join(gthome, 'gt/script/corpus/XSL-template.xsl'), os.path.join(gthome, 'gt/script/preprocess')]
self.avvir_dep = [os.path.join(gthome, 'gt/script/langTools/AvvirXMLConverter.pm'), os.path.join(gthome, 'gt/script/corpus/avvir2corpus.xsl')]
self.doc_dep = [os.path.join(gthome, 'gt/script/corpus/docbook2corpus2.xsl'), os.path.join(gthome, 'gt/script/langTools/DOCConverter.pm')]
def find_dependencies(self, xsl_files):
from distutils.dep_util import newer_group
for xsl_file in xsl_files:
dependencies = self.common_dep
xml_file = xsl_file.replace('.xsl', '.xml').replace('orig', 'converted')
source = xsl_file[:-4]
dependencies.append(xsl_file)
dependencies.append(source)
if source.endswith('.doc'):
dependencies = dependencies + self.doc_dep
elif source.endswith('.pdf'):
dependencies = dependencies + self.pdf_dep
elif 'Avvir_xml-filer' in source:
dependencies = dependencies + self.avvir_dep
elif source.endswith('.svg'):
dependencies = dependencies + self.svg_dep
elif 'bible' in source:
dependencies = dependencies + self.bible_dep
elif source.endswith('.htm') or source.endswith('.html') or 'html_id' in source:
dependencies = dependencies + self.html_dep
if newer_group(dependencies, xml_file):
self.convert_file(source)
def convert_file(self, source):
subp = subprocess.Popen(['convert2xml.pl', '--debug', source], stdout = subprocess.PIPE, stderr = subprocess.PIPE)
(output, error) = subp.communicate()
if subp.returncode != 0:
#print >>sys.stderr, "couldn't build", source
#print >>sys.stderr, error
self.failed_files += 1
def find_xsl_files(self):
xsl_files = []
for root, dirs, files in os.walk(self.orig_dir): # Walk directory tree
for f in files:
if f.endswith('.xsl'):
xsl_files.append(root + '/' + f)
self.convertable_files = len(xsl_files)
return xsl_files
def final_call(self):
if self.failed_files > 0:
print "Couldn't convert", self.failed_files, "files of", self.convertable_files, "convertable files"
else:
print "Converted all", self.convertable_files, "convertible files"
def parse_options():
parser = argparse.ArgumentParser(description = 'Convert original files to giellatekno xml, using dependency checking.')
parser.add_argument('orig_dir', help = "directory where the original files exist")
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_options()
cb = CorpusBuilder(args.orig_dir)
cb.find_dependencies(cb.find_xsl_files())
cb.final_call()