#!/usr/bin/env python # -*- coding: utf-8 -*- # # This file contains a program to convert corpus files with a # make like function # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this file. If not, see . # # Copyright 2012 Børre Gaup # import os import sys import subprocess import argparse class CorpusBuilder: def __init__(self, orig_dir): gthome = os.getenv('GTHOME') if not os.path.isdir(orig_dir): sys.exit(1) else: self.orig_dir = orig_dir self.convertable_files = 0 self.failed_files = 0 self.bible_dep = [os.path.join(gthome, 'gt/script/langTools/BibleXMLConverter.pm'), os.path.join(gthome, 'gt/script/corpus/bible2xml.pl'), os.path.join(gthome, 'gt/script/corpus/paratext2xml.pl'), os.path.join(gthome, 'gt/script/langTools/ParatextConverter.pm')] self.pdf_dep = [os.path.join(gthome, 'gt/script/langTools/PDFConverter.pm'), os.path.join(gthome, 'gt/script/langTools/PlaintextConverter.pm')] self.html_dep = [os.path.join(gthome, 'gt/script/langTools/HTMLConverter.pm'), os.path.join(gthome, 'gt/script/langTools/RTFConverter.pm'), os.path.join(gthome, 'gt/script/corpus/xhtml2corpus.xsl')] self.svg_dep = [os.path.join(gthome, 'gt/script/langTools/SVGConverter.pm'), os.path.join(gthome, 'gt/script/corpus/svg2corpus.xsl')] self.common_dep = [os.path.join(gthome, 'gt/script/langTools/CantHandle.pm'), os.path.join(gthome, 'gt/script/langTools/CorrectXMLConverter.pm'), os.path.join(gthome, 'gt/script/corpus/common.xsl'), os.path.join(gthome, 'gt/script/corpus/convert2xml.pl'), os.path.join(gthome, 'gt/script/langTools/Preconverter.pm'), os.path.join(gthome, 'gt/script/langTools/Converter.pm'), os.path.join(gthome, 'gt/script/langTools/Corpus.pm'), os.path.join(gthome, 'gt/script/langTools/Decode.pm'), os.path.join(gthome, 'gt/script/corpus/XSL-template.xsl'), os.path.join(gthome, 'gt/script/preprocess')] self.avvir_dep = [os.path.join(gthome, 'gt/script/langTools/AvvirXMLConverter.pm'), os.path.join(gthome, 'gt/script/corpus/avvir2corpus.xsl')] self.doc_dep = [os.path.join(gthome, 'gt/script/corpus/docbook2corpus2.xsl'), os.path.join(gthome, 'gt/script/langTools/DOCConverter.pm')] def find_dependencies(self, xsl_files): from distutils.dep_util import newer_group for xsl_file in xsl_files: dependencies = self.common_dep xml_file = xsl_file.replace('.xsl', '.xml').replace('orig', 'converted') source = xsl_file[:-4] dependencies.append(xsl_file) dependencies.append(source) if source.endswith('.doc'): dependencies = dependencies + self.doc_dep elif source.endswith('.pdf'): dependencies = dependencies + self.pdf_dep elif 'Avvir_xml-filer' in source: dependencies = dependencies + self.avvir_dep elif source.endswith('.svg'): dependencies = dependencies + self.svg_dep elif 'bible' in source: dependencies = dependencies + self.bible_dep elif source.endswith('.htm') or source.endswith('.html') or 'html_id' in source: dependencies = dependencies + self.html_dep if newer_group(dependencies, xml_file): self.convert_file(source) def convert_file(self, source): subp = subprocess.Popen(['convert2xml.pl', '--debug', source], stdout = subprocess.PIPE, stderr = subprocess.PIPE) (output, error) = subp.communicate() if subp.returncode != 0: #print >>sys.stderr, "couldn't build", source #print >>sys.stderr, error self.failed_files += 1 def find_xsl_files(self): xsl_files = [] for root, dirs, files in os.walk(self.orig_dir): # Walk directory tree for f in files: if f.endswith('.xsl'): xsl_files.append(root + '/' + f) self.convertable_files = len(xsl_files) return xsl_files def final_call(self): if self.failed_files > 0: print "Couldn't convert", self.failed_files, "files of", self.convertable_files, "convertable files" else: print "Converted all", self.convertable_files, "convertible files" def parse_options(): parser = argparse.ArgumentParser(description = 'Convert original files to giellatekno xml, using dependency checking.') parser.add_argument('orig_dir', help = "directory where the original files exist") args = parser.parse_args() return args if __name__ == '__main__': args = parse_options() cb = CorpusBuilder(args.orig_dir) cb.find_dependencies(cb.find_xsl_files()) cb.final_call()