#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This file contains routines to convert pdf files to xml
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this file. If not, see .
#
# Copyright 2012 Børre Gaup
#
import os
from lxml import etree
from lxml import doctestcompare
import doctest
import unittest
class TestPdf2Xml(unittest.TestCase):
"""
A class to test pdf to "our" xml conversion
"""
def setup():
pass
def assertXmlEqual(self, got, want):
"""
Check if two xml snippets are equal
"""
string_got = etree.tostring(got, pretty_print = True)
string_want = etree.tostring(want, pretty_print = True)
checker = doctestcompare.LXMLOutputChecker()
if not checker.check_output(string_got, string_want, 0):
message = checker.output_difference(doctest.Example("", string_got), string_want, 0).encode('utf-8')
raise AssertionError(message)
def testConstruction(self):
"""
Test the constructor
"""
# Make sure that an IOError is thrown on a non-existing file
self.assertRaises(IOError, Pdf2Xml, "foofile")
# Make sure that an etree.XMLSyntaxError is raised when opening a non xml file
self.assertRaises(etree.XMLSyntaxError, Pdf2Xml, os.path.join(os.environ['GTFREE'], "orig/sme/admin/others/jahkediedahus_2009.pdf"))
# Check that we raise a ValueError when the input doc isn't a pdf2xml doc
self.assertRaises(ValueError, Pdf2Xml, "pdf2xml_data/non_pdf2xml.xml")
def testRemoveTableOfContent(self):
pass
def testHandlePdf2xml(self):
# First make "our" xml
pdf2xml = Pdf2Xml("pdf2xml_data/simple.pdf.xml")
gotXml = pdf2xml.handlePdf2xml()
# Then parse what we want
wantXml = etree.parse("pdf2xml_data/simple.xml")
self.assertXmlEqual(gotXml, wantXml)
class Pdf2Xml:
"""
A class to convert pdf to "our" xml format.
Input is a file that has been converted to libpopplers pdf2xml format.
This file is then further processed and then converted to "our" format
"""
def __init__(self, inXmlFile):
"""
Parse the infile
"""
self.etree = etree.parse(inXmlFile)
root = self.etree.getroot()
# Raise an exception if this isn't the kind of xml doc this program can handle
if root.tag != "pdf2xml":
raise ValueError(root.tag)
def handlePdf2xml(self):
"""
Handle the root element of the input doc, convert it to "our" format
"""
document = etree.Element("document")
body = etree.Element("body")
document.append(body)
return document
def removeTableOfContent(self):
"""
Remove lines containing four or more consecutive . marks
"""
pass
def removeHeader(self):
"""
Remove page numbers and other repeated content at the top of the page
"""
pass
def removeFooter(self):
"""
Remove page numbers and other repeated content at the top of the page
"""
pass
def findStandardFont(self):
"""
Find the font that is used mostly in the doc
"""
pass
def makeParagraphs(self):
"""
Make the paragraphs out the content on the page
Insert the paragraphs as they are made. Indicate whether
"""
pass
def handlePage(self):
"""
Parse a page.
Strip away unwanted elements.
Indicate whether the content continues to the next page or not
"""
self.removeHeader()
self.removeFooter()
self.removeTableOfContent()
self.makeParagraphs()
pass
if __name__ == '__main__':
unittest.main()
#testSuite = unittest.TestSuite()
#testSuite.addTest(unittest.makeSuite(TestPdf2Xml))
#unittest.TextTestRunner().run(testSuite)