#!/usr/bin/env python
# -*- coding: utf-8 -*-

import BeautifulSoup
import sys
import os

def Usage():
	print 'Usage: tidy.py inputfile outputfile'

if len(sys.argv) != 3:
	Usage()
	exit(1)

if not os.path.exists(sys.argv[1]):
	print 'Input file', sys.argv[1], "doesn't exist"
	Usage()
	exit(2)
	
s = open(sys.argv[1])
sbuffer = s.read()
s.close()

try:
	soup = BeautifulSoup.BeautifulSoup(sbuffer, fromEncoding="utf-8", convertEntities=BeautifulSoup.BeautifulStoneSoup.HTML_ENTITIES)
except HTMLParseError, e:
	print 'Cannot parse', sys.argv[1]
	print 'Reason', e
	exit(4)

comments = soup.findAll(text=lambda text:isinstance(text, BeautifulSoup.Comment))
[comment.extract() for comment in comments]

[item.extract() for item in soup.findAll(text = lambda text:isinstance(text, BeautifulSoup.ProcessingInstruction ))]
[item.extract() for item in soup.findAll(text = lambda text:isinstance(text, BeautifulSoup.Declaration ))]
    
remove_tags = ['noscript', 'script', 'input', 'img', 'v:shapetype', 'v:shape', 'textarea', 'label', 'o:p', 'st1:metricconverter', 'st1:placename', 'st1:place', 'meta']
for remove_tag in remove_tags:
	removes = soup.findAll(remove_tag)
	for remove in removes:
		remove.extract()

try:
	if not ("xmlns", "http://www.w3.org/1999/xhtml") in soup.html.attrs:
		soup.html.attrs.append(("xmlns", "http://www.w3.org/1999/xhtml"))
except AttributeError:
	pass
	#print 'soup.html', soup.html
	#soup.html['xmlns'] = "http://www.w3.org/1999/xhtml"
s = open(sys.argv[2], 'w')
s.write(str(soup.prettify()))
s.close()