#!/usr/bin/env python3
"""Check if forrest docs are wellformed and that the addresses are correct."""
import os
import re
import sys

import lxml.etree as etree

from corpustools import util


def is_correct_link(link_content, filename, xdocs_dir):
    """Determine if link_content is a valid link."""
    return (
        re.match("""\d+""", link_content) or
        'static_files' in link_content or
        link_content.startswith('http://') or
        link_content.startswith('https://') or
        link_content.startswith('mailto:') or
        link_content.startswith('news:') or
        link_content.startswith('ftp://') or
        link_content.startswith('svn://') or
        link_content.startswith('see://') or
        jspwiki_file_exists(link_content, filename, xdocs_dir)
    )


def jspwiki_file_exists(link_content, filename, xdocs_dir):
    """Check if link_content points to a file."""
    # util.print_frame(link_content)
    link_content = link_content.split('#')[0].strip()
    link_content = link_content.replace('slidy/', '')
    if (link_content and link_content != '/' and not
            link_content.startswith('cgi')):
        # util.print_frame(link_content)
        dirname = os.path.dirname(os.path.abspath(filename))
        if link_content.startswith('/'):
            dirname = xdocs_dir
            link_content = link_content[1:]

        return is_forrest_file(os.path.normpath(os.path.join(dirname,
                                                             link_content)))
    else:
        return True


def is_forrest_file(normpath):
    """Check if file exists."""
    ext_replacements = []

    (normpath, ext) = os.path.splitext(normpath)
    ext_replacements.append(ext)
    if ext in ['.html', '.pdf']:
        ext_replacements.append('.jspwiki')
        ext_replacements.append('.xml')
        ext_replacements.append('.pdf')
        ext_replacements.append('.en' + ext)
        ext_replacements.append('.en' + '.jspwiki')
        ext_replacements.append('.en' + '.xml')
        ext_replacements.append('.en' + '.pdf')

    for r in ext_replacements:
        if os.path.exists(normpath + r):
            return True
    else:
        return False


def check_xml_file(filename, xdocs_dir):
    """Check if xml file is wellformed and valid."""
    errors = 0
    for a in get_tree(filename).iter('a'):
        if not is_correct_link(a.get('href').strip(), filename, xdocs_dir):
            errors += 1
            util.print_frame('{} :#{}: wrong address {}\n'.format(
                filename, a.sourceline, a.get('href')))

    return errors


def get_tree(filename):
    """Get the etree of a xml file."""
    try:
        return etree.parse(filename)
    except etree.XMLSyntaxError as e:
        util.print_frame(filename, 'is not wellformed\nError:', e)


def parse_site(xdocs_dir):
    """Parse forrests site.xml file."""
    filename = os.path.join(xdocs_dir, 'site.xml')
    site = get_tree(filename)
    get_site_href(site.getroot(), xdocs_dir, filename)


def get_site_href(element, directory, filename):
    """Compute the href from a site.xml file."""
    href = element.get('href')

    if href is not None:
        if element.tag == '{http://www.w3.org/2001/XInclude}include':
            parts = href.split('#')
            site_dirname = os.path.dirname(filename)
            site_filename = os.path.join(site_dirname, parts[0])
            try:
                site = get_tree(site_filename)
                if len(parts) == 2:
                    for element in site.xpath(re.match('xpointer\((.+)\)',
                                                       parts[1]).group(1)):
                        get_tabs_href(element, directory, site_filename)
                elif len(parts) == 1:
                    get_site_href(site.getroot(), directory, site_filename)
            except OSError as e:
                util.print_frame(
                    '{}: #{}: {} {} does not exist.\nError: {}'.format(
                        filename, element.sourceline,
                        etree.tostring(element, encoding='unicode'),
                        site_filename, str(e)))
        else:
            if (href.endswith('.html') and not href.startswith('cgi') and not
                    href.startswith('http')):
                if not is_forrest_file(os.path.join(directory, href)):
                    util.print_frame('{} :#{}: wrong address {}\n'.format(
                        filename, element.sourceline, os.path.join(directory,
                                                                   href)))
            else:
                directory = os.path.join(directory, href)

    if len(element):
        for node in element:
            get_site_href(node, directory, filename)


def get_tabs_href(element, directory, filename):
    """Compute the address from the tabs.xml file."""
    try:
        href = os.path.join(directory, element.get('dir'),
                            element.get('indexfile'))
        if not is_forrest_file(href):
            util.print_frame('{} :#{}: {} wrong address {}\n'.format(
                filename, element.sourceline,
                etree.tostring(element, encoding='unicode'),
                os.path.join(directory, href)))
    except TypeError as e:
        try:
            element.attrib['href']
        except KeyError:
            util.print_frame(e, etree.tostring(element, encoding='unicode'),
                             element.sourceline, filename)


def parse_tabs(xdocs_dir):
    """Parse forrests tabs.xml file."""
    filename = os.path.join(xdocs_dir, 'tabs.xml')
    tabs = get_tree(filename)
    for element in tabs.getroot().iter('tab'):
        get_tabs_href(element, xdocs_dir, filename)
    for element in tabs.getroot().iter(
            '{http://www.w3.org/2001/XInclude}include'):
        parts = element.get('href').split('#')
        include_name = os.path.join(xdocs_dir, parts[0])
        tabs = etree.parse(os.path.join(xdocs_dir, parts[0]))
        if len(parts) == 2:
            for element in tabs.xpath(re.match('xpointer\((.+)\)',
                                               parts[1]).group(1)):
                get_tabs_href(element, xdocs_dir, include_name)
        elif len(parts) == 1:
            for element in tabs.getroot().iter('tab'):
                get_tabs_href(element, xdocs_dir, include_name)


def main():
    """Parse xml files found in forrest sites."""
    sites = {
        'divvun': 'xtdoc/divvun/src/documentation/content/xdocs',
        'gtuit': 'xtdoc/gtuit/src/documentation/content/xdocs',
        'techdoc': 'xtdoc/techdoc/src/documentation/content/xdocs',
        'ped': 'ped/userdoc',
        'dicts': 'xtdoc/dicts/src/documentation/content/xdocs',
        'divvun.org': 'xtdoc/divvun.org/src/documentation/content/xdocs',
    }
    errors = 0
    no_files = 0

    sites_to_check = sites if len(sys.argv) == 1 else sys.argv[1:]

    for site in sites_to_check:
        fullpath = os.path.join(os.getenv('GTHOME'), sites[site])
        parse_site(fullpath)
        parse_tabs(fullpath)
        for root, dirs, files in os.walk(fullpath, followlinks=True):
            for f in files:
                no_files += 1
                path = os.path.join(root, f)
                if (f.endswith('.xml') and 'obsolete' not in path and
                        '/cgi' not in path and 'uped/' not in path):
                    errors += check_xml_file(path, fullpath)

    util.print_frame(errors)
    return errors


if __name__ == "__main__":
    sys.exit(main())
