# -*- test-case-name: openid.test.test_rpverify -*-
"""
This module contains the C{L{TrustRoot}} class, which helps handle
trust root checking.  This module is used by the
C{L{openid.server.server}} module, but it is also available to server
implementers who wish to use it for additional trust root checking.

It also implements relying party return_to URL verification, based on
the realm.
"""

__all__ = [
    'TrustRoot',
    'RP_RETURN_TO_URL_TYPE',
    'extractReturnToURLs',
    'returnToMatches',
    'verifyReturnTo',
    ]

from openid import oidutil
from openid import urinorm
from openid.yadis import services

from urlparse import urlparse, urlunparse
import re

############################################
_protocols = ['http', 'https']
_top_level_domains = [
    'ac', 'ad', 'ae', 'aero', 'af', 'ag', 'ai', 'al', 'am', 'an',
    'ao', 'aq', 'ar', 'arpa', 'as', 'asia', 'at', 'au', 'aw',
    'ax', 'az', 'ba', 'bb', 'bd', 'be', 'bf', 'bg', 'bh', 'bi',
    'biz', 'bj', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bv', 'bw',
    'by', 'bz', 'ca', 'cat', 'cc', 'cd', 'cf', 'cg', 'ch', 'ci',
    'ck', 'cl', 'cm', 'cn', 'co', 'com', 'coop', 'cr', 'cu', 'cv',
    'cx', 'cy', 'cz', 'de', 'dj', 'dk', 'dm', 'do', 'dz', 'ec',
    'edu', 'ee', 'eg', 'er', 'es', 'et', 'eu', 'fi', 'fj', 'fk',
    'fm', 'fo', 'fr', 'ga', 'gb', 'gd', 'ge', 'gf', 'gg', 'gh',
    'gi', 'gl', 'gm', 'gn', 'gov', 'gp', 'gq', 'gr', 'gs', 'gt',
    'gu', 'gw', 'gy', 'hk', 'hm', 'hn', 'hr', 'ht', 'hu', 'id',
    'ie', 'il', 'im', 'in', 'info', 'int', 'io', 'iq', 'ir', 'is',
    'it', 'je', 'jm', 'jo', 'jobs', 'jp', 'ke', 'kg', 'kh', 'ki',
    'km', 'kn', 'kp', 'kr', 'kw', 'ky', 'kz', 'la', 'lb', 'lc',
    'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc',
    'md', 'me', 'mg', 'mh', 'mil', 'mk', 'ml', 'mm', 'mn', 'mo',
    'mobi', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'museum', 'mv',
    'mw', 'mx', 'my', 'mz', 'na', 'name', 'nc', 'ne', 'net', 'nf',
    'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'org',
    'pa', 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr',
    'pro', 'ps', 'pt', 'pw', 'py', 'qa', 're', 'ro', 'rs', 'ru',
    'rw', 'sa', 'sb', 'sc', 'sd', 'se', 'sg', 'sh', 'si', 'sj',
    'sk', 'sl', 'sm', 'sn', 'so', 'sr', 'st', 'su', 'sv', 'sy',
    'sz', 'tc', 'td', 'tel', 'tf', 'tg', 'th', 'tj', 'tk', 'tl',
    'tm', 'tn', 'to', 'tp', 'tr', 'travel', 'tt', 'tv', 'tw',
    'tz', 'ua', 'ug', 'uk', 'us', 'uy', 'uz', 'va', 'vc', 've',
    'vg', 'vi', 'vn', 'vu', 'wf', 'ws', 'xn--0zwm56d',
    'xn--11b5bs3a9aj6g', 'xn--80akhbyknj4f', 'xn--9t4b11yi5a',
    'xn--deba0ad', 'xn--g6w251d', 'xn--hgbk6aj7f53bba',
    'xn--hlcj6aya9esc7a', 'xn--jxalpdlp', 'xn--kgbechtv',
    'xn--zckzah', 'ye', 'yt', 'yu', 'za', 'zm', 'zw']

# Build from RFC3986, section 3.2.2. Used to reject hosts with invalid
# characters.
host_segment_re = re.compile(
    r"(?:[-a-zA-Z0-9!$&'\(\)\*+,;=._~]|%[a-zA-Z0-9]{2})+$")

class RealmVerificationRedirected(Exception):
    """Attempting to verify this realm resulted in a redirect.

    @since: 2.1.0
    """
    def __init__(self, relying_party_url, rp_url_after_redirects):
        self.relying_party_url = relying_party_url
        self.rp_url_after_redirects = rp_url_after_redirects

    def __str__(self):
        return ("Attempting to verify %r resulted in "
                "redirect to %r" %
                (self.relying_party_url,
                 self.rp_url_after_redirects))


def _parseURL(url):
    try:
        url = urinorm.urinorm(url)
    except ValueError:
        return None
    proto, netloc, path, params, query, frag = urlparse(url)
    if not path:
        # Python <2.4 does not parse URLs with no path properly
        if not query and '?' in netloc:
            netloc, query = netloc.split('?', 1)

        path = '/'

    path = urlunparse(('', '', path, params, query, frag))

    if ':' in netloc:
        try:
            host, port = netloc.split(':')
        except ValueError:
            return None

        if not re.match(r'\d+$', port):
            return None
    else:
        host = netloc
        port = ''

    host = host.lower()
    if not host_segment_re.match(host):
        return None

    return proto, host, port, path

class TrustRoot(object):
    """
    This class represents an OpenID trust root.  The C{L{parse}}
    classmethod accepts a trust root string, producing a
    C{L{TrustRoot}} object.  The method OpenID server implementers
    would be most likely to use is the C{L{isSane}} method, which
    checks the trust root for given patterns that indicate that the
    trust root is too broad or points to a local network resource.

    @sort: parse, isSane
    """

    def __init__(self, unparsed, proto, wildcard, host, port, path):
        self.unparsed = unparsed
        self.proto = proto
        self.wildcard = wildcard
        self.host = host
        self.port = port
        self.path = path

    def isSane(self):
        """
        This method checks the to see if a trust root represents a
        reasonable (sane) set of URLs.  'http://*.com/', for example
        is not a reasonable pattern, as it cannot meaningfully specify
        the site claiming it.  This function attempts to find many
        related examples, but it can only work via heuristics.
        Negative responses from this method should be treated as
        advisory, used only to alert the user to examine the trust
        root carefully.


        @return: Whether the trust root is sane

        @rtype: C{bool}
        """

        if self.host == 'localhost':
            return True

        host_parts = self.host.split('.')
        if self.wildcard:
            assert host_parts[0] == '', host_parts
            del host_parts[0]

        # If it's an absolute domain name, remove the empty string
        # from the end.
        if host_parts and not host_parts[-1]:
            del host_parts[-1]

        if not host_parts:
            return False

        # Do not allow adjacent dots
        if '' in host_parts:
            return False

        tld = host_parts[-1]
        if tld not in _top_level_domains:
            return False

        if len(host_parts) == 1:
            return False

        if self.wildcard:
            if len(tld) == 2 and len(host_parts[-2]) <= 3:
                # It's a 2-letter tld with a short second to last segment
                # so there needs to be more than two segments specified 
                # (e.g. *.co.uk is insane)
                return len(host_parts) > 2

        # Passed all tests for insanity.
        return True

    def validateURL(self, url):
        """
        Validates a URL against this trust root.


        @param url: The URL to check

        @type url: C{str}


        @return: Whether the given URL is within this trust root.

        @rtype: C{bool}
        """

        url_parts = _parseURL(url)
        if url_parts is None:
            return False

        proto, host, port, path = url_parts

        if proto != self.proto:
            return False

        if port != self.port:
            return False

        if '*' in host:
            return False

        if not self.wildcard:
            if host != self.host:
                return False
        elif ((not host.endswith(self.host)) and
              ('.' + host) != self.host):
            return False

        if path != self.path:
            path_len = len(self.path)
            trust_prefix = self.path[:path_len]
            url_prefix = path[:path_len]

            # must be equal up to the length of the path, at least
            if trust_prefix != url_prefix:
                return False

            # These characters must be on the boundary between the end
            # of the trust root's path and the start of the URL's
            # path.
            if '?' in self.path:
                allowed = '&'
            else:
                allowed = '?/'

            return (self.path[-1] in allowed or
                path[path_len] in allowed)

        return True

    def parse(cls, trust_root):
        """
        This method creates a C{L{TrustRoot}} instance from the given
        input, if possible.


        @param trust_root: This is the trust root to parse into a
        C{L{TrustRoot}} object.

        @type trust_root: C{str}


        @return: A C{L{TrustRoot}} instance if trust_root parses as a
        trust root, C{None} otherwise.

        @rtype: C{NoneType} or C{L{TrustRoot}}
        """
        url_parts = _parseURL(trust_root)
        if url_parts is None:
            return None

        proto, host, port, path = url_parts

        # check for valid prototype
        if proto not in _protocols:
            return None

        # check for URI fragment
        if path.find('#') != -1:
            return None

        # extract wildcard if it is there
        if host.find('*', 1) != -1:
            # wildcard must be at start of domain:  *.foo.com, not foo.*.com
            return None

        if host.startswith('*'):
            # Starts with star, so must have a dot after it (if a
            # domain is specified)
            if len(host) > 1 and host[1] != '.':
                return None

            host = host[1:]
            wilcard = True
        else:
            wilcard = False

        # we have a valid trust root
        tr = cls(trust_root, proto, wilcard, host, port, path)

        return tr

    parse = classmethod(parse)

    def checkSanity(cls, trust_root_string):
        """str -> bool

        is this a sane trust root?
        """
        trust_root = cls.parse(trust_root_string)
        if trust_root is None:
            return False
        else:
            return trust_root.isSane()

    checkSanity = classmethod(checkSanity)

    def checkURL(cls, trust_root, url):
        """quick func for validating a url against a trust root.  See the
        TrustRoot class if you need more control."""
        tr = cls.parse(trust_root)
        return tr is not None and tr.validateURL(url)

    checkURL = classmethod(checkURL)

    def buildDiscoveryURL(self):
        """Return a discovery URL for this realm.

        This function does not check to make sure that the realm is
        valid. Its behaviour on invalid inputs is undefined.

        @rtype: str

        @returns: The URL upon which relying party discovery should be run
            in order to verify the return_to URL

        @since: 2.1.0
        """
        if self.wildcard:
            # Use "www." in place of the star
            assert self.host.startswith('.'), self.host
            www_domain = 'www' + self.host
            return '%s://%s%s' % (self.proto, www_domain, self.path)
        else:
            return self.unparsed

    def __repr__(self):
        return "TrustRoot(%r, %r, %r, %r, %r, %r)" % (
            self.unparsed, self.proto, self.wildcard, self.host, self.port,
            self.path)

    def __str__(self):
        return repr(self)

# The URI for relying party discovery, used in realm verification.
#
# XXX: This should probably live somewhere else (like in
# openid.consumer or openid.yadis somewhere)
RP_RETURN_TO_URL_TYPE = 'http://specs.openid.net/auth/2.0/return_to'

def _extractReturnURL(endpoint):
    """If the endpoint is a relying party OpenID return_to endpoint,
    return the endpoint URL. Otherwise, return None.

    This function is intended to be used as a filter for the Yadis
    filtering interface.

    @see: C{L{openid.yadis.services}}
    @see: C{L{openid.yadis.filters}}

    @param endpoint: An XRDS BasicServiceEndpoint, as returned by
        performing Yadis dicovery.

    @returns: The endpoint URL or None if the endpoint is not a
        relying party endpoint.
    @rtype: str or NoneType
    """
    if endpoint.matchTypes([RP_RETURN_TO_URL_TYPE]):
        return endpoint.uri
    else:
        return None

def returnToMatches(allowed_return_to_urls, return_to):
    """Is the return_to URL under one of the supplied allowed
    return_to URLs?

    @since: 2.1.0
    """

    for allowed_return_to in allowed_return_to_urls:
        # A return_to pattern works the same as a realm, except that
        # it's not allowed to use a wildcard. We'll model this by
        # parsing it as a realm, and not trying to match it if it has
        # a wildcard.

        return_realm = TrustRoot.parse(allowed_return_to)
        if (# Parses as a trust root
            return_realm is not None and

            # Does not have a wildcard
            not return_realm.wildcard and

            # Matches the return_to that we passed in with it
            return_realm.validateURL(return_to)
            ):
            return True

    # No URL in the list matched
    return False

def getAllowedReturnURLs(relying_party_url):
    """Given a relying party discovery URL return a list of return_to URLs.

    @since: 2.1.0
    """
    (rp_url_after_redirects, return_to_urls) = services.getServiceEndpoints(
        relying_party_url, _extractReturnURL)

    if rp_url_after_redirects != relying_party_url:
        # Verification caused a redirect
        raise RealmVerificationRedirected(
            relying_party_url, rp_url_after_redirects)

    return return_to_urls

# _vrfy parameter is there to make testing easier
def verifyReturnTo(realm_str, return_to, _vrfy=getAllowedReturnURLs):
    """Verify that a return_to URL is valid for the given realm.

    This function builds a discovery URL, performs Yadis discovery on
    it, makes sure that the URL does not redirect, parses out the
    return_to URLs, and finally checks to see if the current return_to
    URL matches the return_to.

    @raises DiscoveryFailure: When Yadis discovery fails
    @returns: True if the return_to URL is valid for the realm

    @since: 2.1.0
    """
    realm = TrustRoot.parse(realm_str)
    if realm is None:
        # The realm does not parse as a URL pattern
        return False

    try:
        allowable_urls = _vrfy(realm.buildDiscoveryURL())
    except RealmVerificationRedirected, err:
        oidutil.log(str(err))
        return False

    if returnToMatches(allowable_urls, return_to):
        return True
    else:
        oidutil.log("Failed to validate return_to %r for realm %r, was not "
                    "in %s" % (return_to, realm_str, allowable_urls))
        return False