#! /usr/bin/python """ urlresolver: makes relative urls absolute TODO: - check for malformed urls possibly a seperate function. """ from exceptions import Exception from lib import textfilters filters = textfilters class URLResolveError(Exception): pass def urlresolver(abs_url, rel_url): abs_url = abs_url.strip() rel_url = rel_url.strip() orig_abs_url = abs_url orig_rel_url = rel_url #print "ORIG_ABS:", orig_abs_url #print "ORIG_REL:", orig_rel_url # first check to see if we need to do anything. if not rel_url: return abs_url if 'http' in rel_url: #print "IS ABS" return rel_url # check for http:// and remove before processing if "http://" in abs_url: abs_url = abs_url[7:] # there may be a file at the end of the abs_url if abs_url[-1] != "/": abs_url = '/'.join(abs_url.split('/')[:-1]) abs_url += "/" # remove './' '../' '/' def remove_levels(rel_url, up_levels=0): url = '' if rel_url: if up_levels > 0 and rel_url[0] == '/': url = rel_url[1:] elif rel_url[:2] == './': url = rel_url[2:] elif rel_url[:3] == '../': url = rel_url[3:] up_levels += 1 return remove_levels(url, up_levels) else: url = rel_url else: return rel_url, up_levels return url, up_levels url, up_levels = remove_levels(rel_url) for i in range(up_levels): parts = abs_url.split('/') parts = filter(lambda s: s != '', parts)[:-1] if len(parts) == 0: error_msg = "URL Cannot Be Resolved Base: %s, Relative: %s" % (orig_abs_url, orig_rel_url) #print error_msg raise URLResolveError(error_msg) abs_url = '/'.join(parts) if abs_url[-1] != '/': abs_url += "/" url = abs_url + url url = url.replace("//", "/") url = "http://" + url #print "--> REL:", url #print "-" * 30 #print return filters.remove_quotes(url) if __name__ == '__main__': # normal cases u1 = 'http://www.python.org/doc/2.3.4/lib/module-exceptions.html' u2 = 'http://www.python.org/doc/2.3.4/lib/' r0 = 'http://www.python.org/doc/2.3.4/lib/' r1 = "../tut/tut.html" r2 = 'module-weakref.html' r3 = './module-weakref.html' r4 = "../../tut/tut.html" NORMAL_TEST = 1 VERBOSE = 0 BREAK_TEST = 1 if NORMAL_TEST: assert(urlresolver(u1, r4) == 'http://www.python.org/doc/tut/tut.html') assert(urlresolver(u1, r0) == r0) assert(urlresolver(u1, r1) == 'http://www.python.org/doc/2.3.4/tut/tut.html') assert(urlresolver(u1, r2) == 'http://www.python.org/doc/2.3.4/lib/module-weakref.html') assert(urlresolver(u1, r3) == 'http://www.python.org/doc/2.3.4/lib/module-weakref.html') assert(urlresolver(u2, r0) == r0) assert(urlresolver(u2, r1) == 'http://www.python.org/doc/2.3.4/tut/tut.html') assert(urlresolver(u2, r2) == 'http://www.python.org/doc/2.3.4/lib/module-weakref.html') assert(urlresolver(u2, r3) == 'http://www.python.org/doc/2.3.4/lib/module-weakref.html') if VERBOSE: absolute = [u1, u2] relative = [r0, r1, r2, r3] for u in absolute: for r in relative: print urlresolver(u, r) if BREAK_TEST: u0 = 'http://www.python.org/doc/2.3.4/lib/' u1 = "" u2 = 'http://www.python.org/' r1 = "" r2 = "../tut/tut.html" assert(urlresolver(u0, r1) == u0) assert(urlresolver(u1, r1) == '') exception_raised = False try: urlresolver(u2, r2) except URLResolveError: exception_raised = True assert(exception_raised == True)