#! /usr/bin/python import urlparse, re, urllib, copy VERSION = 0.2 # need to define some exceptions for bad parses. # from urllib # urllib sucks!!! it doesn't parse paramaters correctly!!! which pisses me off!!! # (addressing scheme, network location, path, parameters, query, fragment identifier) # only handles http schemes. # maybe other schemes later on. # TODO: # 1. fix params problem due to urllib # 2. filename property # # DONE # 3. urlencode options. # 4. add support for relative urls # 2. dict type of query params # # WISH LIST # 6. validation... raise a URLNotValid Exception with parse error # this only cares about params in the query string. # params in the path should be fine. (fingers crossed) def paramsNeedFixin(query): return ';' in query and query.index(';') != 0 def fixParams(query): return query.split(";") class WebURL(object): # only handles http, https and ftp urls. # sorry no gopher ;-) # these should be good for now. # will add more later. DefaultPorts = { 'http' : '80', 'https' : '443', 'ftp' : '21' } def __init__(self, URL): self._scheme = '' # http, ftp, etc. self._user = '' # username self._password = '' # password self._host = '' # the part people remember, sometimes :-) self._toplevel = '' # .com, .edu, etc. self._ipaddr = '' # dotted ip. mutually exclusive with the cononical network address. self._port = '' # port, default is used if none specified. self._path = '' # filepath self._doctype = '' # this will be the extension of the document self._filename = '' # only extracted if an extentsion is found # otherwise it could be a directory. # XXX url parse doesn't support # params correctly. until this is fixed (or i fix it) # this could be incorrect. self._params = '' self._query = '' self._frag = '' # stuff after # char self._origPort = False scheme, address, path, params, query, frag = urlparse.urlparse(URL) self._scheme = scheme.lower() if path: # XXX this might not work correctly. will need to test. if path[:2] != '..': if path[0] == '.': path = path[1:] if path[0] == '/': path = path[1:] self._path = path self._frag = frag self._paramsInFront = True # XXX need to look at the RFC to see what the precise # format for params should be. if paramsNeedFixin(query): self._query, self._params = fixParams(query) self._paramsInFront = False else: self._query, self._params = query, params address = address.lower() # ftp and http/https urls are very similar # the other url like protocols like mailto # should be parsed seperately, if ever. # ip address? ipmatch = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$") if ipmatch.match(address): self._ipaddr = address # i could probably handle all of this through 1 re. maybe later. else: # must be a cononical address # username password? if '@' in address: parts = address.split(':') self._username, self._password = parts[0].split('@') address = ''.join(parts[1:]) # port? if ':' in address: parts = address.split(':') port = parts[-1] address = ''.join(parts[:-1]) self._port = port self._origPort = True else: self._port = self.DefaultPorts.get(self._scheme, '') # top level domain? if '.' in address: self._toplevel = address.split('.')[-1] self._host = address # path - this cannot be set. if '.' in self._path: self._doctype = self._path.split('.')[-1] self._filename = self._path.split('/')[-1] def scheme(self, scheme=''): if scheme: self._scheme = scheme else: return self._scheme def user(self, user=''): if user: self._user = user else: return self._user def password(self, password=''): if password: self._password = password else: return self._password def host(self, host=''): if host: self._host = host else: return self._host def toplevel(self): return self._toplevel def ipaddr(self, ip=''): if ip: self._ipaddr = ip else: return self._ipaddr def port(self, port=''): if port: self._port = str(port) else: return self._port def path(self, path=''): if path: self._path = path else: return self._path # this might not be a good # idea since the presence of # params could fuck it up. def queryDict(self, queryDict=None): # to make life easier. queryDict # will always build and break apart # the query string dynamically. if queryDict != None: pairs = [] for k, v in queryDict.items(): pairs.append("%s=%s" % (k, v)) self._query = '&'.join(pairs) else: queryDict = {} pairs = self._query.split("&") splitkv = lambda p: p.split('=') pairs = map(splitkv, pairs) for k, v in pairs: queryDict[k] = v return queryDict # this is not a perfect way to implement params. # since, params can occur in a path, and # before or after the query. def params(self, params=''): if params: self._params = params else: return self._params def query(self, q=''): if q: self._query = q else: return self._query def fragment(self, frag=''): if frag: self._frag = frag else: return self._frag def doctype(self): return self._doctype def filename(self): return self._filename def isAbsolute(self): return self._host != '' def isRelative(self): return not self.isAbsolute() # makes relative urls absolute. # EXPIRAMENTAL def __add__(self, webURL): # we can only add an absolute to a relative. if webURL.isAbsolute(): return webURL newURL = copy.deepcopy(webURL) newURL.scheme(self._scheme) newURL.host(self._host) path = newURL.path() # remove filename fname = self.filename() if fname: thisPath = self._path[:-len(fname)] else: thisPath = self._path if path: useSep = thisPath and thisPath[-1] != '/' and path and path[0] != '/' if useSep: newURL.path(thisPath + "/" + newURL.path()) else: newURL.path(thisPath + newURL.path()) return newURL def __str__(self): return self.url() def url(self): host = '' if self._ipaddr: # mutually exclusive host = self._ipaddr else: host = self._host # to PORT or not to PORT? # if the host originally had a port, stick it back on. port = '' if self._origPort: port = ':' + self._port # format for URLs # # http/https # http[s]://:/?# # # ftp # ftp://:@:/; # params = '' if self._params: params = ';' + self._params url = '' if self._scheme != 'ftp': frag = '' if self._frag: frag = '#' + self._frag query = '' if self._query: query = '?' + self._query if self._paramsInFront: query = params + query else: query = query + params components = { 'scheme' : self._scheme or 'http', 'host' : host, 'path' : self._path, 'query' : query, 'frag' : frag, 'port' : port, } if self.isAbsolute(): url = "%(scheme)s://%(host)s%(port)s/%(path)s%(query)s%(frag)s" % components else: # relative url = "%(path)s%(query)s%(frag)s" % components else: auth = '' if self._user and self._password: auth = "%s:%s@" % (self._user, self._password) url = "%(scheme)s://%(auth)s%(host)s%(port)s/%(path)s%(params)s" % { 'scheme' : self._scheme, 'host' : host, 'path' : self._path, 'port' : port, 'params' : params } return url def urlEncoded(self): return urllib.urlencode(self.url) # TODO: tests can be collapsed into a list of scenarios. def unittest(): ALL = 1 if ALL: # Test 1 simple url url1 = "http://www.python.org/" w1 = WebURL(url1) assert w1.scheme() == 'http' assert w1.port() == '80' assert w1.host() == 'www.python.org' assert w1.path() == '' assert w1.params() == '' assert w1.query() == '' assert w1.fragment() == '' assert w1.doctype() == '' assert w1.ipaddr() == '' assert w1.toplevel() == 'org' assert w1.user() == '' assert w1.password() == '' assert w1.ipaddr() == '' assert w1.url() == url1 if ALL: # Test 2 url with params and query url2 = "http://www.python.org/somepath/index.html;param=1?q=1&y=2" w2 = WebURL(url2) assert w2.scheme() == 'http' assert w2.port() == '80' assert w2.host() == 'www.python.org' assert w2.path() == 'somepath/index.html' assert w2.params() == 'param=1' assert w2.query() == 'q=1&y=2' assert w2.fragment() == '' assert w2.doctype() == 'html' assert w2.ipaddr() == '' assert w2.toplevel() == 'org' assert w2.user() == '' assert w2.password() == '' assert w2.ipaddr() == '' assert w2.queryDict() == {'q' : '1', 'y' : '2'} assert w2.url() == url2 if ALL: # Test 3 same as 2 but params after query instead of before query url3 = "http://www.python.org/somepath/index.html?q=1&y=2;param=1" w3 = WebURL(url3) assert w3.scheme() == 'http' assert w3.port() == '80' assert w3.host() == 'www.python.org' assert w3.path() == 'somepath/index.html' assert w3.params() == 'param=1' assert w3.query() == 'q=1&y=2' assert w3.fragment() == '' assert w3.doctype() == 'html' assert w3.ipaddr() == '' assert w3.toplevel() == 'org' assert w3.user() == '' assert w3.password() == '' assert w3.ipaddr() == '' assert w3.queryDict() == {'q' : '1', 'y' : '2'} assert w3.url() == url3 if ALL: # Test 4 no filename url4 = "http://www.python.org/somepath/?q=1&y=2;param=1" w4 = WebURL(url4) assert w4.scheme() == 'http' assert w4.port() == '80' assert w4.host() == 'www.python.org' assert w4.path() == 'somepath/' assert w4.params() == 'param=1' assert w4.query() == 'q=1&y=2' assert w4.fragment() == '' assert w4.doctype() == '' assert w4.ipaddr() == '' assert w4.toplevel() == 'org' assert w4.user() == '' assert w4.password() == '' assert w4.ipaddr() == '' assert w4.queryDict() == {'q' : '1', 'y' : '2'} assert w4.url() == url4 if ALL: # Test 5 test new filename method url5 = "http://www.python.org:8080/somepath/index.html;param=1?q=1&y=2" w5 = WebURL(url5) assert w5.scheme() == 'http' assert w5.port() == '8080' assert w5.host() == 'www.python.org' assert w5.path() == 'somepath/index.html' assert w5.params() == 'param=1' assert w5.query() == 'q=1&y=2' assert w5.fragment() == '' assert w5.filename() == 'index.html' assert w5.doctype() == 'html' assert w5.ipaddr() == '' assert w5.toplevel() == 'org' assert w5.user() == '' assert w5.password() == '' assert w5.ipaddr() == '' assert w5.queryDict() == {'q' : '1', 'y' : '2'} assert w5.url() == url5 if ALL: # Test 6 testing relative urls url6 = "somepath/;param=1?q=1&y=2" w6 = WebURL(url6) assert w6.scheme() == '' assert w6.port() == '' assert w6.host() == '' assert w6.path() == 'somepath/' assert w6.params() == 'param=1' assert w6.query() == 'q=1&y=2' assert w6.fragment() == '' assert w6.doctype() == '' assert w6.ipaddr() == '' assert w6.toplevel() == '' assert w6.user() == '' assert w6.password() == '' assert w6.ipaddr() == '' assert w6.queryDict() == {'q' : '1', 'y' : '2'} assert w6.url() == url6 if ALL: # Test 7 create absolute from relative. url7 = "somepath/;param=1?q=1&y=2" w7 = WebURL(url7) assert w7.scheme() == '' assert w7.port() == '' assert w7.host() == '' assert w7.path() == 'somepath/' assert w7.params() == 'param=1' assert w7.query() == 'q=1&y=2' assert w7.fragment() == '' assert w7.doctype() == '' assert w7.ipaddr() == '' assert w7.toplevel() == '' assert w7.user() == '' assert w7.password() == '' assert w7.ipaddr() == '' assert w7.queryDict() == {'q' : '1', 'y' : '2'} w7.host('www.mikesiley.org') assert w7.url() == "http://www.mikesiley.org/" + url7 if __name__ == '__main__': unittest()