#! /usr/bin/python """ crawler.py Purpose and Features: - recrusively crawls the links of a website and downloads each page. - can be set to a certain depth limit. - each page downloaded is kept in the current or specified directory. - downloads each page only once. each pages unique signiture is the url and character count. ??? """ import sys, os, tempfile, glob sys.path.append("../..") from lib.uniqitemserver import UniqItemServer from lib.urlresolver import urlresolver, URLResolveError from lib import textfilters filters = textfilters class Crawler(object): def __init__(self, rootURL, file): self._foundURLs = UniqItemServer([]) self._crawledURLs = [] self._foundURLs.append(rootURL) self._urlFile = file self._posConstraints = [] self._negConstraints = [] self._verbose = False def verbose(self): self._verbose = True def _download(self): urlServer = self._foundURLs.item_generator() url = urlServer.next() if self._verbose: print "NextURL ->", url fd, tfile = tempfile.mkstemp(suffix='.html', dir='.') os.close(fd) tfile = ''.join(list(tfile)[2:]) # XXX this needs to go into a thread os.system('wget %s -a log --output-document=%s' % (url, tfile)) self._crawledURLs.append([url, tfile]) if self._verbose: print "SavedTo ->", tfile return (url, tfile) def addPosConstraint(self, constraint): self._posConstraints.append(constraint) def addNegConstraint(self, constraint): self._negConstraints.append(constraint) def _urlMeetsConstraint(self, url): meetsPositive = True meetsNegative = True if self._posConstraints: for constraint in self._posConstraints: if constraint not in url: meetsPositive = False break if self._negConstraints: for constraint in self._negConstraints: if constraint in url: meetsNegative = False break return meetsPositive and meetsNegative def _extractLinks(self, baseurl, tfile): fd = open(tfile) text = fd.read() fd.close() urls = map(lambda l: l[0], filters.extract_anchors(text)) for url in urls: try: if "#" in url: url = url.split("#")[0] url = urlresolver(baseurl, url) except URLResolveError, e: if self._verbose: print "ERROR ->", e continue if self._urlMeetsConstraint(url): self._foundURLs.append(url) def crawl(self): while 1: try: url, tfile = self._download() self._extractLinks(url, tfile) except StopIteration: for urlEntry in self._crawledURLs: print >> self._urlFile, ' '.join(urlEntry) break if __name__ == '__main__': rootURL = "http://www.python.org/" logFile = "found_urls" fd = open(logFile, 'w') crawler = Crawler(rootURL, fd) # these should go into a file. crawler.addPosConstraint("www.python.org") crawler.addNegConstraint("cgi-bin") crawler.addNegConstraint(".exe") crawler.addNegConstraint(".pdf") crawler.addNegConstraint(".tar") crawler.addNegConstraint(".tgz") crawler.addNegConstraint(".bz2") crawler.addNegConstraint(".jpeg") crawler.addNegConstraint(".jpg") crawler.addNegConstraint(".png") crawler.addNegConstraint(".gif") crawler.addNegConstraint("mailto") crawler.addNegConstraint("txt") crawler.addNegConstraint("ftp") crawler.crawl() fd.close()