#! /usr/bin/python2.3 """ htmlextractor.py library of functions and classes for extracting information from HTML. TODO: - extract flash and other objects. """ import sys, re # # Regular Expressions for Parsing # # filters comment_re = re.compile(r'<--.*?-->', re.M) script_re = re.compile(r'', re.I | re.M) style_re = re.compile(r'', re.I | re.M) tag_re = re.compile(r'<.*?>', re.I | re.M) apos_re = re.compile(r''', re.I) amp_re = re.compile(r'&', re.I) entity_re = re.compile(r'&.*?;') # extractors anchor_re = re.compile(r'<\s*a\s+href\s*\=\s*[\'\"]{0,1}(.*?)[\'\"]{0,1}\s*?>(.*?)', re.I | re.M) title_re = re.compile(r'<\s*title\s*>(.*?)<\s*\/title\s*>', re.I | re.M) image_re = re.compile(r'<\s*img.*?src\s*\=\s*(.*?)\s*>', re.I | re.M) # # Utilty Function(s) # def remove_quotes(s): if s: s_list = list(s) if s[0] in ["'", '"']: s_list = s_list[1:] if s[-1] in ["'", '"']: s_list = s_list[:-1] s = ''.join(s_list) return s def norm_word(word): """ norm_word(string: word) -> string: word cleans up a word. removes periods, punctution, quotes... other then that is contained inside the word. """ charlist = [] lastc = '' for c in word: if not lastc.isalnum() and not lastc.isspace() and c.isalnum(): if lastc not in ["(", ")", "*", "&", "^", "[", "]", "{", "}", "|", "!", "%"]: charlist.append(lastc) if c.isalnum(): charlist.append(c) lastc = c word = ''.join(charlist) word = remove_quotes(word) return word # # Public Functions # # # Functions for extracting words from html. # # deprecated def extract_words(text): """ extract_words(string: text) extracts content words from html. """ lines = text.split('\n') text = ' '.join(lines) regex_filters = [ comment_re, script_re, style_re, tag_re, apos_re, amp_re, entity_re ] for regex in regex_filters: text = regex.sub(" ", text) words = filter(lambda s: s != '', map(norm_word, text.split()) ) return words def extract_tokens(text): """ extract_tokens(string: text) extracts content tokens (possibly words) from html. """ lines = text.split('\n') text = ' '.join(lines) regex_filters = [ comment_re, script_re, style_re, tag_re, apos_re, amp_re, entity_re ] for regex in regex_filters: text = regex.sub(" ", text) return words # # Functions for extracting certain tags and tag contents from HTML # def extract_anchors(text): """ extract_anchors(string: text) -> tuple: (href, contents) """ matches = anchor_re.findall(text) def fix_href(t): try: t = (t[0].split()[0], t[1]) except: pass return t matches = map(fix_href, matches) return matches def extract_title_words(text): """ extract_title_words(string: text) -> list of string: words """ matches = title_re.findall(text) words = [] for title in matches: words += filter(lambda s: s != '', map(norm_word, title.split()) ) return words def extract_images(text): """ extract_images(string: text) -> list of string: image urls """ # leaving out image dimensions... might want to add back in later. matches = image_re.findall(text) img_urls = map(lambda s: remove_quotes(s.split()[0]), matches) return img_urls def main(): args = sys.argv[1:] if len(args) < 1: print "usage: htmlwords.py htmlfiles" sys.exit(1) for filename in args: fd = open(filename, 'r') text = fd.read() fd.close() # words for word in extract_words(text): print word # links for value in extract_anchors(text): print value # title words for value in extract_title_words(text): print value # images for value in extract_images(text): print value if __name__ == '__main__': main()