#! /usr/bin/python2.3
"""
htmlextractor.py
library of functions and classes for extracting
information from HTML.
TODO:
- extract flash and other objects.
"""
import sys, re
#
# Regular Expressions for Parsing
#
# filters
comment_re = re.compile(r'<--.*?-->', re.M)
script_re = re.compile(r'', re.I | re.M)
style_re = re.compile(r'', re.I | re.M)
tag_re = re.compile(r'<.*?>', re.I | re.M)
apos_re = re.compile(r''', re.I)
amp_re = re.compile(r'&', re.I)
entity_re = re.compile(r'&.*?;')
# extractors
anchor_re = re.compile(r'<\s*a\s+href\s*\=\s*[\'\"]{0,1}(.*?)[\'\"]{0,1}\s*?>(.*?)', re.I | re.M)
title_re = re.compile(r'<\s*title\s*>(.*?)<\s*\/title\s*>', re.I | re.M)
image_re = re.compile(r'<\s*img.*?src\s*\=\s*(.*?)\s*>', re.I | re.M)
#
# Utilty Function(s)
#
def remove_quotes(s):
if s:
s_list = list(s)
if s[0] in ["'", '"']:
s_list = s_list[1:]
if s[-1] in ["'", '"']:
s_list = s_list[:-1]
s = ''.join(s_list)
return s
def norm_word(word):
"""
norm_word(string: word) -> string: word
cleans up a word. removes periods, punctution, quotes...
other then that is contained inside the word.
"""
charlist = []
lastc = ''
for c in word:
if not lastc.isalnum() and not lastc.isspace() and c.isalnum():
if lastc not in ["(", ")", "*", "&", "^", "[", "]", "{", "}", "|", "!", "%"]:
charlist.append(lastc)
if c.isalnum():
charlist.append(c)
lastc = c
word = ''.join(charlist)
word = remove_quotes(word)
return word
#
# Public Functions
#
#
# Functions for extracting words from html.
#
# deprecated
def extract_words(text):
"""
extract_words(string: text)
extracts content words from html.
"""
lines = text.split('\n')
text = ' '.join(lines)
regex_filters = [
comment_re,
script_re,
style_re,
tag_re,
apos_re,
amp_re,
entity_re
]
for regex in regex_filters:
text = regex.sub(" ", text)
words = filter(lambda s: s != '',
map(norm_word, text.split())
)
return words
def extract_tokens(text):
"""
extract_tokens(string: text)
extracts content tokens (possibly words) from html.
"""
lines = text.split('\n')
text = ' '.join(lines)
regex_filters = [
comment_re,
script_re,
style_re,
tag_re,
apos_re,
amp_re,
entity_re
]
for regex in regex_filters:
text = regex.sub(" ", text)
return words
#
# Functions for extracting certain tags and tag contents from HTML
#
def extract_anchors(text):
"""
extract_anchors(string: text) -> tuple: (href, contents)
"""
matches = anchor_re.findall(text)
def fix_href(t):
try:
t = (t[0].split()[0], t[1])
except: pass
return t
matches = map(fix_href, matches)
return matches
def extract_title_words(text):
"""
extract_title_words(string: text) -> list of string: words
"""
matches = title_re.findall(text)
words = []
for title in matches:
words += filter(lambda s: s != '',
map(norm_word, title.split())
)
return words
def extract_images(text):
"""
extract_images(string: text) -> list of string: image urls
"""
# leaving out image dimensions... might want to add back in later.
matches = image_re.findall(text)
img_urls = map(lambda s: remove_quotes(s.split()[0]), matches)
return img_urls
def main():
args = sys.argv[1:]
if len(args) < 1:
print "usage: htmlwords.py htmlfiles"
sys.exit(1)
for filename in args:
fd = open(filename, 'r')
text = fd.read()
fd.close()
# words
for word in extract_words(text):
print word
# links
for value in extract_anchors(text):
print value
# title words
for value in extract_title_words(text):
print value
# images
for value in extract_images(text):
print value
if __name__ == '__main__':
main()