#! /usr/bin/python2.3 import sys, re #sys.path.append("../..") sys.path.append("..") from pprint import pprint # # Basic Utility Functions # def unique(items): """ unique(variant: items) -> variant: unique items in items """ d = {} for item in items: d[item] = 0 return d.keys() def remove_quotes(s): """ remove_quotes(string: s) -> string: s without quotes """ if s: s_list = list(s) if s[0] in ["'", '"']: s_list = s_list[1:] if s[-1] in ["'", '"']: s_list = s_list[:-1] s = ''.join(s_list) return s def is_image(token): if "." in token: ext = token.split(".")[-1] if ext in ["png", "gif", "jpeg", "jpg", "bmp"]: return True return False # file extensions of files we want to test for. __file_types = [ "pdf", "gif", "jpeg", "jpg", "doc", "png", "xls", "ps", "html", "htm", "org", "com", "net", "gov", "info", "c", "h", "make", "py", "pl", "js", "csh", "sh", "gz", "tar", "tgz", "bz2", "app", "m", "txt", "ttf", "ttc", ] def is_filename(token, file_types=__file_types): """ is_filename(string: token [, list:file_types]) -> boolean """ # remove filenames if '.' in token: ext = token.split(".")[-1] if ext in file_types: return True return False def is_url(token): """ is_url(string: token) -> boolean """ # remove urls if "/" in token: if token[:4] == 'http': return True if token[:4] == 'file': return True if token[:3] == 'ftp': return True return False def is_dir(token): """ is_dir(string: token) -> boolean """ # remove suspected directories if token[-1] in ["\\", "/"]: return True return False # chars we do not want to include in words. __bad_chars = ["(", ")", "*", "&", "^", "[", "]", "{", "}", "|", "!", "%"] def remove_punct(word, punct=__bad_chars): """ remove_punct(string: word [, list: punct]) -> string: word removes punctuation from a string. """ charlist = [] lastc = '' for c in word: if (not lastc.isalnum() and not lastc.isspace() and c.isalnum() ): if lastc not in punct: charlist.append(lastc) if c.isalnum(): charlist.append(c) lastc = c word = ''.join(charlist) word = remove_quotes(word) return word # chars we want to split on to form more words. __delims = ['!', '&', '|', ';', ':', '-', '=', '"', '/', ','] def slice_word(word, delims=__delims): if not delims: return [word] else: values = [] tokens = word.split(delims[0]) for token in tokens: values += slice_word(token, delims[1:]) return filter(lambda s: s != '', values) # # Functions for HTML filtering # def replace_apos_entity(text): apos_re = re.compile(r''', re.I) text = apos_re.sub("'", text) return text def replace_amp_entity(text): amp_re = re.compile(r'&', re.I) text = apos_re.sub("&", text) return text __regex_filters = [ re.compile(r'<--.*?-->', re.M), # comments re.compile(r'', re.I | re.M), # script tag contents re.compile(r'', re.I | re.M), # style tag contents re.compile(r'<.*?>', re.I | re.M), # html tags re.compile(r'&.*?;') # entity tags ] def remove_html(text): """ extract_tokens(string: text) extracts content tokens (possibly words) from html. """ lines = text.split('\n') text = ' '.join(lines) # filter out html, javascript, css, entity tags for regex in __regex_filters: text = regex.sub(" ", text) return text __anchor_re = re.compile(r'<\s*a\s+href\s*\=\s*[\'\"]{0,1}(.*?)[\'\"]{0,1}\s*?>(.*?)', re.I | re.M) def extract_anchors(text): """ extract_anchors(string: text) -> tuple: (href, contents) """ matches = __anchor_re.findall(text) def fix_href(t): try: t = (t[0].split()[0], t[1]) except: pass return t matches = map(fix_href, matches) #print "EXTRACT_ANCHORS" #pprint(matches) return matches __title_re = re.compile(r'<\s*title\s*>(.*?)<\s*\/title\s*>', re.I | re.M) def extract_title(text): """ extract_title_words(string: text) -> list of string: words """ titles = __title_re.findall(text) if titles: # assume only 1 title per html doc. return titles[0] return '' return words __image_re = re.compile(r'<\s*img.*?src\s*\=\s*(.*?)\s*>', re.I | re.M) def extract_images(text): """ extract_images(string: text) -> list of string: image urls """ # leaving out image dimensions... might want to add back in later. matches = __image_re.findall(text) img_urls = map(lambda s: remove_quotes(s.split()[0]), matches) return img_urls # # Application Specific filters # # words to filter out __stopwords = [ 'the', 'and', 'a', 'to', 'of', 'in', 'i', 'is', 'that', 'it', 'on', 'you', 'this', 'for', 'but', 'with', 'are', 'have', 'be', 'at', 'or', 'as', 'was', 'so', 'if', 'out', 'not', 'e.g', 'i.e' ] def remove_stopwords(words, stopwords=__stopwords): clean_words = [] stopwords_found = [] for word in words: if word in stopwords: stopwords_found.append(word) else: clean_words.append(word) return clean_words, stopwords_found def norm_words(words): # the main parsing function for # filtering and extracting words. wanted_words = [] for w in words: if not w: continue w = w.lower() if is_filename(w): continue if is_url(w): continue if is_dir(w): continue more_words = slice_word(w) for mw in more_words: if len(mw) == 1: continue mw = remove_punct(mw) wanted_words.append(mw) return wanted_words def extract_words(text): return norm_words(remove_html(text).split()) if __name__ == '__main__': TEST = 0 def print_list(lst): for item in lst: print item for file in sys.argv[1:]: f = open(file) text = f.read() f.close() text = ' '.join(text.split()) if TEST: print " Images" print_list(extract_images(text)) if TEST: print " Title" print extract_title(text) if TEST: print " Anchors" print_list(extract_anchors(text)) if TEST: print " Remove HTML" content = remove_html(text) print content print " Normed Words" print_list(norm_words(content.split())) for w in extract_words(text): print w