#! /usr/bin/python import sys, re #sys.path.append("../..") sys.path.append("..") import lib.htmlextractor as htmlx class Set(list): def __init__(self, values): list.__init__(self, values) def append(self, value): if value not in self: self += [value] def remove_quotes(s): """ remove_quotes(string: s) -> string: s without quotes """ if s: s_list = list(s) if s[0] in ["'", '"']: s_list = s_list[1:] if s[-1] in ["'", '"']: s_list = s_list[:-1] s = ''.join(s_list) return s class NormWordsParser(object): def __init__(self): # words we want to elinate from the query. self._stopwords = [ 'the', 'and', 'a', 'to', 'of', 'in', 'i', 'is', 'that', 'it', 'on', 'you', 'this', 'for', 'but', 'with', 'are', 'have', 'be', 'at', 'or', 'as', 'was', 'so', 'if', 'out', 'not', 'e.g', 'i.e' ] # file extensions of files we want to remove. self._file_types = [ "pdf", "gif", "jpeg", "jpg", "doc", "png", "xls", "ps", "html", "htm", "org", "com", "net", "gov", "info", "c", "h", "make", "py", "pl", "js", "csh", "sh", "gz", "tar", "tgz", "bz2", "app", "m", "txt", "ttf", "ttc", ] # chars we want to split on to form more words. self._delims = ['!', '&', '|', ';', ':', '-', '=', '"', '/', ','] # chars we do not want to include in words. self._bad_chars = ["(", ")", "*", "&", "^", "[", "]", "{", "}", "|", "!", "%"] self._normed_words = [] self._stopwords_found = Set([]) self._original_words = [] self._uniq_normed_words = Set([]) def reset(self): self._normed_words = [] self._stopwords_found = Set([]) self._original_words = [] self._uniq_normed_words = Set([]) def set_word_list(self, words): self._original_words = words[:] self._norm_words() self._remove_stopwords() def found_stopwords(self): return self._stopwords_found def original_words(self): return self._original_words def normed_words(self): return self._normed_words def uniq_normed_words: if not self._uniq_normed_words: for w in self._normed_words: self._uniq_normed_words.append(w) return self._uniq_normed_words def _remove_stopwords(self): clean_words = [] for word in self._normed_words: if word in self._stopwords: self._stopwords_found.append(word) else: clean_words.append(word) self._normed_words = clean_words def _clean_word(self, word): # cleans up unwanted punctuation charlist = [] lastc = '' for c in word: if not lastc.isalnum() and not lastc.isspace() and c.isalnum(): if lastc not in self._bad_chars: charlist.append(lastc) if c.isalnum(): charlist.append(c) lastc = c word = ''.join(charlist) word = remove_quotes(word) return word def _norm_words(self): # the main parsing function for # filtering and extracting words. for w in self._original_words: if not w: continue w = w.lower() # remove urls if "/" in w: if w[:4] == 'http': continue if w[:4] == 'file': continue if w[:3] == 'ftp': continue # remove filenames if '.' in w: ext = w.split(".")[-1] if ext in self._file_types: continue # remove suspected directories if w[-1] in ["\\", "/"]: continue # there may be words embeded within the 'words' # we have been given. this function breaks up # embedded words. def make_words(tokens, delims): if delims: delim = delims[0] for token in tokens: return make_words(token.split(delim), delims[1:]) return tokens more_words = make_words([w], self._delims) for w in more_words: if len(w) == 1: continue w = self._clean_word(w) wanted_words.append(w) self._normed_words = wanted_words class QueryWordsParser(NormWordsParser): def __init__(self): NormWordsParser.__init__(self) class HTMLWordsParser(NormWordsParser): _regex_filters = [ re.compile(r'<--.*?-->', re.M), # comments re.compile(r'', re.I | re.M), # script tag contents re.compile(r'', re.I | re.M), # style tag contents re.compile(r'<.*?>', re.I | re.M), # html tags re.compile(r'&.*?;') # entity tags ] def __init__(self): NormWordsParser.__init__(self) def _extract_tokens(self, html_text): """ extract_tokens(string: text) extracts content tokens (possibly words) from html. """ lines = text.split('\n') text = ' '.join(lines) # keep certain entity tags w/ literal chars. apos_re = re.compile(r''', re.I) text = apos_re.sub("'", text) amp_re = re.compile(r'&', re.I) text = apos_re.sub("&", text) # filter out html, javascript, css, entity tags for regex in HTMLWordsParser._regex_filters: text = regex.sub(" ", text) return text def set_html(self, html_text): text = self._extract_tokens() self._set_text(text) def set_text(self, text): self._set_word_list(text.split()) if __name__ == '__main__': import sys args = sys.argv[1:] for filename in args: fd = open(filename) text = fd.read() words = htmlx.extract_words(text) fd.close()