#TextGen #Dan Richert #2005 # # #TextGen is a text generation suite. #requires Soundex #requires BeautifulSoup #requires Google API (SOAPpy, pygoogle, various dependencies) #TextGen().build_dictionary() done (with variable `nextwords` length) 08/03/05 ##copied from ./text_ca/5x5n.py 08/09/05 #081805 -- build_soundex_dict method added to TextGen #081905 --- successful syllable counter added ##TO DO: # use `Grid` object for all output formatting (all TextGen function write to Grid) # soundex implementation (started on 081605) # implement CMU Pronouncing Dictionary ( c06d ) DONE 081905 # add fancy PostScript output to Grid # work on GParser to pull text from web. needs to be able to parse hyperlinks from google # search and body text from resulting pages. # *may require config file or some other method to specifiy the location of the cmu pronouncing dictionary # right now, it has to be in the same directory as textgen ######################################################### ######################################################### ######################################################### import sys, random, string, re, soundex, urllib, google from BeautifulSoup import BeautifulSoup google.LICENSE_KEY = '...' #get license key from www.google.com/apis def remove_nonletters(s): return re.sub('[^a-zA-Z]', '', s) ###build pronouncing dictionary for syllable counting reference ###defaults to opening 'c06d' from the same directory as the TextGen script def build_pdict(fp=open('/Users/danrichert/thesis/c06d')): ###set up the CMU pronouncing dictionary for syllable counting: ###########mysteriously missing 18 entries??????? pdict = {} cpd = fp.read().split('\n') for line in range(len(cpd)-1): cpd[line] = cpd[line].split() for line in range(len(cpd)-2): ##'len(cpd)-2' is a crappy fix for something wrong with the last line after cpd is split()## temp = [] for x in range(1, len(cpd[line])): temp.append(cpd[line][x]) pdict[cpd[line][0]] = temp return pdict #WORKS!!!!!!!! should be moved into TextGen? #returns -1 if word can't be found in pronouncing dictionary def count_syl(word, pdict): syl = 0 try: for x in pdict[remove_nonletters(word).upper()]: for y in string.digits: if x[-1:] == y: syl = syl + 1 return syl except KeyError: return -1 #--- make line lengths variable per line (set up lines as sequence?) class Formatter: def __init__(self, width=5, height=5): self.width = width self.height = height self.buffer = [] #problem with first word def format(self): output = '' total_length = self.width*self.height counter = 1 #offsetting counter to fix first word modulus problem while counter <= total_length: #print counter if counter < len(self.buffer)+1: #more offsetting #print counter%self.width if counter%self.width == 0 and counter != 0: output = output + self.buffer[counter-1] + '\n' else: output = output + self.buffer[counter-1] + ' ' counter += 1 return output + '\n' #google link pulling is working!!!!!!!! def get_google_links(search_term): "Pulls links from Google Search (uses Google API with pygoogle)" data = google.doGoogleSearch(search_term) link_list = [] for i in data.results: link_list.append(i.URL) return link_list #re.sub('', ' ', x) removes html comments (not working) #add error handling class Scraper: def __init__(self, url_string): try: self.page_source = urllib.urlopen(url_string).read() self.soupify() except IOError: self.page_source = ' ' self.soupify() def soupify(self): self.soup = BeautifulSoup(self.page_source) def get_links(self): "HTML hyperlink extractor: returns list of links" link_tag_list = self.soup.fetch('a', {'href':re.compile('.+')}) link_list = [] for i in link_tag_list: link_list.append(i['href']) return link_list #need to fix problems with special characters and comments #problems pretty much fixed (101405) def get_body_text(self): "HTML body text extractor (not formatting sensitive)" #format - 0 returns body text as list, 1 returns as ' '-joined string fetched_text = self.soup.fetchText(re.compile('.')) #pattern matches and tags tag_pattern = re.compile('<[!?]?.*?>', re.DOTALL) #pattern to remove '&'-started special characters spec_pattern = re.compile('&.*?;') joined = ' '.join(fetched_text) joined = spec_pattern.sub(' ', joined) return tag_pattern.sub(' ', joined) #all generative methods return a list for greatest flexibility #master text definited in __init__ is now string rather than fp arg 081205 #methods that return formatted text output removed. instead, non-formatting # versions to send output to Grid 081205 ###dictionaries and references: ##dict -- markov dictionary ##cpd_dict -- CMU Pronouncing Dictionary with words as keys, pronunciation key elements as list for value ##sndx_ref -- wordlist paried with Soundex hashes (not a dictionary, zipped list) class TextGen: def __init__(self, text): self.text = text #formerly `self.text = text.split()` (changed so that text within object can be changed) def build_dict(self, num_of_nextwords=2, verbose=0): "Builds Markov dictionary into self.dict" #splits text as string if not already split try: self.text = self.text.split() except AttributeError: pass words = [] nextwords = [] for x in range(len(self.text)-num_of_nextwords): if verbose: print str(x)+' of '+str(len(self.text)) words.append(remove_nonletters(self.text[x])) nw = [] for i in range(num_of_nextwords): nw.append(self.text[x+(i+1)]) nextwords.append(nw) self.dict = {} for y in range(len(words)): if verbose: print str(y)+' of '+str(len(words)) currentword = words[y].lower() try: test = self.dict[currentword] except KeyError: self.dict[currentword] = [] self.dict[currentword].append(nextwords[y]) ###SOUNDEX STUFF### def build_soundex_ref(self, s): #builds a zipped list of words and their Soundex values wordlist = s.split() wordlist_hashes = [] for word in wordlist: wordlist_hashes.append(soundex.get_soundex(word)) self.sndx_ref = zip(wordlist, wordlist_hashes) def soundex_switch(self, word): #switch out a word based on Soundex value possible_words = [] for x in self.sndx_ref: if soundex.get_soundex(word) == x[1]: possible_words.append(x[0]) if len(possible_words) > 0: return possible_words[random.randint(0, len(possible_words)-1)] else: return word ############### ###GENERATORS##### def random_text(self, length): output = [] for x in range(length-1): output.append(self.text[random.randint(0, len(self.text)-1)]) return output def prob_text(self, length): output = [] #get firstword output.append(self.text[random.randint(0, len(self.text)-1)]) for x in range(length): lastword = remove_nonletters(output[len(output)-1]).lower() nextwords = self.dict[lastword][random.randint(0, len(self.dict[lastword])-1)] for word in nextwords: output.append(word) return output ###############