|
| 1 | +import re |
| 2 | +import random |
| 3 | +import operator |
| 4 | +import bisect |
| 5 | +import json |
| 6 | +from unidecode import unidecode |
| 7 | +from splitters import split_into_sentences |
| 8 | + |
| 9 | +try: # pragma: no cover |
| 10 | + basestring |
| 11 | +except NameError: # pragma: no cover |
| 12 | + basestring = str |
| 13 | + |
| 14 | +BEGIN = "__BEGIN__" |
| 15 | +END = "__END__" |
| 16 | + |
| 17 | + |
| 18 | +def accumulate(iterable, func=operator.add): |
| 19 | + """ |
| 20 | + Cumulative calculations. (Summation, by default.) |
| 21 | + """ |
| 22 | + it = iter(iterable) |
| 23 | + total = next(it) |
| 24 | + yield total |
| 25 | + for element in it: |
| 26 | + total = func(total, element) |
| 27 | + yield total |
| 28 | + |
| 29 | + |
| 30 | +class MarkovChain(object): |
| 31 | + """ |
| 32 | + Markov Chain which has beginning and end. |
| 33 | + """ |
| 34 | + |
| 35 | + def __init__(self, corpus, state_size): |
| 36 | + """ |
| 37 | + corpus: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence. |
| 38 | +
|
| 39 | + state_size: items used to represent the state of the model. |
| 40 | + """ |
| 41 | + self.state_size = state_size |
| 42 | + self.model = self.build(corpus, self.state_size) |
| 43 | + # self.precompute_begin_state() |
| 44 | + |
| 45 | + def build(self, corpus, state_size): |
| 46 | + """ |
| 47 | + Returns a dict of dicts where the keys of the outer dict represent all possible states, and point to the inner dicts. The inner dicts represent all possibilities for the "next" item in the chain, along with the count of times it appears. |
| 48 | + """ |
| 49 | + model = {} |
| 50 | + |
| 51 | + for run in corpus: |
| 52 | + items = ([BEGIN] * state_size) + run + [END] |
| 53 | + for i in range(len(run)+1): |
| 54 | + state = tuple(items[i:i+state_size]) |
| 55 | + follow = items[i+state_size] |
| 56 | + if state not in model: |
| 57 | + model[state] = {} |
| 58 | + |
| 59 | + if follow not in model[state]: |
| 60 | + model[state][follow] = 0 |
| 61 | + |
| 62 | + model[state][follow] += 1 |
| 63 | + |
| 64 | + return model |
| 65 | + |
| 66 | + def move(self, state): |
| 67 | + """ |
| 68 | + Randomly chooses item based on the given state.. |
| 69 | + """ |
| 70 | + choices, weights = zip(*self.model[state].items()) |
| 71 | + cumdist = list(accumulate(weights)) |
| 72 | + r = random.random() * cumdist[-1] |
| 73 | + selection = choices[bisect.bisect(cumdist, r)] |
| 74 | + return selection |
| 75 | + |
| 76 | + def gen(self, init_state=None): |
| 77 | + """ |
| 78 | + Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items until the chain reaches the "END" state. |
| 79 | + """ |
| 80 | + state = init_state or (BEGIN,) * self.state_size |
| 81 | + while True: |
| 82 | + next_word = self.move(state) |
| 83 | + if next_word == END: |
| 84 | + break |
| 85 | + yield next_word |
| 86 | + state = tuple(state[1:]) + (next_word,) |
| 87 | + |
| 88 | + def walk(self, init_state=None): |
| 89 | + """ |
| 90 | + Returns a list representing a single run of the markov model |
| 91 | + """ |
| 92 | + return list(self.gen(init_state)) |
| 93 | + |
| 94 | + def to_json(self): |
| 95 | + """ |
| 96 | + Converts the list into a string |
| 97 | + """ |
| 98 | + return json.dumps(list(self.model.items())) |
| 99 | + |
| 100 | + |
| 101 | +"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------""" |
| 102 | + |
| 103 | +DEFAULT_MAX_OVERLAP_RATIO = 0.7 |
| 104 | +DEFAULT_MAX_OVERLAP_TOTAL = 20 |
| 105 | +DEFAULT_TRIES = 8 |
| 106 | + |
| 107 | + |
| 108 | +class Text(object): |
| 109 | + def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True): |
| 110 | + """ |
| 111 | + input_text: A string. |
| 112 | + state_size: An integer, indicating the number of words in the model's state. |
| 113 | + parsed_sentences: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence. |
| 114 | + """ |
| 115 | + |
| 116 | + can_make_sentences = parsed_sentences is not None or input_text is not None |
| 117 | + self.retain_original = retain_original and can_make_sentences |
| 118 | + self.state_size = state_size |
| 119 | + |
| 120 | + if self.retain_original: |
| 121 | + self.parsed_sentences = parsed_sentences or list( |
| 122 | + self.generate_corpus(input_text)) |
| 123 | + |
| 124 | + # Rejoined text lets us assess the novelty of generated sentences |
| 125 | + self.rejoined_text = self.sentence_join( |
| 126 | + map(self.word_join, self.parsed_sentences)) |
| 127 | + self.chain = chain or MarkovChain( |
| 128 | + self.parsed_sentences, state_size) |
| 129 | + |
| 130 | + else: |
| 131 | + if not chain: |
| 132 | + parsed = parsed_sentences or self.generate_corpus(input_text) |
| 133 | + self.chain = chain or MarkovChain(parsed, state_size) |
| 134 | + |
| 135 | + def to_dict(self): |
| 136 | + """ |
| 137 | + Makes a Python dict of all the data |
| 138 | + """ |
| 139 | + return { |
| 140 | + "state_size": self.state_size, |
| 141 | + "chain": self.chain.to_json, |
| 142 | + "parsed_sentences": self.parsed_sentences if self.retain_original else None |
| 143 | + } |
| 144 | + |
| 145 | + def sentence_split(self, text): |
| 146 | + """ |
| 147 | + Splits full-text string into a list of sentences |
| 148 | + """ |
| 149 | + return split_into_sentences(text) |
| 150 | + |
| 151 | + def sentence_join(self, sentences): |
| 152 | + """ |
| 153 | + Rejoins a list of sentences into the full text |
| 154 | + """ |
| 155 | + return " ".join(sentences) |
| 156 | + |
| 157 | + word_split_pattern = re.compile(r"\s+") |
| 158 | + |
| 159 | + def word_split(self, sentence): |
| 160 | + """ |
| 161 | + Splits the sentence into list of words |
| 162 | + """ |
| 163 | + return re.split(self.word_split_pattern, sentence) |
| 164 | + |
| 165 | + def word_join(self, words): |
| 166 | + """ |
| 167 | + Rejoins a list of words into a sentence |
| 168 | + """ |
| 169 | + return " ".join(words) |
| 170 | + |
| 171 | + def text_sentences_input(self, sentence): |
| 172 | + """ |
| 173 | + A sentence filter that will reject any sentences that has strange punctuation in it |
| 174 | + """ |
| 175 | + if len(sentence.strip()) == 0: |
| 176 | + return False |
| 177 | + |
| 178 | + reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]") |
| 179 | + |
| 180 | + # Decode unicode, mainly to normalize fancy quotation marks |
| 181 | + |
| 182 | + if sentence.__class__.__name__ == "str": |
| 183 | + decoded = sentence |
| 184 | + |
| 185 | + else: |
| 186 | + decoded = unidecode(sentence) |
| 187 | + |
| 188 | + # Sentence shouldn't contain problematic characters |
| 189 | + |
| 190 | + if re.search(reject_pat, decoded): |
| 191 | + return False |
| 192 | + |
| 193 | + return True |
| 194 | + |
| 195 | + def generate_corpus(self, text): |
| 196 | + """ |
| 197 | + Returns a list of list of sentences, each containing list of words. |
| 198 | + """ |
| 199 | + if isinstance(text, str): |
| 200 | + sentences = self.sentence_split(text) |
| 201 | + |
| 202 | + else: |
| 203 | + sentences = [] |
| 204 | + for line in text: |
| 205 | + sentences += self.sentence_split(line) |
| 206 | + |
| 207 | + passing = filter(self.text_sentences_input, sentences) |
| 208 | + runs = map(self.word_split, passing) |
| 209 | + |
| 210 | + return runs |
| 211 | + |
| 212 | + def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total): |
| 213 | + """ |
| 214 | + Given a generated list of words, accept or reject it. This one rejects sentences that too closely match the original text, namely those that contain any identical sequence of words of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the total number of words, and (b) `max_overlap_total` (default: 15). |
| 215 | + """ |
| 216 | + # Rejects chunk that is similar |
| 217 | + |
| 218 | + overlap_ratio = int(round(max_overlap_ratio * len(words))) |
| 219 | + overlap_max = min(max_overlap_total, overlap_ratio) |
| 220 | + overlap_over = overlap_max + 1 |
| 221 | + |
| 222 | + gram_count = max((len(words) - overlap_max), 1) |
| 223 | + grams = [words[i:i+overlap_over] for i in range(gram_count)] |
| 224 | + |
| 225 | + for gm in grams: |
| 226 | + gram_joined = self.word_join(gm) |
| 227 | + if gram_joined in self.rejoined_text: |
| 228 | + return False |
| 229 | + |
| 230 | + return True |
| 231 | + |
| 232 | + def make_sentences(self, init_state=None, **kwargs): |
| 233 | + """ |
| 234 | + Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to "test_sentences_output". |
| 235 | +
|
| 236 | + If successful, returns the sentence as a string. If not, returns None. |
| 237 | +
|
| 238 | + If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method chooses a sentence-start at random, in accordance with the model. |
| 239 | +
|
| 240 | + If "test_output" is set as False then the "text_sentences_output" check will be skipped. |
| 241 | +
|
| 242 | + If "max_words" is specified, the word count for the sentence will be evaluated against the provided limit. |
| 243 | + """ |
| 244 | + |
| 245 | + tries = kwargs.get("tries", DEFAULT_TRIES) |
| 246 | + mor = kwargs.get("max_overlap_ratio", DEFAULT_MAX_OVERLAP_RATIO) |
| 247 | + mot = kwargs.get("max_overlap_total", DEFAULT_MAX_OVERLAP_TOTAL) |
| 248 | + test_output = kwargs.get("test_output", True) |
| 249 | + max_words = kwargs.get("max_words", None) |
| 250 | + |
| 251 | + if init_state != None: |
| 252 | + prefix = list(init_state) |
| 253 | + for word in prefix: |
| 254 | + if word == BEGIN: |
| 255 | + prefix = prefix[1:] |
| 256 | + else: |
| 257 | + break |
| 258 | + |
| 259 | + else: |
| 260 | + prefix = [] |
| 261 | + |
| 262 | + for _ in range(tries): |
| 263 | + words = prefix + self.chain.walk(init_state) |
| 264 | + if max_words != None and len(words) > max_words: |
| 265 | + continue |
| 266 | + if test_output and hasattr(self, "rejoined_text"): |
| 267 | + if self.text_sentences_output(words, mor, mot): |
| 268 | + return self.word_join(words) |
| 269 | + |
| 270 | + else: |
| 271 | + return self.word_join(words) |
| 272 | + |
| 273 | + return None |
| 274 | + |
| 275 | + def make_short_sentence(self, max_chars, min_chars=0, **kwargs): |
| 276 | + """ |
| 277 | + Tries making a sentence of no more than "max_chars" characters and optionally no less than "min_chars" charcaters, passing **kwargs to "self.make_sentence". |
| 278 | + """ |
| 279 | + tries = kwargs.get("tries", DEFAULT_TRIES) |
| 280 | + |
| 281 | + for _ in range(tries): |
| 282 | + sentence = self.make_sentences(**kwargs) |
| 283 | + if sentence and len(sentence) <= max_chars and len(sentence) >= min_chars: |
| 284 | + return sentence |
| 285 | + |
| 286 | + |
| 287 | +"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------""" |
0 commit comments