Skip to content

Commit 4bf579c

Browse files
authored
Add files via upload
1 parent 5209394 commit 4bf579c

3 files changed

Lines changed: 358 additions & 0 deletions

File tree

chain.py

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
import re
2+
import random
3+
import operator
4+
import bisect
5+
import json
6+
from unidecode import unidecode
7+
from splitters import split_into_sentences
8+
9+
try: # pragma: no cover
10+
basestring
11+
except NameError: # pragma: no cover
12+
basestring = str
13+
14+
BEGIN = "__BEGIN__"
15+
END = "__END__"
16+
17+
18+
def accumulate(iterable, func=operator.add):
19+
"""
20+
Cumulative calculations. (Summation, by default.)
21+
"""
22+
it = iter(iterable)
23+
total = next(it)
24+
yield total
25+
for element in it:
26+
total = func(total, element)
27+
yield total
28+
29+
30+
class MarkovChain(object):
31+
"""
32+
Markov Chain which has beginning and end.
33+
"""
34+
35+
def __init__(self, corpus, state_size):
36+
"""
37+
corpus: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence.
38+
39+
state_size: items used to represent the state of the model.
40+
"""
41+
self.state_size = state_size
42+
self.model = self.build(corpus, self.state_size)
43+
# self.precompute_begin_state()
44+
45+
def build(self, corpus, state_size):
46+
"""
47+
Returns a dict of dicts where the keys of the outer dict represent all possible states, and point to the inner dicts. The inner dicts represent all possibilities for the "next" item in the chain, along with the count of times it appears.
48+
"""
49+
model = {}
50+
51+
for run in corpus:
52+
items = ([BEGIN] * state_size) + run + [END]
53+
for i in range(len(run)+1):
54+
state = tuple(items[i:i+state_size])
55+
follow = items[i+state_size]
56+
if state not in model:
57+
model[state] = {}
58+
59+
if follow not in model[state]:
60+
model[state][follow] = 0
61+
62+
model[state][follow] += 1
63+
64+
return model
65+
66+
def move(self, state):
67+
"""
68+
Randomly chooses item based on the given state..
69+
"""
70+
choices, weights = zip(*self.model[state].items())
71+
cumdist = list(accumulate(weights))
72+
r = random.random() * cumdist[-1]
73+
selection = choices[bisect.bisect(cumdist, r)]
74+
return selection
75+
76+
def gen(self, init_state=None):
77+
"""
78+
Starting with a naive "BEGIN" state, RETURNS a generator that will yield successive items until the chain reaches the "END" state.
79+
"""
80+
state = init_state or (BEGIN,) * self.state_size
81+
while True:
82+
next_word = self.move(state)
83+
if next_word == END:
84+
break
85+
yield next_word
86+
state = tuple(state[1:]) + (next_word,)
87+
88+
def walk(self, init_state=None):
89+
"""
90+
Returns a list representing a single run of the markov model
91+
"""
92+
return list(self.gen(init_state))
93+
94+
def to_json(self):
95+
"""
96+
Converts the list into a string
97+
"""
98+
return json.dumps(list(self.model.items()))
99+
100+
101+
"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------"""
102+
103+
DEFAULT_MAX_OVERLAP_RATIO = 0.7
104+
DEFAULT_MAX_OVERLAP_TOTAL = 20
105+
DEFAULT_TRIES = 8
106+
107+
108+
class Text(object):
109+
def __init__(self, input_text, state_size=2, chain=None, parsed_sentences=None, retain_original=True):
110+
"""
111+
input_text: A string.
112+
state_size: An integer, indicating the number of words in the model's state.
113+
parsed_sentences: It is a list of lists where the outer list like a sentence and the inner list is contains the words that make the sentence.
114+
"""
115+
116+
can_make_sentences = parsed_sentences is not None or input_text is not None
117+
self.retain_original = retain_original and can_make_sentences
118+
self.state_size = state_size
119+
120+
if self.retain_original:
121+
self.parsed_sentences = parsed_sentences or list(
122+
self.generate_corpus(input_text))
123+
124+
# Rejoined text lets us assess the novelty of generated sentences
125+
self.rejoined_text = self.sentence_join(
126+
map(self.word_join, self.parsed_sentences))
127+
self.chain = chain or MarkovChain(
128+
self.parsed_sentences, state_size)
129+
130+
else:
131+
if not chain:
132+
parsed = parsed_sentences or self.generate_corpus(input_text)
133+
self.chain = chain or MarkovChain(parsed, state_size)
134+
135+
def to_dict(self):
136+
"""
137+
Makes a Python dict of all the data
138+
"""
139+
return {
140+
"state_size": self.state_size,
141+
"chain": self.chain.to_json,
142+
"parsed_sentences": self.parsed_sentences if self.retain_original else None
143+
}
144+
145+
def sentence_split(self, text):
146+
"""
147+
Splits full-text string into a list of sentences
148+
"""
149+
return split_into_sentences(text)
150+
151+
def sentence_join(self, sentences):
152+
"""
153+
Rejoins a list of sentences into the full text
154+
"""
155+
return " ".join(sentences)
156+
157+
word_split_pattern = re.compile(r"\s+")
158+
159+
def word_split(self, sentence):
160+
"""
161+
Splits the sentence into list of words
162+
"""
163+
return re.split(self.word_split_pattern, sentence)
164+
165+
def word_join(self, words):
166+
"""
167+
Rejoins a list of words into a sentence
168+
"""
169+
return " ".join(words)
170+
171+
def text_sentences_input(self, sentence):
172+
"""
173+
A sentence filter that will reject any sentences that has strange punctuation in it
174+
"""
175+
if len(sentence.strip()) == 0:
176+
return False
177+
178+
reject_pat = re.compile(r"(^')|('$)|\s'|'\s|[\"(\(\)\[\])]")
179+
180+
# Decode unicode, mainly to normalize fancy quotation marks
181+
182+
if sentence.__class__.__name__ == "str":
183+
decoded = sentence
184+
185+
else:
186+
decoded = unidecode(sentence)
187+
188+
# Sentence shouldn't contain problematic characters
189+
190+
if re.search(reject_pat, decoded):
191+
return False
192+
193+
return True
194+
195+
def generate_corpus(self, text):
196+
"""
197+
Returns a list of list of sentences, each containing list of words.
198+
"""
199+
if isinstance(text, str):
200+
sentences = self.sentence_split(text)
201+
202+
else:
203+
sentences = []
204+
for line in text:
205+
sentences += self.sentence_split(line)
206+
207+
passing = filter(self.text_sentences_input, sentences)
208+
runs = map(self.word_split, passing)
209+
210+
return runs
211+
212+
def text_sentences_output(self, words, max_overlap_ratio, max_overlap_total):
213+
"""
214+
Given a generated list of words, accept or reject it. This one rejects sentences that too closely match the original text, namely those that contain any identical sequence of words of X length, where X is the smaller number of (a) `max_overlap_ratio` (default: 0.7) of the total number of words, and (b) `max_overlap_total` (default: 15).
215+
"""
216+
# Rejects chunk that is similar
217+
218+
overlap_ratio = int(round(max_overlap_ratio * len(words)))
219+
overlap_max = min(max_overlap_total, overlap_ratio)
220+
overlap_over = overlap_max + 1
221+
222+
gram_count = max((len(words) - overlap_max), 1)
223+
grams = [words[i:i+overlap_over] for i in range(gram_count)]
224+
225+
for gm in grams:
226+
gram_joined = self.word_join(gm)
227+
if gram_joined in self.rejoined_text:
228+
return False
229+
230+
return True
231+
232+
def make_sentences(self, init_state=None, **kwargs):
233+
"""
234+
Attempts "tries" (default: 10) times to generate a valid sentence, based on the model and "test_sentences_output". Passes "max_overlap_ratio" and "max_overlap_total" to "test_sentences_output".
235+
236+
If successful, returns the sentence as a string. If not, returns None.
237+
238+
If "init_state" (a tuple of "self.chain.state_size" words) is not specified, this method chooses a sentence-start at random, in accordance with the model.
239+
240+
If "test_output" is set as False then the "text_sentences_output" check will be skipped.
241+
242+
If "max_words" is specified, the word count for the sentence will be evaluated against the provided limit.
243+
"""
244+
245+
tries = kwargs.get("tries", DEFAULT_TRIES)
246+
mor = kwargs.get("max_overlap_ratio", DEFAULT_MAX_OVERLAP_RATIO)
247+
mot = kwargs.get("max_overlap_total", DEFAULT_MAX_OVERLAP_TOTAL)
248+
test_output = kwargs.get("test_output", True)
249+
max_words = kwargs.get("max_words", None)
250+
251+
if init_state != None:
252+
prefix = list(init_state)
253+
for word in prefix:
254+
if word == BEGIN:
255+
prefix = prefix[1:]
256+
else:
257+
break
258+
259+
else:
260+
prefix = []
261+
262+
for _ in range(tries):
263+
words = prefix + self.chain.walk(init_state)
264+
if max_words != None and len(words) > max_words:
265+
continue
266+
if test_output and hasattr(self, "rejoined_text"):
267+
if self.text_sentences_output(words, mor, mot):
268+
return self.word_join(words)
269+
270+
else:
271+
return self.word_join(words)
272+
273+
return None
274+
275+
def make_short_sentence(self, max_chars, min_chars=0, **kwargs):
276+
"""
277+
Tries making a sentence of no more than "max_chars" characters and optionally no less than "min_chars" charcaters, passing **kwargs to "self.make_sentence".
278+
"""
279+
tries = kwargs.get("tries", DEFAULT_TRIES)
280+
281+
for _ in range(tries):
282+
sentence = self.make_sentences(**kwargs)
283+
if sentence and len(sentence) <= max_chars and len(sentence) >= min_chars:
284+
return sentence
285+
286+
287+
"""-------------------------------------------------------------------------------------------------------------------------------------------------------------------"""

splitters.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# -*- coding: utf-8 -*-
2+
import re
3+
4+
ascii_lowercase = "abcdefghijklmnopqrstuvwxyz"
5+
ascii_uppercase = ascii_lowercase.upper()
6+
7+
# States w/ with thanks to https://github.com/unitedstates/python-us
8+
# Titles w/ thanks to https://github.com/nytimes/emphasis and @donohoe
9+
abbr_capped = "|".join([
10+
"ala|ariz|ark|calif|colo|conn|del|fla|ga|ill|ind|kan|ky|la|md|mass|mich|minn|miss|mo|mont|neb|nev|okla|ore|pa|tenn|vt|va|wash|wis|wyo", # States
11+
"u.s",
12+
"mr|ms|mrs|msr|dr|gov|pres|sen|sens|rep|reps|prof|gen|messrs|col|sr|jf|sgt|mgr|fr|rev|jr|snr|atty|supt", # Titles
13+
"ave|blvd|st|rd|hwy", # Streets
14+
"jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec", # Months
15+
"|".join(ascii_lowercase) # Initials
16+
]).split("|")
17+
18+
abbr_lowercase = "etc|v|vs|viz|al|pct"
19+
20+
exceptions = "U.S.|U.N.|E.U.|F.B.I.|C.I.A.".split("|")
21+
22+
def is_abbreviation(dotted_word):
23+
clipped = dotted_word[:-1]
24+
if clipped[0] in ascii_uppercase:
25+
if clipped.lower() in abbr_capped: return True
26+
else: return False
27+
else:
28+
if clipped in abbr_lowercase: return True
29+
else: return False
30+
31+
def is_sentence_ender(word):
32+
if word in exceptions: return False
33+
if word[-1] in [ "?", "!" ]:
34+
return True
35+
if len(re.sub(r"[^A-Z]", "", word)) > 1:
36+
return True
37+
if word[-1] == "." and (not is_abbreviation(word)):
38+
return True
39+
return False
40+
41+
def split_into_sentences(text):
42+
potential_end_pat = re.compile(r"".join([
43+
r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation
44+
r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc
45+
r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash)
46+
]), re.U)
47+
dot_iter = re.finditer(potential_end_pat, text)
48+
end_indices = [ (x.start() + len(x.group(1)) + len(x.group(2)))
49+
for x in dot_iter
50+
if is_sentence_ender(x.group(1)) ]
51+
spans = zip([None] + end_indices, end_indices + [None])
52+
sentences = [ text[start:end].strip() for start, end in spans ]
53+
return sentences

test.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import chain
2+
3+
with open("/home/saurabh/Personal/Stuff/Next Tech Lab AP/LSSC/caption_para.txt") as f:
4+
text = f.read()
5+
6+
text_model = chain.Text(text, state_size=3)
7+
8+
file = open("/home/saurabh/Personal/Stuff/Next Tech Lab AP/LSSC/text3.txt","w")
9+
10+
for j in range(200):
11+
print(" ")
12+
for i in range(1):
13+
output=text_model.make_short_sentence(max_chars=230, min_chars=70)
14+
txt = output + "\n"
15+
print(txt)
16+
17+
file.write(txt)
18+
file.close()

0 commit comments

Comments
 (0)