Skip to content

Commit e171ce0

Browse files
authored
Add summary backfill (#1948)
* Initial commit * Fill out TODOs * Move it so it works * Update documentation on summary script * Update * Remove temporary exit * Progress * Update with new CSV writer * Minor writerow updates * Minor clean-up * Name the tests * Address feedback * Address feedback
1 parent 567266c commit e171ce0

4 files changed

Lines changed: 168 additions & 1 deletion

File tree

llm/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
venv/
22
__pycache__/
33
databases/
4-
.secret.local
4+
.secret.local
5+
summaries-and-topics.csv

llm/backfill_summaries.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""This script fills any missing 'summary' or 'topics' fields on the data model.
2+
3+
The document must have a 'Title' and 'DocumentText' field to generate them. The
4+
script queries only the general court 194 bills, modifies the firebase database
5+
in-place, and generates a CSV with a description of what happened. The header for
6+
the CSV is `bill_id,status,summary,topics`. The possible statuses are,
7+
8+
- `skipped` - the bill doesn't have either a title or text, skip it
9+
- `previous_summary` - the bill previously had a summary, skip it
10+
- `failed_summary` - something went wrong when trying to summarize, skip it
11+
- `previous_topics` - the bill previously had topics, skip it
12+
- `failed_topics` - something went wrong when trying to generate topics, skip it
13+
- `generated_summary` - both the summary and topics were generated successfully
14+
15+
Developer notes:
16+
- you'll need to set the 'OPENAI_API_KEY' environment variable
17+
"""
18+
19+
import firebase_admin
20+
from llm_functions import get_summary_api_function, get_tags_api_function_v2
21+
from firebase_admin import firestore
22+
from bill_on_document_created import get_categories_from_topics, CATEGORY_BY_TOPIC
23+
import csv
24+
from normalize_summaries import normalize_summary
25+
26+
# Module constants
27+
FIREBASE_COLLECTION_PATH = "generalCourts/194/bills"
28+
CSV_SUMMARY_OUTPUT = "./summaries-and-topics.csv"
29+
30+
# Application Default credentials are automatically created.
31+
app = firebase_admin.initialize_app()
32+
db = firestore.client()
33+
34+
35+
def make_bill_summary(bill_id, status, summary, topics):
36+
"""Generate a row for csv.writerow
37+
38+
The goal with this function is to not forget all the arguments to subsequent
39+
csv.writerow calls.
40+
"""
41+
return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"]
42+
43+
44+
bills_ref = db.collection(FIREBASE_COLLECTION_PATH)
45+
bills = bills_ref.get()
46+
with open(CSV_SUMMARY_OUTPUT, "w") as csvfile:
47+
csv_writer = csv.writer(csvfile)
48+
csv_writer.writerow(["bill_id", "status", "summary", "topics"])
49+
for bill in bills:
50+
document = bill.to_dict()
51+
bill_id = document["id"]
52+
document_text = document.get("content", {}).get("DocumentText")
53+
document_title = document.get("content", {}).get("Title")
54+
summary = document.get("summary")
55+
56+
# No document text or title, skip it because we can't summarize it
57+
if document_text is None or document_title is None:
58+
csv_writer.writerow(make_bill_summary(bill_id, "skipped", None, None))
59+
continue
60+
61+
# If the summary is already populated move on
62+
if summary is not None:
63+
csv_writer.writerow(
64+
make_bill_summary(bill_id, "previous_summary", None, None)
65+
)
66+
continue
67+
68+
summary = get_summary_api_function(bill_id, document_title, document_text)
69+
if summary["status"] in [-1, -2]:
70+
csv_writer.writerow(
71+
make_bill_summary(bill_id, "failed_summary", None, None)
72+
)
73+
continue
74+
# Note: `normalize_summary` does some post-processing to clean up the summaries
75+
# As of 2025-10-21 this was necessary due to the LLM prompt
76+
summary = normalize_summary(summary["summary"])
77+
bill.reference.update({"summary": summary})
78+
79+
# If the topics are already populated, just make a note of it
80+
topics = document.get("topics")
81+
if topics is not None:
82+
csv_writer.writerow(
83+
make_bill_summary(bill_id, "previous_topics", None, None)
84+
)
85+
86+
tags = get_tags_api_function_v2(bill_id, document_title, summary)
87+
# If the tags fail, make a note and at least write the summary for debugging
88+
if tags["status"] != 1:
89+
csv_writer.writerow(make_bill_summary(bill_id, "failed_topics", None, None))
90+
csv_writer.writerow(
91+
make_bill_summary(bill_id, "generated_summary", summary, None)
92+
)
93+
continue
94+
topics_and_categories = get_categories_from_topics(
95+
tags["tags"], CATEGORY_BY_TOPIC
96+
)
97+
bill.reference.update({"topics": topics_and_categories})
98+
csv_writer.writerow(
99+
make_bill_summary(
100+
bill_id, "generated_summary_and_topics", summary, topics_and_categories
101+
)
102+
)

llm/normalize_summaries.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""Normalize summary outputs from the LLM
2+
3+
The summary prompt has some formatting prose that we don't want to persist into
4+
the database. For example, it prefixes every summary with `Summary:`. We apply a
5+
few preprocessing steps to every summary to keep things uniform. The steps,
6+
7+
1. Remove leading `Summary:` from the input text
8+
2. Split any newlines created by unordered lists in the input text
9+
3. Remove leading `- ` from the split unordered lists
10+
4. Remove any remaining whitespace
11+
5. Put everything back together separated with spaces
12+
"""
13+
14+
import re
15+
16+
17+
def normalize_summary(summary: str) -> str:
18+
strip_summary = re.sub(r"^Summary:", "", summary)
19+
lines = strip_summary.splitlines()
20+
handle_list_items = [re.sub(r"^- ", "", x) for x in lines]
21+
handle_remaining_whitespace = [
22+
x.strip() for x in handle_list_items if x.strip() != ""
23+
]
24+
return " ".join(handle_remaining_whitespace)

llm/test_normalize_summaries.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import normalize_summaries
2+
3+
4+
def test_normalize_summary_handles_summary_prefix_and_bullets():
5+
summary = """Summary:
6+
- The bill allows Joe, the chief of police in Gravity, to continue working.
7+
- The city can require annual health examinations
8+
"""
9+
assert (
10+
normalize_summaries.normalize_summary(summary)
11+
== "The bill allows Joe, the chief of police in Gravity, to continue working. The city can require annual health examinations"
12+
)
13+
14+
15+
def test_normalize_summary_handles_summary_prefix_and_no_bullets():
16+
summary = """Summary:
17+
The bill allows Joe, the chief of police in Gravity, to continue working.
18+
"""
19+
assert (
20+
normalize_summaries.normalize_summary(summary)
21+
== "The bill allows Joe, the chief of police in Gravity, to continue working."
22+
)
23+
24+
25+
def test_normalize_summary_handles_summary_prefix_with_no_linebreak():
26+
summary = "Summary: The bill allows Joe, the chief of police in Gravity, to continue working."
27+
assert (
28+
normalize_summaries.normalize_summary(summary)
29+
== "The bill allows Joe, the chief of police in Gravity, to continue working."
30+
)
31+
32+
33+
def test_normalize_summary_handles_bare_summary():
34+
summary = (
35+
"The bill allows Joe, the chief of police in Gravity, to continue working."
36+
)
37+
assert (
38+
normalize_summaries.normalize_summary(summary)
39+
== "The bill allows Joe, the chief of police in Gravity, to continue working."
40+
)

0 commit comments

Comments
 (0)