|
| 1 | +"""This script fills any missing 'summary' or 'topics' fields on the data model. |
| 2 | +
|
| 3 | +The document must have a 'Title' and 'DocumentText' field to generate them. The |
| 4 | +script queries only the general court 194 bills, modifies the firebase database |
| 5 | +in-place, and generates a CSV with a description of what happened. The header for |
| 6 | +the CSV is `bill_id,status,summary,topics`. The possible statuses are, |
| 7 | +
|
| 8 | +- `skipped` - the bill doesn't have either a title or text, skip it |
| 9 | +- `previous_summary` - the bill previously had a summary, skip it |
| 10 | +- `failed_summary` - something went wrong when trying to summarize, skip it |
| 11 | +- `previous_topics` - the bill previously had topics, skip it |
| 12 | +- `failed_topics` - something went wrong when trying to generate topics, skip it |
| 13 | +- `generated_summary` - both the summary and topics were generated successfully |
| 14 | +
|
| 15 | +Developer notes: |
| 16 | +- you'll need to set the 'OPENAI_API_KEY' environment variable |
| 17 | +""" |
| 18 | + |
| 19 | +import firebase_admin |
| 20 | +from llm_functions import get_summary_api_function, get_tags_api_function_v2 |
| 21 | +from firebase_admin import firestore |
| 22 | +from bill_on_document_created import get_categories_from_topics, CATEGORY_BY_TOPIC |
| 23 | +import csv |
| 24 | +from normalize_summaries import normalize_summary |
| 25 | + |
| 26 | +# Module constants |
| 27 | +FIREBASE_COLLECTION_PATH = "generalCourts/194/bills" |
| 28 | +CSV_SUMMARY_OUTPUT = "./summaries-and-topics.csv" |
| 29 | + |
| 30 | +# Application Default credentials are automatically created. |
| 31 | +app = firebase_admin.initialize_app() |
| 32 | +db = firestore.client() |
| 33 | + |
| 34 | + |
| 35 | +def make_bill_summary(bill_id, status, summary, topics): |
| 36 | + """Generate a row for csv.writerow |
| 37 | +
|
| 38 | + The goal with this function is to not forget all the arguments to subsequent |
| 39 | + csv.writerow calls. |
| 40 | + """ |
| 41 | + return [f"{bill_id}", f"{status}", f"{summary}", f"{topics}"] |
| 42 | + |
| 43 | + |
| 44 | +bills_ref = db.collection(FIREBASE_COLLECTION_PATH) |
| 45 | +bills = bills_ref.get() |
| 46 | +with open(CSV_SUMMARY_OUTPUT, "w") as csvfile: |
| 47 | + csv_writer = csv.writer(csvfile) |
| 48 | + csv_writer.writerow(["bill_id", "status", "summary", "topics"]) |
| 49 | + for bill in bills: |
| 50 | + document = bill.to_dict() |
| 51 | + bill_id = document["id"] |
| 52 | + document_text = document.get("content", {}).get("DocumentText") |
| 53 | + document_title = document.get("content", {}).get("Title") |
| 54 | + summary = document.get("summary") |
| 55 | + |
| 56 | + # No document text or title, skip it because we can't summarize it |
| 57 | + if document_text is None or document_title is None: |
| 58 | + csv_writer.writerow(make_bill_summary(bill_id, "skipped", None, None)) |
| 59 | + continue |
| 60 | + |
| 61 | + # If the summary is already populated move on |
| 62 | + if summary is not None: |
| 63 | + csv_writer.writerow( |
| 64 | + make_bill_summary(bill_id, "previous_summary", None, None) |
| 65 | + ) |
| 66 | + continue |
| 67 | + |
| 68 | + summary = get_summary_api_function(bill_id, document_title, document_text) |
| 69 | + if summary["status"] in [-1, -2]: |
| 70 | + csv_writer.writerow( |
| 71 | + make_bill_summary(bill_id, "failed_summary", None, None) |
| 72 | + ) |
| 73 | + continue |
| 74 | + # Note: `normalize_summary` does some post-processing to clean up the summaries |
| 75 | + # As of 2025-10-21 this was necessary due to the LLM prompt |
| 76 | + summary = normalize_summary(summary["summary"]) |
| 77 | + bill.reference.update({"summary": summary}) |
| 78 | + |
| 79 | + # If the topics are already populated, just make a note of it |
| 80 | + topics = document.get("topics") |
| 81 | + if topics is not None: |
| 82 | + csv_writer.writerow( |
| 83 | + make_bill_summary(bill_id, "previous_topics", None, None) |
| 84 | + ) |
| 85 | + |
| 86 | + tags = get_tags_api_function_v2(bill_id, document_title, summary) |
| 87 | + # If the tags fail, make a note and at least write the summary for debugging |
| 88 | + if tags["status"] != 1: |
| 89 | + csv_writer.writerow(make_bill_summary(bill_id, "failed_topics", None, None)) |
| 90 | + csv_writer.writerow( |
| 91 | + make_bill_summary(bill_id, "generated_summary", summary, None) |
| 92 | + ) |
| 93 | + continue |
| 94 | + topics_and_categories = get_categories_from_topics( |
| 95 | + tags["tags"], CATEGORY_BY_TOPIC |
| 96 | + ) |
| 97 | + bill.reference.update({"topics": topics_and_categories}) |
| 98 | + csv_writer.writerow( |
| 99 | + make_bill_summary( |
| 100 | + bill_id, "generated_summary_and_topics", summary, topics_and_categories |
| 101 | + ) |
| 102 | + ) |
0 commit comments