Skip to content
This repository was archived by the owner on Mar 26, 2026. It is now read-only.

Commit dd3e090

Browse files
committed
optimize rst
1 parent a4a0179 commit dd3e090

1 file changed

Lines changed: 129 additions & 42 deletions

File tree

gapic/utils/rst.py

Lines changed: 129 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,105 @@
1313
# limitations under the License.
1414

1515
import re
16-
from typing import Optional
16+
from typing import Optional, List, Dict
1717

1818
import pypandoc # type: ignore
1919

2020
from gapic.utils.lines import wrap
2121

22+
# --- PERFORMANCE CACHE ---
23+
_RAW_RST_CACHE: Dict[str, str] = {}
24+
25+
26+
def _aggressive_fast_convert(text: str) -> Optional[str]:
27+
"""
28+
Converts common Markdown (Code, Links, Lists) to RST using pure Python.
29+
Only gives up (returns None) for complex structures like Tables.
30+
"""
31+
# 1. TABLE CHECK (The only thing we strictly need Pandoc for)
32+
# If we see a pipe surrounded by spaces, it's likely a table.
33+
if re.search(r" \| ", text) or re.search(r"\|\n", text):
34+
return None
35+
36+
# 2. CODE BLOCKS: `code` -> ``code``
37+
# RST requires double backticks. Markdown uses one.
38+
# We look for backticks that aren't already double.
39+
# Regex: Negative lookbehind/lookahead to ensure we don't match ``already rst``.
40+
converted = re.sub(r"(?<!`)`([^`]+)`(?!`)", r"``\1``", text)
41+
42+
# 3. LINKS: [Text](URL) -> `Text <URL>`__
43+
# We use anonymous links (__) to avoid collision issues.
44+
converted = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"`\1 <\2>`__", converted)
45+
46+
# 4. BOLD: **text** -> **text** (Compatible, no change needed)
47+
48+
# 5. HEADINGS: # Heading -> Heading\n=======
49+
# (Simple fix for H1/H2, mostly sufficient for docstrings)
50+
converted = re.sub(r"^# (.*)$", r"\1\n" + "=" * 10, converted, flags=re.MULTILINE)
51+
converted = re.sub(r"^## (.*)$", r"\1\n" + "-" * 10, converted, flags=re.MULTILINE)
52+
53+
# 6. LISTS: Markdown lists (- item) work in RST mostly fine.
54+
# We just ensure there's a newline before a list starts to satisfy RST strictness.
55+
converted = re.sub(r"(\n[^-*].*)\n\s*[-*] ", r"\1\n\n- ", converted)
56+
57+
return converted
58+
59+
60+
def batch_convert_docstrings(docstrings: List[str]):
61+
"""
62+
Optimized Batch Processor.
63+
1. Tries Aggressive Python Conversion first.
64+
2. Only sends Tables/Complex items to Pandoc.
65+
"""
66+
unique_docs = set(docstrings)
67+
68+
# Filter: Only keep strings that need conversion and aren't in cache
69+
candidates = [
70+
d for d in unique_docs
71+
if d
72+
and d not in _RAW_RST_CACHE
73+
and re.search(r"[|*`_[\]#]", d) # Only interesting chars
74+
]
75+
76+
if not candidates:
77+
return
78+
79+
pandoc_batch: List[str] = []
80+
81+
# 1. Try Python Conversion
82+
for doc in candidates:
83+
fast_result = _aggressive_fast_convert(doc)
84+
if fast_result is not None:
85+
# Success: Saved ~50ms per call
86+
_RAW_RST_CACHE[doc] = fast_result.strip()
87+
else:
88+
# Failed: Must use Pandoc (Tables, etc)
89+
pandoc_batch.append(doc)
90+
91+
# 2. Process Remainder with Pandoc (Likely < 10 items)
92+
if not pandoc_batch:
93+
return
94+
95+
separator = "\n\n__GAPIC_BATCH_SPLIT__\n\n"
96+
giant_payload = separator.join(pandoc_batch)
97+
98+
try:
99+
converted_payload = pypandoc.convert_text(
100+
giant_payload,
101+
"rst",
102+
format="commonmark",
103+
extra_args=["--columns=1000"]
104+
)
105+
except Exception:
106+
return
107+
108+
split_marker = "__GAPIC_BATCH_SPLIT__"
109+
results = converted_payload.split(split_marker)
110+
111+
if len(results) == len(pandoc_batch):
112+
for original, converted in zip(pandoc_batch, results):
113+
_RAW_RST_CACHE[original] = converted.strip()
114+
22115

23116
def rst(
24117
text: str,
@@ -27,59 +120,53 @@ def rst(
27120
nl: Optional[bool] = None,
28121
source_format: str = "commonmark",
29122
):
30-
"""Convert the given text to ReStructured Text.
31-
32-
Args:
33-
text (str): The text to convert.
34-
width (int): The number of columns.
35-
indent (int): The number of columns to indent each line of text
36-
(except the first).
37-
nl (bool): Whether to append a trailing newline.
38-
Defaults to appending a newline if the result is more than
39-
one line long.
40-
source_format (str): The source format. This is ``commonmark`` by
41-
default, which is what is used by convention in protocol buffers.
42-
43-
Returns:
44-
str: The same text, in RST format.
45-
"""
46-
# Quick check: If the text block does not appear to have any formatting,
47-
# do not convert it.
48-
# (This makes code generation significantly faster; calling out to pandoc
49-
# is by far the most expensive thing we do.)
50-
if not re.search(r"[|*`_[\]]", text):
123+
"""Convert the given text to ReStructured Text."""
124+
125+
# 1. Super Fast Path: No special chars? Just wrap.
126+
if not re.search(r"[|*`_[\]#]", text):
51127
answer = wrap(
52128
text,
53129
indent=indent,
54130
offset=indent + 3,
55131
width=width - indent,
56132
)
133+
return _finalize(answer, nl, indent)
134+
135+
# 2. Check Cache
136+
if text in _RAW_RST_CACHE:
137+
raw_rst = _RAW_RST_CACHE[text]
57138
else:
58-
# Convert from CommonMark to ReStructured Text.
59-
answer = (
60-
pypandoc.convert_text(
139+
# Slow Path: Missed by batch or new string.
140+
# TRY PYTHON CONVERT FIRST.
141+
# This prevents the 'Slow Path' from actually being slow.
142+
fast_result = _aggressive_fast_convert(text)
143+
144+
if fast_result is not None:
145+
raw_rst = fast_result.strip()
146+
else:
147+
# The absolute last resort: Shell out to Pandoc
148+
raw_rst = pypandoc.convert_text(
61149
text,
62150
"rst",
63151
format=source_format,
64-
extra_args=["--columns=%d" % (width - indent)],
65-
)
66-
.strip()
67-
.replace("\n", f"\n{' ' * indent}")
68-
)
152+
extra_args=["--columns=1000"]
153+
).strip()
154+
155+
_RAW_RST_CACHE[text] = raw_rst
156+
157+
# 3. Python Formatting
158+
if "::" in raw_rst or ".. code" in raw_rst:
159+
answer = raw_rst.replace("\n", f"\n{' ' * indent}")
160+
else:
161+
answer = wrap(raw_rst, indent=indent, offset=indent, width=width - indent)
162+
163+
return _finalize(answer, nl, indent)
69164

70-
# Add a newline to the end of the document if any line breaks are
71-
# already present.
72-
#
73-
# This causes the closing """ to be on the subsequent line only when
74-
# appropriate.
165+
166+
def _finalize(answer, nl, indent):
167+
"""Helper to handle trailing newlines and quotes."""
75168
if nl or ("\n" in answer and nl is None):
76169
answer += "\n" + " " * indent
77-
78-
# If the text ends in a double-quote, append a period.
79-
# This ensures that we do not get a parse error when this output is
80-
# followed by triple-quotes.
81170
if answer.endswith('"'):
82171
answer += "."
83-
84-
# Done; return the answer.
85-
return answer
172+
return answer

0 commit comments

Comments
 (0)