diff --git a/docs/changelog.md b/docs/changelog.md index be1cf69b7a..3d36473071 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,10 @@ This changelog documents user-relevant changes to the GitHub runner charm. +## 2026-03-13 + +- Reduced planner pressure stream reconnect log noise by treating transient disconnects/timeouts as warnings without duplicate stack traces, while preserving error-level logging for non-transient planner API failures. + ## 2026-03-11 - Added Prometheus metrics for GithubClient operation counts, errors, latency, and rate-limit visibility. diff --git a/github-runner-manager/pyproject.toml b/github-runner-manager/pyproject.toml index a1365a6551..9a8d0749b0 100644 --- a/github-runner-manager/pyproject.toml +++ b/github-runner-manager/pyproject.toml @@ -3,7 +3,7 @@ [project] name = "github-runner-manager" -version = "0.15.0" +version = "0.15.1" authors = [ { name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" }, ] diff --git a/github-runner-manager/src/github_runner_manager/manager/pressure_reconciler.py b/github-runner-manager/src/github_runner_manager/manager/pressure_reconciler.py index c3c1a45d93..8b43e1fd36 100644 --- a/github-runner-manager/src/github_runner_manager/manager/pressure_reconciler.py +++ b/github-runner-manager/src/github_runner_manager/manager/pressure_reconciler.py @@ -14,7 +14,6 @@ import grp import logging import os -import time from dataclasses import dataclass from threading import Event, Lock from typing import Optional @@ -32,6 +31,7 @@ PlannerApiError, PlannerClient, PlannerConfiguration, + PlannerConnectionError, ) from github_runner_manager.platform.github_provider import GitHubRunnerPlatform @@ -131,14 +131,29 @@ def start_create_loop(self) -> None: if self._stop.is_set(): return self._handle_create_runners(update.pressure) + except PlannerConnectionError as exc: + fallback = max(self._last_pressure or 0, self._config.min_pressure) + logger.warning( + "Pressure stream interrupted for flavor %s (%s), falling back to %s runners.", + self._config.flavor_name, + exc, + fallback, + ) + if self._stop.is_set(): + return + self._handle_create_runners(fallback) + self._stop.wait(5) except PlannerApiError: fallback = max(self._last_pressure or 0, self._config.min_pressure) logger.exception( - "Error in pressure stream loop, falling back to %s runners.", + "Error in pressure stream loop for flavor %s, falling back to %s runners.", + self._config.flavor_name, fallback, ) + if self._stop.is_set(): + return self._handle_create_runners(fallback) - time.sleep(5) + self._stop.wait(5) def start_reconcile_loop(self) -> None: """Periodically reconcile runners: sync state, scale up/down, and clean up.""" diff --git a/github-runner-manager/src/github_runner_manager/planner_client.py b/github-runner-manager/src/github_runner_manager/planner_client.py index f47c207e0b..6ff371d1f5 100644 --- a/github-runner-manager/src/github_runner_manager/planner_client.py +++ b/github-runner-manager/src/github_runner_manager/planner_client.py @@ -46,6 +46,10 @@ class PlannerApiError(Exception): """Represents an error while interacting with the planner service.""" +class PlannerConnectionError(PlannerApiError): + """Transient connection or stream interruption error.""" + + class PlannerClient: # pylint: disable=too-few-public-methods """An HTTP client for the planner service.""" @@ -69,7 +73,8 @@ def stream_pressure(self, name: str) -> Iterable[PressureInfo]: Parsed pressure updates. Raises: - PlannerApiError: On HTTP or stream errors. + PlannerConnectionError: On transient stream disconnects or timeouts. + PlannerApiError: On other HTTP or stream errors. """ base = str(self._config.base_url).rstrip("/") + "/" url = urljoin(base, f"api/v1/flavors/{name}/pressure?stream=true") @@ -93,9 +98,14 @@ def stream_pressure(self, name: str) -> Iterable[PressureInfo]: except json.JSONDecodeError: logger.warning("Skipping malformed stream line: %s", line) continue + except ( + requests.ConnectionError, + requests.Timeout, + requests.exceptions.ChunkedEncodingError, + ) as exc: + raise PlannerConnectionError(str(exc)) from exc except requests.RequestException as exc: - logger.exception("Error while streaming pressure for flavor '%s' from planner.", name) - raise PlannerApiError from exc + raise PlannerApiError(str(exc)) from exc @staticmethod def _create_session() -> requests.Session: diff --git a/github-runner-manager/tests/unit/manager/test_pressure_reconciler.py b/github-runner-manager/tests/unit/manager/test_pressure_reconciler.py index a2d84911a4..66b2e660d2 100644 --- a/github-runner-manager/tests/unit/manager/test_pressure_reconciler.py +++ b/github-runner-manager/tests/unit/manager/test_pressure_reconciler.py @@ -12,7 +12,7 @@ PressureReconciler, PressureReconcilerConfig, ) -from github_runner_manager.planner_client import PlannerApiError +from github_runner_manager.planner_client import PlannerApiError, PlannerConnectionError class _FakeManager: @@ -64,66 +64,82 @@ class _FakePlanner: def __init__( self, stream_updates: list[int] | None = None, - stream_raises: bool = False, + stream_exception: Exception | None = None, ): """Initialize with configurable stream behavior.""" self._stream_updates = stream_updates or [] - self._stream_raises = stream_raises + self._stream_exception = stream_exception def stream_pressure(self, name: str): # noqa: ARG002 - """Yield pressure updates or raise PlannerApiError based on configuration. + """Yield pressure updates or raise the configured exception. Yields: Namespace objects with a pressure attribute. """ - if self._stream_raises: - raise PlannerApiError + if self._stream_exception is not None: + raise self._stream_exception for p in self._stream_updates: yield SimpleNamespace(pressure=p) -def test_min_pressure_used_as_fallback_when_stream_errors(monkeypatch: pytest.MonkeyPatch): +@pytest.mark.parametrize( + "planner_error", + [ + pytest.param(PlannerApiError("request failed"), id="planner_api_error"), + pytest.param(PlannerConnectionError("connection dropped"), id="planner_connection_error"), + ], +) +def test_min_pressure_used_as_fallback_when_stream_errors( + monkeypatch: pytest.MonkeyPatch, planner_error: Exception +): """ - arrange: A reconciler whose planner stream raises PlannerApiError and no prior pressure. + arrange: A reconciler whose planner stream raises a planner error and no prior pressure. act: Call start_create_loop. assert: min_pressure is used as fallback to create runners. """ mgr = _FakeManager() - planner = _FakePlanner(stream_raises=True) + planner = _FakePlanner(stream_exception=planner_error) cfg = PressureReconcilerConfig(flavor_name="small", min_pressure=2) reconciler = PressureReconciler(mgr, planner, cfg, lock=Lock()) - def _stop_after_backoff(_seconds: int): - """Stop the reconciler after the backoff sleep is triggered.""" + def _stop_after_backoff(_seconds: int) -> bool: + """Stop the reconciler after the backoff wait is triggered.""" reconciler.stop() + return True - monkeypatch.setattr( - "github_runner_manager.manager.pressure_reconciler.time.sleep", _stop_after_backoff - ) + monkeypatch.setattr(reconciler._stop, "wait", _stop_after_backoff) reconciler.start_create_loop() assert 2 in mgr.created_args -def test_fallback_preserves_last_pressure_when_higher(monkeypatch: pytest.MonkeyPatch): +@pytest.mark.parametrize( + "planner_error", + [ + pytest.param(PlannerApiError("request failed"), id="planner_api_error"), + pytest.param(PlannerConnectionError("connection dropped"), id="planner_connection_error"), + ], +) +def test_fallback_preserves_last_pressure_when_higher( + monkeypatch: pytest.MonkeyPatch, planner_error: Exception +): """ arrange: A reconciler with last_pressure=10 and min_pressure=2 whose stream errors. act: Call start_create_loop. assert: The higher last_pressure is used as fallback instead of min_pressure. """ mgr = _FakeManager() - planner = _FakePlanner(stream_raises=True) + planner = _FakePlanner(stream_exception=planner_error) cfg = PressureReconcilerConfig(flavor_name="small", min_pressure=2) reconciler = PressureReconciler(mgr, planner, cfg, lock=Lock()) reconciler._last_pressure = 10 - def _stop_after_backoff(_seconds: int): - """Stop the reconciler after the backoff sleep is triggered.""" + def _stop_after_backoff(_seconds: int) -> bool: + """Stop the reconciler after the backoff wait is triggered.""" reconciler.stop() + return True - monkeypatch.setattr( - "github_runner_manager.manager.pressure_reconciler.time.sleep", _stop_after_backoff - ) + monkeypatch.setattr(reconciler._stop, "wait", _stop_after_backoff) reconciler.start_create_loop() assert 10 in mgr.created_args diff --git a/github-runner-manager/tests/unit/test_planner_client.py b/github-runner-manager/tests/unit/test_planner_client.py index a0c4dcddc7..c8e0782909 100644 --- a/github-runner-manager/tests/unit/test_planner_client.py +++ b/github-runner-manager/tests/unit/test_planner_client.py @@ -6,26 +6,36 @@ import json from types import SimpleNamespace +import pytest import requests from github_runner_manager.planner_client import ( + PlannerApiError, PlannerClient, PlannerConfiguration, + PlannerConnectionError, ) class _FakeResponse: """Minimal Response-like object used to stub `requests.Response`.""" - def __init__(self, status_code: int = 200, lines: list[str] | None = None) -> None: + def __init__( + self, + status_code: int = 200, + lines: list[str] | None = None, + iter_lines_exception: Exception | None = None, + ) -> None: """Minimal Response-like object used to stub `requests.Response`. Args: status_code: HTTP status code to emulate. lines: Lines yielded by `iter_lines()` for streaming tests. + iter_lines_exception: Exception raised while iterating stream lines. """ self.status_code = status_code self._lines = lines or [] + self._iter_lines_exception = iter_lines_exception self._closed = False def raise_for_status(self) -> None: @@ -48,6 +58,8 @@ def iter_lines(self, decode_unicode: bool = True): """ for line in self._lines: yield line + if self._iter_lines_exception is not None: + raise self._iter_lines_exception def close(self) -> None: """Mark the response as closed (context manager support).""" @@ -100,11 +112,16 @@ def get(self, url: str, headers: dict[str, str], timeout: int, stream: bool = Fa return _FakeResponse(status_code=200) -def _fake_get_stream_response(lines: list[str], status_code: int = 200): - """Build a stub `Session.get` returning stream lines (NDJSON). +def _fake_get_stream_response( + lines: list[str], + stream_error: Exception | None = None, + status_code: int = 200, +): + """Build a stub `Session.get` returning a fake streaming response. Args: lines: List of NDJSON lines to yield. + stream_error: Optional exception raised after all lines are yielded. status_code: HTTP status code for the response (default: 200). Returns: @@ -112,18 +129,12 @@ def _fake_get_stream_response(lines: list[str], status_code: int = 200): """ def _fake_get(url, headers, timeout, stream=False): - """Return a fake streaming response yielding predefined lines. - - Args: - url: Request URL (ignored by stub). - headers: Request headers (ignored by stub). - timeout: Request timeout in seconds (ignored by stub). - stream: Whether streaming is requested (ignored by stub). - - Returns: - _FakeResponse: Response configured with the provided NDJSON lines. - """ - return _FakeResponse(status_code=status_code, lines=lines) + """Return a fake streaming response.""" + return _FakeResponse( + status_code=status_code, + lines=lines, + iter_lines_exception=stream_error, + ) return _fake_get @@ -146,3 +157,111 @@ def test_stream_pressure_success(monkeypatch): updates = list(client.stream_pressure("small")) assert updates[0].pressure == 2 assert updates[1].pressure == 5 + + +@pytest.mark.parametrize( + ("request_error", "expected_error", "message"), + [ + pytest.param( + requests.ConnectionError, + PlannerConnectionError, + "connection dropped", + id="connection_error", + ), + pytest.param( + requests.Timeout, + PlannerConnectionError, + "timed out", + id="timeout", + ), + pytest.param( + requests.exceptions.ChunkedEncodingError, + PlannerConnectionError, + "Response ended prematurely", + id="chunked_encoding_error", + ), + pytest.param( + requests.RequestException, + PlannerApiError, + "request failed", + id="request_exception", + ), + ], +) +def test_stream_pressure_raises_expected_error_on_connection_failures( + monkeypatch, request_error, expected_error, message +): + """ + arrange: Fake session raises a requests error during connection. + act: Consume `stream_pressure('small')`. + assert: Raises the expected planner client error wrapper. + """ + cfg = PlannerConfiguration(base_url="http://localhost:8080", token="t") + client = PlannerClient(cfg) + + fake_session = _FakeSession() + monkeypatch.setattr(client, "_session", fake_session) + + def _raise_error(url, headers, timeout, stream=False): + """Raise the parametrized requests error.""" + raise request_error(message) + + monkeypatch.setattr(fake_session, "get", _raise_error) + + with pytest.raises(expected_error, match=message): + next(client.stream_pressure("small")) + + +@pytest.mark.parametrize( + ("stream_error", "expected_error", "message"), + [ + pytest.param( + requests.ConnectionError("stream connection dropped"), + PlannerConnectionError, + "stream connection dropped", + id="connection_error", + ), + pytest.param( + requests.Timeout("stream timed out"), + PlannerConnectionError, + "stream timed out", + id="timeout", + ), + pytest.param( + requests.exceptions.ChunkedEncodingError("Response ended prematurely"), + PlannerConnectionError, + "Response ended prematurely", + id="chunked_encoding_error", + ), + pytest.param( + requests.RequestException("stream request failed"), + PlannerApiError, + "stream request failed", + id="request_exception", + ), + ], +) +def test_stream_pressure_raises_expected_error_on_midstream_request_failures( + monkeypatch, stream_error, expected_error, message +): + """ + arrange: Fake response raises a requests error during `iter_lines()`. + act: Consume `stream_pressure('small')` until the stream fails. + assert: Raises the expected planner client error wrapper. + """ + cfg = PlannerConfiguration(base_url="http://localhost:8080", token="t") + client = PlannerClient(cfg) + + fake_session = _FakeSession() + monkeypatch.setattr(client, "_session", fake_session) + monkeypatch.setattr( + fake_session, + "get", + _fake_get_stream_response([json.dumps({"small": 2})], stream_error=stream_error), + ) + + stream = client.stream_pressure("small") + + assert next(stream).pressure == 2 + with pytest.raises(expected_error, match=message): + next(stream)