Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

This changelog documents user-relevant changes to the GitHub runner charm.

## 2026-03-13

- Reduced planner pressure stream reconnect log noise by treating transient disconnects/timeouts as warnings without duplicate stack traces, while preserving error-level logging for non-transient planner API failures.

## 2026-03-11

- Added Prometheus metrics for GithubClient operation counts, errors, latency, and rate-limit visibility.
Expand Down
2 changes: 1 addition & 1 deletion github-runner-manager/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

[project]
name = "github-runner-manager"
version = "0.15.0"
version = "0.15.1"
authors = [
{ name = "Canonical IS DevOps", email = "is-devops-team@canonical.com" },
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import grp
import logging
import os
import time
from dataclasses import dataclass
from threading import Event, Lock
from typing import Optional
Expand All @@ -32,6 +31,7 @@
PlannerApiError,
PlannerClient,
PlannerConfiguration,
PlannerConnectionError,
)
from github_runner_manager.platform.github_provider import GitHubRunnerPlatform

Expand Down Expand Up @@ -131,14 +131,29 @@ def start_create_loop(self) -> None:
if self._stop.is_set():
return
self._handle_create_runners(update.pressure)
except PlannerConnectionError as exc:
fallback = max(self._last_pressure or 0, self._config.min_pressure)
logger.warning(
"Pressure stream interrupted for flavor %s (%s), falling back to %s runners.",
self._config.flavor_name,
exc,
fallback,
)
Comment thread
cbartz marked this conversation as resolved.
if self._stop.is_set():
return
self._handle_create_runners(fallback)
self._stop.wait(5)
Comment thread
cbartz marked this conversation as resolved.
Comment thread
cbartz marked this conversation as resolved.
Comment thread
cbartz marked this conversation as resolved.
except PlannerApiError:
fallback = max(self._last_pressure or 0, self._config.min_pressure)
logger.exception(
"Error in pressure stream loop, falling back to %s runners.",
"Error in pressure stream loop for flavor %s, falling back to %s runners.",
self._config.flavor_name,
fallback,
)
Comment thread
cbartz marked this conversation as resolved.
if self._stop.is_set():
return
self._handle_create_runners(fallback)
time.sleep(5)
self._stop.wait(5)
Comment thread
cbartz marked this conversation as resolved.
Comment thread
cbartz marked this conversation as resolved.

def start_reconcile_loop(self) -> None:
"""Periodically reconcile runners: sync state, scale up/down, and clean up."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ class PlannerApiError(Exception):
"""Represents an error while interacting with the planner service."""


class PlannerConnectionError(PlannerApiError):
"""Transient connection error (dropped stream, timeout)."""

Comment thread
cbartz marked this conversation as resolved.

class PlannerClient: # pylint: disable=too-few-public-methods
"""An HTTP client for the planner service."""

Expand All @@ -69,7 +73,8 @@ def stream_pressure(self, name: str) -> Iterable[PressureInfo]:
Parsed pressure updates.

Raises:
PlannerApiError: On HTTP or stream errors.
PlannerConnectionError: On transient stream disconnects or timeouts.
PlannerApiError: On other HTTP or stream errors.
"""
base = str(self._config.base_url).rstrip("/") + "/"
url = urljoin(base, f"api/v1/flavors/{name}/pressure?stream=true")
Expand All @@ -93,9 +98,10 @@ def stream_pressure(self, name: str) -> Iterable[PressureInfo]:
except json.JSONDecodeError:
logger.warning("Skipping malformed stream line: %s", line)
continue
except (requests.ConnectionError, requests.Timeout) as exc:
raise PlannerConnectionError(str(exc)) from exc
except requests.RequestException as exc:
logger.exception("Error while streaming pressure for flavor '%s' from planner.", name)
raise PlannerApiError from exc
raise PlannerApiError(str(exc)) from exc
Comment thread
cbartz marked this conversation as resolved.
Outdated
Comment thread
cbartz marked this conversation as resolved.

@staticmethod
def _create_session() -> requests.Session:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
PressureReconciler,
PressureReconcilerConfig,
)
from github_runner_manager.planner_client import PlannerApiError
from github_runner_manager.planner_client import PlannerApiError, PlannerConnectionError


class _FakeManager:
Expand Down Expand Up @@ -64,69 +64,128 @@ class _FakePlanner:
def __init__(
self,
stream_updates: list[int] | None = None,
stream_raises: bool = False,
stream_exception: Exception | None = None,
):
"""Initialize with configurable stream behavior."""
self._stream_updates = stream_updates or []
self._stream_raises = stream_raises
self._stream_exception = stream_exception

def stream_pressure(self, name: str): # noqa: ARG002
"""Yield pressure updates or raise PlannerApiError based on configuration.

Yields:
Namespace objects with a pressure attribute.
"""
if self._stream_raises:
raise PlannerApiError
if self._stream_exception is not None:
raise self._stream_exception
Comment thread
cbartz marked this conversation as resolved.
for p in self._stream_updates:
yield SimpleNamespace(pressure=p)


def test_min_pressure_used_as_fallback_when_stream_errors(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize(
"planner_error",
[
pytest.param(PlannerApiError("request failed"), id="planner_api_error"),
pytest.param(PlannerConnectionError("connection dropped"), id="planner_connection_error"),
],
)
def test_min_pressure_used_as_fallback_when_stream_errors(
monkeypatch: pytest.MonkeyPatch, planner_error: Exception
):
"""
arrange: A reconciler whose planner stream raises PlannerApiError and no prior pressure.
arrange: A reconciler whose planner stream raises a planner error and no prior pressure.
act: Call start_create_loop.
assert: min_pressure is used as fallback to create runners.
"""
mgr = _FakeManager()
planner = _FakePlanner(stream_raises=True)
planner = _FakePlanner(stream_exception=planner_error)
cfg = PressureReconcilerConfig(flavor_name="small", min_pressure=2)
reconciler = PressureReconciler(mgr, planner, cfg, lock=Lock())

def _stop_after_backoff(_seconds: int):
"""Stop the reconciler after the backoff sleep is triggered."""
def _stop_after_backoff(_seconds: int) -> bool:
"""Stop the reconciler after the backoff wait is triggered."""
reconciler.stop()
return True

monkeypatch.setattr(
"github_runner_manager.manager.pressure_reconciler.time.sleep", _stop_after_backoff
)
monkeypatch.setattr(reconciler._stop, "wait", _stop_after_backoff)
reconciler.start_create_loop()

assert 2 in mgr.created_args


def test_fallback_preserves_last_pressure_when_higher(monkeypatch: pytest.MonkeyPatch):
@pytest.mark.parametrize(
"planner_error",
[
pytest.param(PlannerApiError("request failed"), id="planner_api_error"),
pytest.param(PlannerConnectionError("connection dropped"), id="planner_connection_error"),
],
)
def test_fallback_preserves_last_pressure_when_higher(
monkeypatch: pytest.MonkeyPatch, planner_error: Exception
):
"""
arrange: A reconciler with last_pressure=10 and min_pressure=2 whose stream errors.
act: Call start_create_loop.
assert: The higher last_pressure is used as fallback instead of min_pressure.
"""
mgr = _FakeManager()
planner = _FakePlanner(stream_raises=True)
planner = _FakePlanner(stream_exception=planner_error)
cfg = PressureReconcilerConfig(flavor_name="small", min_pressure=2)
reconciler = PressureReconciler(mgr, planner, cfg, lock=Lock())
reconciler._last_pressure = 10

def _stop_after_backoff(_seconds: int):
"""Stop the reconciler after the backoff sleep is triggered."""
def _stop_after_backoff(_seconds: int) -> bool:
"""Stop the reconciler after the backoff wait is triggered."""
reconciler.stop()
return True

monkeypatch.setattr(reconciler._stop, "wait", _stop_after_backoff)
reconciler.start_create_loop()

assert 10 in mgr.created_args


@pytest.mark.parametrize(
("planner_error", "logger_method"),
[
pytest.param(PlannerApiError("request failed"), "exception", id="planner_api_error"),
pytest.param(
PlannerConnectionError("connection dropped"),
"warning",
id="planner_connection_error",
),
],
)
def test_create_loop_does_not_scale_up_after_stop_requested_during_error_handling(
monkeypatch: pytest.MonkeyPatch, planner_error: Exception, logger_method: str
):
"""
arrange: A reconciler whose stream fails and shutdown is requested during logging.
act: Call start_create_loop.
assert: No fallback runners are created after shutdown is requested.
"""
mgr = _FakeManager()
planner = _FakePlanner(stream_exception=planner_error)
cfg = PressureReconcilerConfig(flavor_name="small", min_pressure=2)
reconciler = PressureReconciler(mgr, planner, cfg, lock=Lock())

def _stop_during_logging(*args, **kwargs):
"""Stop the reconciler while the exception handler is logging."""
reconciler.stop()

def _wait_should_not_run(_seconds: int) -> bool:
"""Fail if backoff wait runs after shutdown has already been requested."""
raise AssertionError("_stop.wait() should not be called after shutdown")

monkeypatch.setattr(
"github_runner_manager.manager.pressure_reconciler.time.sleep", _stop_after_backoff
f"github_runner_manager.manager.pressure_reconciler.logger.{logger_method}",
_stop_during_logging,
)
monkeypatch.setattr(reconciler._stop, "wait", _wait_should_not_run)

reconciler.start_create_loop()

assert 10 in mgr.created_args
assert mgr.created_args == []


def test_timer_loop_uses_cached_pressure(monkeypatch: pytest.MonkeyPatch):
Expand Down
Loading
Loading