Skip to content

Commit 1563d7c

Browse files
committed
Introduce telemetry for observability
This introduces the foundational telemetry infrastructure to improve the observability of LDK Server. It adds a new `/metrics` endpoint exposed on the REST service address, which serves Prometheus-compatible metrics. This endpoint is public and does not require HMAC authentication, allowing for easy integration with monitoring systems. - Added a `Metrics` utility struct to hold all the metrics we need to expose. - Introduced a basic `ldk_server_health_score` gauge (0-100) that reflects the node's operational status based on connection to peer, sync state, and running status. This is the first step in a larger effort to provide comprehensive telemetry. Future updates will expand this to include metrics for channels, balances, payments, and other critical node activities.
1 parent 9ff0939 commit 1563d7c

File tree

5 files changed

+208
-6
lines changed

5 files changed

+208
-6
lines changed

ldk-server-protos/src/endpoints.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,4 @@ pub const SPONTANEOUS_SEND_PATH: &str = "SpontaneousSend";
3030
pub const SIGN_MESSAGE_PATH: &str = "SignMessage";
3131
pub const VERIFY_SIGNATURE_PATH: &str = "VerifySignature";
3232
pub const EXPORT_PATHFINDING_SCORES_PATH: &str = "ExportPathfindingScores";
33+
pub const GET_METRICS_PATH: &str = "metrics";

ldk-server/src/main.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ use crate::io::persist::{
5050
use crate::service::NodeService;
5151
use crate::util::config::{load_config, ArgsConfig, ChainSource};
5252
use crate::util::logger::ServerLogger;
53+
use crate::util::metrics::{Metrics, BUILD_METRICS_INTERVAL};
5354
use crate::util::proto_adapter::{forwarded_payment_to_proto, payment_to_proto};
5455
use crate::util::tls::get_or_generate_tls_config;
5556

@@ -256,6 +257,19 @@ fn main() {
256257
}
257258
};
258259
let event_node = Arc::clone(&node);
260+
261+
let metrics_node = Arc::clone(&node);
262+
let mut interval = tokio::time::interval(BUILD_METRICS_INTERVAL);
263+
let metrics = Arc::new(Metrics::new());
264+
let metrics_bg = Arc::clone(&metrics);
265+
266+
runtime.spawn(async move {
267+
loop {
268+
interval.tick().await;
269+
metrics_bg.update_service_health_score(&metrics_node);
270+
}
271+
});
272+
259273
let rest_svc_listener = TcpListener::bind(config_file.rest_service_addr)
260274
.await
261275
.expect("Failed to bind listening port");
@@ -415,7 +429,7 @@ fn main() {
415429
res = rest_svc_listener.accept() => {
416430
match res {
417431
Ok((stream, _)) => {
418-
let node_service = NodeService::new(Arc::clone(&node), Arc::clone(&paginated_store), api_key.clone());
432+
let node_service = NodeService::new(Arc::clone(&node), Arc::clone(&paginated_store), api_key.clone(), Arc::clone(&metrics));
419433
let acceptor = tls_acceptor.clone();
420434
runtime.spawn(async move {
421435
match acceptor.accept(stream).await {

ldk-server/src/service.rs

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ use ldk_node::Node;
2121
use ldk_server_protos::endpoints::{
2222
BOLT11_RECEIVE_PATH, BOLT11_SEND_PATH, BOLT12_RECEIVE_PATH, BOLT12_SEND_PATH,
2323
CLOSE_CHANNEL_PATH, CONNECT_PEER_PATH, EXPORT_PATHFINDING_SCORES_PATH,
24-
FORCE_CLOSE_CHANNEL_PATH, GET_BALANCES_PATH, GET_NODE_INFO_PATH, GET_PAYMENT_DETAILS_PATH,
25-
LIST_CHANNELS_PATH, LIST_FORWARDED_PAYMENTS_PATH, LIST_PAYMENTS_PATH, ONCHAIN_RECEIVE_PATH,
26-
ONCHAIN_SEND_PATH, OPEN_CHANNEL_PATH, SIGN_MESSAGE_PATH, SPLICE_IN_PATH, SPLICE_OUT_PATH,
27-
SPONTANEOUS_SEND_PATH, UPDATE_CHANNEL_CONFIG_PATH, VERIFY_SIGNATURE_PATH,
24+
FORCE_CLOSE_CHANNEL_PATH, GET_BALANCES_PATH, GET_METRICS_PATH, GET_NODE_INFO_PATH,
25+
GET_PAYMENT_DETAILS_PATH, LIST_CHANNELS_PATH, LIST_FORWARDED_PAYMENTS_PATH, LIST_PAYMENTS_PATH,
26+
ONCHAIN_RECEIVE_PATH, ONCHAIN_SEND_PATH, OPEN_CHANNEL_PATH, SIGN_MESSAGE_PATH, SPLICE_IN_PATH,
27+
SPLICE_OUT_PATH, SPONTANEOUS_SEND_PATH, UPDATE_CHANNEL_CONFIG_PATH, VERIFY_SIGNATURE_PATH,
2828
};
2929
use prost::Message;
3030

@@ -52,6 +52,7 @@ use crate::api::spontaneous_send::handle_spontaneous_send_request;
5252
use crate::api::update_channel_config::handle_update_channel_config_request;
5353
use crate::api::verify_signature::handle_verify_signature_request;
5454
use crate::io::persist::paginated_kv_store::PaginatedKVStore;
55+
use crate::util::metrics::Metrics;
5556
use crate::util::proto_adapter::to_error_response;
5657

5758
// Maximum request body size: 10 MB
@@ -63,13 +64,15 @@ pub struct NodeService {
6364
node: Arc<Node>,
6465
paginated_kv_store: Arc<dyn PaginatedKVStore>,
6566
api_key: String,
67+
metrics: Arc<Metrics>,
6668
}
6769

6870
impl NodeService {
6971
pub(crate) fn new(
7072
node: Arc<Node>, paginated_kv_store: Arc<dyn PaginatedKVStore>, api_key: String,
73+
metrics: Arc<Metrics>,
7174
) -> Self {
72-
Self { node, paginated_kv_store, api_key }
75+
Self { node, paginated_kv_store, api_key, metrics }
7376
}
7477
}
7578

@@ -153,6 +156,17 @@ impl Service<Request<Incoming>> for NodeService {
153156
type Future = Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send>>;
154157

155158
fn call(&self, req: Request<Incoming>) -> Self::Future {
159+
// Handle metrics endpoint separately to bypass auth and return plain text
160+
if req.uri().path().len() > 1 && &req.uri().path()[1..] == GET_METRICS_PATH {
161+
let metrics = Arc::clone(&self.metrics);
162+
return Box::pin(async move {
163+
Ok(Response::builder()
164+
.header("Content-Type", "text/plain")
165+
.body(Full::new(Bytes::from(metrics.gather_metrics())))
166+
.unwrap())
167+
});
168+
}
169+
156170
// Extract auth params from headers (validation happens after body is read)
157171
let auth_params = match extract_auth_params(&req) {
158172
Ok(params) => params,

ldk-server/src/util/metrics.rs

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
// This file is Copyright its original authors, visible in version control
2+
// history.
3+
//
4+
// This file is licensed under the Apache License, Version 2.0 <LICENSE-APACHE
5+
// or http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your option.
7+
// You may not use this file except in accordance with one or both of these
8+
// licenses.
9+
10+
use std::sync::atomic::{AtomicI64, Ordering};
11+
use std::time::Duration;
12+
13+
use ldk_node::Node;
14+
15+
pub const BUILD_METRICS_INTERVAL: Duration = Duration::from_secs(60);
16+
17+
/// This represents a [`Metrics`] type that can go up and down in value.
18+
pub struct IntGauge {
19+
inner: AtomicI64,
20+
}
21+
22+
impl IntGauge {
23+
pub fn new() -> Self {
24+
Self { inner: AtomicI64::new(0) }
25+
}
26+
27+
pub fn set(&self, value: i64) {
28+
self.inner.store(value, Ordering::Relaxed);
29+
}
30+
31+
pub fn get(&self) -> i64 {
32+
self.inner.load(Ordering::Relaxed)
33+
}
34+
}
35+
36+
/// Represents the [`Metrics`] output values and type.
37+
pub struct MetricsOutput {
38+
name: String,
39+
help_text: String,
40+
metric_type: String,
41+
value: String,
42+
}
43+
44+
impl MetricsOutput {
45+
pub fn new(name: &str, help_text: &str, metric_type: &str, value: &str) -> Self {
46+
Self {
47+
name: name.to_string(),
48+
help_text: help_text.to_string(),
49+
metric_type: metric_type.to_string(),
50+
value: value.to_string(),
51+
}
52+
}
53+
}
54+
55+
pub struct Metrics {
56+
pub service_health_score: IntGauge,
57+
}
58+
59+
impl Metrics {
60+
pub fn new() -> Self {
61+
Self { service_health_score: IntGauge::new() }
62+
}
63+
64+
pub fn update_service_health_score(&self, node: &Node) {
65+
let score = self.calculate_ldk_server_health_score(node);
66+
self.service_health_score.set(score);
67+
}
68+
69+
/// The health score computation is pretty basic for now and simply
70+
/// calculated based on the impacted events on the components of the
71+
/// `Node`. The events severity and weightage value are as follows:
72+
///
73+
/// - Critical: 0 (Total failure)
74+
/// - Major: 35%
75+
/// - Minor: 25%
76+
///
77+
/// Using the assigned score above, the health score of the `Node` is
78+
/// computed as:
79+
///
80+
/// Health score = Maximum health score - Sum(Event severity score)
81+
///
82+
/// Where:
83+
///
84+
/// - Maximum health score = 100
85+
///
86+
/// If the `Node` is not running/online, i.e `is_running` is false,
87+
/// the severity is critical with a weightage value of -100%.
88+
///
89+
/// If the `Node` is running but isn't connected to any peer yet,
90+
/// the severity is major with a weightage value of -35%.
91+
///
92+
/// If the `Node` is running but the Lightning Wallet hasn't been synced
93+
/// yet, the severity is minor with a weightage value of -25%.
94+
pub fn calculate_ldk_server_health_score(&self, node: &Node) -> i64 {
95+
Self::compute_health_score(
96+
node.status().is_running,
97+
!node.list_peers().is_empty(),
98+
node.status().latest_lightning_wallet_sync_timestamp.is_some(),
99+
)
100+
}
101+
102+
pub fn format_metrics_output(&self, buffer: &mut String, options: &MetricsOutput) {
103+
buffer.push_str(&format!("# HELP {} {}\n", options.name, options.help_text));
104+
buffer.push_str(&format!("# TYPE {} {}\n", options.name, options.metric_type));
105+
buffer.push_str(&format!("{} {}\n", options.name, options.value));
106+
}
107+
108+
pub fn gather_metrics(&self) -> String {
109+
let mut buffer = String::new();
110+
let options = &MetricsOutput::new(
111+
"ldk_server_health_score",
112+
"Current health score (0-100)",
113+
"gauge",
114+
&self.service_health_score.get().to_string(),
115+
);
116+
117+
self.format_metrics_output(&mut buffer, options);
118+
119+
buffer
120+
}
121+
122+
fn compute_health_score(is_running: bool, has_peers: bool, is_wallet_synced: bool) -> i64 {
123+
if !is_running {
124+
return 0;
125+
}
126+
127+
let mut health_score = 100;
128+
129+
if !has_peers {
130+
health_score -= 35;
131+
}
132+
133+
if !is_wallet_synced {
134+
health_score -= 25;
135+
}
136+
137+
health_score
138+
}
139+
}
140+
141+
#[cfg(test)]
142+
mod tests {
143+
144+
use super::*;
145+
146+
#[test]
147+
fn test_compute_health_score() {
148+
// Node is not running
149+
assert_eq!(Metrics::compute_health_score(false, true, true), 0);
150+
assert_eq!(Metrics::compute_health_score(false, false, false), 0);
151+
152+
// Node is running, connected to a peer and wallet is synced
153+
assert_eq!(Metrics::compute_health_score(true, true, true), 100);
154+
155+
// Node is running, not connected to a peer but wallet is synced
156+
assert_eq!(Metrics::compute_health_score(true, false, true), 65);
157+
158+
// Node is running, connected to a peer but wallet is not synced
159+
assert_eq!(Metrics::compute_health_score(true, true, false), 75);
160+
161+
// Node is running, not connected to a peer and wallet is not synced
162+
assert_eq!(Metrics::compute_health_score(true, false, false), 40);
163+
}
164+
165+
#[test]
166+
fn test_gather_metrics_format() {
167+
let metrics = Metrics::new();
168+
169+
let result = metrics.gather_metrics();
170+
assert!(result.contains("ldk_server_health_score"));
171+
}
172+
}

ldk-server/src/util/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@
99

1010
pub(crate) mod config;
1111
pub(crate) mod logger;
12+
pub(crate) mod metrics;
1213
pub(crate) mod proto_adapter;
1314
pub(crate) mod tls;

0 commit comments

Comments
 (0)