openzeppelin_relayer/metrics/
mod.rs

1//! Metrics module for the application.
2//!
3//! - This module contains the global Prometheus registry.
4//! - Defines specific metrics for the application.
5
6pub mod middleware;
7use lazy_static::lazy_static;
8use prometheus::{
9    CounterVec, Encoder, Gauge, GaugeVec, HistogramOpts, HistogramVec, Opts, Registry, TextEncoder,
10};
11use sysinfo::{Disks, System};
12
13lazy_static! {
14    // Global Prometheus registry.
15    pub static ref REGISTRY: Registry = Registry::new();
16
17    // Counter: Total HTTP requests.
18    pub static ref REQUEST_COUNTER: CounterVec = {
19        let opts = Opts::new("requests_total", "Total number of HTTP requests");
20        let counter_vec = CounterVec::new(opts, &["endpoint", "method", "status"]).unwrap();
21        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
22        counter_vec
23    };
24
25    // Counter: Total HTTP requests by raw URI.
26    pub static ref RAW_REQUEST_COUNTER: CounterVec = {
27      let opts = Opts::new("raw_requests_total", "Total number of HTTP requests by raw URI");
28      let counter_vec = CounterVec::new(opts, &["raw_uri", "method", "status"]).unwrap();
29      REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
30      counter_vec
31    };
32
33    // Histogram for request latency in seconds.
34    pub static ref REQUEST_LATENCY: HistogramVec = {
35      let histogram_opts = HistogramOpts::new("request_latency_seconds", "Request latency in seconds")
36          .buckets(vec![0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0]);
37      let histogram_vec = HistogramVec::new(histogram_opts, &["endpoint", "method", "status"]).unwrap();
38      REGISTRY.register(Box::new(histogram_vec.clone())).unwrap();
39      histogram_vec
40    };
41
42    // Counter for error responses.
43    pub static ref ERROR_COUNTER: CounterVec = {
44        let opts = Opts::new("error_requests_total", "Total number of error responses");
45        // Using "status" to record the HTTP status code (or a special label like "service_error")
46        let counter_vec = CounterVec::new(opts, &["endpoint", "method", "status"]).unwrap();
47        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
48        counter_vec
49    };
50
51    // Gauge for CPU usage percentage.
52    pub static ref CPU_USAGE: Gauge = {
53      let gauge = Gauge::new("cpu_usage_percentage", "Current CPU usage percentage").unwrap();
54      REGISTRY.register(Box::new(gauge.clone())).unwrap();
55      gauge
56    };
57
58    // Gauge for memory usage percentage.
59    pub static ref MEMORY_USAGE_PERCENT: Gauge = {
60      let gauge = Gauge::new("memory_usage_percentage", "Memory usage percentage").unwrap();
61      REGISTRY.register(Box::new(gauge.clone())).unwrap();
62      gauge
63    };
64
65    // Gauge for memory usage in bytes.
66    pub static ref MEMORY_USAGE: Gauge = {
67        let gauge = Gauge::new("memory_usage_bytes", "Memory usage in bytes").unwrap();
68        REGISTRY.register(Box::new(gauge.clone())).unwrap();
69        gauge
70    };
71
72    // Gauge for total memory in bytes.
73    pub static ref TOTAL_MEMORY: Gauge = {
74      let gauge = Gauge::new("total_memory_bytes", "Total memory in bytes").unwrap();
75      REGISTRY.register(Box::new(gauge.clone())).unwrap();
76      gauge
77    };
78
79    // Gauge for available memory in bytes.
80    pub static ref AVAILABLE_MEMORY: Gauge = {
81        let gauge = Gauge::new("available_memory_bytes", "Available memory in bytes").unwrap();
82        REGISTRY.register(Box::new(gauge.clone())).unwrap();
83        gauge
84    };
85
86    // Gauge for used disk space in bytes.
87    pub static ref DISK_USAGE: Gauge = {
88      let gauge = Gauge::new("disk_usage_bytes", "Used disk space in bytes").unwrap();
89      REGISTRY.register(Box::new(gauge.clone())).unwrap();
90      gauge
91    };
92
93    // Gauge for disk usage percentage.
94    pub static ref DISK_USAGE_PERCENT: Gauge = {
95      let gauge = Gauge::new("disk_usage_percentage", "Disk usage percentage").unwrap();
96      REGISTRY.register(Box::new(gauge.clone())).unwrap();
97      gauge
98    };
99
100    // Gauge for in-flight requests.
101    pub static ref IN_FLIGHT_REQUESTS: GaugeVec = {
102        let gauge_vec = GaugeVec::new(
103            Opts::new("in_flight_requests", "Number of in-flight requests"),
104            &["endpoint"]
105        ).unwrap();
106        REGISTRY.register(Box::new(gauge_vec.clone())).unwrap();
107        gauge_vec
108    };
109
110    // Counter for request timeouts.
111    pub static ref TIMEOUT_COUNTER: CounterVec = {
112        let opts = Opts::new("request_timeouts_total", "Total number of request timeouts");
113        let counter_vec = CounterVec::new(opts, &["endpoint", "method", "timeout_type"]).unwrap();
114        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
115        counter_vec
116    };
117
118    // Gauge for file descriptor count.
119    pub static ref FILE_DESCRIPTORS: Gauge = {
120        let gauge = Gauge::new("file_descriptors_count", "Current file descriptor count").unwrap();
121        REGISTRY.register(Box::new(gauge.clone())).unwrap();
122        gauge
123    };
124
125    // Gauge for CLOSE_WAIT socket count.
126    pub static ref CLOSE_WAIT_SOCKETS: Gauge = {
127        let gauge = Gauge::new("close_wait_sockets_count", "Number of CLOSE_WAIT sockets").unwrap();
128        REGISTRY.register(Box::new(gauge.clone())).unwrap();
129        gauge
130    };
131
132    // Counter for successful transactions (Confirmed status).
133    pub static ref TRANSACTIONS_SUCCESS: CounterVec = {
134        let opts = Opts::new("transactions_success_total", "Total number of successful transactions");
135        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
136        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
137        counter_vec
138    };
139
140    // Counter for failed transactions (Failed, Expired, Canceled statuses).
141    pub static ref TRANSACTIONS_FAILED: CounterVec = {
142        let opts = Opts::new("transactions_failed_total", "Total number of failed transactions");
143        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type", "failure_reason"]).unwrap();
144        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
145        counter_vec
146    };
147
148    // Counter for RPC failures during API requests (before transaction creation).
149    // This tracks failures that occur during operations like get_status, get_balance, etc.
150    // that happen before a transaction is created.
151    pub static ref API_RPC_FAILURES: CounterVec = {
152        let opts = Opts::new("api_rpc_failures_total", "Total number of RPC failures during API requests (before transaction creation)");
153        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type", "operation_name", "error_type"]).unwrap();
154        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
155        counter_vec
156    };
157
158    // Counter for transaction creation (when a transaction is successfully created in the repository).
159    pub static ref TRANSACTIONS_CREATED: CounterVec = {
160        let opts = Opts::new("transactions_created_total", "Total number of transactions created");
161        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
162        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
163        counter_vec
164    };
165
166    // Counter for transaction submissions (when status changes to Submitted).
167    pub static ref TRANSACTIONS_SUBMITTED: CounterVec = {
168        let opts = Opts::new("transactions_submitted_total", "Total number of transactions submitted to the network");
169        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
170        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
171        counter_vec
172    };
173
174    // Gauge for transaction status distribution (current count of transactions in each status).
175    pub static ref TRANSACTIONS_BY_STATUS: GaugeVec = {
176        let gauge_vec = GaugeVec::new(
177            Opts::new("transactions_by_status", "Current number of transactions by status"),
178            &["relayer_id", "network_type", "status"]
179        ).unwrap();
180        REGISTRY.register(Box::new(gauge_vec.clone())).unwrap();
181        gauge_vec
182    };
183
184    // Histogram for transaction processing times (creation to submission).
185    pub static ref TRANSACTION_PROCESSING_TIME: HistogramVec = {
186        let histogram_opts = HistogramOpts::new("transaction_processing_seconds", "Transaction processing time in seconds")
187            .buckets(vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0]);
188        let histogram_vec = HistogramVec::new(histogram_opts, &["relayer_id", "network_type", "stage"]).unwrap();
189        REGISTRY.register(Box::new(histogram_vec.clone())).unwrap();
190        histogram_vec
191    };
192
193    // Histogram for RPC call latency.
194    pub static ref RPC_CALL_LATENCY: HistogramVec = {
195        let histogram_opts = HistogramOpts::new("rpc_call_latency_seconds", "RPC call latency in seconds")
196            .buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0]);
197        let histogram_vec = HistogramVec::new(histogram_opts, &["relayer_id", "network_type", "operation_name"]).unwrap();
198        REGISTRY.register(Box::new(histogram_vec.clone())).unwrap();
199        histogram_vec
200    };
201
202    // Counter for plugin calls (tracks requests to /api/v1/plugins/{plugin_id}/call endpoints).
203    pub static ref PLUGIN_CALLS: CounterVec = {
204        let opts = Opts::new("plugin_calls_total", "Total number of plugin calls");
205        let counter_vec = CounterVec::new(opts, &["plugin_id", "method", "status"]).unwrap();
206        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
207        counter_vec
208    };
209}
210
211/// Gather all metrics and encode into the provided format.
212pub fn gather_metrics() -> Result<Vec<u8>, Box<dyn std::error::Error>> {
213    let encoder = TextEncoder::new();
214    let metric_families = REGISTRY.gather();
215    let mut buffer = Vec::new();
216    encoder.encode(&metric_families, &mut buffer)?;
217    Ok(buffer)
218}
219
220/// Get file descriptor count for current process.
221fn get_fd_count() -> Result<usize, std::io::Error> {
222    let pid = std::process::id();
223
224    #[cfg(target_os = "linux")]
225    {
226        let fd_dir = format!("/proc/{pid}/fd");
227        std::fs::read_dir(fd_dir).map(|entries| entries.count())
228    }
229
230    #[cfg(target_os = "macos")]
231    {
232        use std::process::Command;
233        let output = Command::new("lsof")
234            .args(["-p", &pid.to_string()])
235            .output()?;
236        let count = String::from_utf8_lossy(&output.stdout)
237            .lines()
238            .count()
239            .saturating_sub(1); // Subtract header line
240        Ok(count)
241    }
242
243    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
244    {
245        Ok(0) // Unsupported platform
246    }
247}
248
249/// Get CLOSE_WAIT socket count.
250fn get_close_wait_count() -> Result<usize, std::io::Error> {
251    #[cfg(any(target_os = "linux", target_os = "macos"))]
252    {
253        use std::process::Command;
254        let output = Command::new("sh")
255            .args(["-c", "netstat -an | grep CLOSE_WAIT | wc -l"])
256            .output()?;
257        let count = String::from_utf8_lossy(&output.stdout)
258            .trim()
259            .parse()
260            .unwrap_or(0);
261        Ok(count)
262    }
263
264    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
265    {
266        Ok(0) // Unsupported platform
267    }
268}
269
270/// Updates the system metrics for CPU and memory usage.
271pub fn update_system_metrics() {
272    let mut sys = System::new_all();
273    sys.refresh_all();
274
275    // Overall CPU usage.
276    let cpu_usage = sys.global_cpu_usage();
277    CPU_USAGE.set(cpu_usage as f64);
278
279    // Total memory (in bytes).
280    let total_memory = sys.total_memory();
281    TOTAL_MEMORY.set(total_memory as f64);
282
283    // Available memory (in bytes).
284    let available_memory = sys.available_memory();
285    AVAILABLE_MEMORY.set(available_memory as f64);
286
287    // Used memory (in bytes).
288    let memory_usage = sys.used_memory();
289    MEMORY_USAGE.set(memory_usage as f64);
290
291    // Calculate memory usage percentage
292    let memory_percentage = if total_memory > 0 {
293        (memory_usage as f64 / total_memory as f64) * 100.0
294    } else {
295        0.0
296    };
297    MEMORY_USAGE_PERCENT.set(memory_percentage);
298
299    // Calculate disk usage:
300    // Sum total space and available space across all disks.
301    let disks = Disks::new_with_refreshed_list();
302    let mut total_disk_space: u64 = 0;
303    let mut total_disk_available: u64 = 0;
304    for disk in disks.list() {
305        total_disk_space += disk.total_space();
306        total_disk_available += disk.available_space();
307    }
308    // Used disk space is total minus available ( in bytes).
309    let used_disk_space = total_disk_space.saturating_sub(total_disk_available);
310    DISK_USAGE.set(used_disk_space as f64);
311
312    // Calculate disk usage percentage.
313    let disk_percentage = if total_disk_space > 0 {
314        (used_disk_space as f64 / total_disk_space as f64) * 100.0
315    } else {
316        0.0
317    };
318    DISK_USAGE_PERCENT.set(disk_percentage);
319
320    // Update file descriptor count.
321    if let Ok(fd_count) = get_fd_count() {
322        FILE_DESCRIPTORS.set(fd_count as f64);
323    }
324
325    // Update CLOSE_WAIT socket count.
326    if let Ok(close_wait) = get_close_wait_count() {
327        CLOSE_WAIT_SOCKETS.set(close_wait as f64);
328    }
329}
330
331#[cfg(test)]
332mod actix_tests {
333    use super::*;
334    use actix_web::{
335        dev::{Service, ServiceRequest, ServiceResponse, Transform},
336        http, test, Error, HttpResponse,
337    };
338    use futures::future::{self};
339    use middleware::MetricsMiddleware;
340    use prometheus::proto::MetricFamily;
341    use std::{
342        pin::Pin,
343        task::{Context, Poll},
344    };
345
346    // Dummy service that always returns a successful response (HTTP 200 OK).
347    struct DummySuccessService;
348
349    impl Service<ServiceRequest> for DummySuccessService {
350        type Response = ServiceResponse;
351        type Error = Error;
352        type Future = Pin<Box<dyn future::Future<Output = Result<Self::Response, Self::Error>>>>;
353
354        fn poll_ready(&self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
355            Poll::Ready(Ok(()))
356        }
357
358        fn call(&self, req: ServiceRequest) -> Self::Future {
359            let resp = req.into_response(HttpResponse::Ok().finish());
360            Box::pin(async move { Ok(resp) })
361        }
362    }
363
364    // Dummy service that always returns an error.
365    struct DummyErrorService;
366
367    impl Service<ServiceRequest> for DummyErrorService {
368        type Response = ServiceResponse;
369        type Error = Error;
370        type Future = Pin<Box<dyn future::Future<Output = Result<Self::Response, Self::Error>>>>;
371
372        fn poll_ready(&self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
373            Poll::Ready(Ok(()))
374        }
375
376        fn call(&self, _req: ServiceRequest) -> Self::Future {
377            Box::pin(async move { Err(actix_web::error::ErrorInternalServerError("dummy error")) })
378        }
379    }
380
381    // Helper function to find a metric family by name.
382    fn find_metric_family<'a>(
383        name: &str,
384        families: &'a [MetricFamily],
385    ) -> Option<&'a MetricFamily> {
386        families.iter().find(|mf| mf.name() == name)
387    }
388
389    #[actix_rt::test]
390    async fn test_gather_metrics_contains_expected_names() {
391        // Update system metrics
392        update_system_metrics();
393
394        // Increment request counters to ensure they appear in output
395        REQUEST_COUNTER
396            .with_label_values(&["/test", "GET", "200"])
397            .inc();
398        RAW_REQUEST_COUNTER
399            .with_label_values(&["/test?param=value", "GET", "200"])
400            .inc();
401        REQUEST_LATENCY
402            .with_label_values(&["/test", "GET", "200"])
403            .observe(0.1);
404        ERROR_COUNTER
405            .with_label_values(&["/test", "GET", "500"])
406            .inc();
407
408        let metrics = gather_metrics().expect("failed to gather metrics");
409        let output = String::from_utf8(metrics).expect("metrics output is not valid UTF-8");
410
411        // System metrics
412        assert!(output.contains("cpu_usage_percentage"));
413        assert!(output.contains("memory_usage_percentage"));
414        assert!(output.contains("memory_usage_bytes"));
415        assert!(output.contains("total_memory_bytes"));
416        assert!(output.contains("available_memory_bytes"));
417        assert!(output.contains("disk_usage_bytes"));
418        assert!(output.contains("disk_usage_percentage"));
419
420        // Request metrics
421        assert!(output.contains("requests_total"));
422        assert!(output.contains("raw_requests_total"));
423        assert!(output.contains("request_latency_seconds"));
424        assert!(output.contains("error_requests_total"));
425    }
426
427    #[actix_rt::test]
428    async fn test_update_system_metrics() {
429        // Reset metrics to ensure clean state
430        CPU_USAGE.set(0.0);
431        TOTAL_MEMORY.set(0.0);
432        AVAILABLE_MEMORY.set(0.0);
433        MEMORY_USAGE.set(0.0);
434        MEMORY_USAGE_PERCENT.set(0.0);
435        DISK_USAGE.set(0.0);
436        DISK_USAGE_PERCENT.set(0.0);
437
438        // Call the function we're testing
439        update_system_metrics();
440
441        // Verify that metrics have been updated with reasonable values
442        let cpu_usage = CPU_USAGE.get();
443        assert!(
444            (0.0..=100.0).contains(&cpu_usage),
445            "CPU usage should be between 0-100%, got {cpu_usage}"
446        );
447
448        let memory_usage = MEMORY_USAGE.get();
449        assert!(
450            memory_usage >= 0.0,
451            "Memory usage should be >= 0, got {memory_usage}"
452        );
453
454        let memory_percent = MEMORY_USAGE_PERCENT.get();
455        assert!(
456            (0.0..=100.0).contains(&memory_percent),
457            "Memory usage percentage should be between 0-100%, got {memory_percent}"
458        );
459
460        let total_memory = TOTAL_MEMORY.get();
461        assert!(
462            total_memory > 0.0,
463            "Total memory should be > 0, got {total_memory}"
464        );
465
466        let available_memory = AVAILABLE_MEMORY.get();
467        assert!(
468            available_memory >= 0.0,
469            "Available memory should be >= 0, got {available_memory}"
470        );
471
472        let disk_usage = DISK_USAGE.get();
473        assert!(
474            disk_usage >= 0.0,
475            "Disk usage should be >= 0, got {disk_usage}"
476        );
477
478        let disk_percent = DISK_USAGE_PERCENT.get();
479        assert!(
480            (0.0..=100.0).contains(&disk_percent),
481            "Disk usage percentage should be between 0-100%, got {disk_percent}"
482        );
483
484        // Verify that memory usage doesn't exceed total memory
485        assert!(
486            memory_usage <= total_memory,
487            "Memory usage should be <= total memory, got {memory_usage}"
488        );
489
490        // Verify that available memory plus used memory doesn't exceed total memory
491        assert!(
492            (available_memory + memory_usage) <= total_memory,
493            "Available memory plus used memory should be <= total memory {}, got {}",
494            total_memory,
495            available_memory + memory_usage
496        );
497    }
498
499    #[actix_rt::test]
500    async fn test_middleware_success() {
501        let req = test::TestRequest::with_uri("/test_success").to_srv_request();
502
503        let middleware = MetricsMiddleware;
504        let service = middleware.new_transform(DummySuccessService).await.unwrap();
505
506        let resp = service.call(req).await.unwrap();
507        assert_eq!(resp.response().status(), http::StatusCode::OK);
508
509        let families = REGISTRY.gather();
510        let counter_fam = find_metric_family("requests_total", &families)
511            .expect("requests_total metric family not found");
512
513        let mut found = false;
514        for m in counter_fam.get_metric() {
515            let labels = m.get_label();
516            if labels
517                .iter()
518                .any(|l| l.name() == "endpoint" && l.value() == "/test_success")
519            {
520                found = true;
521                assert!(m.get_counter().value() >= 1.0);
522            }
523        }
524        assert!(
525            found,
526            "Expected metric with endpoint '/test_success' not found"
527        );
528    }
529
530    #[actix_rt::test]
531    async fn test_middleware_error() {
532        let req = test::TestRequest::with_uri("/test_error").to_srv_request();
533
534        let middleware = MetricsMiddleware;
535        let service = middleware.new_transform(DummyErrorService).await.unwrap();
536
537        let result = service.call(req).await;
538        assert!(result.is_err());
539
540        let families = REGISTRY.gather();
541        let error_counter_fam = find_metric_family("error_requests_total", &families)
542            .expect("error_requests_total metric family not found");
543
544        let mut found = false;
545        for m in error_counter_fam.get_metric() {
546            let labels = m.get_label();
547            if labels
548                .iter()
549                .any(|l| l.name() == "endpoint" && l.value() == "/test_error")
550            {
551                found = true;
552                assert!(m.get_counter().value() >= 1.0);
553            }
554        }
555        assert!(
556            found,
557            "Expected error metric with endpoint '/test_error' not found"
558        );
559    }
560}
561
562#[cfg(test)]
563mod property_tests {
564    use proptest::{prelude::*, test_runner::Config};
565
566    // A helper function to compute percentage used from total.
567    fn compute_percentage(used: u64, total: u64) -> f64 {
568        if total > 0 {
569            (used as f64 / total as f64) * 100.0
570        } else {
571            0.0
572        }
573    }
574
575    proptest! {
576        // Set the number of cases to 1000
577        #![proptest_config(Config {
578          cases: 1000, ..Config::default()
579        })]
580
581        #[test]
582        fn prop_compute_percentage((total, used) in {
583            (1u64..1_000_000u64).prop_flat_map(|total| {
584                (Just(total), 0u64..=total)
585            })
586        }) {
587            let percentage = compute_percentage(used, total);
588            prop_assert!(percentage >= 0.0);
589            prop_assert!(percentage <= 100.0);
590        }
591
592        #[test]
593        fn prop_labels_are_reasonable(
594              endpoint in ".*",
595              method in prop::sample::select(vec![
596                "GET".to_string(),
597                "POST".to_string(),
598                "PUT".to_string(),
599                "DELETE".to_string()
600                ])
601            ) {
602            let endpoint_label = if endpoint.is_empty() { "/".to_string() } else { endpoint.clone() };
603            let method_label = method;
604
605            prop_assert!(endpoint_label.chars().count() <= 1024, "Endpoint label too long");
606            prop_assert!(method_label.chars().count() <= 16, "Method label too long");
607
608            let status = "200".to_string();
609            let labels = vec![endpoint_label, method_label, status];
610
611            for label in labels {
612                prop_assert!(!label.is_empty());
613                prop_assert!(label.len() < 1024);
614            }
615        }
616    }
617}