openzeppelin_relayer/services/provider/
rpc_health_store.rs

1//! RPC Health Store
2//!
3//! This module provides a shared in-memory store for RPC health metadata.
4//! Health state is shared across all relayers using the same RPC URL.
5
6use std::collections::HashMap;
7use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
8
9use chrono::{DateTime, Utc};
10use once_cell::sync::Lazy;
11use tracing::{debug, warn};
12
13/// Metadata for tracking RPC endpoint health.
14#[derive(Clone, Debug, Default, PartialEq, Eq)]
15pub struct RpcConfigMetadata {
16    /// Timestamps of recent failures. Only failures within the expiration window are kept.
17    /// Limited to a reasonable size (threshold * 2) to prevent unbounded growth.
18    pub failure_timestamps: Vec<DateTime<Utc>>,
19    /// Timestamp until which this RPC endpoint is paused due to failures.
20    /// If set and in the future, the endpoint is considered paused.
21    pub paused_until: Option<DateTime<Utc>>,
22}
23
24/// Shared in-memory store for RPC health metadata.
25///
26/// This store is shared across all relayers, so health state for a given RPC URL
27/// is consistent across all relayers using that URL.
28pub struct RpcHealthStore {
29    metadata: Arc<RwLock<HashMap<String, RpcConfigMetadata>>>,
30}
31
32static HEALTH_STORE: Lazy<RpcHealthStore> = Lazy::new(|| RpcHealthStore {
33    metadata: Arc::new(RwLock::new(HashMap::new())),
34});
35
36impl RpcHealthStore {
37    /// Gets the singleton instance of the health store.
38    pub fn instance() -> &'static RpcHealthStore {
39        &HEALTH_STORE
40    }
41
42    /// Acquires a read lock, recovering from poison if necessary.
43    ///
44    /// If the lock is poisoned (a thread panicked while holding it), we recover
45    /// by extracting the inner data. This prevents cascading failures.
46    fn acquire_read_lock(&self) -> RwLockReadGuard<'_, HashMap<String, RpcConfigMetadata>> {
47        match self.metadata.read() {
48            Ok(guard) => guard,
49            Err(poisoned) => {
50                warn!("RpcHealthStore read lock was poisoned, recovering");
51                poisoned.into_inner()
52            }
53        }
54    }
55
56    /// Acquires a write lock, recovering from poison if necessary.
57    ///
58    /// If the lock is poisoned (a thread panicked while holding it), we recover
59    /// by extracting the inner data. This prevents cascading failures.
60    fn acquire_write_lock(&self) -> RwLockWriteGuard<'_, HashMap<String, RpcConfigMetadata>> {
61        match self.metadata.write() {
62            Ok(guard) => guard,
63            Err(poisoned) => {
64                warn!("RpcHealthStore write lock was poisoned, recovering");
65                poisoned.into_inner()
66            }
67        }
68    }
69
70    /// Gets metadata for a given RPC URL.
71    ///
72    /// Returns default (empty) metadata if the URL is not in the store.
73    ///
74    /// # Arguments
75    /// * `url` - The RPC endpoint URL
76    ///
77    /// # Returns
78    /// * `RpcConfigMetadata` - The metadata for the URL, or default if not found
79    pub fn get_metadata(&self, url: &str) -> RpcConfigMetadata {
80        let store = self.acquire_read_lock();
81        store.get(url).cloned().unwrap_or_default()
82    }
83
84    /// Updates metadata for a given RPC URL.
85    ///
86    /// # Arguments
87    /// * `url` - The RPC endpoint URL
88    /// * `metadata` - The metadata to store
89    pub fn update_metadata(&self, url: &str, metadata: RpcConfigMetadata) {
90        let mut store = self.acquire_write_lock();
91        store.insert(url.to_string(), metadata);
92    }
93
94    /// Marks an RPC endpoint as failed, adding a failure timestamp.
95    /// If the number of recent failures (within expiration window) reaches the threshold, pauses the endpoint.
96    /// Stale failures (older than failure_expiration) are automatically removed.
97    ///
98    /// # Arguments
99    /// * `url` - The RPC endpoint URL
100    /// * `threshold` - The number of failures before pausing
101    /// * `pause_duration` - The duration to pause for
102    /// * `failure_expiration` - Duration after which failures are considered stale and removed
103    pub fn mark_failed(
104        &self,
105        url: &str,
106        threshold: u32,
107        pause_duration: chrono::Duration,
108        failure_expiration: chrono::Duration,
109    ) {
110        let mut store = self.acquire_write_lock();
111        let mut metadata = store.get(url).cloned().unwrap_or_default();
112
113        let now = Utc::now();
114
115        // Remove stale failures (older than expiration window)
116        metadata
117            .failure_timestamps
118            .retain(|&ts| now - ts <= failure_expiration);
119
120        // Add current failure timestamp
121        metadata.failure_timestamps.push(now);
122
123        // Limit size to prevent unbounded growth (keep slightly more than threshold for safety)
124        let max_size = (threshold * 2) as usize;
125        if metadata.failure_timestamps.len() > max_size {
126            // Keep only the most recent failures (they're already in chronological order)
127            // Remove the oldest ones
128            let remove_count = metadata.failure_timestamps.len() - max_size;
129            metadata.failure_timestamps.drain(0..remove_count);
130        }
131
132        // Check if we've reached the threshold
133        let recent_failures = metadata.failure_timestamps.len() as u32;
134        let was_paused = metadata.paused_until.is_some();
135
136        if recent_failures >= threshold {
137            let paused_until = now + pause_duration;
138            metadata.paused_until = Some(paused_until);
139
140            if !was_paused {
141                // Provider just got paused
142                debug!(
143                    provider_url = %url,
144                    failure_count = %recent_failures,
145                    threshold = %threshold,
146                    paused_until = %paused_until,
147                    pause_duration_secs = %pause_duration.num_seconds(),
148                    "RPC provider paused due to failures"
149                );
150            } else {
151                // Provider was already paused, but pause duration extended
152                debug!(
153                    provider_url = %url,
154                    failure_count = %recent_failures,
155                    threshold = %threshold,
156                    paused_until = %paused_until,
157                    pause_duration_secs = %pause_duration.num_seconds(),
158                    "RPC provider pause extended due to additional failures"
159                );
160            }
161        }
162
163        store.insert(url.to_string(), metadata);
164    }
165
166    /// Resets the failure count and unpauses the endpoint.
167    ///
168    /// # Arguments
169    /// * `url` - The RPC endpoint URL
170    pub fn reset_failures(&self, url: &str) {
171        let mut store = self.acquire_write_lock();
172        store.remove(url);
173    }
174
175    /// Resets the failure count only if the provider has failures recorded.
176    ///
177    /// # Arguments
178    /// * `url` - The RPC endpoint URL
179    ///
180    /// # Returns
181    /// * `bool` - True if failures were reset, false if no failures existed
182    #[must_use]
183    pub fn reset_failures_if_exists(&self, url: &str) -> bool {
184        let mut store = self.acquire_write_lock();
185        store.remove(url).is_some()
186    }
187
188    /// Checks if an RPC endpoint is currently paused.
189    ///
190    /// An endpoint is considered paused if:
191    /// - It has reached the failure threshold (within expiration window) AND
192    /// - It has a paused_until timestamp that is in the future
193    ///
194    /// Stale failures (older than failure_expiration) are automatically removed
195    /// to allow the provider to be retried.
196    ///
197    /// This method uses a read-lock-first pattern for better concurrency:
198    /// - First acquires a read lock to check if modification is needed
199    /// - Only upgrades to write lock if cleanup is required
200    ///
201    /// # Arguments
202    /// * `url` - The RPC endpoint URL
203    /// * `threshold` - The failure threshold to check against
204    /// * `failure_expiration` - Duration after which failures are considered stale and removed
205    ///
206    /// # Returns
207    /// * `bool` - True if the endpoint is paused, false otherwise
208    pub fn is_paused(
209        &self,
210        url: &str,
211        threshold: u32,
212        failure_expiration: chrono::Duration,
213    ) -> bool {
214        let now = Utc::now();
215
216        // Fast path: check with read lock first
217        let needs_write = {
218            let store = self.acquire_read_lock();
219            match store.get(url) {
220                None => return false, // No entry, definitely not paused
221                Some(meta) => {
222                    // Check if we need to modify anything
223                    let has_stale_failures = meta
224                        .failure_timestamps
225                        .iter()
226                        .any(|&ts| now - ts > failure_expiration);
227                    let pause_expired = meta
228                        .paused_until
229                        .is_some_and(|paused_until| now >= paused_until);
230                    let needs_cleanup =
231                        meta.failure_timestamps.is_empty() && meta.paused_until.is_none();
232
233                    if has_stale_failures || pause_expired || needs_cleanup {
234                        // Need write lock to clean up
235                        true
236                    } else {
237                        // No cleanup needed, can determine pause status with read lock
238                        let recent_failures = meta.failure_timestamps.len() as u32;
239                        if recent_failures >= threshold {
240                            if let Some(paused_until) = meta.paused_until {
241                                return now < paused_until;
242                            }
243                        }
244                        return false;
245                    }
246                }
247            }
248        };
249
250        // Slow path: need write lock for cleanup
251        if needs_write {
252            self.is_paused_with_cleanup(url, threshold, failure_expiration, now)
253        } else {
254            false
255        }
256    }
257
258    /// Internal helper that performs pause check with cleanup (requires write lock).
259    ///
260    /// This is called when the read-lock check determined that modifications are needed.
261    fn is_paused_with_cleanup(
262        &self,
263        url: &str,
264        threshold: u32,
265        failure_expiration: chrono::Duration,
266        now: DateTime<Utc>,
267    ) -> bool {
268        let mut store = self.acquire_write_lock();
269
270        // Re-check after acquiring write lock (state may have changed)
271        let Some(meta) = store.get_mut(url) else {
272            return false;
273        };
274
275        // Remove stale failures (older than expiration window)
276        meta.failure_timestamps
277            .retain(|&ts| now - ts <= failure_expiration);
278
279        // If pause has expired, clear it
280        if let Some(paused_until) = meta.paused_until {
281            if now >= paused_until {
282                // Pause expired - clear pause (but keep failure timestamps for tracking)
283                debug!(
284                    provider_url = %url,
285                    paused_until = %paused_until,
286                    current_time = %now,
287                    remaining_failures = %meta.failure_timestamps.len(),
288                    "RPC provider pause expired, provider available again"
289                );
290                meta.paused_until = None;
291                // If no recent failures remain, clear everything
292                if meta.failure_timestamps.is_empty() {
293                    store.remove(url);
294                }
295                return false;
296            }
297        }
298
299        // Check if paused: must have reached threshold AND be within pause window
300        let recent_failures = meta.failure_timestamps.len() as u32;
301        if recent_failures >= threshold {
302            if let Some(paused_until) = meta.paused_until {
303                return now < paused_until;
304            }
305            // If we've reached threshold but no pause_until is set, not paused
306            return false;
307        }
308
309        // If no recent failures remain, remove the entry
310        if meta.failure_timestamps.is_empty() && meta.paused_until.is_none() {
311            store.remove(url);
312        }
313
314        false
315    }
316
317    /// Clears all metadata from the store.
318    /// Primarily useful for testing.
319    #[cfg(test)]
320    pub fn clear_all(&self) {
321        let mut store = self.acquire_write_lock();
322        store.clear();
323    }
324}
325
326#[cfg(test)]
327mod tests {
328    use super::*;
329
330    #[test]
331    fn test_get_metadata_returns_default_when_not_found() {
332        let store = RpcHealthStore::instance();
333
334        // Use a unique URL to avoid interference from other tests
335        let url = "https://test-get-metadata.example.com";
336        let metadata = store.get_metadata(url);
337        assert_eq!(metadata, RpcConfigMetadata::default());
338        assert_eq!(metadata.failure_timestamps.len(), 0);
339        assert_eq!(metadata.paused_until, None);
340    }
341
342    #[test]
343    fn test_update_and_get_metadata() {
344        let store = RpcHealthStore::instance();
345
346        let url = "https://test-update-metadata.example.com";
347        let mut metadata = RpcConfigMetadata::default();
348        metadata.failure_timestamps.push(Utc::now());
349        metadata.failure_timestamps.push(Utc::now());
350        metadata.failure_timestamps.push(Utc::now());
351
352        store.update_metadata(url, metadata.clone());
353
354        let retrieved = store.get_metadata(url);
355        assert_eq!(
356            retrieved.failure_timestamps.len(),
357            metadata.failure_timestamps.len()
358        );
359    }
360
361    #[test]
362    fn test_mark_failed_increments_count() {
363        let store = RpcHealthStore::instance();
364
365        // Use a unique URL to avoid interference
366        let url = "https://test-increment-count.example.com";
367        let expiration = chrono::Duration::seconds(60);
368        let threshold = 3;
369
370        // First failure
371        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
372        let metadata = store.get_metadata(url);
373        assert_eq!(
374            metadata.failure_timestamps.len(),
375            1,
376            "Should have 1 failure after first mark"
377        );
378        assert!(metadata.paused_until.is_none(), "Should not be paused yet");
379        // Check pause status after verifying metadata
380        assert!(
381            !store.is_paused(url, threshold, expiration),
382            "Should not be paused with 1 failure"
383        );
384        // Verify metadata still exists after is_paused call
385        let metadata_after = store.get_metadata(url);
386        assert_eq!(
387            metadata_after.failure_timestamps.len(),
388            1,
389            "Should still have 1 failure"
390        );
391
392        // Second failure
393        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
394        let metadata = store.get_metadata(url);
395        assert_eq!(
396            metadata.failure_timestamps.len(),
397            2,
398            "Should have 2 failures after second mark"
399        );
400        assert!(metadata.paused_until.is_none(), "Should not be paused yet");
401        assert!(
402            !store.is_paused(url, threshold, expiration),
403            "Should not be paused with 2 failures"
404        );
405        let metadata_after = store.get_metadata(url);
406        assert_eq!(
407            metadata_after.failure_timestamps.len(),
408            2,
409            "Should still have 2 failures"
410        );
411
412        // Third failure - should pause
413        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
414        let metadata = store.get_metadata(url);
415        assert_eq!(
416            metadata.failure_timestamps.len(),
417            3,
418            "Should have 3 failures after third mark"
419        );
420        assert!(
421            metadata.paused_until.is_some(),
422            "Should be paused after reaching threshold"
423        );
424        assert!(
425            store.is_paused(url, threshold, expiration),
426            "Should be paused"
427        );
428        let metadata_after = store.get_metadata(url);
429        assert_eq!(
430            metadata_after.failure_timestamps.len(),
431            3,
432            "Should still have 3 failures"
433        );
434        assert!(
435            metadata_after.paused_until.is_some(),
436            "Should still be paused"
437        );
438    }
439
440    #[test]
441    fn test_reset_failures() {
442        let store = RpcHealthStore::instance();
443
444        let url = "https://test-reset-failures.example.com";
445        let expiration = chrono::Duration::seconds(60);
446        let threshold = 3;
447
448        // Mark failed 3 times to trigger pause
449        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
450        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
451        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
452        assert!(store.is_paused(url, threshold, expiration));
453
454        store.reset_failures(url);
455        assert!(!store.is_paused(url, threshold, expiration));
456        let metadata = store.get_metadata(url);
457        assert_eq!(metadata, RpcConfigMetadata::default());
458    }
459
460    #[test]
461    fn test_is_paused_with_failure_count_below_threshold() {
462        let store = RpcHealthStore::instance();
463
464        let url = "https://test-below-threshold.example.com";
465        let expiration = chrono::Duration::seconds(60);
466        let mut metadata = RpcConfigMetadata::default();
467        metadata.failure_timestamps.push(Utc::now());
468        store.update_metadata(url, metadata);
469
470        // Should not be paused if below threshold
471        assert!(!store.is_paused(url, 3, expiration));
472    }
473
474    #[test]
475    fn test_is_paused_with_time_based_pause() {
476        let store = RpcHealthStore::instance();
477
478        let url = "https://test-time-based-pause.example.com";
479        let expiration = chrono::Duration::seconds(60);
480        let mut metadata = RpcConfigMetadata::default();
481        // Add 3 failures to reach threshold
482        metadata.failure_timestamps.push(Utc::now());
483        metadata.failure_timestamps.push(Utc::now());
484        metadata.failure_timestamps.push(Utc::now());
485        metadata.paused_until = Some(Utc::now() + chrono::Duration::seconds(60));
486        store.update_metadata(url, metadata);
487
488        assert!(store.is_paused(url, 3, expiration));
489    }
490
491    #[test]
492    fn test_is_paused_expires_after_time() {
493        let store = RpcHealthStore::instance();
494
495        let url = "https://test-expires-after-time.example.com";
496        let expiration = chrono::Duration::seconds(60);
497        let mut metadata = RpcConfigMetadata::default();
498        // Set pause to expire in the past
499        metadata.paused_until = Some(Utc::now() - chrono::Duration::seconds(60));
500        store.update_metadata(url, metadata);
501
502        // Should not be paused if pause time has expired
503        assert!(!store.is_paused(url, 3, expiration));
504    }
505
506    #[test]
507    fn test_shared_state_across_instances() {
508        let store1 = RpcHealthStore::instance();
509        let store2 = RpcHealthStore::instance();
510
511        let url = "https://test-shared-state.example.com";
512        let expiration = chrono::Duration::seconds(60);
513        let threshold = 3;
514
515        // Mark failed 3 times to trigger pause
516        store1.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
517        store1.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
518        store1.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
519
520        // Both instances should see the same state
521        assert!(store1.is_paused(url, threshold, expiration));
522        assert!(store2.is_paused(url, threshold, expiration));
523
524        let metadata1 = store1.get_metadata(url);
525        let metadata2 = store2.get_metadata(url);
526        assert_eq!(
527            metadata1.failure_timestamps.len(),
528            metadata2.failure_timestamps.len()
529        );
530    }
531
532    #[test]
533    fn test_stale_failures_are_expired() {
534        let store = RpcHealthStore::instance();
535
536        let url = "https://test-stale-failures.example.com";
537        let expiration = chrono::Duration::seconds(60);
538
539        // Add failures that are old (outside expiration window)
540        let mut metadata = RpcConfigMetadata::default();
541        metadata
542            .failure_timestamps
543            .push(Utc::now() - chrono::Duration::seconds(120)); // 2 minutes ago
544        metadata
545            .failure_timestamps
546            .push(Utc::now() - chrono::Duration::seconds(90)); // 1.5 minutes ago
547        store.update_metadata(url, metadata);
548
549        // Old failures should be expired when checking pause status
550        assert!(!store.is_paused(url, 3, expiration));
551
552        // Metadata should be cleaned up (no recent failures)
553        let metadata = store.get_metadata(url);
554        assert_eq!(metadata.failure_timestamps.len(), 0);
555    }
556
557    #[test]
558    fn test_failure_timestamps_size_limit() {
559        let store = RpcHealthStore::instance();
560
561        let url = "https://test-size-limit.example.com";
562        let expiration = chrono::Duration::seconds(60);
563        let threshold = 3;
564
565        // Add many failures quickly (within expiration window)
566        for _ in 0..10 {
567            store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
568        }
569
570        let metadata = store.get_metadata(url);
571        // Should be limited to threshold * 2 = 6 entries
572        assert!(metadata.failure_timestamps.len() <= (threshold * 2) as usize);
573    }
574
575    #[test]
576    fn test_mixed_stale_and_recent_failures() {
577        let store = RpcHealthStore::instance();
578
579        let url = "https://test-mixed-failures.example.com";
580        let expiration = chrono::Duration::seconds(60);
581        let threshold = 3;
582
583        // Add old failures manually
584        let mut metadata = RpcConfigMetadata::default();
585        metadata
586            .failure_timestamps
587            .push(Utc::now() - chrono::Duration::seconds(120)); // Stale
588        metadata
589            .failure_timestamps
590            .push(Utc::now() - chrono::Duration::seconds(90)); // Stale
591        store.update_metadata(url, metadata);
592
593        // Add recent failures - mark_failed will remove stale ones first
594        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
595        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
596
597        let metadata = store.get_metadata(url);
598        // Should only have 2 recent failures (stale ones removed during mark_failed)
599        assert_eq!(metadata.failure_timestamps.len(), 2);
600        assert!(!store.is_paused(url, threshold, expiration)); // Below threshold
601    }
602
603    #[test]
604    fn test_pause_extension_when_already_paused() {
605        let store = RpcHealthStore::instance();
606
607        // Use a unique URL to avoid interference
608        let url = "https://test-pause-extension.example.com";
609        let expiration = chrono::Duration::seconds(60);
610        let pause_duration = chrono::Duration::seconds(60);
611        let threshold = 3;
612
613        // Mark failed 3 times to trigger pause
614        store.mark_failed(url, threshold, pause_duration, expiration);
615        store.mark_failed(url, threshold, pause_duration, expiration);
616        store.mark_failed(url, threshold, pause_duration, expiration);
617
618        // Verify pause was set (check metadata before calling is_paused)
619        let metadata1 = store.get_metadata(url);
620        assert_eq!(
621            metadata1.failure_timestamps.len(),
622            3,
623            "Should have 3 failures"
624        );
625        assert!(
626            metadata1.paused_until.is_some(),
627            "Should be paused after 3 failures"
628        );
629        let initial_paused_until = metadata1.paused_until.unwrap();
630
631        // Verify it's actually paused
632        assert!(
633            store.is_paused(url, threshold, expiration),
634            "Should be paused"
635        );
636        // Verify metadata still exists after is_paused
637        let metadata1_after = store.get_metadata(url);
638        assert_eq!(
639            metadata1_after.failure_timestamps.len(),
640            3,
641            "Should still have 3 failures"
642        );
643
644        // Wait a bit (simulate time passing)
645        std::thread::sleep(std::time::Duration::from_millis(10));
646
647        // Mark failed again while already paused - should extend pause
648        store.mark_failed(url, threshold, pause_duration, expiration);
649
650        let metadata2 = store.get_metadata(url);
651        assert_eq!(
652            metadata2.failure_timestamps.len(),
653            4,
654            "Should have 4 failures now"
655        );
656        assert!(
657            metadata2.paused_until.is_some(),
658            "Should still be paused after 4th failure"
659        );
660        let new_paused_until = metadata2.paused_until.unwrap();
661
662        // Pause should be extended (new paused_until should be later)
663        assert!(
664            new_paused_until > initial_paused_until,
665            "Pause should be extended"
666        );
667        assert!(
668            store.is_paused(url, threshold, expiration),
669            "Should still be paused"
670        );
671    }
672
673    #[test]
674    fn test_stale_failures_removed_during_mark_failed() {
675        let store = RpcHealthStore::instance();
676
677        let url = "https://test-stale-removed.example.com";
678        let expiration = chrono::Duration::seconds(60);
679
680        // Add old failures manually
681        let mut metadata = RpcConfigMetadata::default();
682        metadata
683            .failure_timestamps
684            .push(Utc::now() - chrono::Duration::seconds(120)); // Stale
685        metadata
686            .failure_timestamps
687            .push(Utc::now() - chrono::Duration::seconds(90)); // Stale
688        store.update_metadata(url, metadata);
689
690        // Mark failed - should remove stale failures and add new one
691        store.mark_failed(url, 3, chrono::Duration::seconds(60), expiration);
692
693        let metadata = store.get_metadata(url);
694        // Should only have 1 failure (stale ones removed, new one added)
695        assert_eq!(metadata.failure_timestamps.len(), 1);
696        // Verify the remaining failure is recent
697        let remaining_failure = metadata.failure_timestamps[0];
698        let age = Utc::now() - remaining_failure;
699        assert!(age.num_seconds() < 5); // Should be very recent
700    }
701
702    #[test]
703    fn test_pause_expiration_cleans_up_metadata() {
704        let store = RpcHealthStore::instance();
705
706        let url = "https://test-pause-expiration-cleanup.example.com";
707        let expiration = chrono::Duration::seconds(60);
708
709        // Create metadata with expired pause but no recent failures
710        let mut metadata = RpcConfigMetadata::default();
711        metadata.paused_until = Some(Utc::now() - chrono::Duration::seconds(10)); // Expired
712        store.update_metadata(url, metadata);
713
714        // Check pause status - should expire and clean up
715        assert!(!store.is_paused(url, 3, expiration));
716
717        // Metadata should be removed since pause expired and no failures remain
718        let metadata_after = store.get_metadata(url);
719        assert_eq!(metadata_after, RpcConfigMetadata::default());
720    }
721
722    #[test]
723    fn test_pause_expiration_keeps_recent_failures() {
724        let store = RpcHealthStore::instance();
725
726        // Use a unique URL to avoid interference
727        let url = "https://test-pause-expiration.example.com";
728        let expiration = chrono::Duration::seconds(60);
729        let threshold = 3;
730
731        // Create metadata with expired pause but recent failures
732        // We need at least threshold failures to have been paused, but pause is now expired
733        let mut metadata = RpcConfigMetadata::default();
734        // Add threshold failures (but they're recent, not stale)
735        metadata
736            .failure_timestamps
737            .push(Utc::now() - chrono::Duration::seconds(30)); // Recent
738        metadata
739            .failure_timestamps
740            .push(Utc::now() - chrono::Duration::seconds(25)); // Recent
741        metadata
742            .failure_timestamps
743            .push(Utc::now() - chrono::Duration::seconds(20)); // Recent
744        metadata.paused_until = Some(Utc::now() - chrono::Duration::seconds(10)); // Expired pause
745        store.update_metadata(url, metadata);
746
747        // Check pause status - should expire pause but keep failures
748        // Note: is_paused will modify the metadata (remove expired pause)
749        // Since we have threshold failures but pause is expired, should return false
750        assert!(
751            !store.is_paused(url, threshold, expiration),
752            "Should not be paused when pause expired"
753        );
754
755        // Metadata should still exist with failures but no pause
756        let metadata_after = store.get_metadata(url);
757        assert_eq!(
758            metadata_after.failure_timestamps.len(),
759            3,
760            "Should keep all recent failures"
761        );
762        assert!(
763            metadata_after.paused_until.is_none(),
764            "Pause should be cleared"
765        );
766    }
767
768    #[test]
769    fn test_reset_failures_if_exists_returns_true_when_entry_exists() {
770        let store = RpcHealthStore::instance();
771
772        let url = "https://test-reset-if-exists-true.example.com";
773        let expiration = chrono::Duration::seconds(60);
774        let threshold = 3;
775
776        // Add some failures
777        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
778        store.mark_failed(url, threshold, chrono::Duration::seconds(60), expiration);
779
780        // Should return true when entry exists
781        let result = store.reset_failures_if_exists(url);
782        assert!(result, "Should return true when entry existed");
783
784        // Verify entry was removed
785        let metadata = store.get_metadata(url);
786        assert_eq!(metadata, RpcConfigMetadata::default());
787    }
788
789    #[test]
790    fn test_reset_failures_if_exists_returns_false_when_no_entry() {
791        let store = RpcHealthStore::instance();
792
793        // Use a URL that was never used
794        let url = "https://test-reset-if-exists-false.example.com";
795
796        // Should return false when entry doesn't exist
797        let result = store.reset_failures_if_exists(url);
798        assert!(!result, "Should return false when entry doesn't exist");
799    }
800
801    #[test]
802    fn test_is_paused_fast_path_no_cleanup_needed() {
803        let store = RpcHealthStore::instance();
804
805        // This test verifies the fast path in is_paused (lines 237-244)
806        // where no cleanup is needed and we can return directly with read lock
807        let url = "https://test-fast-path.example.com";
808        let expiration = chrono::Duration::seconds(60);
809        let threshold = 3;
810
811        // Create metadata with recent failures (not stale) and future pause
812        let mut metadata = RpcConfigMetadata::default();
813        metadata.failure_timestamps.push(Utc::now());
814        metadata.failure_timestamps.push(Utc::now());
815        metadata.failure_timestamps.push(Utc::now());
816        metadata.paused_until = Some(Utc::now() + chrono::Duration::seconds(60));
817        store.update_metadata(url, metadata);
818
819        // This should hit the fast path and return true without needing write lock
820        assert!(
821            store.is_paused(url, threshold, expiration),
822            "Should be paused via fast path"
823        );
824
825        // Verify the metadata is unchanged (no cleanup performed)
826        let metadata_after = store.get_metadata(url);
827        assert_eq!(
828            metadata_after.failure_timestamps.len(),
829            3,
830            "Should have 3 failures unchanged"
831        );
832        assert!(
833            metadata_after.paused_until.is_some(),
834            "Pause should be unchanged"
835        );
836    }
837
838    #[test]
839    fn test_is_paused_fast_path_below_threshold_returns_false() {
840        let store = RpcHealthStore::instance();
841
842        // Test fast path when failures are below threshold
843        let url = "https://test-fast-path-below-threshold.example.com";
844        let expiration = chrono::Duration::seconds(60);
845        let threshold = 3;
846
847        // Create metadata with recent failures below threshold
848        let mut metadata = RpcConfigMetadata::default();
849        metadata.failure_timestamps.push(Utc::now());
850        metadata.failure_timestamps.push(Utc::now());
851        // Only 2 failures, threshold is 3
852        store.update_metadata(url, metadata);
853
854        // Should hit fast path and return false
855        assert!(
856            !store.is_paused(url, threshold, expiration),
857            "Should not be paused - below threshold"
858        );
859    }
860
861    #[test]
862    fn test_is_paused_threshold_reached_but_no_pause_until() {
863        let store = RpcHealthStore::instance();
864
865        // Test edge case: threshold reached but no paused_until set
866        // This can happen if metadata is manipulated directly
867        let url = "https://test-threshold-no-pause.example.com";
868        let expiration = chrono::Duration::seconds(60);
869        let threshold = 3;
870
871        // Create metadata with failures at threshold but no paused_until
872        let mut metadata = RpcConfigMetadata::default();
873        metadata.failure_timestamps.push(Utc::now());
874        metadata.failure_timestamps.push(Utc::now());
875        metadata.failure_timestamps.push(Utc::now());
876        // Note: paused_until is None
877        store.update_metadata(url, metadata);
878
879        // Should return false because no paused_until is set
880        // This tests line 305-306 in is_paused_with_cleanup
881        assert!(
882            !store.is_paused(url, threshold, expiration),
883            "Should not be paused - no paused_until set despite threshold reached"
884        );
885    }
886
887    #[test]
888    fn test_is_paused_cleans_up_empty_entry() {
889        let store = RpcHealthStore::instance();
890
891        // Test that empty entries get cleaned up
892        let url = "https://test-cleanup-empty.example.com";
893        let expiration = chrono::Duration::seconds(60);
894
895        // Create an empty metadata entry (simulating a state after all failures expired)
896        let metadata = RpcConfigMetadata::default();
897        store.update_metadata(url, metadata);
898
899        // Calling is_paused should clean up the empty entry
900        assert!(!store.is_paused(url, 3, expiration));
901
902        // Verify entry was removed (get_metadata returns default for non-existent entries)
903        // We can't directly verify removal, but the behavior is correct
904    }
905
906    #[test]
907    fn test_mark_failed_logs_new_pause_vs_extended_pause() {
908        let store = RpcHealthStore::instance();
909
910        // This test exercises both logging branches in mark_failed:
911        // - Line 142-149: "RPC provider paused due to failures" (first pause)
912        // - Line 151-159: "RPC provider pause extended" (already paused)
913        let url = "https://test-pause-logging.example.com";
914        let expiration = chrono::Duration::seconds(60);
915        let pause_duration = chrono::Duration::seconds(60);
916        let threshold = 3;
917
918        // First pause (exercises lines 142-149)
919        store.mark_failed(url, threshold, pause_duration, expiration);
920        store.mark_failed(url, threshold, pause_duration, expiration);
921        store.mark_failed(url, threshold, pause_duration, expiration);
922
923        let metadata1 = store.get_metadata(url);
924        assert!(metadata1.paused_until.is_some(), "Should be paused");
925
926        // Extended pause (exercises lines 151-159)
927        store.mark_failed(url, threshold, pause_duration, expiration);
928
929        let metadata2 = store.get_metadata(url);
930        assert!(
931            metadata2.paused_until.is_some(),
932            "Should still be paused after extension"
933        );
934    }
935}