Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 6 additions & 4 deletions core/src/subgraph/context/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ use graph::{
slog::Logger,
};
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use std::sync::Arc;

use graph::parking_lot::RwLock;
use tokio::sync::mpsc;

use self::instance::SubgraphInstance;
Expand All @@ -44,18 +46,18 @@ impl SubgraphKeepAlive {
}

pub fn remove(&self, deployment_id: &DeploymentId) {
self.alive_map.write().unwrap().remove(deployment_id);
self.alive_map.write().remove(deployment_id);
self.sg_metrics.running_count.dec();
}
pub fn insert(&self, deployment_id: DeploymentId, guard: CancelGuard) {
let old = self.alive_map.write().unwrap().insert(deployment_id, guard);
let old = self.alive_map.write().insert(deployment_id, guard);
if old.is_none() {
self.sg_metrics.running_count.inc();
}
}

pub fn contains(&self, deployment_id: &DeploymentId) -> bool {
self.alive_map.read().unwrap().contains_key(deployment_id)
self.alive_map.read().contains_key(deployment_id)
}
}

Expand Down
5 changes: 5 additions & 0 deletions docs/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,11 @@ those.
decisions. Set to `true` to turn simulation on, defaults to `false`
- `GRAPH_STORE_CONNECTION_TIMEOUT`: How long to wait to connect to a
database before assuming the database is down in ms. Defaults to 5000ms.
- `GRAPH_STORE_CONNECTION_UNAVAILABLE_RETRY`: When a database shard is marked
unavailable due to connection timeouts, this controls how often to allow a
single probe request through to check if the database has recovered. Only one
request per interval will attempt a connection; all others fail instantly.
Value is in seconds and defaults to 2s.
- `EXPERIMENTAL_SUBGRAPH_VERSION_SWITCHING_MODE`: default is `instant`, set
to `synced` to only switch a named subgraph to a new deployment once it
has synced, making the new deployment the "Pending" version.
Expand Down
1 change: 1 addition & 0 deletions graph/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ futures03 = { version = "0.3.31", package = "futures", features = ["compat"] }
wasmparser = "0.118.1"
thiserror = { workspace = true }
parking_lot = "0.12.5"
portable-atomic = { version = "1.11", features = ["fallback"] }
itertools = "0.14.0"
defer = "0.2"

Expand Down
31 changes: 11 additions & 20 deletions graph/src/components/metrics/registry.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use std::sync::Arc;

use crate::parking_lot::RwLock;

use prometheus::{labels, Histogram, IntCounterVec};
use prometheus::{IntCounter, IntGauge};
Expand Down Expand Up @@ -109,14 +111,13 @@ impl MetricsRegistry {
};
let counters = CounterVec::new(opts, variable_labels)?;
let id = counters.desc().first().unwrap().id;
let maybe_counter = self.global_counter_vecs.read().unwrap().get(&id).cloned();
let maybe_counter = self.global_counter_vecs.read().get(&id).cloned();
if let Some(counters) = maybe_counter {
Ok(counters)
} else {
self.register(name, Box::new(counters.clone()));
self.global_counter_vecs
.write()
.unwrap()
.insert(id, counters.clone());
Ok(counters)
}
Expand Down Expand Up @@ -161,15 +162,12 @@ impl MetricsRegistry {
) -> Result<Counter, PrometheusError> {
let counter = counter_with_labels(name, help, const_labels)?;
let id = counter.desc().first().unwrap().id;
let maybe_counter = self.global_counters.read().unwrap().get(&id).cloned();
let maybe_counter = self.global_counters.read().get(&id).cloned();
if let Some(counter) = maybe_counter {
Ok(counter)
} else {
self.register(name, Box::new(counter.clone()));
self.global_counters
.write()
.unwrap()
.insert(id, counter.clone());
self.global_counters.write().insert(id, counter.clone());
Ok(counter)
}
}
Expand Down Expand Up @@ -210,15 +208,12 @@ impl MetricsRegistry {
) -> Result<Gauge, PrometheusError> {
let gauge = gauge_with_labels(name, help, const_labels)?;
let id = gauge.desc().first().unwrap().id;
let maybe_gauge = self.global_gauges.read().unwrap().get(&id).cloned();
let maybe_gauge = self.global_gauges.read().get(&id).cloned();
if let Some(gauge) = maybe_gauge {
Ok(gauge)
} else {
self.register(name, Box::new(gauge.clone()));
self.global_gauges
.write()
.unwrap()
.insert(id, gauge.clone());
self.global_gauges.write().insert(id, gauge.clone());
Ok(gauge)
}
}
Expand All @@ -232,15 +227,12 @@ impl MetricsRegistry {
let opts = Opts::new(name, help);
let gauges = GaugeVec::new(opts, variable_labels)?;
let id = gauges.desc().first().unwrap().id;
let maybe_gauge = self.global_gauge_vecs.read().unwrap().get(&id).cloned();
let maybe_gauge = self.global_gauge_vecs.read().get(&id).cloned();
if let Some(gauges) = maybe_gauge {
Ok(gauges)
} else {
self.register(name, Box::new(gauges.clone()));
self.global_gauge_vecs
.write()
.unwrap()
.insert(id, gauges.clone());
self.global_gauge_vecs.write().insert(id, gauges.clone());
Ok(gauges)
}
}
Expand All @@ -254,14 +246,13 @@ impl MetricsRegistry {
let opts = HistogramOpts::new(name, help);
let histograms = HistogramVec::new(opts, variable_labels)?;
let id = histograms.desc().first().unwrap().id;
let maybe_histogram = self.global_histogram_vecs.read().unwrap().get(&id).cloned();
let maybe_histogram = self.global_histogram_vecs.read().get(&id).cloned();
if let Some(histograms) = maybe_histogram {
Ok(histograms)
} else {
self.register(name, Box::new(histograms.clone()));
self.global_histogram_vecs
.write()
.unwrap()
.insert(id, histograms.clone());
Ok(histograms)
}
Expand Down
8 changes: 4 additions & 4 deletions graph/src/components/store/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::fmt;
use std::fmt::Display;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, RwLock};
use std::sync::Arc;
use std::time::Duration;

use async_trait::async_trait;
Expand All @@ -42,7 +42,7 @@ use crate::env::ENV_VARS;
use crate::internal_error;
use crate::prelude::{s, Attribute, DeploymentHash, ValueType};
use crate::schema::{ast as sast, EntityKey, EntityType, InputSchema};
use crate::util::stats::MovingStats;
use crate::util::stats::AtomicMovingStats;

#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct EntityFilterDerivative(bool);
Expand Down Expand Up @@ -742,8 +742,8 @@ impl Display for DeploymentLocator {
}

// The type that the connection pool uses to track wait times for
// connection checkouts
pub type PoolWaitStats = Arc<RwLock<MovingStats>>;
// connection checkouts. Uses lock-free atomic operations internally.
pub type PoolWaitStats = Arc<AtomicMovingStats>;

/// Determines which columns should be selected in a table.
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
Expand Down
18 changes: 10 additions & 8 deletions graph/src/data/graphql/load_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ use prometheus::core::GenericCounter;
use rand::{prelude::Rng, rng};
use std::collections::{HashMap, HashSet};
use std::iter::FromIterator;
use std::sync::{Arc, RwLock};
use std::sync::Arc;
use std::time::{Duration, Instant};

use crate::parking_lot::RwLock;

use crate::components::metrics::{Counter, GaugeVec, MetricsRegistry};
use crate::components::store::{DeploymentId, PoolWaitStats};
use crate::data::graphql::shape_hash::shape_hash;
Expand Down Expand Up @@ -57,7 +59,7 @@ impl ShardEffort {
}

pub fn add(&self, shard: &str, qref: QueryRef, duration: Duration, gauge: &GaugeVec) {
let mut inner = self.inner.write().unwrap();
let mut inner = self.inner.write();
inner.add(qref, duration);
gauge
.with_label_values(&[shard])
Expand All @@ -70,7 +72,7 @@ impl ShardEffort {
/// data for the particular query, return `None` as the effort
/// for the query
pub fn current_effort(&self, qref: &QueryRef) -> (Option<Duration>, Duration) {
let inner = self.inner.read().unwrap();
let inner = self.inner.read();
let total_effort = inner.total.duration();
let query_effort = inner.effort.get(qref).map(|stats| stats.duration());
(query_effort, total_effort)
Expand Down Expand Up @@ -381,7 +383,7 @@ impl LoadManager {

let qref = QueryRef::new(deployment, shape_hash);

if self.jailed_queries.read().unwrap().contains(&qref) {
if self.jailed_queries.read().contains(&qref) {
return if ENV_VARS.load_simulate {
Proceed
} else {
Expand Down Expand Up @@ -426,7 +428,7 @@ impl LoadManager {
"query_effort_ms" => query_effort,
"total_effort_ms" => total_effort,
"ratio" => format!("{:.4}", query_effort/total_effort));
self.jailed_queries.write().unwrap().insert(qref);
self.jailed_queries.write().insert(qref);
return if ENV_VARS.load_simulate {
Proceed
} else {
Expand Down Expand Up @@ -457,15 +459,15 @@ impl LoadManager {
}

fn overloaded(&self, wait_stats: &PoolWaitStats) -> (bool, Duration) {
let store_avg = wait_stats.read().unwrap().average();
let store_avg = wait_stats.average();
let overloaded = store_avg
.map(|average| average > ENV_VARS.load_threshold)
.unwrap_or(false);
(overloaded, store_avg.unwrap_or(Duration::ZERO))
}

fn kill_state(&self, shard: &str) -> (f64, Instant) {
let state = self.kill_state.get(shard).unwrap().read().unwrap();
let state = self.kill_state.get(shard).unwrap().read();
(state.kill_rate, state.last_update)
}

Expand Down Expand Up @@ -505,7 +507,7 @@ impl LoadManager {
kill_rate = (kill_rate - KILL_RATE_STEP_DOWN).max(0.0);
}
let event = {
let mut state = self.kill_state.get(shard).unwrap().write().unwrap();
let mut state = self.kill_state.get(shard).unwrap().write();
state.kill_rate = kill_rate;
state.last_update = now;
state.log_event(now, kill_rate, overloaded)
Expand Down
25 changes: 25 additions & 0 deletions graph/src/env/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,20 @@ pub struct EnvVarsStore {
/// Disables storing or reading `eth_call` results from the store call cache.
/// Set by `GRAPH_STORE_DISABLE_CALL_CACHE`. Defaults to false.
pub disable_call_cache: bool,
/// Set by `GRAPH_STORE_DISABLE_CHAIN_HEAD_PTR_CACHE`. Default is false.
/// Set to true to disable chain_head_ptr caching (safety escape hatch).
pub disable_chain_head_ptr_cache: bool,
/// Minimum idle time before running connection health check (SELECT 67).
/// Connections used more recently than this threshold skip validation.
/// Set to 0 to always validate (previous behavior).
/// Set by `GRAPH_STORE_CONNECTION_VALIDATION_IDLE_SECS`. Default is 30 seconds.
pub connection_validation_idle_secs: Duration,
/// When a database shard is marked unavailable due to connection timeouts,
/// this controls how often to allow a single probe request through to check
/// if the database has recovered. Only one request per interval will attempt
/// a connection; all others fail instantly with DatabaseUnavailable.
/// Set by `GRAPH_STORE_CONNECTION_UNAVAILABLE_RETRY`. Default is 2 seconds.
pub connection_unavailable_retry: Duration,
}

// This does not print any values avoid accidentally leaking any sensitive env vars
Expand Down Expand Up @@ -224,6 +238,11 @@ impl TryFrom<InnerStore> for EnvVarsStore {
account_like_min_versions_count: x.account_like_min_versions_count,
account_like_max_unique_ratio: x.account_like_max_unique_ratio.map(|r| r.0),
disable_call_cache: x.disable_call_cache,
disable_chain_head_ptr_cache: x.disable_chain_head_ptr_cache,
connection_validation_idle_secs: Duration::from_secs(x.connection_validation_idle_secs),
connection_unavailable_retry: Duration::from_secs(
x.connection_unavailable_retry_in_secs,
),
};
if let Some(timeout) = vars.batch_timeout {
if timeout < 2 * vars.batch_target_duration {
Expand Down Expand Up @@ -331,6 +350,12 @@ pub struct InnerStore {
account_like_max_unique_ratio: Option<ZeroToOneF64>,
#[envconfig(from = "GRAPH_STORE_DISABLE_CALL_CACHE", default = "false")]
disable_call_cache: bool,
#[envconfig(from = "GRAPH_STORE_DISABLE_CHAIN_HEAD_PTR_CACHE", default = "false")]
disable_chain_head_ptr_cache: bool,
#[envconfig(from = "GRAPH_STORE_CONNECTION_VALIDATION_IDLE_SECS", default = "30")]
connection_validation_idle_secs: u64,
#[envconfig(from = "GRAPH_STORE_CONNECTION_UNAVAILABLE_RETRY", default = "2")]
connection_unavailable_retry_in_secs: u64,
}

#[derive(Clone, Copy, Debug)]
Expand Down
2 changes: 1 addition & 1 deletion graph/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ pub mod prelude {
pub use crate::log::split::split_logger;
pub use crate::util::cache_weight::CacheWeight;
pub use crate::util::futures::{retry, TimeoutError};
pub use crate::util::stats::MovingStats;
pub use crate::util::stats::{AtomicMovingStats, MovingStats};

macro_rules! static_graphql {
($m:ident, $m2:ident, {$($n:ident,)*}) => {
Expand Down
Loading