resolver: auto-tune nx-heavy batch share by run health
This commit is contained in:
@@ -218,10 +218,13 @@ type resolverLiveBatchStats struct {
|
||||
P1 int
|
||||
P2 int
|
||||
P3 int
|
||||
NXHeavyPct int
|
||||
NXHeavyTotal int
|
||||
NXHeavySkip int
|
||||
NextTarget int
|
||||
NextReason string
|
||||
NextNXPct int
|
||||
NextNXReason string
|
||||
DNSAttempts int
|
||||
DNSTimeout int
|
||||
DNSCoolSkips int
|
||||
@@ -413,13 +416,28 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
||||
liveBatchDefault = liveBatchMax
|
||||
}
|
||||
liveBatchTarget := loadResolverLiveBatchTarget(precheckStatePath, liveBatchDefault, liveBatchMin, liveBatchMax)
|
||||
liveBatchNXHeavyPct := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10)
|
||||
if liveBatchNXHeavyPct < 0 {
|
||||
liveBatchNXHeavyPct = 0
|
||||
liveBatchNXHeavyMin := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MIN_PCT", 5)
|
||||
liveBatchNXHeavyMax := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MAX_PCT", 35)
|
||||
liveBatchNXHeavyDefault := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10)
|
||||
if liveBatchNXHeavyMin < 0 {
|
||||
liveBatchNXHeavyMin = 0
|
||||
}
|
||||
if liveBatchNXHeavyPct > 100 {
|
||||
liveBatchNXHeavyPct = 100
|
||||
if liveBatchNXHeavyMin > 100 {
|
||||
liveBatchNXHeavyMin = 100
|
||||
}
|
||||
if liveBatchNXHeavyMax < liveBatchNXHeavyMin {
|
||||
liveBatchNXHeavyMax = liveBatchNXHeavyMin
|
||||
}
|
||||
if liveBatchNXHeavyMax > 100 {
|
||||
liveBatchNXHeavyMax = 100
|
||||
}
|
||||
if liveBatchNXHeavyDefault < liveBatchNXHeavyMin {
|
||||
liveBatchNXHeavyDefault = liveBatchNXHeavyMin
|
||||
}
|
||||
if liveBatchNXHeavyDefault > liveBatchNXHeavyMax {
|
||||
liveBatchNXHeavyDefault = liveBatchNXHeavyMax
|
||||
}
|
||||
liveBatchNXHeavyPct := loadResolverLiveBatchNXHeavyPct(precheckStatePath, liveBatchNXHeavyDefault, liveBatchNXHeavyMin, liveBatchNXHeavyMax)
|
||||
precheckEnvForced := resolvePrecheckForceEnvEnabled()
|
||||
precheckFileForced := resolvePrecheckForceFileEnabled(precheckForcePath)
|
||||
precheckDue := precheckEnvForced || precheckFileForced || (precheckEverySec > 0 && (precheckLastRun <= 0 || now-precheckLastRun >= precheckEverySec))
|
||||
@@ -484,7 +502,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
||||
wildcardPolicy := wildcardDNSAttemptPolicy(1)
|
||||
cEnabled, cMin, cRate, cStreak, cBan, cMaxBan := cooldown.configSnapshot()
|
||||
logf(
|
||||
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
|
||||
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d live_batch_nx_heavy_min=%d live_batch_nx_heavy_max=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
|
||||
directPolicy.TryLimit,
|
||||
directPolicy.DomainBudget.Milliseconds(),
|
||||
wildcardPolicy.TryLimit,
|
||||
@@ -501,6 +519,8 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
||||
liveBatchMin,
|
||||
liveBatchMax,
|
||||
liveBatchNXHeavyPct,
|
||||
liveBatchNXHeavyMin,
|
||||
liveBatchNXHeavyMax,
|
||||
staleKeepSec,
|
||||
precheckEverySec,
|
||||
precheckMaxDomains,
|
||||
@@ -783,7 +803,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
||||
dnsErrors := dnsStats.totalErrors()
|
||||
unresolvedSuppressed := cacheNegativeHits + quarantineHits + liveDeferred
|
||||
logf(
|
||||
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
|
||||
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_pct=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
|
||||
len(domains),
|
||||
len(fresh),
|
||||
cacheNegativeHits,
|
||||
@@ -798,6 +818,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
||||
liveP1,
|
||||
liveP2,
|
||||
liveP3,
|
||||
liveBatchNXHeavyPct,
|
||||
liveNXHeavyTotal,
|
||||
liveNXHeavySkip,
|
||||
len(staticEntries),
|
||||
@@ -850,6 +871,27 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
||||
}
|
||||
if precheckDue {
|
||||
nextTarget, nextReason := computeNextLiveBatchTarget(liveBatchTarget, liveBatchMin, liveBatchMax, dnsStats, liveDeferred)
|
||||
nextNXPct, nextNXReason := computeNextLiveBatchNXHeavyPct(
|
||||
liveBatchNXHeavyPct,
|
||||
liveBatchNXHeavyMin,
|
||||
liveBatchNXHeavyMax,
|
||||
dnsStats,
|
||||
resolvedNowDNS,
|
||||
liveP3,
|
||||
liveNXHeavyTotal,
|
||||
liveNXHeavySkip,
|
||||
)
|
||||
if logf != nil {
|
||||
logf(
|
||||
"resolve live-batch nxheavy: pct=%d next=%d reason=%s selected=%d total=%d skipped=%d",
|
||||
liveBatchNXHeavyPct,
|
||||
nextNXPct,
|
||||
nextNXReason,
|
||||
liveP3,
|
||||
liveNXHeavyTotal,
|
||||
liveNXHeavySkip,
|
||||
)
|
||||
}
|
||||
saveResolverPrecheckState(
|
||||
precheckStatePath,
|
||||
now,
|
||||
@@ -861,10 +903,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
||||
P1: liveP1,
|
||||
P2: liveP2,
|
||||
P3: liveP3,
|
||||
NXHeavyPct: liveBatchNXHeavyPct,
|
||||
NXHeavyTotal: liveNXHeavyTotal,
|
||||
NXHeavySkip: liveNXHeavySkip,
|
||||
NextTarget: nextTarget,
|
||||
NextReason: nextReason,
|
||||
NextNXPct: nextNXPct,
|
||||
NextNXReason: nextNXReason,
|
||||
DNSAttempts: dnsStats.Attempts,
|
||||
DNSTimeout: dnsStats.Timeout,
|
||||
DNSCoolSkips: dnsStats.Skipped,
|
||||
@@ -2473,6 +2518,34 @@ func loadResolverLiveBatchTarget(path string, fallback, minV, maxV int) int {
|
||||
return v
|
||||
}
|
||||
|
||||
func loadResolverLiveBatchNXHeavyPct(path string, fallback, minV, maxV int) int {
|
||||
if fallback < minV {
|
||||
fallback = minV
|
||||
}
|
||||
if fallback > maxV {
|
||||
fallback = maxV
|
||||
}
|
||||
m := loadJSONMap(path)
|
||||
if len(m) == 0 {
|
||||
return fallback
|
||||
}
|
||||
raw := m["live_batch_nxheavy_next_pct"]
|
||||
if raw == nil {
|
||||
raw = m["live_batch_nxheavy_pct"]
|
||||
}
|
||||
v, ok := parseAnyInt(raw)
|
||||
if !ok {
|
||||
return fallback
|
||||
}
|
||||
if v < minV {
|
||||
v = minV
|
||||
}
|
||||
if v > maxV {
|
||||
v = maxV
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, deferred int) (int, string) {
|
||||
if current < minV {
|
||||
current = minV
|
||||
@@ -2517,6 +2590,73 @@ func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, de
|
||||
return next, reason
|
||||
}
|
||||
|
||||
func computeNextLiveBatchNXHeavyPct(
|
||||
current, minV, maxV int,
|
||||
dnsStats dnsMetrics,
|
||||
resolvedNowDNS int,
|
||||
liveP3 int,
|
||||
liveNXHeavyTotal int,
|
||||
liveNXHeavySkip int,
|
||||
) (int, string) {
|
||||
if current < minV {
|
||||
current = minV
|
||||
}
|
||||
if current > maxV {
|
||||
current = maxV
|
||||
}
|
||||
next := current
|
||||
reason := "stable"
|
||||
attempts := dnsStats.Attempts
|
||||
timeoutRate := 0.0
|
||||
nxRate := 0.0
|
||||
okRate := 0.0
|
||||
if attempts > 0 {
|
||||
timeoutRate = float64(dnsStats.Timeout) / float64(attempts)
|
||||
nxRate = float64(dnsStats.NXDomain) / float64(attempts)
|
||||
okRate = float64(dnsStats.OK) / float64(attempts)
|
||||
}
|
||||
nxSelectedRatio := 0.0
|
||||
if liveNXHeavyTotal > 0 {
|
||||
nxSelectedRatio = float64(liveP3) / float64(liveNXHeavyTotal)
|
||||
}
|
||||
|
||||
switch {
|
||||
case attempts == 0:
|
||||
reason = "no_dns_attempts"
|
||||
case timeoutRate >= 0.20 || dnsStats.Skipped > 0:
|
||||
next = current - 3
|
||||
reason = "timeout_very_high_or_cooldown"
|
||||
case timeoutRate >= 0.12:
|
||||
next = current - 2
|
||||
reason = "timeout_high"
|
||||
case dnsStats.OK == 0 && dnsStats.NXDomain > 0:
|
||||
next = current - 2
|
||||
reason = "no_success_nx_only"
|
||||
case nxRate >= 0.90 && resolvedNowDNS == 0:
|
||||
next = current - 2
|
||||
reason = "nx_dominant_no_resolve"
|
||||
case nxSelectedRatio >= 0.95 && resolvedNowDNS == 0:
|
||||
next = current - 1
|
||||
reason = "nxheavy_selected_dominant"
|
||||
case timeoutRate <= 0.02 && okRate >= 0.10 && liveNXHeavySkip > 0:
|
||||
next = current + 2
|
||||
reason = "healthy_fast_reintroduce_nxheavy"
|
||||
case timeoutRate <= 0.04 && resolvedNowDNS > 0 && liveNXHeavySkip > 0:
|
||||
next = current + 1
|
||||
reason = "healthy_reintroduce_nxheavy"
|
||||
}
|
||||
if next < minV {
|
||||
next = minV
|
||||
}
|
||||
if next > maxV {
|
||||
next = maxV
|
||||
}
|
||||
if next == current && reason != "no_dns_attempts" {
|
||||
reason = "stable"
|
||||
}
|
||||
return next, reason
|
||||
}
|
||||
|
||||
func classifyLiveBatchHost(
|
||||
host string,
|
||||
cache domainCacheState,
|
||||
@@ -2701,8 +2841,11 @@ func saveResolverPrecheckState(path string, ts int, timeoutStats resolverTimeout
|
||||
state["live_batch_p1"] = live.P1
|
||||
state["live_batch_p2"] = live.P2
|
||||
state["live_batch_p3"] = live.P3
|
||||
state["live_batch_nxheavy_pct"] = live.NXHeavyPct
|
||||
state["live_batch_nxheavy_total"] = live.NXHeavyTotal
|
||||
state["live_batch_nxheavy_skip"] = live.NXHeavySkip
|
||||
state["live_batch_nxheavy_next_pct"] = live.NextNXPct
|
||||
state["live_batch_nxheavy_next_reason"] = live.NextNXReason
|
||||
state["live_batch_next_target"] = live.NextTarget
|
||||
state["live_batch_next_reason"] = live.NextReason
|
||||
state["live_batch_dns_attempts"] = live.DNSAttempts
|
||||
|
||||
@@ -672,6 +672,7 @@ class DashboardController:
|
||||
live_batch_p1 = int(pairs.get("live_batch_p1", 0))
|
||||
live_batch_p2 = int(pairs.get("live_batch_p2", 0))
|
||||
live_batch_p3 = int(pairs.get("live_batch_p3", 0))
|
||||
live_batch_nxheavy_pct = int(pairs.get("live_batch_nxheavy_pct", 0))
|
||||
live_batch_nxheavy_skip = int(pairs.get("live_batch_nxheavy_skip", 0))
|
||||
|
||||
r_checked = int(pairs.get("timeout_recheck_checked", 0))
|
||||
@@ -688,7 +689,7 @@ class DashboardController:
|
||||
f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} "
|
||||
f"| cooldown_skips={dns_cooldown_skips} | attempts={dns_attempts} "
|
||||
f"| live_batch={live_batch_target} deferred={live_batch_deferred} "
|
||||
f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_skip={live_batch_nxheavy_skip})"
|
||||
f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_pct={live_batch_nxheavy_pct}, nx_skip={live_batch_nxheavy_skip})"
|
||||
)
|
||||
recheck_text = (
|
||||
f"Timeout recheck: checked={r_checked} recovered={r_recovered} "
|
||||
|
||||
Reference in New Issue
Block a user