resolver: auto-tune nx-heavy batch share by run health

This commit is contained in:
beckline
2026-02-25 12:15:11 +03:00
parent f94ba6214e
commit f458bd8433
2 changed files with 152 additions and 8 deletions

View File

@@ -218,10 +218,13 @@ type resolverLiveBatchStats struct {
P1 int P1 int
P2 int P2 int
P3 int P3 int
NXHeavyPct int
NXHeavyTotal int NXHeavyTotal int
NXHeavySkip int NXHeavySkip int
NextTarget int NextTarget int
NextReason string NextReason string
NextNXPct int
NextNXReason string
DNSAttempts int DNSAttempts int
DNSTimeout int DNSTimeout int
DNSCoolSkips int DNSCoolSkips int
@@ -413,13 +416,28 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
liveBatchDefault = liveBatchMax liveBatchDefault = liveBatchMax
} }
liveBatchTarget := loadResolverLiveBatchTarget(precheckStatePath, liveBatchDefault, liveBatchMin, liveBatchMax) liveBatchTarget := loadResolverLiveBatchTarget(precheckStatePath, liveBatchDefault, liveBatchMin, liveBatchMax)
liveBatchNXHeavyPct := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10) liveBatchNXHeavyMin := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MIN_PCT", 5)
if liveBatchNXHeavyPct < 0 { liveBatchNXHeavyMax := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MAX_PCT", 35)
liveBatchNXHeavyPct = 0 liveBatchNXHeavyDefault := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10)
if liveBatchNXHeavyMin < 0 {
liveBatchNXHeavyMin = 0
} }
if liveBatchNXHeavyPct > 100 { if liveBatchNXHeavyMin > 100 {
liveBatchNXHeavyPct = 100 liveBatchNXHeavyMin = 100
} }
if liveBatchNXHeavyMax < liveBatchNXHeavyMin {
liveBatchNXHeavyMax = liveBatchNXHeavyMin
}
if liveBatchNXHeavyMax > 100 {
liveBatchNXHeavyMax = 100
}
if liveBatchNXHeavyDefault < liveBatchNXHeavyMin {
liveBatchNXHeavyDefault = liveBatchNXHeavyMin
}
if liveBatchNXHeavyDefault > liveBatchNXHeavyMax {
liveBatchNXHeavyDefault = liveBatchNXHeavyMax
}
liveBatchNXHeavyPct := loadResolverLiveBatchNXHeavyPct(precheckStatePath, liveBatchNXHeavyDefault, liveBatchNXHeavyMin, liveBatchNXHeavyMax)
precheckEnvForced := resolvePrecheckForceEnvEnabled() precheckEnvForced := resolvePrecheckForceEnvEnabled()
precheckFileForced := resolvePrecheckForceFileEnabled(precheckForcePath) precheckFileForced := resolvePrecheckForceFileEnabled(precheckForcePath)
precheckDue := precheckEnvForced || precheckFileForced || (precheckEverySec > 0 && (precheckLastRun <= 0 || now-precheckLastRun >= precheckEverySec)) precheckDue := precheckEnvForced || precheckFileForced || (precheckEverySec > 0 && (precheckLastRun <= 0 || now-precheckLastRun >= precheckEverySec))
@@ -484,7 +502,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
wildcardPolicy := wildcardDNSAttemptPolicy(1) wildcardPolicy := wildcardDNSAttemptPolicy(1)
cEnabled, cMin, cRate, cStreak, cBan, cMaxBan := cooldown.configSnapshot() cEnabled, cMin, cRate, cStreak, cBan, cMaxBan := cooldown.configSnapshot()
logf( logf(
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t", "resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d live_batch_nx_heavy_min=%d live_batch_nx_heavy_max=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
directPolicy.TryLimit, directPolicy.TryLimit,
directPolicy.DomainBudget.Milliseconds(), directPolicy.DomainBudget.Milliseconds(),
wildcardPolicy.TryLimit, wildcardPolicy.TryLimit,
@@ -501,6 +519,8 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
liveBatchMin, liveBatchMin,
liveBatchMax, liveBatchMax,
liveBatchNXHeavyPct, liveBatchNXHeavyPct,
liveBatchNXHeavyMin,
liveBatchNXHeavyMax,
staleKeepSec, staleKeepSec,
precheckEverySec, precheckEverySec,
precheckMaxDomains, precheckMaxDomains,
@@ -783,7 +803,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
dnsErrors := dnsStats.totalErrors() dnsErrors := dnsStats.totalErrors()
unresolvedSuppressed := cacheNegativeHits + quarantineHits + liveDeferred unresolvedSuppressed := cacheNegativeHits + quarantineHits + liveDeferred
logf( logf(
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d", "resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_pct=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
len(domains), len(domains),
len(fresh), len(fresh),
cacheNegativeHits, cacheNegativeHits,
@@ -798,6 +818,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
liveP1, liveP1,
liveP2, liveP2,
liveP3, liveP3,
liveBatchNXHeavyPct,
liveNXHeavyTotal, liveNXHeavyTotal,
liveNXHeavySkip, liveNXHeavySkip,
len(staticEntries), len(staticEntries),
@@ -850,6 +871,27 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
} }
if precheckDue { if precheckDue {
nextTarget, nextReason := computeNextLiveBatchTarget(liveBatchTarget, liveBatchMin, liveBatchMax, dnsStats, liveDeferred) nextTarget, nextReason := computeNextLiveBatchTarget(liveBatchTarget, liveBatchMin, liveBatchMax, dnsStats, liveDeferred)
nextNXPct, nextNXReason := computeNextLiveBatchNXHeavyPct(
liveBatchNXHeavyPct,
liveBatchNXHeavyMin,
liveBatchNXHeavyMax,
dnsStats,
resolvedNowDNS,
liveP3,
liveNXHeavyTotal,
liveNXHeavySkip,
)
if logf != nil {
logf(
"resolve live-batch nxheavy: pct=%d next=%d reason=%s selected=%d total=%d skipped=%d",
liveBatchNXHeavyPct,
nextNXPct,
nextNXReason,
liveP3,
liveNXHeavyTotal,
liveNXHeavySkip,
)
}
saveResolverPrecheckState( saveResolverPrecheckState(
precheckStatePath, precheckStatePath,
now, now,
@@ -861,10 +903,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
P1: liveP1, P1: liveP1,
P2: liveP2, P2: liveP2,
P3: liveP3, P3: liveP3,
NXHeavyPct: liveBatchNXHeavyPct,
NXHeavyTotal: liveNXHeavyTotal, NXHeavyTotal: liveNXHeavyTotal,
NXHeavySkip: liveNXHeavySkip, NXHeavySkip: liveNXHeavySkip,
NextTarget: nextTarget, NextTarget: nextTarget,
NextReason: nextReason, NextReason: nextReason,
NextNXPct: nextNXPct,
NextNXReason: nextNXReason,
DNSAttempts: dnsStats.Attempts, DNSAttempts: dnsStats.Attempts,
DNSTimeout: dnsStats.Timeout, DNSTimeout: dnsStats.Timeout,
DNSCoolSkips: dnsStats.Skipped, DNSCoolSkips: dnsStats.Skipped,
@@ -2473,6 +2518,34 @@ func loadResolverLiveBatchTarget(path string, fallback, minV, maxV int) int {
return v return v
} }
func loadResolverLiveBatchNXHeavyPct(path string, fallback, minV, maxV int) int {
if fallback < minV {
fallback = minV
}
if fallback > maxV {
fallback = maxV
}
m := loadJSONMap(path)
if len(m) == 0 {
return fallback
}
raw := m["live_batch_nxheavy_next_pct"]
if raw == nil {
raw = m["live_batch_nxheavy_pct"]
}
v, ok := parseAnyInt(raw)
if !ok {
return fallback
}
if v < minV {
v = minV
}
if v > maxV {
v = maxV
}
return v
}
func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, deferred int) (int, string) { func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, deferred int) (int, string) {
if current < minV { if current < minV {
current = minV current = minV
@@ -2517,6 +2590,73 @@ func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, de
return next, reason return next, reason
} }
func computeNextLiveBatchNXHeavyPct(
current, minV, maxV int,
dnsStats dnsMetrics,
resolvedNowDNS int,
liveP3 int,
liveNXHeavyTotal int,
liveNXHeavySkip int,
) (int, string) {
if current < minV {
current = minV
}
if current > maxV {
current = maxV
}
next := current
reason := "stable"
attempts := dnsStats.Attempts
timeoutRate := 0.0
nxRate := 0.0
okRate := 0.0
if attempts > 0 {
timeoutRate = float64(dnsStats.Timeout) / float64(attempts)
nxRate = float64(dnsStats.NXDomain) / float64(attempts)
okRate = float64(dnsStats.OK) / float64(attempts)
}
nxSelectedRatio := 0.0
if liveNXHeavyTotal > 0 {
nxSelectedRatio = float64(liveP3) / float64(liveNXHeavyTotal)
}
switch {
case attempts == 0:
reason = "no_dns_attempts"
case timeoutRate >= 0.20 || dnsStats.Skipped > 0:
next = current - 3
reason = "timeout_very_high_or_cooldown"
case timeoutRate >= 0.12:
next = current - 2
reason = "timeout_high"
case dnsStats.OK == 0 && dnsStats.NXDomain > 0:
next = current - 2
reason = "no_success_nx_only"
case nxRate >= 0.90 && resolvedNowDNS == 0:
next = current - 2
reason = "nx_dominant_no_resolve"
case nxSelectedRatio >= 0.95 && resolvedNowDNS == 0:
next = current - 1
reason = "nxheavy_selected_dominant"
case timeoutRate <= 0.02 && okRate >= 0.10 && liveNXHeavySkip > 0:
next = current + 2
reason = "healthy_fast_reintroduce_nxheavy"
case timeoutRate <= 0.04 && resolvedNowDNS > 0 && liveNXHeavySkip > 0:
next = current + 1
reason = "healthy_reintroduce_nxheavy"
}
if next < minV {
next = minV
}
if next > maxV {
next = maxV
}
if next == current && reason != "no_dns_attempts" {
reason = "stable"
}
return next, reason
}
func classifyLiveBatchHost( func classifyLiveBatchHost(
host string, host string,
cache domainCacheState, cache domainCacheState,
@@ -2701,8 +2841,11 @@ func saveResolverPrecheckState(path string, ts int, timeoutStats resolverTimeout
state["live_batch_p1"] = live.P1 state["live_batch_p1"] = live.P1
state["live_batch_p2"] = live.P2 state["live_batch_p2"] = live.P2
state["live_batch_p3"] = live.P3 state["live_batch_p3"] = live.P3
state["live_batch_nxheavy_pct"] = live.NXHeavyPct
state["live_batch_nxheavy_total"] = live.NXHeavyTotal state["live_batch_nxheavy_total"] = live.NXHeavyTotal
state["live_batch_nxheavy_skip"] = live.NXHeavySkip state["live_batch_nxheavy_skip"] = live.NXHeavySkip
state["live_batch_nxheavy_next_pct"] = live.NextNXPct
state["live_batch_nxheavy_next_reason"] = live.NextNXReason
state["live_batch_next_target"] = live.NextTarget state["live_batch_next_target"] = live.NextTarget
state["live_batch_next_reason"] = live.NextReason state["live_batch_next_reason"] = live.NextReason
state["live_batch_dns_attempts"] = live.DNSAttempts state["live_batch_dns_attempts"] = live.DNSAttempts

View File

@@ -672,6 +672,7 @@ class DashboardController:
live_batch_p1 = int(pairs.get("live_batch_p1", 0)) live_batch_p1 = int(pairs.get("live_batch_p1", 0))
live_batch_p2 = int(pairs.get("live_batch_p2", 0)) live_batch_p2 = int(pairs.get("live_batch_p2", 0))
live_batch_p3 = int(pairs.get("live_batch_p3", 0)) live_batch_p3 = int(pairs.get("live_batch_p3", 0))
live_batch_nxheavy_pct = int(pairs.get("live_batch_nxheavy_pct", 0))
live_batch_nxheavy_skip = int(pairs.get("live_batch_nxheavy_skip", 0)) live_batch_nxheavy_skip = int(pairs.get("live_batch_nxheavy_skip", 0))
r_checked = int(pairs.get("timeout_recheck_checked", 0)) r_checked = int(pairs.get("timeout_recheck_checked", 0))
@@ -688,7 +689,7 @@ class DashboardController:
f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} " f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} "
f"| cooldown_skips={dns_cooldown_skips} | attempts={dns_attempts} " f"| cooldown_skips={dns_cooldown_skips} | attempts={dns_attempts} "
f"| live_batch={live_batch_target} deferred={live_batch_deferred} " f"| live_batch={live_batch_target} deferred={live_batch_deferred} "
f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_skip={live_batch_nxheavy_skip})" f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_pct={live_batch_nxheavy_pct}, nx_skip={live_batch_nxheavy_skip})"
) )
recheck_text = ( recheck_text = (
f"Timeout recheck: checked={r_checked} recovered={r_recovered} " f"Timeout recheck: checked={r_checked} recovered={r_recovered} "