resolver: auto-tune nx-heavy batch share by run health

This commit is contained in:
beckline
2026-02-25 12:15:11 +03:00
parent f94ba6214e
commit f458bd8433
2 changed files with 152 additions and 8 deletions

View File

@@ -218,10 +218,13 @@ type resolverLiveBatchStats struct {
P1 int
P2 int
P3 int
NXHeavyPct int
NXHeavyTotal int
NXHeavySkip int
NextTarget int
NextReason string
NextNXPct int
NextNXReason string
DNSAttempts int
DNSTimeout int
DNSCoolSkips int
@@ -413,13 +416,28 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
liveBatchDefault = liveBatchMax
}
liveBatchTarget := loadResolverLiveBatchTarget(precheckStatePath, liveBatchDefault, liveBatchMin, liveBatchMax)
liveBatchNXHeavyPct := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10)
if liveBatchNXHeavyPct < 0 {
liveBatchNXHeavyPct = 0
liveBatchNXHeavyMin := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MIN_PCT", 5)
liveBatchNXHeavyMax := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MAX_PCT", 35)
liveBatchNXHeavyDefault := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10)
if liveBatchNXHeavyMin < 0 {
liveBatchNXHeavyMin = 0
}
if liveBatchNXHeavyPct > 100 {
liveBatchNXHeavyPct = 100
if liveBatchNXHeavyMin > 100 {
liveBatchNXHeavyMin = 100
}
if liveBatchNXHeavyMax < liveBatchNXHeavyMin {
liveBatchNXHeavyMax = liveBatchNXHeavyMin
}
if liveBatchNXHeavyMax > 100 {
liveBatchNXHeavyMax = 100
}
if liveBatchNXHeavyDefault < liveBatchNXHeavyMin {
liveBatchNXHeavyDefault = liveBatchNXHeavyMin
}
if liveBatchNXHeavyDefault > liveBatchNXHeavyMax {
liveBatchNXHeavyDefault = liveBatchNXHeavyMax
}
liveBatchNXHeavyPct := loadResolverLiveBatchNXHeavyPct(precheckStatePath, liveBatchNXHeavyDefault, liveBatchNXHeavyMin, liveBatchNXHeavyMax)
precheckEnvForced := resolvePrecheckForceEnvEnabled()
precheckFileForced := resolvePrecheckForceFileEnabled(precheckForcePath)
precheckDue := precheckEnvForced || precheckFileForced || (precheckEverySec > 0 && (precheckLastRun <= 0 || now-precheckLastRun >= precheckEverySec))
@@ -484,7 +502,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
wildcardPolicy := wildcardDNSAttemptPolicy(1)
cEnabled, cMin, cRate, cStreak, cBan, cMaxBan := cooldown.configSnapshot()
logf(
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d live_batch_nx_heavy_min=%d live_batch_nx_heavy_max=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
directPolicy.TryLimit,
directPolicy.DomainBudget.Milliseconds(),
wildcardPolicy.TryLimit,
@@ -501,6 +519,8 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
liveBatchMin,
liveBatchMax,
liveBatchNXHeavyPct,
liveBatchNXHeavyMin,
liveBatchNXHeavyMax,
staleKeepSec,
precheckEverySec,
precheckMaxDomains,
@@ -783,7 +803,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
dnsErrors := dnsStats.totalErrors()
unresolvedSuppressed := cacheNegativeHits + quarantineHits + liveDeferred
logf(
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_pct=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
len(domains),
len(fresh),
cacheNegativeHits,
@@ -798,6 +818,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
liveP1,
liveP2,
liveP3,
liveBatchNXHeavyPct,
liveNXHeavyTotal,
liveNXHeavySkip,
len(staticEntries),
@@ -850,6 +871,27 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
}
if precheckDue {
nextTarget, nextReason := computeNextLiveBatchTarget(liveBatchTarget, liveBatchMin, liveBatchMax, dnsStats, liveDeferred)
nextNXPct, nextNXReason := computeNextLiveBatchNXHeavyPct(
liveBatchNXHeavyPct,
liveBatchNXHeavyMin,
liveBatchNXHeavyMax,
dnsStats,
resolvedNowDNS,
liveP3,
liveNXHeavyTotal,
liveNXHeavySkip,
)
if logf != nil {
logf(
"resolve live-batch nxheavy: pct=%d next=%d reason=%s selected=%d total=%d skipped=%d",
liveBatchNXHeavyPct,
nextNXPct,
nextNXReason,
liveP3,
liveNXHeavyTotal,
liveNXHeavySkip,
)
}
saveResolverPrecheckState(
precheckStatePath,
now,
@@ -861,10 +903,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
P1: liveP1,
P2: liveP2,
P3: liveP3,
NXHeavyPct: liveBatchNXHeavyPct,
NXHeavyTotal: liveNXHeavyTotal,
NXHeavySkip: liveNXHeavySkip,
NextTarget: nextTarget,
NextReason: nextReason,
NextNXPct: nextNXPct,
NextNXReason: nextNXReason,
DNSAttempts: dnsStats.Attempts,
DNSTimeout: dnsStats.Timeout,
DNSCoolSkips: dnsStats.Skipped,
@@ -2473,6 +2518,34 @@ func loadResolverLiveBatchTarget(path string, fallback, minV, maxV int) int {
return v
}
func loadResolverLiveBatchNXHeavyPct(path string, fallback, minV, maxV int) int {
if fallback < minV {
fallback = minV
}
if fallback > maxV {
fallback = maxV
}
m := loadJSONMap(path)
if len(m) == 0 {
return fallback
}
raw := m["live_batch_nxheavy_next_pct"]
if raw == nil {
raw = m["live_batch_nxheavy_pct"]
}
v, ok := parseAnyInt(raw)
if !ok {
return fallback
}
if v < minV {
v = minV
}
if v > maxV {
v = maxV
}
return v
}
func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, deferred int) (int, string) {
if current < minV {
current = minV
@@ -2517,6 +2590,73 @@ func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, de
return next, reason
}
func computeNextLiveBatchNXHeavyPct(
current, minV, maxV int,
dnsStats dnsMetrics,
resolvedNowDNS int,
liveP3 int,
liveNXHeavyTotal int,
liveNXHeavySkip int,
) (int, string) {
if current < minV {
current = minV
}
if current > maxV {
current = maxV
}
next := current
reason := "stable"
attempts := dnsStats.Attempts
timeoutRate := 0.0
nxRate := 0.0
okRate := 0.0
if attempts > 0 {
timeoutRate = float64(dnsStats.Timeout) / float64(attempts)
nxRate = float64(dnsStats.NXDomain) / float64(attempts)
okRate = float64(dnsStats.OK) / float64(attempts)
}
nxSelectedRatio := 0.0
if liveNXHeavyTotal > 0 {
nxSelectedRatio = float64(liveP3) / float64(liveNXHeavyTotal)
}
switch {
case attempts == 0:
reason = "no_dns_attempts"
case timeoutRate >= 0.20 || dnsStats.Skipped > 0:
next = current - 3
reason = "timeout_very_high_or_cooldown"
case timeoutRate >= 0.12:
next = current - 2
reason = "timeout_high"
case dnsStats.OK == 0 && dnsStats.NXDomain > 0:
next = current - 2
reason = "no_success_nx_only"
case nxRate >= 0.90 && resolvedNowDNS == 0:
next = current - 2
reason = "nx_dominant_no_resolve"
case nxSelectedRatio >= 0.95 && resolvedNowDNS == 0:
next = current - 1
reason = "nxheavy_selected_dominant"
case timeoutRate <= 0.02 && okRate >= 0.10 && liveNXHeavySkip > 0:
next = current + 2
reason = "healthy_fast_reintroduce_nxheavy"
case timeoutRate <= 0.04 && resolvedNowDNS > 0 && liveNXHeavySkip > 0:
next = current + 1
reason = "healthy_reintroduce_nxheavy"
}
if next < minV {
next = minV
}
if next > maxV {
next = maxV
}
if next == current && reason != "no_dns_attempts" {
reason = "stable"
}
return next, reason
}
func classifyLiveBatchHost(
host string,
cache domainCacheState,
@@ -2701,8 +2841,11 @@ func saveResolverPrecheckState(path string, ts int, timeoutStats resolverTimeout
state["live_batch_p1"] = live.P1
state["live_batch_p2"] = live.P2
state["live_batch_p3"] = live.P3
state["live_batch_nxheavy_pct"] = live.NXHeavyPct
state["live_batch_nxheavy_total"] = live.NXHeavyTotal
state["live_batch_nxheavy_skip"] = live.NXHeavySkip
state["live_batch_nxheavy_next_pct"] = live.NextNXPct
state["live_batch_nxheavy_next_reason"] = live.NextNXReason
state["live_batch_next_target"] = live.NextTarget
state["live_batch_next_reason"] = live.NextReason
state["live_batch_dns_attempts"] = live.DNSAttempts

View File

@@ -672,6 +672,7 @@ class DashboardController:
live_batch_p1 = int(pairs.get("live_batch_p1", 0))
live_batch_p2 = int(pairs.get("live_batch_p2", 0))
live_batch_p3 = int(pairs.get("live_batch_p3", 0))
live_batch_nxheavy_pct = int(pairs.get("live_batch_nxheavy_pct", 0))
live_batch_nxheavy_skip = int(pairs.get("live_batch_nxheavy_skip", 0))
r_checked = int(pairs.get("timeout_recheck_checked", 0))
@@ -688,7 +689,7 @@ class DashboardController:
f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} "
f"| cooldown_skips={dns_cooldown_skips} | attempts={dns_attempts} "
f"| live_batch={live_batch_target} deferred={live_batch_deferred} "
f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_skip={live_batch_nxheavy_skip})"
f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_pct={live_batch_nxheavy_pct}, nx_skip={live_batch_nxheavy_skip})"
)
recheck_text = (
f"Timeout recheck: checked={r_checked} recovered={r_recovered} "