resolver: auto-tune nx-heavy batch share by run health
This commit is contained in:
@@ -218,10 +218,13 @@ type resolverLiveBatchStats struct {
|
|||||||
P1 int
|
P1 int
|
||||||
P2 int
|
P2 int
|
||||||
P3 int
|
P3 int
|
||||||
|
NXHeavyPct int
|
||||||
NXHeavyTotal int
|
NXHeavyTotal int
|
||||||
NXHeavySkip int
|
NXHeavySkip int
|
||||||
NextTarget int
|
NextTarget int
|
||||||
NextReason string
|
NextReason string
|
||||||
|
NextNXPct int
|
||||||
|
NextNXReason string
|
||||||
DNSAttempts int
|
DNSAttempts int
|
||||||
DNSTimeout int
|
DNSTimeout int
|
||||||
DNSCoolSkips int
|
DNSCoolSkips int
|
||||||
@@ -413,13 +416,28 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
liveBatchDefault = liveBatchMax
|
liveBatchDefault = liveBatchMax
|
||||||
}
|
}
|
||||||
liveBatchTarget := loadResolverLiveBatchTarget(precheckStatePath, liveBatchDefault, liveBatchMin, liveBatchMax)
|
liveBatchTarget := loadResolverLiveBatchTarget(precheckStatePath, liveBatchDefault, liveBatchMin, liveBatchMax)
|
||||||
liveBatchNXHeavyPct := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10)
|
liveBatchNXHeavyMin := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MIN_PCT", 5)
|
||||||
if liveBatchNXHeavyPct < 0 {
|
liveBatchNXHeavyMax := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MAX_PCT", 35)
|
||||||
liveBatchNXHeavyPct = 0
|
liveBatchNXHeavyDefault := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10)
|
||||||
|
if liveBatchNXHeavyMin < 0 {
|
||||||
|
liveBatchNXHeavyMin = 0
|
||||||
}
|
}
|
||||||
if liveBatchNXHeavyPct > 100 {
|
if liveBatchNXHeavyMin > 100 {
|
||||||
liveBatchNXHeavyPct = 100
|
liveBatchNXHeavyMin = 100
|
||||||
}
|
}
|
||||||
|
if liveBatchNXHeavyMax < liveBatchNXHeavyMin {
|
||||||
|
liveBatchNXHeavyMax = liveBatchNXHeavyMin
|
||||||
|
}
|
||||||
|
if liveBatchNXHeavyMax > 100 {
|
||||||
|
liveBatchNXHeavyMax = 100
|
||||||
|
}
|
||||||
|
if liveBatchNXHeavyDefault < liveBatchNXHeavyMin {
|
||||||
|
liveBatchNXHeavyDefault = liveBatchNXHeavyMin
|
||||||
|
}
|
||||||
|
if liveBatchNXHeavyDefault > liveBatchNXHeavyMax {
|
||||||
|
liveBatchNXHeavyDefault = liveBatchNXHeavyMax
|
||||||
|
}
|
||||||
|
liveBatchNXHeavyPct := loadResolverLiveBatchNXHeavyPct(precheckStatePath, liveBatchNXHeavyDefault, liveBatchNXHeavyMin, liveBatchNXHeavyMax)
|
||||||
precheckEnvForced := resolvePrecheckForceEnvEnabled()
|
precheckEnvForced := resolvePrecheckForceEnvEnabled()
|
||||||
precheckFileForced := resolvePrecheckForceFileEnabled(precheckForcePath)
|
precheckFileForced := resolvePrecheckForceFileEnabled(precheckForcePath)
|
||||||
precheckDue := precheckEnvForced || precheckFileForced || (precheckEverySec > 0 && (precheckLastRun <= 0 || now-precheckLastRun >= precheckEverySec))
|
precheckDue := precheckEnvForced || precheckFileForced || (precheckEverySec > 0 && (precheckLastRun <= 0 || now-precheckLastRun >= precheckEverySec))
|
||||||
@@ -484,7 +502,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
wildcardPolicy := wildcardDNSAttemptPolicy(1)
|
wildcardPolicy := wildcardDNSAttemptPolicy(1)
|
||||||
cEnabled, cMin, cRate, cStreak, cBan, cMaxBan := cooldown.configSnapshot()
|
cEnabled, cMin, cRate, cStreak, cBan, cMaxBan := cooldown.configSnapshot()
|
||||||
logf(
|
logf(
|
||||||
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
|
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d live_batch_nx_heavy_min=%d live_batch_nx_heavy_max=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
|
||||||
directPolicy.TryLimit,
|
directPolicy.TryLimit,
|
||||||
directPolicy.DomainBudget.Milliseconds(),
|
directPolicy.DomainBudget.Milliseconds(),
|
||||||
wildcardPolicy.TryLimit,
|
wildcardPolicy.TryLimit,
|
||||||
@@ -501,6 +519,8 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
liveBatchMin,
|
liveBatchMin,
|
||||||
liveBatchMax,
|
liveBatchMax,
|
||||||
liveBatchNXHeavyPct,
|
liveBatchNXHeavyPct,
|
||||||
|
liveBatchNXHeavyMin,
|
||||||
|
liveBatchNXHeavyMax,
|
||||||
staleKeepSec,
|
staleKeepSec,
|
||||||
precheckEverySec,
|
precheckEverySec,
|
||||||
precheckMaxDomains,
|
precheckMaxDomains,
|
||||||
@@ -783,7 +803,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
dnsErrors := dnsStats.totalErrors()
|
dnsErrors := dnsStats.totalErrors()
|
||||||
unresolvedSuppressed := cacheNegativeHits + quarantineHits + liveDeferred
|
unresolvedSuppressed := cacheNegativeHits + quarantineHits + liveDeferred
|
||||||
logf(
|
logf(
|
||||||
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
|
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_pct=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
|
||||||
len(domains),
|
len(domains),
|
||||||
len(fresh),
|
len(fresh),
|
||||||
cacheNegativeHits,
|
cacheNegativeHits,
|
||||||
@@ -798,6 +818,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
liveP1,
|
liveP1,
|
||||||
liveP2,
|
liveP2,
|
||||||
liveP3,
|
liveP3,
|
||||||
|
liveBatchNXHeavyPct,
|
||||||
liveNXHeavyTotal,
|
liveNXHeavyTotal,
|
||||||
liveNXHeavySkip,
|
liveNXHeavySkip,
|
||||||
len(staticEntries),
|
len(staticEntries),
|
||||||
@@ -850,6 +871,27 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
}
|
}
|
||||||
if precheckDue {
|
if precheckDue {
|
||||||
nextTarget, nextReason := computeNextLiveBatchTarget(liveBatchTarget, liveBatchMin, liveBatchMax, dnsStats, liveDeferred)
|
nextTarget, nextReason := computeNextLiveBatchTarget(liveBatchTarget, liveBatchMin, liveBatchMax, dnsStats, liveDeferred)
|
||||||
|
nextNXPct, nextNXReason := computeNextLiveBatchNXHeavyPct(
|
||||||
|
liveBatchNXHeavyPct,
|
||||||
|
liveBatchNXHeavyMin,
|
||||||
|
liveBatchNXHeavyMax,
|
||||||
|
dnsStats,
|
||||||
|
resolvedNowDNS,
|
||||||
|
liveP3,
|
||||||
|
liveNXHeavyTotal,
|
||||||
|
liveNXHeavySkip,
|
||||||
|
)
|
||||||
|
if logf != nil {
|
||||||
|
logf(
|
||||||
|
"resolve live-batch nxheavy: pct=%d next=%d reason=%s selected=%d total=%d skipped=%d",
|
||||||
|
liveBatchNXHeavyPct,
|
||||||
|
nextNXPct,
|
||||||
|
nextNXReason,
|
||||||
|
liveP3,
|
||||||
|
liveNXHeavyTotal,
|
||||||
|
liveNXHeavySkip,
|
||||||
|
)
|
||||||
|
}
|
||||||
saveResolverPrecheckState(
|
saveResolverPrecheckState(
|
||||||
precheckStatePath,
|
precheckStatePath,
|
||||||
now,
|
now,
|
||||||
@@ -861,10 +903,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
P1: liveP1,
|
P1: liveP1,
|
||||||
P2: liveP2,
|
P2: liveP2,
|
||||||
P3: liveP3,
|
P3: liveP3,
|
||||||
|
NXHeavyPct: liveBatchNXHeavyPct,
|
||||||
NXHeavyTotal: liveNXHeavyTotal,
|
NXHeavyTotal: liveNXHeavyTotal,
|
||||||
NXHeavySkip: liveNXHeavySkip,
|
NXHeavySkip: liveNXHeavySkip,
|
||||||
NextTarget: nextTarget,
|
NextTarget: nextTarget,
|
||||||
NextReason: nextReason,
|
NextReason: nextReason,
|
||||||
|
NextNXPct: nextNXPct,
|
||||||
|
NextNXReason: nextNXReason,
|
||||||
DNSAttempts: dnsStats.Attempts,
|
DNSAttempts: dnsStats.Attempts,
|
||||||
DNSTimeout: dnsStats.Timeout,
|
DNSTimeout: dnsStats.Timeout,
|
||||||
DNSCoolSkips: dnsStats.Skipped,
|
DNSCoolSkips: dnsStats.Skipped,
|
||||||
@@ -2473,6 +2518,34 @@ func loadResolverLiveBatchTarget(path string, fallback, minV, maxV int) int {
|
|||||||
return v
|
return v
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadResolverLiveBatchNXHeavyPct(path string, fallback, minV, maxV int) int {
|
||||||
|
if fallback < minV {
|
||||||
|
fallback = minV
|
||||||
|
}
|
||||||
|
if fallback > maxV {
|
||||||
|
fallback = maxV
|
||||||
|
}
|
||||||
|
m := loadJSONMap(path)
|
||||||
|
if len(m) == 0 {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
raw := m["live_batch_nxheavy_next_pct"]
|
||||||
|
if raw == nil {
|
||||||
|
raw = m["live_batch_nxheavy_pct"]
|
||||||
|
}
|
||||||
|
v, ok := parseAnyInt(raw)
|
||||||
|
if !ok {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
if v < minV {
|
||||||
|
v = minV
|
||||||
|
}
|
||||||
|
if v > maxV {
|
||||||
|
v = maxV
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, deferred int) (int, string) {
|
func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, deferred int) (int, string) {
|
||||||
if current < minV {
|
if current < minV {
|
||||||
current = minV
|
current = minV
|
||||||
@@ -2517,6 +2590,73 @@ func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, de
|
|||||||
return next, reason
|
return next, reason
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func computeNextLiveBatchNXHeavyPct(
|
||||||
|
current, minV, maxV int,
|
||||||
|
dnsStats dnsMetrics,
|
||||||
|
resolvedNowDNS int,
|
||||||
|
liveP3 int,
|
||||||
|
liveNXHeavyTotal int,
|
||||||
|
liveNXHeavySkip int,
|
||||||
|
) (int, string) {
|
||||||
|
if current < minV {
|
||||||
|
current = minV
|
||||||
|
}
|
||||||
|
if current > maxV {
|
||||||
|
current = maxV
|
||||||
|
}
|
||||||
|
next := current
|
||||||
|
reason := "stable"
|
||||||
|
attempts := dnsStats.Attempts
|
||||||
|
timeoutRate := 0.0
|
||||||
|
nxRate := 0.0
|
||||||
|
okRate := 0.0
|
||||||
|
if attempts > 0 {
|
||||||
|
timeoutRate = float64(dnsStats.Timeout) / float64(attempts)
|
||||||
|
nxRate = float64(dnsStats.NXDomain) / float64(attempts)
|
||||||
|
okRate = float64(dnsStats.OK) / float64(attempts)
|
||||||
|
}
|
||||||
|
nxSelectedRatio := 0.0
|
||||||
|
if liveNXHeavyTotal > 0 {
|
||||||
|
nxSelectedRatio = float64(liveP3) / float64(liveNXHeavyTotal)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case attempts == 0:
|
||||||
|
reason = "no_dns_attempts"
|
||||||
|
case timeoutRate >= 0.20 || dnsStats.Skipped > 0:
|
||||||
|
next = current - 3
|
||||||
|
reason = "timeout_very_high_or_cooldown"
|
||||||
|
case timeoutRate >= 0.12:
|
||||||
|
next = current - 2
|
||||||
|
reason = "timeout_high"
|
||||||
|
case dnsStats.OK == 0 && dnsStats.NXDomain > 0:
|
||||||
|
next = current - 2
|
||||||
|
reason = "no_success_nx_only"
|
||||||
|
case nxRate >= 0.90 && resolvedNowDNS == 0:
|
||||||
|
next = current - 2
|
||||||
|
reason = "nx_dominant_no_resolve"
|
||||||
|
case nxSelectedRatio >= 0.95 && resolvedNowDNS == 0:
|
||||||
|
next = current - 1
|
||||||
|
reason = "nxheavy_selected_dominant"
|
||||||
|
case timeoutRate <= 0.02 && okRate >= 0.10 && liveNXHeavySkip > 0:
|
||||||
|
next = current + 2
|
||||||
|
reason = "healthy_fast_reintroduce_nxheavy"
|
||||||
|
case timeoutRate <= 0.04 && resolvedNowDNS > 0 && liveNXHeavySkip > 0:
|
||||||
|
next = current + 1
|
||||||
|
reason = "healthy_reintroduce_nxheavy"
|
||||||
|
}
|
||||||
|
if next < minV {
|
||||||
|
next = minV
|
||||||
|
}
|
||||||
|
if next > maxV {
|
||||||
|
next = maxV
|
||||||
|
}
|
||||||
|
if next == current && reason != "no_dns_attempts" {
|
||||||
|
reason = "stable"
|
||||||
|
}
|
||||||
|
return next, reason
|
||||||
|
}
|
||||||
|
|
||||||
func classifyLiveBatchHost(
|
func classifyLiveBatchHost(
|
||||||
host string,
|
host string,
|
||||||
cache domainCacheState,
|
cache domainCacheState,
|
||||||
@@ -2701,8 +2841,11 @@ func saveResolverPrecheckState(path string, ts int, timeoutStats resolverTimeout
|
|||||||
state["live_batch_p1"] = live.P1
|
state["live_batch_p1"] = live.P1
|
||||||
state["live_batch_p2"] = live.P2
|
state["live_batch_p2"] = live.P2
|
||||||
state["live_batch_p3"] = live.P3
|
state["live_batch_p3"] = live.P3
|
||||||
|
state["live_batch_nxheavy_pct"] = live.NXHeavyPct
|
||||||
state["live_batch_nxheavy_total"] = live.NXHeavyTotal
|
state["live_batch_nxheavy_total"] = live.NXHeavyTotal
|
||||||
state["live_batch_nxheavy_skip"] = live.NXHeavySkip
|
state["live_batch_nxheavy_skip"] = live.NXHeavySkip
|
||||||
|
state["live_batch_nxheavy_next_pct"] = live.NextNXPct
|
||||||
|
state["live_batch_nxheavy_next_reason"] = live.NextNXReason
|
||||||
state["live_batch_next_target"] = live.NextTarget
|
state["live_batch_next_target"] = live.NextTarget
|
||||||
state["live_batch_next_reason"] = live.NextReason
|
state["live_batch_next_reason"] = live.NextReason
|
||||||
state["live_batch_dns_attempts"] = live.DNSAttempts
|
state["live_batch_dns_attempts"] = live.DNSAttempts
|
||||||
|
|||||||
@@ -672,6 +672,7 @@ class DashboardController:
|
|||||||
live_batch_p1 = int(pairs.get("live_batch_p1", 0))
|
live_batch_p1 = int(pairs.get("live_batch_p1", 0))
|
||||||
live_batch_p2 = int(pairs.get("live_batch_p2", 0))
|
live_batch_p2 = int(pairs.get("live_batch_p2", 0))
|
||||||
live_batch_p3 = int(pairs.get("live_batch_p3", 0))
|
live_batch_p3 = int(pairs.get("live_batch_p3", 0))
|
||||||
|
live_batch_nxheavy_pct = int(pairs.get("live_batch_nxheavy_pct", 0))
|
||||||
live_batch_nxheavy_skip = int(pairs.get("live_batch_nxheavy_skip", 0))
|
live_batch_nxheavy_skip = int(pairs.get("live_batch_nxheavy_skip", 0))
|
||||||
|
|
||||||
r_checked = int(pairs.get("timeout_recheck_checked", 0))
|
r_checked = int(pairs.get("timeout_recheck_checked", 0))
|
||||||
@@ -688,7 +689,7 @@ class DashboardController:
|
|||||||
f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} "
|
f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} "
|
||||||
f"| cooldown_skips={dns_cooldown_skips} | attempts={dns_attempts} "
|
f"| cooldown_skips={dns_cooldown_skips} | attempts={dns_attempts} "
|
||||||
f"| live_batch={live_batch_target} deferred={live_batch_deferred} "
|
f"| live_batch={live_batch_target} deferred={live_batch_deferred} "
|
||||||
f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_skip={live_batch_nxheavy_skip})"
|
f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_pct={live_batch_nxheavy_pct}, nx_skip={live_batch_nxheavy_skip})"
|
||||||
)
|
)
|
||||||
recheck_text = (
|
recheck_text = (
|
||||||
f"Timeout recheck: checked={r_checked} recovered={r_recovered} "
|
f"Timeout recheck: checked={r_checked} recovered={r_recovered} "
|
||||||
|
|||||||
Reference in New Issue
Block a user