resolver: adaptive live batch sizing and cooldown-aware summary

This commit is contained in:
beckline
2026-02-25 10:24:02 +03:00
parent 29dde73f04
commit 4b1a189152
3 changed files with 165 additions and 9 deletions

View File

@@ -211,6 +211,17 @@ type resolverTimeoutRecheckStats struct {
NoSignal int
}
type resolverLiveBatchStats struct {
Target int
Total int
Deferred int
NextTarget int
NextReason string
DNSAttempts int
DNSTimeout int
DNSCoolSkips int
}
type dnsCooldownState struct {
Attempts int
TimeoutLike int
@@ -375,6 +386,28 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
}
precheckStatePath := opts.CachePath + ".precheck.json"
precheckLastRun := loadResolverPrecheckLastRun(precheckStatePath)
liveBatchMin := envInt("RESOLVE_LIVE_BATCH_MIN", 1200)
liveBatchMax := envInt("RESOLVE_LIVE_BATCH_MAX", 5000)
liveBatchDefault := envInt("RESOLVE_LIVE_BATCH_DEFAULT", 3000)
if liveBatchMin < 200 {
liveBatchMin = 200
}
if liveBatchMin > 50000 {
liveBatchMin = 50000
}
if liveBatchMax < liveBatchMin {
liveBatchMax = liveBatchMin
}
if liveBatchMax > 50000 {
liveBatchMax = 50000
}
if liveBatchDefault < liveBatchMin {
liveBatchDefault = liveBatchMin
}
if liveBatchDefault > liveBatchMax {
liveBatchDefault = liveBatchMax
}
liveBatchTarget := loadResolverLiveBatchTarget(precheckStatePath, liveBatchDefault, liveBatchMin, liveBatchMax)
precheckEnvForced := resolvePrecheckForceEnvEnabled()
precheckFileForced := resolvePrecheckForceFileEnabled(precheckForcePath)
precheckDue := precheckEnvForced || precheckFileForced || (precheckEverySec > 0 && (precheckLastRun <= 0 || now-precheckLastRun >= precheckEverySec))
@@ -439,7 +472,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
wildcardPolicy := wildcardDNSAttemptPolicy(1)
cEnabled, cMin, cRate, cStreak, cBan, cMaxBan := cooldown.configSnapshot()
logf(
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
directPolicy.TryLimit,
directPolicy.DomainBudget.Milliseconds(),
wildcardPolicy.TryLimit,
@@ -452,6 +485,9 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
cStreak,
cBan,
cMaxBan,
liveBatchTarget,
liveBatchMin,
liveBatchMax,
staleKeepSec,
precheckEverySec,
precheckMaxDomains,
@@ -544,9 +580,27 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
for k, v := range fresh {
resolved[k] = v
}
toResolveTotal := len(toResolve)
liveDeferred := 0
if liveBatchTarget > 0 && len(toResolve) > liveBatchTarget {
startIdx := 0
if len(toResolve) > 0 {
startIdx = now % len(toResolve)
if startIdx < 0 {
startIdx = 0
}
}
limited := make([]string, 0, liveBatchTarget)
for i := 0; i < liveBatchTarget; i++ {
idx := (startIdx + i) % len(toResolve)
limited = append(limited, toResolve[idx])
}
liveDeferred = len(toResolve) - len(limited)
toResolve = limited
}
if logf != nil {
logf("resolve: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d precheck_due=%t precheck_scheduled=%d to_resolve=%d", len(domains), len(fresh), cacheNegativeHits, quarantineHits, staleHits, precheckDue, precheckScheduled, len(toResolve))
logf("resolve: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d precheck_due=%t precheck_scheduled=%d to_resolve=%d to_resolve_total=%d deferred_by_live_batch=%d", len(domains), len(fresh), cacheNegativeHits, quarantineHits, staleHits, precheckDue, precheckScheduled, len(toResolve), toResolveTotal, liveDeferred)
}
dnsStats := dnsMetrics{}
@@ -712,8 +766,9 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
if logf != nil {
dnsErrors := dnsStats.totalErrors()
unresolvedSuppressed := cacheNegativeHits + quarantineHits + liveDeferred
logf(
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
len(domains),
len(fresh),
cacheNegativeHits,
@@ -722,7 +777,9 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
len(resolved)-len(fresh),
len(domains)-len(resolved),
unresolvedAfterAttempts,
cacheNegativeHits+quarantineHits,
unresolvedSuppressed,
liveBatchTarget,
liveDeferred,
len(staticEntries),
staticSkipped,
len(res.IPs),
@@ -758,12 +815,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
logf("resolve domain states: %s", stateSummary)
}
logf(
"resolve breakdown: resolved_now_total=%d resolved_now_dns=%d resolved_now_stale=%d skipped_neg=%d skipped_quarantine=%d unresolved_after_attempts=%d",
"resolve breakdown: resolved_now_total=%d resolved_now_dns=%d resolved_now_stale=%d skipped_neg=%d skipped_quarantine=%d deferred_live_batch=%d unresolved_after_attempts=%d",
len(resolved)-len(fresh),
resolvedNowDNS,
resolvedNowStale,
cacheNegativeHits,
quarantineHits,
liveDeferred,
unresolvedAfterAttempts,
)
if precheckDue {
@@ -771,7 +829,22 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
}
}
if precheckDue {
saveResolverPrecheckState(precheckStatePath, now, timeoutRecheck)
nextTarget, nextReason := computeNextLiveBatchTarget(liveBatchTarget, liveBatchMin, liveBatchMax, dnsStats, liveDeferred)
saveResolverPrecheckState(
precheckStatePath,
now,
timeoutRecheck,
resolverLiveBatchStats{
Target: liveBatchTarget,
Total: toResolveTotal,
Deferred: liveDeferred,
NextTarget: nextTarget,
NextReason: nextReason,
DNSAttempts: dnsStats.Attempts,
DNSTimeout: dnsStats.Timeout,
DNSCoolSkips: dnsStats.Skipped,
},
)
}
if precheckFileForced {
_ = os.Remove(precheckForcePath)
@@ -2347,7 +2420,79 @@ func loadResolverPrecheckLastRun(path string) int {
return v
}
func saveResolverPrecheckState(path string, ts int, timeoutStats resolverTimeoutRecheckStats) {
func loadResolverLiveBatchTarget(path string, fallback, minV, maxV int) int {
if fallback < minV {
fallback = minV
}
if fallback > maxV {
fallback = maxV
}
m := loadJSONMap(path)
if len(m) == 0 {
return fallback
}
raw := m["live_batch_next_target"]
if raw == nil {
raw = m["live_batch_target"]
}
v, ok := parseAnyInt(raw)
if !ok || v <= 0 {
return fallback
}
if v < minV {
v = minV
}
if v > maxV {
v = maxV
}
return v
}
func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, deferred int) (int, string) {
if current < minV {
current = minV
}
if current > maxV {
current = maxV
}
next := current
reason := "stable"
attempts := dnsStats.Attempts
timeoutRate := 0.0
if attempts > 0 {
timeoutRate = float64(dnsStats.Timeout) / float64(attempts)
}
switch {
case attempts == 0:
reason = "no_dns_attempts"
case dnsStats.Skipped > 0 || timeoutRate >= 0.15:
next = int(float64(current) * 0.75)
reason = "timeout_high_or_cooldown"
case timeoutRate >= 0.08:
next = int(float64(current) * 0.90)
reason = "timeout_medium"
case timeoutRate <= 0.03 && deferred > 0:
next = int(float64(current) * 1.15)
reason = "timeout_low_expand"
case timeoutRate <= 0.03:
next = int(float64(current) * 1.10)
reason = "timeout_low"
}
if next < minV {
next = minV
}
if next > maxV {
next = maxV
}
if next == current && reason == "timeout_low" {
reason = "stable"
}
return next, reason
}
func saveResolverPrecheckState(path string, ts int, timeoutStats resolverTimeoutRecheckStats, live resolverLiveBatchStats) {
if path == "" || ts <= 0 {
return
}
@@ -2366,6 +2511,14 @@ func saveResolverPrecheckState(path string, ts int, timeoutStats resolverTimeout
"now_other": timeoutStats.NowOther,
"no_signal": timeoutStats.NoSignal,
}
state["live_batch_target"] = live.Target
state["live_batch_total"] = live.Total
state["live_batch_deferred"] = live.Deferred
state["live_batch_next_target"] = live.NextTarget
state["live_batch_next_reason"] = live.NextReason
state["live_batch_dns_attempts"] = live.DNSAttempts
state["live_batch_dns_timeout"] = live.DNSTimeout
state["live_batch_dns_cooldown_skips"] = live.DNSCoolSkips
saveJSON(state, path)
}