resolver: split unresolved metrics and soften NX hard quarantine

This commit is contained in:
beckline
2026-02-25 09:54:54 +03:00
parent 5bd7f1c9f4
commit e8fb361b4c
3 changed files with 27 additions and 4 deletions

View File

@@ -6,7 +6,7 @@
- [x] ~~3. Add stale-keep policy.~~
- [x] ~~4. Wire 24h precheck cycle (soft pruning only).~~
- [x] ~~5. Expose metrics/log clarity in API + GUI (API/trace done; DNS benchmark load-profile UI done; route badges done).~~
- [ ] 6. Tune thresholds with production data (pass-1 done: timeout-only now stays suspect/no quarantine by default).
- [ ] 6. Tune thresholds with production data (pass-1 done: timeout-only stays suspect; pass-2 done: NX hard-quarantine disabled by default).
## 1) Goal
- Stabilize resolver behavior under high domain volume.

View File

@@ -408,12 +408,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
directPolicy := directDNSAttemptPolicy(len(cfg.Default))
wildcardPolicy := wildcardDNSAttemptPolicy(1)
logf(
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
directPolicy.TryLimit,
directPolicy.DomainBudget.Milliseconds(),
wildcardPolicy.TryLimit,
wildcardPolicy.DomainBudget.Milliseconds(),
resolveNXEarlyStopEnabled(),
resolveNXHardQuarantineEnabled(),
staleKeepSec,
precheckEverySec,
precheckMaxDomains,
@@ -675,7 +676,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
if logf != nil {
dnsErrors := dnsStats.totalErrors()
logf(
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
len(domains),
len(fresh),
cacheNegativeHits,
@@ -683,6 +684,8 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
staleHits,
len(resolved)-len(fresh),
len(domains)-len(resolved),
unresolvedAfterAttempts,
cacheNegativeHits+quarantineHits,
len(staticEntries),
staticSkipped,
len(res.IPs),
@@ -1164,6 +1167,15 @@ func resolveNXEarlyStopEnabled() bool {
}
}
func resolveNXHardQuarantineEnabled() bool {
switch strings.ToLower(strings.TrimSpace(os.Getenv("RESOLVE_NX_HARD_QUARANTINE"))) {
case "1", "true", "yes", "on":
return true
default:
return false
}
}
func resolvePrecheckForceEnvEnabled() bool {
switch strings.ToLower(strings.TrimSpace(os.Getenv("RESOLVE_PRECHECK_FORCE"))) {
case "1", "true", "yes", "on":
@@ -1825,6 +1837,14 @@ func (s *domainCacheState) setErrorWithStats(domain string, source domainCacheSo
}
entry.State = domainStateSuspect
}
// NXDOMAIN-heavy synthetic subdomains create large false "hard quarantine" pools.
// By default, keep NX failures in regular quarantine (24h), not hard quarantine.
if normKind == dnsErrorNXDomain && !resolveNXHardQuarantineEnabled() && entry.State == domainStateHardQuar {
entry.State = domainStateQuarantine
if entry.Score < -30 {
entry.Score = -30
}
}
entry.LastErrorKind = string(normKind)
entry.LastErrorAt = now
switch entry.State {

View File

@@ -661,6 +661,8 @@ class DashboardController:
direct_ips = int(pairs.get("direct_ips", 0))
wildcard_ips = int(pairs.get("wildcard_ips", 0))
unresolved = int(pairs.get("unresolved", 0))
unresolved_live = int(pairs.get("unresolved_live", 0))
unresolved_suppressed = int(pairs.get("unresolved_suppressed", 0))
q_hits = int(pairs.get("quarantine_hits", 0))
dns_attempts = int(pairs.get("dns_attempts", 0))
dns_timeout = int(pairs.get("dns_timeout", 0))
@@ -674,7 +676,8 @@ class DashboardController:
text = (
f"Resolve: ips={unique_ips} (direct={direct_ips}, wildcard={wildcard_ips}, "
f"+recheck_ips={r_recovered_ips}) | unresolved={unresolved} | "
f"+recheck_ips={r_recovered_ips}) | unresolved={unresolved} "
f"(live={unresolved_live}, suppressed={unresolved_suppressed}) | "
f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} | attempts={dns_attempts}"
)
recheck_text = (