resolver: split unresolved metrics and soften NX hard quarantine
This commit is contained in:
@@ -6,7 +6,7 @@
|
|||||||
- [x] ~~3. Add stale-keep policy.~~
|
- [x] ~~3. Add stale-keep policy.~~
|
||||||
- [x] ~~4. Wire 24h precheck cycle (soft pruning only).~~
|
- [x] ~~4. Wire 24h precheck cycle (soft pruning only).~~
|
||||||
- [x] ~~5. Expose metrics/log clarity in API + GUI (API/trace done; DNS benchmark load-profile UI done; route badges done).~~
|
- [x] ~~5. Expose metrics/log clarity in API + GUI (API/trace done; DNS benchmark load-profile UI done; route badges done).~~
|
||||||
- [ ] 6. Tune thresholds with production data (pass-1 done: timeout-only now stays suspect/no quarantine by default).
|
- [ ] 6. Tune thresholds with production data (pass-1 done: timeout-only stays suspect; pass-2 done: NX hard-quarantine disabled by default).
|
||||||
|
|
||||||
## 1) Goal
|
## 1) Goal
|
||||||
- Stabilize resolver behavior under high domain volume.
|
- Stabilize resolver behavior under high domain volume.
|
||||||
|
|||||||
@@ -408,12 +408,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
directPolicy := directDNSAttemptPolicy(len(cfg.Default))
|
directPolicy := directDNSAttemptPolicy(len(cfg.Default))
|
||||||
wildcardPolicy := wildcardDNSAttemptPolicy(1)
|
wildcardPolicy := wildcardDNSAttemptPolicy(1)
|
||||||
logf(
|
logf(
|
||||||
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
|
"resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t",
|
||||||
directPolicy.TryLimit,
|
directPolicy.TryLimit,
|
||||||
directPolicy.DomainBudget.Milliseconds(),
|
directPolicy.DomainBudget.Milliseconds(),
|
||||||
wildcardPolicy.TryLimit,
|
wildcardPolicy.TryLimit,
|
||||||
wildcardPolicy.DomainBudget.Milliseconds(),
|
wildcardPolicy.DomainBudget.Milliseconds(),
|
||||||
resolveNXEarlyStopEnabled(),
|
resolveNXEarlyStopEnabled(),
|
||||||
|
resolveNXHardQuarantineEnabled(),
|
||||||
staleKeepSec,
|
staleKeepSec,
|
||||||
precheckEverySec,
|
precheckEverySec,
|
||||||
precheckMaxDomains,
|
precheckMaxDomains,
|
||||||
@@ -675,7 +676,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
if logf != nil {
|
if logf != nil {
|
||||||
dnsErrors := dnsStats.totalErrors()
|
dnsErrors := dnsStats.totalErrors()
|
||||||
logf(
|
logf(
|
||||||
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
|
"resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d",
|
||||||
len(domains),
|
len(domains),
|
||||||
len(fresh),
|
len(fresh),
|
||||||
cacheNegativeHits,
|
cacheNegativeHits,
|
||||||
@@ -683,6 +684,8 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul
|
|||||||
staleHits,
|
staleHits,
|
||||||
len(resolved)-len(fresh),
|
len(resolved)-len(fresh),
|
||||||
len(domains)-len(resolved),
|
len(domains)-len(resolved),
|
||||||
|
unresolvedAfterAttempts,
|
||||||
|
cacheNegativeHits+quarantineHits,
|
||||||
len(staticEntries),
|
len(staticEntries),
|
||||||
staticSkipped,
|
staticSkipped,
|
||||||
len(res.IPs),
|
len(res.IPs),
|
||||||
@@ -1164,6 +1167,15 @@ func resolveNXEarlyStopEnabled() bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolveNXHardQuarantineEnabled() bool {
|
||||||
|
switch strings.ToLower(strings.TrimSpace(os.Getenv("RESOLVE_NX_HARD_QUARANTINE"))) {
|
||||||
|
case "1", "true", "yes", "on":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func resolvePrecheckForceEnvEnabled() bool {
|
func resolvePrecheckForceEnvEnabled() bool {
|
||||||
switch strings.ToLower(strings.TrimSpace(os.Getenv("RESOLVE_PRECHECK_FORCE"))) {
|
switch strings.ToLower(strings.TrimSpace(os.Getenv("RESOLVE_PRECHECK_FORCE"))) {
|
||||||
case "1", "true", "yes", "on":
|
case "1", "true", "yes", "on":
|
||||||
@@ -1825,6 +1837,14 @@ func (s *domainCacheState) setErrorWithStats(domain string, source domainCacheSo
|
|||||||
}
|
}
|
||||||
entry.State = domainStateSuspect
|
entry.State = domainStateSuspect
|
||||||
}
|
}
|
||||||
|
// NXDOMAIN-heavy synthetic subdomains create large false "hard quarantine" pools.
|
||||||
|
// By default, keep NX failures in regular quarantine (24h), not hard quarantine.
|
||||||
|
if normKind == dnsErrorNXDomain && !resolveNXHardQuarantineEnabled() && entry.State == domainStateHardQuar {
|
||||||
|
entry.State = domainStateQuarantine
|
||||||
|
if entry.Score < -30 {
|
||||||
|
entry.Score = -30
|
||||||
|
}
|
||||||
|
}
|
||||||
entry.LastErrorKind = string(normKind)
|
entry.LastErrorKind = string(normKind)
|
||||||
entry.LastErrorAt = now
|
entry.LastErrorAt = now
|
||||||
switch entry.State {
|
switch entry.State {
|
||||||
|
|||||||
@@ -661,6 +661,8 @@ class DashboardController:
|
|||||||
direct_ips = int(pairs.get("direct_ips", 0))
|
direct_ips = int(pairs.get("direct_ips", 0))
|
||||||
wildcard_ips = int(pairs.get("wildcard_ips", 0))
|
wildcard_ips = int(pairs.get("wildcard_ips", 0))
|
||||||
unresolved = int(pairs.get("unresolved", 0))
|
unresolved = int(pairs.get("unresolved", 0))
|
||||||
|
unresolved_live = int(pairs.get("unresolved_live", 0))
|
||||||
|
unresolved_suppressed = int(pairs.get("unresolved_suppressed", 0))
|
||||||
q_hits = int(pairs.get("quarantine_hits", 0))
|
q_hits = int(pairs.get("quarantine_hits", 0))
|
||||||
dns_attempts = int(pairs.get("dns_attempts", 0))
|
dns_attempts = int(pairs.get("dns_attempts", 0))
|
||||||
dns_timeout = int(pairs.get("dns_timeout", 0))
|
dns_timeout = int(pairs.get("dns_timeout", 0))
|
||||||
@@ -674,7 +676,8 @@ class DashboardController:
|
|||||||
|
|
||||||
text = (
|
text = (
|
||||||
f"Resolve: ips={unique_ips} (direct={direct_ips}, wildcard={wildcard_ips}, "
|
f"Resolve: ips={unique_ips} (direct={direct_ips}, wildcard={wildcard_ips}, "
|
||||||
f"+recheck_ips={r_recovered_ips}) | unresolved={unresolved} | "
|
f"+recheck_ips={r_recovered_ips}) | unresolved={unresolved} "
|
||||||
|
f"(live={unresolved_live}, suppressed={unresolved_suppressed}) | "
|
||||||
f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} | attempts={dns_attempts}"
|
f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} | attempts={dns_attempts}"
|
||||||
)
|
)
|
||||||
recheck_text = (
|
recheck_text = (
|
||||||
|
|||||||
Reference in New Issue
Block a user