diff --git a/PLAN_DHSQ_GLOBAL.md b/PLAN_DHSQ_GLOBAL.md index 7017711..070c2d2 100644 --- a/PLAN_DHSQ_GLOBAL.md +++ b/PLAN_DHSQ_GLOBAL.md @@ -6,7 +6,7 @@ - [x] ~~3. Add stale-keep policy.~~ - [x] ~~4. Wire 24h precheck cycle (soft pruning only).~~ - [x] ~~5. Expose metrics/log clarity in API + GUI (API/trace done; DNS benchmark load-profile UI done; route badges done).~~ -- [ ] 6. Tune thresholds with production data (pass-1 done: timeout-only now stays suspect/no quarantine by default). +- [ ] 6. Tune thresholds with production data (pass-1 done: timeout-only stays suspect; pass-2 done: NX hard-quarantine disabled by default). ## 1) Goal - Stabilize resolver behavior under high domain volume. diff --git a/selective-vpn-api/app/resolver.go b/selective-vpn-api/app/resolver.go index 4ba02a1..3c75996 100644 --- a/selective-vpn-api/app/resolver.go +++ b/selective-vpn-api/app/resolver.go @@ -408,12 +408,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul directPolicy := directDNSAttemptPolicy(len(cfg.Default)) wildcardPolicy := wildcardDNSAttemptPolicy(1) logf( - "resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t", + "resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t", directPolicy.TryLimit, directPolicy.DomainBudget.Milliseconds(), wildcardPolicy.TryLimit, wildcardPolicy.DomainBudget.Milliseconds(), resolveNXEarlyStopEnabled(), + resolveNXHardQuarantineEnabled(), staleKeepSec, precheckEverySec, precheckMaxDomains, @@ -675,7 +676,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul if logf != nil { dnsErrors := dnsStats.totalErrors() logf( - "resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d", + "resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d", len(domains), len(fresh), cacheNegativeHits, @@ -683,6 +684,8 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul staleHits, len(resolved)-len(fresh), len(domains)-len(resolved), + unresolvedAfterAttempts, + cacheNegativeHits+quarantineHits, len(staticEntries), staticSkipped, len(res.IPs), @@ -1164,6 +1167,15 @@ func resolveNXEarlyStopEnabled() bool { } } +func resolveNXHardQuarantineEnabled() bool { + switch strings.ToLower(strings.TrimSpace(os.Getenv("RESOLVE_NX_HARD_QUARANTINE"))) { + case "1", "true", "yes", "on": + return true + default: + return false + } +} + func resolvePrecheckForceEnvEnabled() bool { switch strings.ToLower(strings.TrimSpace(os.Getenv("RESOLVE_PRECHECK_FORCE"))) { case "1", "true", "yes", "on": @@ -1825,6 +1837,14 @@ func (s *domainCacheState) setErrorWithStats(domain string, source domainCacheSo } entry.State = domainStateSuspect } + // NXDOMAIN-heavy synthetic subdomains create large false "hard quarantine" pools. + // By default, keep NX failures in regular quarantine (24h), not hard quarantine. + if normKind == dnsErrorNXDomain && !resolveNXHardQuarantineEnabled() && entry.State == domainStateHardQuar { + entry.State = domainStateQuarantine + if entry.Score < -30 { + entry.Score = -30 + } + } entry.LastErrorKind = string(normKind) entry.LastErrorAt = now switch entry.State { diff --git a/selective-vpn-gui/dashboard_controller.py b/selective-vpn-gui/dashboard_controller.py index 51aed40..4c58c8f 100644 --- a/selective-vpn-gui/dashboard_controller.py +++ b/selective-vpn-gui/dashboard_controller.py @@ -661,6 +661,8 @@ class DashboardController: direct_ips = int(pairs.get("direct_ips", 0)) wildcard_ips = int(pairs.get("wildcard_ips", 0)) unresolved = int(pairs.get("unresolved", 0)) + unresolved_live = int(pairs.get("unresolved_live", 0)) + unresolved_suppressed = int(pairs.get("unresolved_suppressed", 0)) q_hits = int(pairs.get("quarantine_hits", 0)) dns_attempts = int(pairs.get("dns_attempts", 0)) dns_timeout = int(pairs.get("dns_timeout", 0)) @@ -674,7 +676,8 @@ class DashboardController: text = ( f"Resolve: ips={unique_ips} (direct={direct_ips}, wildcard={wildcard_ips}, " - f"+recheck_ips={r_recovered_ips}) | unresolved={unresolved} | " + f"+recheck_ips={r_recovered_ips}) | unresolved={unresolved} " + f"(live={unresolved_live}, suppressed={unresolved_suppressed}) | " f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} | attempts={dns_attempts}" ) recheck_text = (