From f458bd8433b3dddd81376db7f5760202f09581fa Mon Sep 17 00:00:00 2001 From: beckline Date: Wed, 25 Feb 2026 12:15:11 +0300 Subject: [PATCH] resolver: auto-tune nx-heavy batch share by run health --- selective-vpn-api/app/resolver.go | 157 +++++++++++++++++++++- selective-vpn-gui/dashboard_controller.py | 3 +- 2 files changed, 152 insertions(+), 8 deletions(-) diff --git a/selective-vpn-api/app/resolver.go b/selective-vpn-api/app/resolver.go index 3a0d3b3..c2581f7 100644 --- a/selective-vpn-api/app/resolver.go +++ b/selective-vpn-api/app/resolver.go @@ -218,10 +218,13 @@ type resolverLiveBatchStats struct { P1 int P2 int P3 int + NXHeavyPct int NXHeavyTotal int NXHeavySkip int NextTarget int NextReason string + NextNXPct int + NextNXReason string DNSAttempts int DNSTimeout int DNSCoolSkips int @@ -413,13 +416,28 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul liveBatchDefault = liveBatchMax } liveBatchTarget := loadResolverLiveBatchTarget(precheckStatePath, liveBatchDefault, liveBatchMin, liveBatchMax) - liveBatchNXHeavyPct := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10) - if liveBatchNXHeavyPct < 0 { - liveBatchNXHeavyPct = 0 + liveBatchNXHeavyMin := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MIN_PCT", 5) + liveBatchNXHeavyMax := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_MAX_PCT", 35) + liveBatchNXHeavyDefault := envInt("RESOLVE_LIVE_BATCH_NX_HEAVY_PCT", 10) + if liveBatchNXHeavyMin < 0 { + liveBatchNXHeavyMin = 0 } - if liveBatchNXHeavyPct > 100 { - liveBatchNXHeavyPct = 100 + if liveBatchNXHeavyMin > 100 { + liveBatchNXHeavyMin = 100 } + if liveBatchNXHeavyMax < liveBatchNXHeavyMin { + liveBatchNXHeavyMax = liveBatchNXHeavyMin + } + if liveBatchNXHeavyMax > 100 { + liveBatchNXHeavyMax = 100 + } + if liveBatchNXHeavyDefault < liveBatchNXHeavyMin { + liveBatchNXHeavyDefault = liveBatchNXHeavyMin + } + if liveBatchNXHeavyDefault > liveBatchNXHeavyMax { + liveBatchNXHeavyDefault = liveBatchNXHeavyMax + } + liveBatchNXHeavyPct := loadResolverLiveBatchNXHeavyPct(precheckStatePath, liveBatchNXHeavyDefault, liveBatchNXHeavyMin, liveBatchNXHeavyMax) precheckEnvForced := resolvePrecheckForceEnvEnabled() precheckFileForced := resolvePrecheckForceFileEnabled(precheckForcePath) precheckDue := precheckEnvForced || precheckFileForced || (precheckEverySec > 0 && (precheckLastRun <= 0 || now-precheckLastRun >= precheckEverySec)) @@ -484,7 +502,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul wildcardPolicy := wildcardDNSAttemptPolicy(1) cEnabled, cMin, cRate, cStreak, cBan, cMaxBan := cooldown.configSnapshot() logf( - "resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t", + "resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t nx_hard_quarantine=%t cooldown_enabled=%t cooldown_min_attempts=%d cooldown_timeout_rate=%d cooldown_fail_streak=%d cooldown_ban_sec=%d cooldown_max_ban_sec=%d live_batch_target=%d live_batch_min=%d live_batch_max=%d live_batch_nx_heavy_pct=%d live_batch_nx_heavy_min=%d live_batch_nx_heavy_max=%d stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t", directPolicy.TryLimit, directPolicy.DomainBudget.Milliseconds(), wildcardPolicy.TryLimit, @@ -501,6 +519,8 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul liveBatchMin, liveBatchMax, liveBatchNXHeavyPct, + liveBatchNXHeavyMin, + liveBatchNXHeavyMax, staleKeepSec, precheckEverySec, precheckMaxDomains, @@ -783,7 +803,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul dnsErrors := dnsStats.totalErrors() unresolvedSuppressed := cacheNegativeHits + quarantineHits + liveDeferred logf( - "resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d", + "resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d unresolved_live=%d unresolved_suppressed=%d live_batch_target=%d live_batch_deferred=%d live_batch_p1=%d live_batch_p2=%d live_batch_p3=%d live_batch_nxheavy_pct=%d live_batch_nxheavy_total=%d live_batch_nxheavy_skip=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_cooldown_skips=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d", len(domains), len(fresh), cacheNegativeHits, @@ -798,6 +818,7 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul liveP1, liveP2, liveP3, + liveBatchNXHeavyPct, liveNXHeavyTotal, liveNXHeavySkip, len(staticEntries), @@ -850,6 +871,27 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul } if precheckDue { nextTarget, nextReason := computeNextLiveBatchTarget(liveBatchTarget, liveBatchMin, liveBatchMax, dnsStats, liveDeferred) + nextNXPct, nextNXReason := computeNextLiveBatchNXHeavyPct( + liveBatchNXHeavyPct, + liveBatchNXHeavyMin, + liveBatchNXHeavyMax, + dnsStats, + resolvedNowDNS, + liveP3, + liveNXHeavyTotal, + liveNXHeavySkip, + ) + if logf != nil { + logf( + "resolve live-batch nxheavy: pct=%d next=%d reason=%s selected=%d total=%d skipped=%d", + liveBatchNXHeavyPct, + nextNXPct, + nextNXReason, + liveP3, + liveNXHeavyTotal, + liveNXHeavySkip, + ) + } saveResolverPrecheckState( precheckStatePath, now, @@ -861,10 +903,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul P1: liveP1, P2: liveP2, P3: liveP3, + NXHeavyPct: liveBatchNXHeavyPct, NXHeavyTotal: liveNXHeavyTotal, NXHeavySkip: liveNXHeavySkip, NextTarget: nextTarget, NextReason: nextReason, + NextNXPct: nextNXPct, + NextNXReason: nextNXReason, DNSAttempts: dnsStats.Attempts, DNSTimeout: dnsStats.Timeout, DNSCoolSkips: dnsStats.Skipped, @@ -2473,6 +2518,34 @@ func loadResolverLiveBatchTarget(path string, fallback, minV, maxV int) int { return v } +func loadResolverLiveBatchNXHeavyPct(path string, fallback, minV, maxV int) int { + if fallback < minV { + fallback = minV + } + if fallback > maxV { + fallback = maxV + } + m := loadJSONMap(path) + if len(m) == 0 { + return fallback + } + raw := m["live_batch_nxheavy_next_pct"] + if raw == nil { + raw = m["live_batch_nxheavy_pct"] + } + v, ok := parseAnyInt(raw) + if !ok { + return fallback + } + if v < minV { + v = minV + } + if v > maxV { + v = maxV + } + return v +} + func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, deferred int) (int, string) { if current < minV { current = minV @@ -2517,6 +2590,73 @@ func computeNextLiveBatchTarget(current, minV, maxV int, dnsStats dnsMetrics, de return next, reason } +func computeNextLiveBatchNXHeavyPct( + current, minV, maxV int, + dnsStats dnsMetrics, + resolvedNowDNS int, + liveP3 int, + liveNXHeavyTotal int, + liveNXHeavySkip int, +) (int, string) { + if current < minV { + current = minV + } + if current > maxV { + current = maxV + } + next := current + reason := "stable" + attempts := dnsStats.Attempts + timeoutRate := 0.0 + nxRate := 0.0 + okRate := 0.0 + if attempts > 0 { + timeoutRate = float64(dnsStats.Timeout) / float64(attempts) + nxRate = float64(dnsStats.NXDomain) / float64(attempts) + okRate = float64(dnsStats.OK) / float64(attempts) + } + nxSelectedRatio := 0.0 + if liveNXHeavyTotal > 0 { + nxSelectedRatio = float64(liveP3) / float64(liveNXHeavyTotal) + } + + switch { + case attempts == 0: + reason = "no_dns_attempts" + case timeoutRate >= 0.20 || dnsStats.Skipped > 0: + next = current - 3 + reason = "timeout_very_high_or_cooldown" + case timeoutRate >= 0.12: + next = current - 2 + reason = "timeout_high" + case dnsStats.OK == 0 && dnsStats.NXDomain > 0: + next = current - 2 + reason = "no_success_nx_only" + case nxRate >= 0.90 && resolvedNowDNS == 0: + next = current - 2 + reason = "nx_dominant_no_resolve" + case nxSelectedRatio >= 0.95 && resolvedNowDNS == 0: + next = current - 1 + reason = "nxheavy_selected_dominant" + case timeoutRate <= 0.02 && okRate >= 0.10 && liveNXHeavySkip > 0: + next = current + 2 + reason = "healthy_fast_reintroduce_nxheavy" + case timeoutRate <= 0.04 && resolvedNowDNS > 0 && liveNXHeavySkip > 0: + next = current + 1 + reason = "healthy_reintroduce_nxheavy" + } + if next < minV { + next = minV + } + if next > maxV { + next = maxV + } + if next == current && reason != "no_dns_attempts" { + reason = "stable" + } + return next, reason +} + func classifyLiveBatchHost( host string, cache domainCacheState, @@ -2701,8 +2841,11 @@ func saveResolverPrecheckState(path string, ts int, timeoutStats resolverTimeout state["live_batch_p1"] = live.P1 state["live_batch_p2"] = live.P2 state["live_batch_p3"] = live.P3 + state["live_batch_nxheavy_pct"] = live.NXHeavyPct state["live_batch_nxheavy_total"] = live.NXHeavyTotal state["live_batch_nxheavy_skip"] = live.NXHeavySkip + state["live_batch_nxheavy_next_pct"] = live.NextNXPct + state["live_batch_nxheavy_next_reason"] = live.NextNXReason state["live_batch_next_target"] = live.NextTarget state["live_batch_next_reason"] = live.NextReason state["live_batch_dns_attempts"] = live.DNSAttempts diff --git a/selective-vpn-gui/dashboard_controller.py b/selective-vpn-gui/dashboard_controller.py index a8570c0..6c75a08 100644 --- a/selective-vpn-gui/dashboard_controller.py +++ b/selective-vpn-gui/dashboard_controller.py @@ -672,6 +672,7 @@ class DashboardController: live_batch_p1 = int(pairs.get("live_batch_p1", 0)) live_batch_p2 = int(pairs.get("live_batch_p2", 0)) live_batch_p3 = int(pairs.get("live_batch_p3", 0)) + live_batch_nxheavy_pct = int(pairs.get("live_batch_nxheavy_pct", 0)) live_batch_nxheavy_skip = int(pairs.get("live_batch_nxheavy_skip", 0)) r_checked = int(pairs.get("timeout_recheck_checked", 0)) @@ -688,7 +689,7 @@ class DashboardController: f"quarantine_hits={q_hits} | dns_timeout={dns_timeout} " f"| cooldown_skips={dns_cooldown_skips} | attempts={dns_attempts} " f"| live_batch={live_batch_target} deferred={live_batch_deferred} " - f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_skip={live_batch_nxheavy_skip})" + f"(p1={live_batch_p1}, p2={live_batch_p2}, p3={live_batch_p3}, nx_pct={live_batch_nxheavy_pct}, nx_skip={live_batch_nxheavy_skip})" ) recheck_text = ( f"Timeout recheck: checked={r_checked} recovered={r_recovered} "