diff --git a/selective-vpn-api/app/resolver.go b/selective-vpn-api/app/resolver.go index 9aead3b..0bb3107 100644 --- a/selective-vpn-api/app/resolver.go +++ b/selective-vpn-api/app/resolver.go @@ -136,11 +136,72 @@ func (m dnsMetrics) formatPerUpstream() string { return strings.Join(parts, "; ") } +func (m dnsMetrics) formatResolverHealth() string { + if len(m.PerUpstream) == 0 { + return "" + } + keys := make([]string, 0, len(m.PerUpstream)) + for k := range m.PerUpstream { + keys = append(keys, k) + } + sort.Strings(keys) + parts := make([]string, 0, len(keys)) + for _, k := range keys { + v := m.PerUpstream[k] + if v == nil || v.Attempts <= 0 { + continue + } + okRate := float64(v.OK) / float64(v.Attempts) + timeoutRate := float64(v.Timeout) / float64(v.Attempts) + score := okRate*100.0 - timeoutRate*50.0 + state := "bad" + switch { + case score >= 70 && timeoutRate <= 0.05: + state = "good" + case score >= 35: + state = "degraded" + default: + state = "bad" + } + parts = append(parts, fmt.Sprintf("%s{score=%.1f state=%s attempts=%d ok=%d timeout=%d nxdomain=%d temporary=%d other=%d}", k, score, state, v.Attempts, v.OK, v.Timeout, v.NXDomain, v.Temporary, v.Other)) + } + return strings.Join(parts, "; ") +} + type wildcardMatcher struct { exact map[string]struct{} suffix []string } +type dnsAttemptPolicy struct { + TryLimit int + DomainBudget time.Duration + StopOnNX bool +} + +const ( + domainStateActive = "active" + domainStateStable = "stable" + domainStateSuspect = "suspect" + domainStateQuarantine = "quarantine" + domainStateHardQuar = "hard_quarantine" + domainScoreMin = -100 + domainScoreMax = 100 + defaultQuarantineTTL = 24 * 3600 + defaultHardQuarantineTT = 7 * 24 * 3600 +) + +type resolverTimeoutRecheckStats struct { + Checked int + Recovered int + RecoveredIPs int + StillTimeout int + NowNXDomain int + NowTemporary int + NowOther int + NoSignal int +} + // Empty by default: primary resolver pool comes from DNS upstream pool state. // Optional fallback list can still be provided via RESOLVE_DNS_FALLBACKS env. var resolverFallbackDNS []string @@ -265,6 +326,37 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul domainCache := loadDomainCacheState(opts.CachePath, logf) ptrCache := loadJSONMap(opts.PtrCachePath) now := int(time.Now().Unix()) + precheckEverySec := envInt("RESOLVE_PRECHECK_EVERY_SEC", 24*3600) + if precheckEverySec < 0 { + precheckEverySec = 0 + } + precheckMaxDomains := envInt("RESOLVE_PRECHECK_MAX_DOMAINS", 3000) + if precheckMaxDomains < 0 { + precheckMaxDomains = 0 + } + if precheckMaxDomains > 50000 { + precheckMaxDomains = 50000 + } + timeoutRecheckMax := envInt("RESOLVE_TIMEOUT_RECHECK_MAX", precheckMaxDomains) + if timeoutRecheckMax < 0 { + timeoutRecheckMax = 0 + } + if timeoutRecheckMax > 50000 { + timeoutRecheckMax = 50000 + } + precheckStatePath := opts.CachePath + ".precheck.json" + precheckLastRun := loadResolverPrecheckLastRun(precheckStatePath) + precheckEnvForced := resolvePrecheckForceEnvEnabled() + precheckFileForced := resolvePrecheckForceFileEnabled(precheckForcePath) + precheckDue := precheckEnvForced || precheckFileForced || (precheckEverySec > 0 && (precheckLastRun <= 0 || now-precheckLastRun >= precheckEverySec)) + precheckScheduled := 0 + staleKeepSec := envInt("RESOLVE_STALE_KEEP_SEC", 48*3600) + if staleKeepSec < 0 { + staleKeepSec = 0 + } + if staleKeepSec > 7*24*3600 { + staleKeepSec = 7 * 24 * 3600 + } negTTLNX := envInt("RESOLVE_NEGATIVE_TTL_NX", 6*3600) negTTLTimeout := envInt("RESOLVE_NEGATIVE_TTL_TIMEOUT", 15*60) negTTLTemporary := envInt("RESOLVE_NEGATIVE_TTL_TEMPORARY", 10*60) @@ -295,13 +387,46 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul return domainCacheSourceDirect } + timeoutRecheck := resolverTimeoutRecheckStats{} + if precheckDue && timeoutRecheckMax > 0 { + timeoutRecheck = runTimeoutQuarantineRecheck( + domains, + cfg, + metaSpecial, + wildcards, + dnsTimeout, + &domainCache, + cacheSourceForHost, + now, + timeoutRecheckMax, + workers, + ) + } + if logf != nil { logf("resolver start: domains=%d ttl=%ds workers=%d dns_timeout_ms=%d", len(domains), ttl, workers, dnsTimeoutMs) + directPolicy := directDNSAttemptPolicy(len(cfg.Default)) + wildcardPolicy := wildcardDNSAttemptPolicy(1) + logf( + "resolver policy: direct_try=%d direct_budget_ms=%d wildcard_try=%d wildcard_budget_ms=%d nx_early_stop=%t stale_keep_sec=%d precheck_every_sec=%d precheck_max=%d precheck_forced_env=%t precheck_forced_file=%t", + directPolicy.TryLimit, + directPolicy.DomainBudget.Milliseconds(), + wildcardPolicy.TryLimit, + wildcardPolicy.DomainBudget.Milliseconds(), + resolveNXEarlyStopEnabled(), + staleKeepSec, + precheckEverySec, + precheckMaxDomains, + precheckEnvForced, + precheckFileForced, + ) } start := time.Now() fresh := map[string][]string{} cacheNegativeHits := 0 + quarantineHits := 0 + staleHits := 0 var toResolve []string for _, d := range domains { source := cacheSourceForHost(d) @@ -312,7 +437,62 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul } continue } + // Quarantine has priority over negative TTL cache so 24h quarantine + // is not silently overridden by shorter negative cache windows. + if state, age, ok := domainCache.getQuarantine(d, source, now); ok { + kind, hasKind := domainCache.getLastErrorKind(d, source) + timeoutKind := hasKind && kind == dnsErrorTimeout + if precheckDue && precheckScheduled < precheckMaxDomains { + // Timeout-based quarantine is rechecked in background batch and should + // not flood trace with per-domain debug lines. + if timeoutKind { + quarantineHits++ + if staleKeepSec > 0 { + if staleIPs, staleAge, ok := domainCache.getStale(d, source, now, staleKeepSec); ok { + staleHits++ + fresh[d] = staleIPs + if logf != nil { + logf("cache stale-keep (quarantine)[age=%ds]: %s -> %v", staleAge, d, staleIPs) + } + } + } + continue + } + precheckScheduled++ + toResolve = append(toResolve, d) + if logf != nil { + logf("precheck schedule[quarantine/%s age=%ds]: %s (%s)", state, age, d, source) + } + continue + } + quarantineHits++ + if logf != nil { + logf("cache quarantine hit[%s age=%ds]: %s (%s)", state, age, d, source) + } + if staleKeepSec > 0 { + if staleIPs, staleAge, ok := domainCache.getStale(d, source, now, staleKeepSec); ok { + staleHits++ + fresh[d] = staleIPs + if logf != nil { + logf("cache stale-keep (quarantine)[age=%ds]: %s -> %v", staleAge, d, staleIPs) + } + } + } + continue + } if kind, age, ok := domainCache.getNegative(d, source, now, negTTLNX, negTTLTimeout, negTTLTemporary, negTTLOther); ok { + if precheckDue && precheckScheduled < precheckMaxDomains { + if kind == dnsErrorTimeout { + cacheNegativeHits++ + continue + } + precheckScheduled++ + toResolve = append(toResolve, d) + if logf != nil { + logf("precheck schedule[negative/%s age=%ds]: %s (%s)", kind, age, d, source) + } + continue + } cacheNegativeHits++ if logf != nil { logf("cache neg hit[%s/%s age=%ds]: %s", source, kind, age, d) @@ -328,10 +508,13 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul } if logf != nil { - logf("resolve: domains=%d cache_hits=%d cache_neg_hits=%d to_resolve=%d", len(domains), len(fresh), cacheNegativeHits, len(toResolve)) + logf("resolve: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d precheck_due=%t precheck_scheduled=%d to_resolve=%d", len(domains), len(fresh), cacheNegativeHits, quarantineHits, staleHits, precheckDue, precheckScheduled, len(toResolve)) } dnsStats := dnsMetrics{} + resolvedNowDNS := 0 + resolvedNowStale := 0 + unresolvedAfterAttempts := 0 if len(toResolve) > 0 { type job struct { @@ -369,20 +552,36 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul } if len(r.ips) > 0 { resolved[r.host] = r.ips + resolvedNowDNS++ source := cacheSourceForHost(r.host) domainCache.set(r.host, source, r.ips, now) if logf != nil { logf("%s -> %v", r.host, r.ips) } } else { + staleApplied := false if hostErrors > 0 { source := cacheSourceForHost(r.host) - if kind, ok := classifyHostErrorKind(r.stats); ok { - domainCache.setError(r.host, source, kind, now) + domainCache.setErrorWithStats(r.host, source, r.stats, now) + if staleKeepSec > 0 && shouldUseStaleOnError(r.stats) { + if staleIPs, staleAge, ok := domainCache.getStale(r.host, source, now, staleKeepSec); ok { + staleHits++ + resolvedNowStale++ + staleApplied = true + resolved[r.host] = staleIPs + if logf != nil { + logf("cache stale-keep (error)[age=%ds]: %s -> %v", staleAge, r.host, staleIPs) + } + } } } + if !staleApplied { + unresolvedAfterAttempts++ + } if logf != nil { - logf("%s: no IPs", r.host) + if _, ok := resolved[r.host]; !ok { + logf("%s: no IPs", r.host) + } } } } @@ -476,10 +675,12 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul if logf != nil { dnsErrors := dnsStats.totalErrors() logf( - "resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d resolved_now=%d unresolved=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_errors=%d duration_ms=%d", + "resolve summary: domains=%d cache_hits=%d cache_neg_hits=%d quarantine_hits=%d stale_hits=%d resolved_now=%d unresolved=%d static_entries=%d static_skipped=%d unique_ips=%d direct_ips=%d wildcard_ips=%d ptr_lookups=%d ptr_errors=%d dns_attempts=%d dns_ok=%d dns_nxdomain=%d dns_timeout=%d dns_temporary=%d dns_other=%d dns_errors=%d timeout_recheck_checked=%d timeout_recheck_recovered=%d timeout_recheck_recovered_ips=%d timeout_recheck_still_timeout=%d timeout_recheck_now_nxdomain=%d timeout_recheck_now_temporary=%d timeout_recheck_now_other=%d timeout_recheck_no_signal=%d duration_ms=%d", len(domains), len(fresh), cacheNegativeHits, + quarantineHits, + staleHits, len(resolved)-len(fresh), len(domains)-len(resolved), len(staticEntries), @@ -496,11 +697,46 @@ func runResolverJob(opts ResolverOpts, logf func(string, ...any)) (resolverResul dnsStats.Temporary, dnsStats.Other, dnsErrors, + timeoutRecheck.Checked, + timeoutRecheck.Recovered, + timeoutRecheck.RecoveredIPs, + timeoutRecheck.StillTimeout, + timeoutRecheck.NowNXDomain, + timeoutRecheck.NowTemporary, + timeoutRecheck.NowOther, + timeoutRecheck.NoSignal, time.Since(start).Milliseconds(), ) if perUpstream := dnsStats.formatPerUpstream(); perUpstream != "" { logf("resolve dns upstreams: %s", perUpstream) } + if health := dnsStats.formatResolverHealth(); health != "" { + logf("resolve dns health: %s", health) + } + if stateSummary := domainCache.formatStateSummary(now); stateSummary != "" { + logf("resolve domain states: %s", stateSummary) + } + logf( + "resolve breakdown: resolved_now_total=%d resolved_now_dns=%d resolved_now_stale=%d skipped_neg=%d skipped_quarantine=%d unresolved_after_attempts=%d", + len(resolved)-len(fresh), + resolvedNowDNS, + resolvedNowStale, + cacheNegativeHits, + quarantineHits, + unresolvedAfterAttempts, + ) + if precheckDue { + logf("resolve precheck done: scheduled=%d state=%s", precheckScheduled, precheckStatePath) + } + } + if precheckDue { + saveResolverPrecheckState(precheckStatePath, now, timeoutRecheck) + } + if precheckFileForced { + _ = os.Remove(precheckForcePath) + if logf != nil { + logf("resolve precheck force-file consumed: %s", precheckForcePath) + } } return res, nil } @@ -534,7 +770,11 @@ func resolveHostGo(host string, cfg dnsConfig, metaSpecial []string, wildcards w primaryViaSmartDNS = true } } - ips, stats := digA(host, dnsList, timeout, logf) + policy := directDNSAttemptPolicy(len(dnsList)) + if primaryViaSmartDNS { + policy = wildcardDNSAttemptPolicy(len(dnsList)) + } + ips, stats := digAWithPolicy(host, dnsList, timeout, logf, policy) if len(ips) == 0 && !primaryViaSmartDNS && cfg.SmartDNS != "" && @@ -551,7 +791,8 @@ func resolveHostGo(host string, cfg dnsConfig, metaSpecial []string, wildcards w stats.Other, ) } - fallbackIPs, fallbackStats := digA(host, []string{cfg.SmartDNS}, timeout, logf) + fallbackPolicy := wildcardDNSAttemptPolicy(1) + fallbackIPs, fallbackStats := digAWithPolicy(host, []string{cfg.SmartDNS}, timeout, logf, fallbackPolicy) stats.merge(fallbackStats) if len(fallbackIPs) > 0 { ips = fallbackIPs @@ -620,26 +861,168 @@ func classifyHostErrorKind(stats dnsMetrics) (dnsErrorKind, bool) { return "", false } +func shouldUseStaleOnError(stats dnsMetrics) bool { + if stats.OK > 0 { + return false + } + return stats.Timeout > 0 || stats.Temporary > 0 || stats.Other > 0 +} + +func runTimeoutQuarantineRecheck( + domains []string, + cfg dnsConfig, + metaSpecial []string, + wildcards wildcardMatcher, + timeout time.Duration, + domainCache *domainCacheState, + cacheSourceForHost func(string) domainCacheSource, + now int, + limit int, + workers int, +) resolverTimeoutRecheckStats { + stats := resolverTimeoutRecheckStats{} + if limit <= 0 || now <= 0 { + return stats + } + if workers < 1 { + workers = 1 + } + if workers > 200 { + workers = 200 + } + seen := map[string]struct{}{} + capHint := len(domains) + if capHint > limit { + capHint = limit + } + candidates := make([]string, 0, capHint) + for _, raw := range domains { + host := strings.TrimSpace(strings.ToLower(raw)) + if host == "" { + continue + } + if _, ok := seen[host]; ok { + continue + } + seen[host] = struct{}{} + source := cacheSourceForHost(host) + if _, _, ok := domainCache.getQuarantine(host, source, now); !ok { + continue + } + kind, ok := domainCache.getLastErrorKind(host, source) + if !ok || kind != dnsErrorTimeout { + continue + } + candidates = append(candidates, host) + if len(candidates) >= limit { + break + } + } + if len(candidates) == 0 { + return stats + } + recoveredIPSet := map[string]struct{}{} + + type result struct { + host string + source domainCacheSource + ips []string + dns dnsMetrics + } + jobs := make(chan string, len(candidates)) + results := make(chan result, len(candidates)) + for i := 0; i < workers; i++ { + go func() { + for host := range jobs { + src := cacheSourceForHost(host) + ips, dnsStats := resolveHostGo(host, cfg, metaSpecial, wildcards, timeout, nil) + results <- result{host: host, source: src, ips: ips, dns: dnsStats} + } + }() + } + for _, host := range candidates { + jobs <- host + } + close(jobs) + + for i := 0; i < len(candidates); i++ { + r := <-results + stats.Checked++ + if len(r.ips) > 0 { + for _, ip := range r.ips { + ip = strings.TrimSpace(ip) + if ip == "" { + continue + } + recoveredIPSet[ip] = struct{}{} + } + domainCache.set(r.host, r.source, r.ips, now) + stats.Recovered++ + continue + } + if r.dns.totalErrors() > 0 { + domainCache.setErrorWithStats(r.host, r.source, r.dns, now) + } + kind, ok := classifyHostErrorKind(r.dns) + if !ok { + stats.NoSignal++ + continue + } + switch kind { + case dnsErrorTimeout: + stats.StillTimeout++ + case dnsErrorNXDomain: + stats.NowNXDomain++ + case dnsErrorTemporary: + stats.NowTemporary++ + default: + stats.NowOther++ + } + } + stats.RecoveredIPs = len(recoveredIPSet) + + return stats +} + // --------------------------------------------------------------------- // EN: `digA` contains core logic for dig a. // RU: `digA` - содержит основную логику для dig a. // --------------------------------------------------------------------- func digA(host string, dnsList []string, timeout time.Duration, logf func(string, ...any)) ([]string, dnsMetrics) { + return digAWithPolicy(host, dnsList, timeout, logf, defaultDNSAttemptPolicy(len(dnsList))) +} + +func digAWithPolicy(host string, dnsList []string, timeout time.Duration, logf func(string, ...any), policy dnsAttemptPolicy) ([]string, dnsMetrics) { stats := dnsMetrics{} if len(dnsList) == 0 { return nil, stats } - tryLimit := envInt("RESOLVE_DNS_TRY_LIMIT", 2) - if tryLimit < 1 { + tryLimit := policy.TryLimit + if tryLimit <= 0 { tryLimit = 1 } if tryLimit > len(dnsList) { tryLimit = len(dnsList) } + budget := policy.DomainBudget + if budget <= 0 { + budget = time.Duration(tryLimit) * timeout + } + if budget < 200*time.Millisecond { + budget = 200 * time.Millisecond + } + deadline := time.Now().Add(budget) start := pickDNSStartIndex(host, len(dnsList)) for attempt := 0; attempt < tryLimit; attempt++ { + remaining := time.Until(deadline) + if remaining <= 0 { + if logf != nil { + logf("dns budget exhausted %s: attempts=%d budget_ms=%d", host, attempt, budget.Milliseconds()) + } + break + } entry := dnsList[(start+attempt)%len(dnsList)] server, port := splitDNS(entry) if server == "" { @@ -656,14 +1039,27 @@ func digA(host string, dnsList []string, timeout time.Duration, logf func(string return d.DialContext(ctx, "udp", addr) }, } - ctx, cancel := context.WithTimeout(context.Background(), timeout) + perAttemptTimeout := timeout + if remaining < perAttemptTimeout { + perAttemptTimeout = remaining + } + if perAttemptTimeout < 100*time.Millisecond { + perAttemptTimeout = 100 * time.Millisecond + } + ctx, cancel := context.WithTimeout(context.Background(), perAttemptTimeout) records, err := r.LookupHost(ctx, host) cancel() if err != nil { kind := classifyDNSError(err) stats.addError(addr, kind) if logf != nil { - logf("dns warn %s via %s: kind=%s err=%v", host, addr, kind, err) + logf("dns warn %s via %s: kind=%s attempt=%d/%d err=%v", host, addr, kind, attempt+1, tryLimit, err) + } + if policy.StopOnNX && kind == dnsErrorNXDomain { + if logf != nil { + logf("dns early-stop %s: nxdomain via %s (attempt=%d/%d)", host, addr, attempt+1, tryLimit) + } + break } continue } @@ -687,6 +1083,104 @@ func digA(host string, dnsList []string, timeout time.Duration, logf func(string return nil, stats } +func defaultDNSAttemptPolicy(dnsCount int) dnsAttemptPolicy { + tryLimit := envInt("RESOLVE_DNS_TRY_LIMIT", 2) + if tryLimit < 1 { + tryLimit = 1 + } + if dnsCount > 0 && tryLimit > dnsCount { + tryLimit = dnsCount + } + budgetMS := envInt("RESOLVE_DNS_DOMAIN_BUDGET_MS", 1200) + if budgetMS < 200 { + budgetMS = 200 + } + if budgetMS > 15000 { + budgetMS = 15000 + } + return dnsAttemptPolicy{ + TryLimit: tryLimit, + DomainBudget: time.Duration(budgetMS) * time.Millisecond, + StopOnNX: resolveNXEarlyStopEnabled(), + } +} + +func directDNSAttemptPolicy(dnsCount int) dnsAttemptPolicy { + tryLimit := envInt("RESOLVE_DIRECT_TRY_LIMIT", 2) + if tryLimit < 1 { + tryLimit = 1 + } + if tryLimit > 3 { + tryLimit = 3 + } + if dnsCount > 0 && tryLimit > dnsCount { + tryLimit = dnsCount + } + budgetMS := envInt("RESOLVE_DIRECT_BUDGET_MS", 1200) + if budgetMS < 200 { + budgetMS = 200 + } + if budgetMS > 15000 { + budgetMS = 15000 + } + return dnsAttemptPolicy{ + TryLimit: tryLimit, + DomainBudget: time.Duration(budgetMS) * time.Millisecond, + StopOnNX: resolveNXEarlyStopEnabled(), + } +} + +func wildcardDNSAttemptPolicy(dnsCount int) dnsAttemptPolicy { + tryLimit := envInt("RESOLVE_WILDCARD_TRY_LIMIT", 1) + if tryLimit < 1 { + tryLimit = 1 + } + if tryLimit > 2 { + tryLimit = 2 + } + if dnsCount > 0 && tryLimit > dnsCount { + tryLimit = dnsCount + } + budgetMS := envInt("RESOLVE_WILDCARD_BUDGET_MS", 1200) + if budgetMS < 200 { + budgetMS = 200 + } + if budgetMS > 15000 { + budgetMS = 15000 + } + return dnsAttemptPolicy{ + TryLimit: tryLimit, + DomainBudget: time.Duration(budgetMS) * time.Millisecond, + StopOnNX: resolveNXEarlyStopEnabled(), + } +} + +func resolveNXEarlyStopEnabled() bool { + switch strings.ToLower(strings.TrimSpace(os.Getenv("RESOLVE_NX_EARLY_STOP"))) { + case "0", "false", "no", "off": + return false + default: + return true + } +} + +func resolvePrecheckForceEnvEnabled() bool { + switch strings.ToLower(strings.TrimSpace(os.Getenv("RESOLVE_PRECHECK_FORCE"))) { + case "1", "true", "yes", "on": + return true + default: + return false + } +} + +func resolvePrecheckForceFileEnabled(path string) bool { + if strings.TrimSpace(path) == "" { + return false + } + _, err := os.Stat(path) + return err == nil +} + func classifyDNSError(err error) dnsErrorKind { if err == nil { return dnsErrorOther @@ -850,10 +1344,13 @@ const ( ) type domainCacheEntry struct { - IPs []string `json:"ips,omitempty"` - LastResolved int `json:"last_resolved,omitempty"` - LastErrorKind string `json:"last_error_kind,omitempty"` - LastErrorAt int `json:"last_error_at,omitempty"` + IPs []string `json:"ips,omitempty"` + LastResolved int `json:"last_resolved,omitempty"` + LastErrorKind string `json:"last_error_kind,omitempty"` + LastErrorAt int `json:"last_error_at,omitempty"` + Score int `json:"score,omitempty"` + State string `json:"state,omitempty"` + QuarantineUntil int `json:"quarantine_until,omitempty"` } type domainCacheRecord struct { @@ -868,7 +1365,7 @@ type domainCacheState struct { func newDomainCacheState() domainCacheState { return domainCacheState{ - Version: 3, + Version: 4, Domains: map[string]domainCacheRecord{}, } } @@ -920,8 +1417,17 @@ func normalizeDomainCacheEntry(in *domainCacheEntry) *domainCacheEntry { out.LastErrorKind = string(kind) out.LastErrorAt = in.LastErrorAt } + out.Score = clampDomainScore(in.Score) + if st := normalizeDomainState(in.State, out.Score); st != "" { + out.State = st + } + if in.QuarantineUntil > 0 { + out.QuarantineUntil = in.QuarantineUntil + } if out.LastResolved <= 0 && out.LastErrorAt <= 0 { - return nil + if out.Score == 0 && out.QuarantineUntil <= 0 { + return nil + } } return out } @@ -987,7 +1493,7 @@ func loadDomainCacheState(path string, logf func(string, ...any)) domainCacheSta var st domainCacheState if err := json.Unmarshal(data, &st); err == nil && st.Domains != nil { if st.Version <= 0 { - st.Version = 3 + st.Version = 4 } normalized := newDomainCacheState() for host, rec := range st.Domains { @@ -1101,6 +1607,76 @@ func (s domainCacheState) getNegative(domain string, source domainCacheSource, n return kind, age, true } +func (s domainCacheState) getStoredIPs(domain string, source domainCacheSource) []string { + rec, ok := s.Domains[strings.TrimSpace(strings.ToLower(domain))] + if !ok { + return nil + } + entry := getCacheEntryBySource(rec, source) + if entry == nil { + return nil + } + return normalizeCacheIPs(entry.IPs) +} + +func (s domainCacheState) getLastErrorKind(domain string, source domainCacheSource) (dnsErrorKind, bool) { + rec, ok := s.Domains[strings.TrimSpace(strings.ToLower(domain))] + if !ok { + return "", false + } + entry := getCacheEntryBySource(rec, source) + if entry == nil || entry.LastErrorAt <= 0 { + return "", false + } + return normalizeCacheErrorKind(entry.LastErrorKind) +} + +func (s domainCacheState) getQuarantine(domain string, source domainCacheSource, now int) (string, int, bool) { + rec, ok := s.Domains[strings.TrimSpace(strings.ToLower(domain))] + if !ok { + return "", 0, false + } + entry := getCacheEntryBySource(rec, source) + if entry == nil || entry.QuarantineUntil <= 0 { + return "", 0, false + } + if now >= entry.QuarantineUntil { + return "", 0, false + } + state := normalizeDomainState(entry.State, entry.Score) + if state == "" { + state = domainStateQuarantine + } + age := 0 + if entry.LastErrorAt > 0 { + age = now - entry.LastErrorAt + } + return state, age, true +} + +func (s domainCacheState) getStale(domain string, source domainCacheSource, now, maxAge int) ([]string, int, bool) { + if maxAge <= 0 { + return nil, 0, false + } + rec, ok := s.Domains[strings.TrimSpace(strings.ToLower(domain))] + if !ok { + return nil, 0, false + } + entry := getCacheEntryBySource(rec, source) + if entry == nil || entry.LastResolved <= 0 { + return nil, 0, false + } + age := now - entry.LastResolved + if age < 0 || age > maxAge { + return nil, 0, false + } + ips := normalizeCacheIPs(entry.IPs) + if len(ips) == 0 { + return nil, 0, false + } + return ips, age, true +} + func (s *domainCacheState) set(domain string, source domainCacheSource, ips []string, now int) { host := strings.TrimSpace(strings.ToLower(domain)) if host == "" || now <= 0 { @@ -1114,9 +1690,140 @@ func (s *domainCacheState) set(domain string, source domainCacheSource, ips []st s.Domains = map[string]domainCacheRecord{} } rec := s.Domains[host] + prev := getCacheEntryBySource(rec, source) + prevScore := 0 + if prev != nil { + prevScore = prev.Score + } entry := &domainCacheEntry{ - IPs: norm, - LastResolved: now, + IPs: norm, + LastResolved: now, + LastErrorKind: "", + LastErrorAt: 0, + Score: clampDomainScore(prevScore + envInt("RESOLVE_DOMAIN_SCORE_OK", 8)), + QuarantineUntil: 0, + } + entry.State = domainStateFromScore(entry.Score) + switch source { + case domainCacheSourceWildcard: + rec.Wildcard = entry + default: + rec.Direct = entry + } + s.Domains[host] = rec +} + +func getCacheEntryBySource(rec domainCacheRecord, source domainCacheSource) *domainCacheEntry { + switch source { + case domainCacheSourceWildcard: + return rec.Wildcard + default: + return rec.Direct + } +} + +func clampDomainScore(v int) int { + if v < domainScoreMin { + return domainScoreMin + } + if v > domainScoreMax { + return domainScoreMax + } + return v +} + +func domainStateFromScore(score int) string { + switch { + case score >= 20: + return domainStateActive + case score >= 5: + return domainStateStable + case score >= -10: + return domainStateSuspect + case score >= -30: + return domainStateQuarantine + default: + return domainStateHardQuar + } +} + +func normalizeDomainState(raw string, score int) string { + switch strings.TrimSpace(strings.ToLower(raw)) { + case domainStateActive: + return domainStateActive + case domainStateStable: + return domainStateStable + case domainStateSuspect: + return domainStateSuspect + case domainStateQuarantine: + return domainStateQuarantine + case domainStateHardQuar: + return domainStateHardQuar + default: + if score == 0 { + return "" + } + return domainStateFromScore(score) + } +} + +func domainScorePenalty(stats dnsMetrics) int { + if stats.NXDomain >= 2 { + return envInt("RESOLVE_DOMAIN_SCORE_NX_CONFIRMED", -15) + } + if stats.NXDomain > 0 { + return envInt("RESOLVE_DOMAIN_SCORE_NX_SINGLE", -7) + } + if stats.Timeout > 0 { + return envInt("RESOLVE_DOMAIN_SCORE_TIMEOUT", -3) + } + if stats.Temporary > 0 { + return envInt("RESOLVE_DOMAIN_SCORE_TEMPORARY", -2) + } + return envInt("RESOLVE_DOMAIN_SCORE_OTHER", -2) +} + +func (s *domainCacheState) setErrorWithStats(domain string, source domainCacheSource, stats dnsMetrics, now int) { + host := strings.TrimSpace(strings.ToLower(domain)) + if host == "" || now <= 0 { + return + } + kind, ok := classifyHostErrorKind(stats) + if !ok { + return + } + normKind, ok := normalizeCacheErrorKind(string(kind)) + if !ok { + return + } + penalty := domainScorePenalty(stats) + quarantineTTL := envInt("RESOLVE_QUARANTINE_TTL_SEC", defaultQuarantineTTL) + if quarantineTTL < 0 { + quarantineTTL = 0 + } + hardQuarantineTTL := envInt("RESOLVE_HARD_QUARANTINE_TTL_SEC", defaultHardQuarantineTT) + if hardQuarantineTTL < 0 { + hardQuarantineTTL = 0 + } + if s.Domains == nil { + s.Domains = map[string]domainCacheRecord{} + } + rec := s.Domains[host] + entry := getCacheEntryBySource(rec, source) + if entry == nil { + entry = &domainCacheEntry{} + } + entry.Score = clampDomainScore(entry.Score + penalty) + entry.State = domainStateFromScore(entry.Score) + entry.LastErrorKind = string(normKind) + entry.LastErrorAt = now + switch entry.State { + case domainStateHardQuar: + entry.QuarantineUntil = now + hardQuarantineTTL + case domainStateQuarantine: + entry.QuarantineUntil = now + quarantineTTL + default: + entry.QuarantineUntil = 0 } switch source { case domainCacheSourceWildcard: @@ -1127,39 +1834,9 @@ func (s *domainCacheState) set(domain string, source domainCacheSource, ips []st s.Domains[host] = rec } -func (s *domainCacheState) setError(domain string, source domainCacheSource, kind dnsErrorKind, now int) { - host := strings.TrimSpace(strings.ToLower(domain)) - if host == "" || now <= 0 { - return - } - normKind, ok := normalizeCacheErrorKind(string(kind)) - if !ok { - return - } - if s.Domains == nil { - s.Domains = map[string]domainCacheRecord{} - } - rec := s.Domains[host] - switch source { - case domainCacheSourceWildcard: - if rec.Wildcard == nil { - rec.Wildcard = &domainCacheEntry{} - } - rec.Wildcard.LastErrorKind = string(normKind) - rec.Wildcard.LastErrorAt = now - default: - if rec.Direct == nil { - rec.Direct = &domainCacheEntry{} - } - rec.Direct.LastErrorKind = string(normKind) - rec.Direct.LastErrorAt = now - } - s.Domains[host] = rec -} - func (s domainCacheState) toMap() map[string]any { out := map[string]any{ - "version": 3, + "version": 4, "domains": map[string]any{}, } domainsAny := out["domains"].(map[string]any) @@ -1181,6 +1858,15 @@ func (s domainCacheState) toMap() map[string]any { directOut["last_error_kind"] = string(kind) directOut["last_error_at"] = rec.Direct.LastErrorAt } + if rec.Direct.Score != 0 { + directOut["score"] = rec.Direct.Score + } + if st := normalizeDomainState(rec.Direct.State, rec.Direct.Score); st != "" { + directOut["state"] = st + } + if rec.Direct.QuarantineUntil > 0 { + directOut["quarantine_until"] = rec.Direct.QuarantineUntil + } if len(directOut) > 0 { recOut["direct"] = directOut } @@ -1195,6 +1881,15 @@ func (s domainCacheState) toMap() map[string]any { wildOut["last_error_kind"] = string(kind) wildOut["last_error_at"] = rec.Wildcard.LastErrorAt } + if rec.Wildcard.Score != 0 { + wildOut["score"] = rec.Wildcard.Score + } + if st := normalizeDomainState(rec.Wildcard.State, rec.Wildcard.Score); st != "" { + wildOut["state"] = st + } + if rec.Wildcard.QuarantineUntil > 0 { + wildOut["quarantine_until"] = rec.Wildcard.QuarantineUntil + } if len(wildOut) > 0 { recOut["wildcard"] = wildOut } @@ -1206,6 +1901,57 @@ func (s domainCacheState) toMap() map[string]any { return out } +func (s domainCacheState) formatStateSummary(now int) string { + type counters struct { + active int + stable int + suspect int + quarantine int + hardQuar int + } + add := func(c *counters, entry *domainCacheEntry) { + if entry == nil { + return + } + st := normalizeDomainState(entry.State, entry.Score) + if entry.QuarantineUntil > now { + // Keep hard quarantine state if explicitly marked, + // otherwise active quarantine bucket. + if st == domainStateHardQuar { + c.hardQuar++ + return + } + c.quarantine++ + return + } + switch st { + case domainStateActive: + c.active++ + case domainStateStable: + c.stable++ + case domainStateSuspect: + c.suspect++ + case domainStateQuarantine: + c.quarantine++ + case domainStateHardQuar: + c.hardQuar++ + } + } + var c counters + for _, rec := range s.Domains { + add(&c, rec.Direct) + add(&c, rec.Wildcard) + } + total := c.active + c.stable + c.suspect + c.quarantine + c.hardQuar + if total == 0 { + return "" + } + return fmt.Sprintf( + "active=%d stable=%d suspect=%d quarantine=%d hard_quarantine=%d total=%d", + c.active, c.stable, c.suspect, c.quarantine, c.hardQuar, total, + ) +} + func digPTR(ip, upstream string, timeout time.Duration, logf func(string, ...any)) ([]string, error) { server, port := splitDNS(upstream) if server == "" { @@ -1370,6 +2116,40 @@ func loadJSONMap(path string) map[string]any { return out } +func loadResolverPrecheckLastRun(path string) int { + m := loadJSONMap(path) + if len(m) == 0 { + return 0 + } + v, ok := parseAnyInt(m["last_run"]) + if !ok || v <= 0 { + return 0 + } + return v +} + +func saveResolverPrecheckState(path string, ts int, timeoutStats resolverTimeoutRecheckStats) { + if path == "" || ts <= 0 { + return + } + state := loadJSONMap(path) + if state == nil { + state = map[string]any{} + } + state["last_run"] = ts + state["timeout_recheck"] = map[string]any{ + "checked": timeoutStats.Checked, + "recovered": timeoutStats.Recovered, + "recovered_ips": timeoutStats.RecoveredIPs, + "still_timeout": timeoutStats.StillTimeout, + "now_nxdomain": timeoutStats.NowNXDomain, + "now_temporary": timeoutStats.NowTemporary, + "now_other": timeoutStats.NowOther, + "no_signal": timeoutStats.NoSignal, + } + saveJSON(state, path) +} + // --------------------------------------------------------------------- // EN: `saveJSON` saves json to persistent storage. // RU: `saveJSON` - сохраняет json в постоянное хранилище.