resolver: surface timeout-recheck stats and keep timeout-only domains suspect

This commit is contained in:
beckline
2026-02-25 09:39:53 +03:00
parent 90c4a73473
commit 5bd7f1c9f4
2 changed files with 11 additions and 1 deletions

View File

@@ -6,7 +6,7 @@
- [x] ~~3. Add stale-keep policy.~~ - [x] ~~3. Add stale-keep policy.~~
- [x] ~~4. Wire 24h precheck cycle (soft pruning only).~~ - [x] ~~4. Wire 24h precheck cycle (soft pruning only).~~
- [x] ~~5. Expose metrics/log clarity in API + GUI (API/trace done; DNS benchmark load-profile UI done; route badges done).~~ - [x] ~~5. Expose metrics/log clarity in API + GUI (API/trace done; DNS benchmark load-profile UI done; route badges done).~~
- [ ] 6. Tune thresholds with production data. - [ ] 6. Tune thresholds with production data (pass-1 done: timeout-only now stays suspect/no quarantine by default).
## 1) Goal ## 1) Goal
- Stabilize resolver behavior under high domain volume. - Stabilize resolver behavior under high domain volume.

View File

@@ -1813,8 +1813,18 @@ func (s *domainCacheState) setErrorWithStats(domain string, source domainCacheSo
if entry == nil { if entry == nil {
entry = &domainCacheEntry{} entry = &domainCacheEntry{}
} }
prevKind, _ := normalizeCacheErrorKind(entry.LastErrorKind)
entry.Score = clampDomainScore(entry.Score + penalty) entry.Score = clampDomainScore(entry.Score + penalty)
entry.State = domainStateFromScore(entry.Score) entry.State = domainStateFromScore(entry.Score)
// Timeout-only failures are treated as transient transport noise by default.
// Keep them in suspect bucket (no quarantine) unless we have NX signal.
if normKind == dnsErrorTimeout && prevKind != dnsErrorNXDomain {
if entry.Score < -10 {
entry.Score = -10
}
entry.State = domainStateSuspect
}
entry.LastErrorKind = string(normKind) entry.LastErrorKind = string(normKind)
entry.LastErrorAt = now entry.LastErrorAt = now
switch entry.State { switch entry.State {