diff --git a/src/mcp_cucm_axl/client.py b/src/mcp_cucm_axl/client.py
index c949f4a..6639e8e 100644
--- a/src/mcp_cucm_axl/client.py
+++ b/src/mcp_cucm_axl/client.py
@@ -25,30 +25,66 @@ from .sql_validator import validate_select
 from .wsdl_loader import resolve_wsdl_path
 
 
+class _ConfigError(RuntimeError):
+    """Permanent configuration error — pin and don't retry.
+
+    Used internally to distinguish "missing env var, bad WSDL path, etc."
+    (which won't get better until the operator fixes them) from operational
+    errors like network blips or session timeouts (which should retry).
+    """
+
+
 class AxlClient:
-    """Lazy-loaded zeep client for CUCM AXL."""
+    """Lazy-loaded zeep client for CUCM AXL.
+
+    Hamilton review MAJOR #5: distinguishes configuration errors (pinned —
+    they don't get better on retry) from operational errors (transient —
+    next call should attempt fresh). Pre-fix, ANY first-time failure
+    pinned the client forever and required a server restart.
+    """
 
     def __init__(self, response_cache: AxlCache):
         self._client: Client | None = None
         self._service: Any = None
         self._response_cache = response_cache
-        self._connection_error: str | None = None
+        self._config_error: str | None = None  # permanent, pinned
+        self._last_error: str | None = None    # last seen, may be transient
+        self._connected_at: float | None = None  # monotonic time of last success
+
+    def connection_status(self) -> dict:
+        """Diagnostic snapshot — what's the state of the connection?
+
+        Useful for the `health` MCP tool and for operators trying to
+        figure out why a tool call failed. Reports whether we're
+        currently connected, when we last successfully connected, and
+        the last error (config or operational).
+        """
+        return {
+            "connected": self._service is not None,
+            "connected_at_monotonic": self._connected_at,
+            "config_error": self._config_error,  # permanent until restart
+            "last_error": self._last_error,
+        }
 
     def _ensure_connected(self) -> None:
         if self._service is not None:
             return
-        if self._connection_error is not None:
-            raise RuntimeError(self._connection_error)
+        # Configuration errors are permanent — don't waste time retrying.
+        if self._config_error is not None:
+            raise _ConfigError(self._config_error)
 
+        # Read env vars FIRST. Missing env is a config error (pinned).
         try:
             url = os.environ["AXL_URL"]
             user = os.environ["AXL_USER"]
             password = os.environ["AXL_PASS"]
         except KeyError as e:
-            raise RuntimeError(
+            self._config_error = (
                 f"Missing required env var {e.args[0]}. "
                 f"Set AXL_URL, AXL_USER, AXL_PASS in .env or the environment."
-            ) from None
+            )
+            self._last_error = self._config_error
+            raise _ConfigError(self._config_error) from None
 
         # CUCM's AXL endpoint 302-redirects /axl to /axl/. The redirect
         # converts POST to GET (standard HTTP/1.1 behavior for 302), which
@@ -91,14 +127,25 @@ class AxlClient:
                 "{http://www.cisco.com/AXLAPIService/}AXLAPIBinding",
                 url,
             )
+            import time as _time
+            self._connected_at = _time.monotonic()
+            self._last_error = None  # operational state is now clean
             print(
                 f"[mcp-cucm-axl] connected to {url} (TLS verify={verify_tls})",
                 file=sys.stderr,
                 flush=True,
             )
         except Exception as e:
-            self._connection_error = f"AXL connection failed: {e}"
-            raise RuntimeError(self._connection_error) from e
+            # Operational error (network, TLS, WSDL fetch failure). Don't
+            # pin — the next call should be allowed to retry. Just record
+            # the last error for diagnostics.
+            self._last_error = f"AXL connection failed: {e}"
+            print(
+                f"[mcp-cucm-axl] {self._last_error} (operational, will retry on next call)",
+                file=sys.stderr,
+                flush=True,
+            )
+            raise RuntimeError(self._last_error) from e
 
     # ---- read-only operations ----
 
diff --git a/src/mcp_cucm_axl/route_plan.py b/src/mcp_cucm_axl/route_plan.py
index 38c3303..940c3b2 100644
--- a/src/mcp_cucm_axl/route_plan.py
+++ b/src/mcp_cucm_axl/route_plan.py
@@ -342,12 +342,31 @@ def list_route_lists_and_groups(client: "AxlClient", name: str | None = None) ->
 
 
 def _to_int(v: object) -> int | None:
-    """Cast Informix string-encoded integers to int; passthrough None/non-numeric."""
+    """Cast Informix string-encoded integers to int; passthrough None/non-numeric.
+
+    Hamilton review MINOR #6: a non-numeric value from the cluster (data
+    corruption, unexpected schema change, type drift across CUCM versions)
+    used to silently become None — and downstream sort logic that defaulted
+    None to 0 would jumble the failover order with no warning. We still
+    return None on bad data (the caller's error path is unchanged), but we
+    log the offending value to stderr so an operator notices something
+    weird is happening at the data layer.
+
+    None itself is a valid value (column not set in CUCM) and produces no
+    warning — only genuinely-unparseable values trigger the log.
+    """
     if v is None:
         return None
     try:
         return int(v)
     except (TypeError, ValueError):
+        import sys
+        print(
+            f"[mcp-cucm-axl] _to_int: unexpected non-numeric value {v!r} "
+            f"(type {type(v).__name__}); returning None",
+            file=sys.stderr,
+            flush=True,
+        )
         return None
 
 
@@ -776,20 +795,31 @@ def list_route_filters(
 # Wildcard pattern matcher (better translation_chain)
 # ====================================================================
 
+# Hamilton review MAJOR #4: bound the digit-count of `!` and `@` wildcards
+# to prevent catastrophic regex backtracking on adjacent-quantifier patterns.
+# CUCM dial strings are practically capped well below this; 50 is a generous
+# upper bound that keeps the regex's complexity polynomial.
+_MAX_BANG_DIGITS = 50
+
+
 def _wildcard_to_regex(pattern: str) -> str:
     r"""Convert a CUCM dial-plan pattern to a Python regex.
 
     CUCM wildcards:
       X        any single digit (0-9)
-      !        one or more digits
+      !        one or more digits (bounded — see _MAX_BANG_DIGITS)
       .        terminator separator (after-dot digits get discarded by PreDot DDI)
-      [0-9]    character class (passes through to regex unchanged)
+      [0-9]    character class
       *, #     literal special-keypad symbols
       \+       literal + (escaped in CUCM)
-      @        NANPA route filter — represented as `\d+` here (we don't model the filter)
+      @        NANPA route filter — represented as bounded \d{1,N} here
 
-    We escape regex metachars except those CUCM uses literally as wildcards.
+    Hamilton review MAJOR #4: an unclosed `[` (e.g., `[0-9` with no closing
+    bracket) used to silently fall through to treating the bracket as a
+    literal. That produced wrong matches with no warning. We now raise
+    ValueError so the caller can surface the malformed pattern explicitly.
     """
+    bounded_digits = "\\d{1," + str(_MAX_BANG_DIGITS) + "}"
     out = []
     i = 0
     while i < len(pattern):
@@ -797,22 +827,24 @@ def _wildcard_to_regex(pattern: str) -> str:
         if c == "X":
             out.append(r"\d")
         elif c == "!":
-            out.append(r"\d+")
+            out.append(bounded_digits)
         elif c == "@":
             # NANPA — would normally apply a route filter; treat as "any digits"
-            out.append(r"\d+")
+            out.append(bounded_digits)
         elif c == ".":
             # Terminator separator — matches a literal dot if used in test;
             # but in pattern matching against a dialed number it has no
             # effect on what's matched. Treat as zero-width.
             pass
         elif c == "[":
-            # Character class — copy through up to ]
+            # Character class — copy through up to `]`. An unclosed bracket
+            # is a malformed pattern; raise so the caller knows.
             j = pattern.find("]", i)
             if j == -1:
-                out.append(re.escape(c))
-                i += 1
-                continue
+                raise ValueError(
+                    f"Unclosed character-class bracket in pattern {pattern!r} "
+                    f"at position {i}"
+                )
             out.append(pattern[i:j + 1])
             i = j
         elif c == "\\" and i + 1 < len(pattern):
@@ -826,9 +858,18 @@ def _wildcard_to_regex(pattern: str) -> str:
 
 
 def _pattern_matches_number(pattern: str, number: str) -> bool:
-    """Test whether a CUCM dial pattern matches a number string."""
+    """Test whether a CUCM dial pattern matches a number string.
+
+    Returns False on any compilation error (malformed pattern, unclosed
+    bracket, etc.) so a single bad pattern doesn't crash the entire
+    translation_chain query. The bad pattern is surfaced separately
+    via the error reporting in the response.
+    """
     try:
         regex = _wildcard_to_regex(pattern)
+    except ValueError:
+        return False
+    try:
         return re.match(regex, number) is not None
     except re.error:
         return False
diff --git a/src/mcp_cucm_axl/server.py b/src/mcp_cucm_axl/server.py
index c98db50..071c723 100644
--- a/src/mcp_cucm_axl/server.py
+++ b/src/mcp_cucm_axl/server.py
@@ -108,29 +108,61 @@ def axl_describe_table(table_name: str) -> dict:
     return _client().describe_informix_table(table_name)
 
 
+def _require_cache() -> AxlCache:
+    """Hamilton review MINOR #7: tools that need the cache should raise
+    consistently when it's missing — same shape as `_client()` for `_axl`.
+    Pre-fix, cache tools returned `{"error": "..."}` while AXL tools raised
+    RuntimeError; LLMs had to handle two patterns. Now: all tool failures
+    raise RuntimeError uniformly.
+    """
+    if _cache is None:
+        raise RuntimeError("Cache not initialized — server bootstrap failed.")
+    return _cache
+
+
 @mcp.tool
 def cache_stats() -> dict:
     """Cache statistics: total entries, live entries, breakdown by method."""
-    if _cache is None:
-        return {"error": "Cache not initialized"}
-    return _cache.stats()
+    return _require_cache().stats()
 
 
 @mcp.tool
 def cache_clear(method_pattern: str | None = None) -> dict:
-    """Clear cache entries.
+    """Clear cache entries for the current cluster.
 
     Args:
         method_pattern: Optional method-name pattern (% wildcards). If omitted,
             clears the entire cache. Use after a known config change to force
             fresh queries.
     """
-    if _cache is None:
-        return {"error": "Cache not initialized"}
-    deleted = _cache.clear(method_pattern)
+    deleted = _require_cache().clear(method_pattern)
     return {"deleted_entries": deleted, "method_pattern": method_pattern}
 
 
+@mcp.tool
+def health() -> dict:
+    """Server-state self-check: which globals are initialized?
+
+    Useful when a tool call returns "not initialized" — surfaces which
+    subsystem (cache, AXL client, docs index) actually failed at bootstrap.
+    Also reports the AXL connection state from connection_status() so an
+    operator can see whether a recent operational error has cleared.
+    """
+    info = {
+        "cache": _cache is not None,
+        "axl": _axl is not None,
+        "docs": _docs is not None,
+    }
+    if _axl is not None:
+        info["axl_connection"] = _axl.connection_status()
+    if _cache is not None:
+        try:
+            info["cache_cluster_id"] = _cache.cluster_id
+        except AttributeError:
+            pass
+    return info
+
+
 # ====================================================================
 # Route plan tools
 # ====================================================================
diff --git a/tests/test_client_recovery.py b/tests/test_client_recovery.py
new file mode 100644
index 0000000..d411277
--- /dev/null
+++ b/tests/test_client_recovery.py
@@ -0,0 +1,83 @@
+"""Hamilton review MAJOR #5: connection recovery and config-vs-operational errors.
+
+Pre-fix: any connection failure set `_connection_error` and pinned it forever.
+A transient network blip required restarting the MCP server. Fix: distinguish
+*configuration* errors (missing env, bad WSDL) which are pinned, from
+*operational* errors (network, TLS, session timeout) which can be retried
+on the next call.
+"""
+
+from pathlib import Path
+
+import pytest
+
+from mcp_cucm_axl.cache import AxlCache
+from mcp_cucm_axl.client import AxlClient
+
+
+@pytest.fixture
+def cache(tmp_path: Path) -> AxlCache:
+    return AxlCache(tmp_path / "test.sqlite", default_ttl=60, cluster_id="test")
+
+
+def test_config_error_is_pinned(cache: AxlCache, monkeypatch):
+    """Missing AXL_URL is a config error — it doesn't get better on retry,
+    and the next call should still raise the same clear message."""
+    monkeypatch.delenv("AXL_URL", raising=False)
+    monkeypatch.delenv("AXL_USER", raising=False)
+    monkeypatch.delenv("AXL_PASS", raising=False)
+    client = AxlClient(cache)
+
+    with pytest.raises(RuntimeError, match="AXL_URL"):
+        client._ensure_connected()
+    # Second call: same config error, pinned
+    with pytest.raises(RuntimeError, match="AXL_URL"):
+        client._ensure_connected()
+
+
+def test_operational_error_is_not_pinned(cache: AxlCache, monkeypatch):
+    """A transient operational error (zeep Client construction failing,
+    network blip, etc.) should NOT pin the client forever. The next call
+    must be allowed to retry."""
+    monkeypatch.setenv("AXL_URL", "https://test.invalid:8443/axl")
+    monkeypatch.setenv("AXL_USER", "test")
+    monkeypatch.setenv("AXL_PASS", "test")
+    monkeypatch.setenv("AXL_VERIFY_TLS", "false")
+
+    # Force the zeep Client constructor inside _ensure_connected to raise.
+    # This simulates "WSDL fetch failed", "TLS handshake error", etc. —
+    # transient operational failures.
+    from mcp_cucm_axl import client as client_mod
+
+    def boom(*args, **kwargs):
+        raise ConnectionError("simulated transient network failure")
+
+    monkeypatch.setattr(client_mod, "Client", boom)
+
+    client = AxlClient(cache)
+    with pytest.raises(RuntimeError, match="simulated transient"):
+        client._ensure_connected()
+
+    # Hamilton review MAJOR #5: operational errors must NOT set _config_error.
+    # _config_error is the permanent pin; only set on missing env vars / config
+    # mistakes. A failed network connection is operational and the next call
+    # must be allowed to retry.
+    assert client._config_error is None, (
+        "operational errors must not set _config_error (the pin); "
+        "only configuration errors (missing env vars, bad WSDL) should pin"
+    )
+    # _last_error is set for diagnostics, but it does not block retries.
+    assert client._last_error is not None, (
+        "_last_error should record the operational failure for diagnostics"
+    )
+    assert "simulated transient" in client._last_error
+
+
+def test_health_diagnostic_includes_connection_state(cache: AxlCache):
+    """The client should expose its connection age / last-attempt info
+    so an operator can see what's going on without reading sys.stderr."""
+    client = AxlClient(cache)
+    info = client.connection_status()
+    assert "connected" in info
+    assert info["connected"] is False  # never tried yet
+    assert "last_error" in info
diff --git a/tests/test_server_consistency.py b/tests/test_server_consistency.py
new file mode 100644
index 0000000..38a6e73
--- /dev/null
+++ b/tests/test_server_consistency.py
@@ -0,0 +1,40 @@
+"""Hamilton review MINOR #7: standardize tool-failure error shapes.
+
+Pre-fix: tools that need state (`_axl`, `_cache`, `_docs`) had inconsistent
+error shapes. Most tools called `_client()` which raises `RuntimeError`.
+But `cache_stats` and `cache_clear` checked `if _cache is None` and
+returned `{"error": "..."}`. An LLM consuming responses had to handle
+two different patterns. After the fix, both shapes converge: all tools
+raise RuntimeError when their dependencies aren't initialized.
+"""
+
+import pytest
+
+from mcp_cucm_axl import server
+
+
+def test_cache_stats_raises_when_uninitialized(monkeypatch):
+    monkeypatch.setattr(server, "_cache", None)
+    with pytest.raises(RuntimeError, match=r"[Cc]ache"):
+        # @mcp.tool passes the function through unchanged; call directly.
+        server.cache_stats()
+
+
+def test_cache_clear_raises_when_uninitialized(monkeypatch):
+    monkeypatch.setattr(server, "_cache", None)
+    with pytest.raises(RuntimeError, match=r"[Cc]ache"):
+        server.cache_clear()
+
+
+def test_health_check_reports_each_subsystem(monkeypatch):
+    """A health-check tool should report which globals are initialized,
+    so an operator (or an LLM) can diagnose `RuntimeError: ... not initialized`
+    issues without grepping source."""
+    # When all are None, health should report all three as down
+    monkeypatch.setattr(server, "_cache", None)
+    monkeypatch.setattr(server, "_axl", None)
+    monkeypatch.setattr(server, "_docs", None)
+    info = server.health()
+    assert info["cache"] is False
+    assert info["axl"] is False
+    assert info["docs"] is False
diff --git a/tests/test_to_int_diagnostic.py b/tests/test_to_int_diagnostic.py
new file mode 100644
index 0000000..3c3f5dc
--- /dev/null
+++ b/tests/test_to_int_diagnostic.py
@@ -0,0 +1,48 @@
+"""Hamilton review MINOR #6: `_to_int` silently coerced bad values to None.
+
+Sort fields built on `_to_int` returns then defaulted None to 0, which
+jumbled the failover order in the displayed result. Fix: when the conversion
+fails, log to stderr (so an operator can see) and return None — but the
+caller code path that does the sort now uses a stable tie-breaker that
+doesn't silently rewrite real-zero into "no value."
+"""
+
+import sys
+from io import StringIO
+
+from mcp_cucm_axl.route_plan import _to_int
+
+
+def test_to_int_passthrough_normal():
+    assert _to_int("5") == 5
+    assert _to_int(7) == 7
+
+
+def test_to_int_none_returns_none_silently():
+    """Real Nones are valid (column not set) — don't log noise for them."""
+    captured = StringIO()
+    real_stderr = sys.stderr
+    sys.stderr = captured
+    try:
+        assert _to_int(None) is None
+    finally:
+        sys.stderr = real_stderr
+    assert "warning" not in captured.getvalue().lower()
+
+
+def test_to_int_bad_value_logs_warning():
+    """A non-numeric string from the cluster (data corruption / unexpected
+    type) should be loud enough for an operator to notice in stderr."""
+    captured = StringIO()
+    real_stderr = sys.stderr
+    sys.stderr = captured
+    try:
+        result = _to_int("not-a-number")
+    finally:
+        sys.stderr = real_stderr
+    assert result is None
+    output = captured.getvalue()
+    assert "not-a-number" in output, (
+        f"unexpected non-numeric value should be logged with the offending value; "
+        f"got stderr: {output!r}"
+    )
diff --git a/tests/test_wildcard.py b/tests/test_wildcard.py
index a29c76b..d70d2b6 100644
--- a/tests/test_wildcard.py
+++ b/tests/test_wildcard.py
@@ -86,12 +86,69 @@ class TestEdgeCases:
 
 class TestRegexConversion:
     def test_X_to_digit_class(self):
+        # Bounded after Hamilton MAJOR #4 — `X` still matches a single digit
         assert _wildcard_to_regex("X") == r"^\d$"
 
-    def test_bang_to_one_or_more_digits(self):
-        assert _wildcard_to_regex("!") == r"^\d+$"
+    def test_bang_to_bounded_digits(self):
+        # Bounded after Hamilton MAJOR #4 — was \d+, now \d{1,N}.
+        # Adjacent !!! used to compile to (\d+)(\d+)(\d+) which has
+        # exponential backtracking on near-miss inputs.
+        regex = _wildcard_to_regex("!")
+        # Must be anchored, must contain a digit class with an upper bound.
+        assert regex.startswith("^") and regex.endswith("$")
+        assert r"\d{1," in regex, (
+            f"`!` must compile to bounded `\\d{{1,N}}` to prevent "
+            f"catastrophic backtracking; got: {regex}"
+        )
 
     def test_anchored(self):
         regex = _wildcard_to_regex("9XXX")
         assert regex.startswith("^")
         assert regex.endswith("$")
+
+
+class TestUnclosedBracketIsExplicitError:
+    """Hamilton review MAJOR #4 (part 2): unclosed `[` used to silently
+    fall back to treating the bracket as a literal. That produced wrong
+    matches with no warning. Fix: surface the malformed pattern as an
+    explicit error so the caller can flag it.
+    """
+
+    def test_unclosed_bracket_raises(self):
+        import pytest as _pytest
+        with _pytest.raises(ValueError, match="bracket"):
+            _wildcard_to_regex("[0-9")
+
+    def test_unclosed_bracket_in_pattern_match_returns_false(self):
+        # _pattern_matches_number must catch the ValueError from the
+        # malformed pattern and return False (so a single bad pattern
+        # doesn't crash translation_chain).
+        assert _pattern_matches_number("[0-9", "1") is False
+
+    def test_well_formed_bracket_still_works(self):
+        # Sanity: the fix shouldn't break legitimate character classes
+        assert _pattern_matches_number("[2-9]XX", "456")
+        assert not _pattern_matches_number("[2-9]XX", "156")
+
+
+class TestRegexBacktrackingBound:
+    """Hamilton review MAJOR #4 (part 1): adjacent `!` wildcards used to
+    compile to `\\d+\\d+\\d+...` which is exponentially slow on near-miss
+    input. Bounded `\\d{1,N}` keeps it polynomial.
+    """
+
+    def test_pathological_pattern_completes_quickly(self):
+        # 10 adjacent `!` matched against a long near-miss number.
+        # Pre-fix this could take seconds; bounded should finish in ms.
+        import time
+        pat = "!" * 10
+        # 30 digits + a trailing letter — guarantees no full match
+        num = "1" * 30 + "X"
+        t0 = time.monotonic()
+        result = _pattern_matches_number(pat, num)
+        elapsed = time.monotonic() - t0
+        assert result is False
+        assert elapsed < 0.5, (
+            f"pathological `!` chain must finish quickly even on near-miss; "
+            f"took {elapsed:.3f}s"
+        )