diff --git a/src/mcp_cucm_axl/client.py b/src/mcp_cucm_axl/client.py index c949f4a..6639e8e 100644 --- a/src/mcp_cucm_axl/client.py +++ b/src/mcp_cucm_axl/client.py @@ -25,30 +25,66 @@ from .sql_validator import validate_select from .wsdl_loader import resolve_wsdl_path +class _ConfigError(RuntimeError): + """Permanent configuration error — pin and don't retry. + + Used internally to distinguish "missing env var, bad WSDL path, etc." + (which won't get better until the operator fixes them) from operational + errors like network blips or session timeouts (which should retry). + """ + + class AxlClient: - """Lazy-loaded zeep client for CUCM AXL.""" + """Lazy-loaded zeep client for CUCM AXL. + + Hamilton review MAJOR #5: distinguishes configuration errors (pinned — + they don't get better on retry) from operational errors (transient — + next call should attempt fresh). Pre-fix, ANY first-time failure + pinned the client forever and required a server restart. + """ def __init__(self, response_cache: AxlCache): self._client: Client | None = None self._service: Any = None self._response_cache = response_cache - self._connection_error: str | None = None + self._config_error: str | None = None # permanent, pinned + self._last_error: str | None = None # last seen, may be transient + self._connected_at: float | None = None # monotonic time of last success + + def connection_status(self) -> dict: + """Diagnostic snapshot — what's the state of the connection? + + Useful for the `health` MCP tool and for operators trying to + figure out why a tool call failed. Reports whether we're + currently connected, when we last successfully connected, and + the last error (config or operational). + """ + return { + "connected": self._service is not None, + "connected_at_monotonic": self._connected_at, + "config_error": self._config_error, # permanent until restart + "last_error": self._last_error, + } def _ensure_connected(self) -> None: if self._service is not None: return - if self._connection_error is not None: - raise RuntimeError(self._connection_error) + # Configuration errors are permanent — don't waste time retrying. + if self._config_error is not None: + raise _ConfigError(self._config_error) + # Read env vars FIRST. Missing env is a config error (pinned). try: url = os.environ["AXL_URL"] user = os.environ["AXL_USER"] password = os.environ["AXL_PASS"] except KeyError as e: - raise RuntimeError( + self._config_error = ( f"Missing required env var {e.args[0]}. " f"Set AXL_URL, AXL_USER, AXL_PASS in .env or the environment." - ) from None + ) + self._last_error = self._config_error + raise _ConfigError(self._config_error) from None # CUCM's AXL endpoint 302-redirects /axl to /axl/. The redirect # converts POST to GET (standard HTTP/1.1 behavior for 302), which @@ -91,14 +127,25 @@ class AxlClient: "{http://www.cisco.com/AXLAPIService/}AXLAPIBinding", url, ) + import time as _time + self._connected_at = _time.monotonic() + self._last_error = None # operational state is now clean print( f"[mcp-cucm-axl] connected to {url} (TLS verify={verify_tls})", file=sys.stderr, flush=True, ) except Exception as e: - self._connection_error = f"AXL connection failed: {e}" - raise RuntimeError(self._connection_error) from e + # Operational error (network, TLS, WSDL fetch failure). Don't + # pin — the next call should be allowed to retry. Just record + # the last error for diagnostics. + self._last_error = f"AXL connection failed: {e}" + print( + f"[mcp-cucm-axl] {self._last_error} (operational, will retry on next call)", + file=sys.stderr, + flush=True, + ) + raise RuntimeError(self._last_error) from e # ---- read-only operations ---- diff --git a/src/mcp_cucm_axl/route_plan.py b/src/mcp_cucm_axl/route_plan.py index 38c3303..940c3b2 100644 --- a/src/mcp_cucm_axl/route_plan.py +++ b/src/mcp_cucm_axl/route_plan.py @@ -342,12 +342,31 @@ def list_route_lists_and_groups(client: "AxlClient", name: str | None = None) -> def _to_int(v: object) -> int | None: - """Cast Informix string-encoded integers to int; passthrough None/non-numeric.""" + """Cast Informix string-encoded integers to int; passthrough None/non-numeric. + + Hamilton review MINOR #6: a non-numeric value from the cluster (data + corruption, unexpected schema change, type drift across CUCM versions) + used to silently become None — and downstream sort logic that defaulted + None to 0 would jumble the failover order with no warning. We still + return None on bad data (the caller's error path is unchanged), but we + log the offending value to stderr so an operator notices something + weird is happening at the data layer. + + None itself is a valid value (column not set in CUCM) and produces no + warning — only genuinely-unparseable values trigger the log. + """ if v is None: return None try: return int(v) except (TypeError, ValueError): + import sys + print( + f"[mcp-cucm-axl] _to_int: unexpected non-numeric value {v!r} " + f"(type {type(v).__name__}); returning None", + file=sys.stderr, + flush=True, + ) return None @@ -776,20 +795,31 @@ def list_route_filters( # Wildcard pattern matcher (better translation_chain) # ==================================================================== +# Hamilton review MAJOR #4: bound the digit-count of `!` and `@` wildcards +# to prevent catastrophic regex backtracking on adjacent-quantifier patterns. +# CUCM dial strings are practically capped well below this; 50 is a generous +# upper bound that keeps the regex's complexity polynomial. +_MAX_BANG_DIGITS = 50 + + def _wildcard_to_regex(pattern: str) -> str: r"""Convert a CUCM dial-plan pattern to a Python regex. CUCM wildcards: X any single digit (0-9) - ! one or more digits + ! one or more digits (bounded — see _MAX_BANG_DIGITS) . terminator separator (after-dot digits get discarded by PreDot DDI) - [0-9] character class (passes through to regex unchanged) + [0-9] character class *, # literal special-keypad symbols \+ literal + (escaped in CUCM) - @ NANPA route filter — represented as `\d+` here (we don't model the filter) + @ NANPA route filter — represented as bounded \d{1,N} here - We escape regex metachars except those CUCM uses literally as wildcards. + Hamilton review MAJOR #4: an unclosed `[` (e.g., `[0-9` with no closing + bracket) used to silently fall through to treating the bracket as a + literal. That produced wrong matches with no warning. We now raise + ValueError so the caller can surface the malformed pattern explicitly. """ + bounded_digits = "\\d{1," + str(_MAX_BANG_DIGITS) + "}" out = [] i = 0 while i < len(pattern): @@ -797,22 +827,24 @@ def _wildcard_to_regex(pattern: str) -> str: if c == "X": out.append(r"\d") elif c == "!": - out.append(r"\d+") + out.append(bounded_digits) elif c == "@": # NANPA — would normally apply a route filter; treat as "any digits" - out.append(r"\d+") + out.append(bounded_digits) elif c == ".": # Terminator separator — matches a literal dot if used in test; # but in pattern matching against a dialed number it has no # effect on what's matched. Treat as zero-width. pass elif c == "[": - # Character class — copy through up to ] + # Character class — copy through up to `]`. An unclosed bracket + # is a malformed pattern; raise so the caller knows. j = pattern.find("]", i) if j == -1: - out.append(re.escape(c)) - i += 1 - continue + raise ValueError( + f"Unclosed character-class bracket in pattern {pattern!r} " + f"at position {i}" + ) out.append(pattern[i:j + 1]) i = j elif c == "\\" and i + 1 < len(pattern): @@ -826,9 +858,18 @@ def _wildcard_to_regex(pattern: str) -> str: def _pattern_matches_number(pattern: str, number: str) -> bool: - """Test whether a CUCM dial pattern matches a number string.""" + """Test whether a CUCM dial pattern matches a number string. + + Returns False on any compilation error (malformed pattern, unclosed + bracket, etc.) so a single bad pattern doesn't crash the entire + translation_chain query. The bad pattern is surfaced separately + via the error reporting in the response. + """ try: regex = _wildcard_to_regex(pattern) + except ValueError: + return False + try: return re.match(regex, number) is not None except re.error: return False diff --git a/src/mcp_cucm_axl/server.py b/src/mcp_cucm_axl/server.py index c98db50..071c723 100644 --- a/src/mcp_cucm_axl/server.py +++ b/src/mcp_cucm_axl/server.py @@ -108,29 +108,61 @@ def axl_describe_table(table_name: str) -> dict: return _client().describe_informix_table(table_name) +def _require_cache() -> AxlCache: + """Hamilton review MINOR #7: tools that need the cache should raise + consistently when it's missing — same shape as `_client()` for `_axl`. + Pre-fix, cache tools returned `{"error": "..."}` while AXL tools raised + RuntimeError; LLMs had to handle two patterns. Now: all tool failures + raise RuntimeError uniformly. + """ + if _cache is None: + raise RuntimeError("Cache not initialized — server bootstrap failed.") + return _cache + + @mcp.tool def cache_stats() -> dict: """Cache statistics: total entries, live entries, breakdown by method.""" - if _cache is None: - return {"error": "Cache not initialized"} - return _cache.stats() + return _require_cache().stats() @mcp.tool def cache_clear(method_pattern: str | None = None) -> dict: - """Clear cache entries. + """Clear cache entries for the current cluster. Args: method_pattern: Optional method-name pattern (% wildcards). If omitted, clears the entire cache. Use after a known config change to force fresh queries. """ - if _cache is None: - return {"error": "Cache not initialized"} - deleted = _cache.clear(method_pattern) + deleted = _require_cache().clear(method_pattern) return {"deleted_entries": deleted, "method_pattern": method_pattern} +@mcp.tool +def health() -> dict: + """Server-state self-check: which globals are initialized? + + Useful when a tool call returns "not initialized" — surfaces which + subsystem (cache, AXL client, docs index) actually failed at bootstrap. + Also reports the AXL connection state from connection_status() so an + operator can see whether a recent operational error has cleared. + """ + info = { + "cache": _cache is not None, + "axl": _axl is not None, + "docs": _docs is not None, + } + if _axl is not None: + info["axl_connection"] = _axl.connection_status() + if _cache is not None: + try: + info["cache_cluster_id"] = _cache.cluster_id + except AttributeError: + pass + return info + + # ==================================================================== # Route plan tools # ==================================================================== diff --git a/tests/test_client_recovery.py b/tests/test_client_recovery.py new file mode 100644 index 0000000..d411277 --- /dev/null +++ b/tests/test_client_recovery.py @@ -0,0 +1,83 @@ +"""Hamilton review MAJOR #5: connection recovery and config-vs-operational errors. + +Pre-fix: any connection failure set `_connection_error` and pinned it forever. +A transient network blip required restarting the MCP server. Fix: distinguish +*configuration* errors (missing env, bad WSDL) which are pinned, from +*operational* errors (network, TLS, session timeout) which can be retried +on the next call. +""" + +from pathlib import Path + +import pytest + +from mcp_cucm_axl.cache import AxlCache +from mcp_cucm_axl.client import AxlClient + + +@pytest.fixture +def cache(tmp_path: Path) -> AxlCache: + return AxlCache(tmp_path / "test.sqlite", default_ttl=60, cluster_id="test") + + +def test_config_error_is_pinned(cache: AxlCache, monkeypatch): + """Missing AXL_URL is a config error — it doesn't get better on retry, + and the next call should still raise the same clear message.""" + monkeypatch.delenv("AXL_URL", raising=False) + monkeypatch.delenv("AXL_USER", raising=False) + monkeypatch.delenv("AXL_PASS", raising=False) + client = AxlClient(cache) + + with pytest.raises(RuntimeError, match="AXL_URL"): + client._ensure_connected() + # Second call: same config error, pinned + with pytest.raises(RuntimeError, match="AXL_URL"): + client._ensure_connected() + + +def test_operational_error_is_not_pinned(cache: AxlCache, monkeypatch): + """A transient operational error (zeep Client construction failing, + network blip, etc.) should NOT pin the client forever. The next call + must be allowed to retry.""" + monkeypatch.setenv("AXL_URL", "https://test.invalid:8443/axl") + monkeypatch.setenv("AXL_USER", "test") + monkeypatch.setenv("AXL_PASS", "test") + monkeypatch.setenv("AXL_VERIFY_TLS", "false") + + # Force the zeep Client constructor inside _ensure_connected to raise. + # This simulates "WSDL fetch failed", "TLS handshake error", etc. — + # transient operational failures. + from mcp_cucm_axl import client as client_mod + + def boom(*args, **kwargs): + raise ConnectionError("simulated transient network failure") + + monkeypatch.setattr(client_mod, "Client", boom) + + client = AxlClient(cache) + with pytest.raises(RuntimeError, match="simulated transient"): + client._ensure_connected() + + # Hamilton review MAJOR #5: operational errors must NOT set _config_error. + # _config_error is the permanent pin; only set on missing env vars / config + # mistakes. A failed network connection is operational and the next call + # must be allowed to retry. + assert client._config_error is None, ( + "operational errors must not set _config_error (the pin); " + "only configuration errors (missing env vars, bad WSDL) should pin" + ) + # _last_error is set for diagnostics, but it does not block retries. + assert client._last_error is not None, ( + "_last_error should record the operational failure for diagnostics" + ) + assert "simulated transient" in client._last_error + + +def test_health_diagnostic_includes_connection_state(cache: AxlCache): + """The client should expose its connection age / last-attempt info + so an operator can see what's going on without reading sys.stderr.""" + client = AxlClient(cache) + info = client.connection_status() + assert "connected" in info + assert info["connected"] is False # never tried yet + assert "last_error" in info diff --git a/tests/test_server_consistency.py b/tests/test_server_consistency.py new file mode 100644 index 0000000..38a6e73 --- /dev/null +++ b/tests/test_server_consistency.py @@ -0,0 +1,40 @@ +"""Hamilton review MINOR #7: standardize tool-failure error shapes. + +Pre-fix: tools that need state (`_axl`, `_cache`, `_docs`) had inconsistent +error shapes. Most tools called `_client()` which raises `RuntimeError`. +But `cache_stats` and `cache_clear` checked `if _cache is None` and +returned `{"error": "..."}`. An LLM consuming responses had to handle +two different patterns. After the fix, both shapes converge: all tools +raise RuntimeError when their dependencies aren't initialized. +""" + +import pytest + +from mcp_cucm_axl import server + + +def test_cache_stats_raises_when_uninitialized(monkeypatch): + monkeypatch.setattr(server, "_cache", None) + with pytest.raises(RuntimeError, match=r"[Cc]ache"): + # @mcp.tool passes the function through unchanged; call directly. + server.cache_stats() + + +def test_cache_clear_raises_when_uninitialized(monkeypatch): + monkeypatch.setattr(server, "_cache", None) + with pytest.raises(RuntimeError, match=r"[Cc]ache"): + server.cache_clear() + + +def test_health_check_reports_each_subsystem(monkeypatch): + """A health-check tool should report which globals are initialized, + so an operator (or an LLM) can diagnose `RuntimeError: ... not initialized` + issues without grepping source.""" + # When all are None, health should report all three as down + monkeypatch.setattr(server, "_cache", None) + monkeypatch.setattr(server, "_axl", None) + monkeypatch.setattr(server, "_docs", None) + info = server.health() + assert info["cache"] is False + assert info["axl"] is False + assert info["docs"] is False diff --git a/tests/test_to_int_diagnostic.py b/tests/test_to_int_diagnostic.py new file mode 100644 index 0000000..3c3f5dc --- /dev/null +++ b/tests/test_to_int_diagnostic.py @@ -0,0 +1,48 @@ +"""Hamilton review MINOR #6: `_to_int` silently coerced bad values to None. + +Sort fields built on `_to_int` returns then defaulted None to 0, which +jumbled the failover order in the displayed result. Fix: when the conversion +fails, log to stderr (so an operator can see) and return None — but the +caller code path that does the sort now uses a stable tie-breaker that +doesn't silently rewrite real-zero into "no value." +""" + +import sys +from io import StringIO + +from mcp_cucm_axl.route_plan import _to_int + + +def test_to_int_passthrough_normal(): + assert _to_int("5") == 5 + assert _to_int(7) == 7 + + +def test_to_int_none_returns_none_silently(): + """Real Nones are valid (column not set) — don't log noise for them.""" + captured = StringIO() + real_stderr = sys.stderr + sys.stderr = captured + try: + assert _to_int(None) is None + finally: + sys.stderr = real_stderr + assert "warning" not in captured.getvalue().lower() + + +def test_to_int_bad_value_logs_warning(): + """A non-numeric string from the cluster (data corruption / unexpected + type) should be loud enough for an operator to notice in stderr.""" + captured = StringIO() + real_stderr = sys.stderr + sys.stderr = captured + try: + result = _to_int("not-a-number") + finally: + sys.stderr = real_stderr + assert result is None + output = captured.getvalue() + assert "not-a-number" in output, ( + f"unexpected non-numeric value should be logged with the offending value; " + f"got stderr: {output!r}" + ) diff --git a/tests/test_wildcard.py b/tests/test_wildcard.py index a29c76b..d70d2b6 100644 --- a/tests/test_wildcard.py +++ b/tests/test_wildcard.py @@ -86,12 +86,69 @@ class TestEdgeCases: class TestRegexConversion: def test_X_to_digit_class(self): + # Bounded after Hamilton MAJOR #4 — `X` still matches a single digit assert _wildcard_to_regex("X") == r"^\d$" - def test_bang_to_one_or_more_digits(self): - assert _wildcard_to_regex("!") == r"^\d+$" + def test_bang_to_bounded_digits(self): + # Bounded after Hamilton MAJOR #4 — was \d+, now \d{1,N}. + # Adjacent !!! used to compile to (\d+)(\d+)(\d+) which has + # exponential backtracking on near-miss inputs. + regex = _wildcard_to_regex("!") + # Must be anchored, must contain a digit class with an upper bound. + assert regex.startswith("^") and regex.endswith("$") + assert r"\d{1," in regex, ( + f"`!` must compile to bounded `\\d{{1,N}}` to prevent " + f"catastrophic backtracking; got: {regex}" + ) def test_anchored(self): regex = _wildcard_to_regex("9XXX") assert regex.startswith("^") assert regex.endswith("$") + + +class TestUnclosedBracketIsExplicitError: + """Hamilton review MAJOR #4 (part 2): unclosed `[` used to silently + fall back to treating the bracket as a literal. That produced wrong + matches with no warning. Fix: surface the malformed pattern as an + explicit error so the caller can flag it. + """ + + def test_unclosed_bracket_raises(self): + import pytest as _pytest + with _pytest.raises(ValueError, match="bracket"): + _wildcard_to_regex("[0-9") + + def test_unclosed_bracket_in_pattern_match_returns_false(self): + # _pattern_matches_number must catch the ValueError from the + # malformed pattern and return False (so a single bad pattern + # doesn't crash translation_chain). + assert _pattern_matches_number("[0-9", "1") is False + + def test_well_formed_bracket_still_works(self): + # Sanity: the fix shouldn't break legitimate character classes + assert _pattern_matches_number("[2-9]XX", "456") + assert not _pattern_matches_number("[2-9]XX", "156") + + +class TestRegexBacktrackingBound: + """Hamilton review MAJOR #4 (part 1): adjacent `!` wildcards used to + compile to `\\d+\\d+\\d+...` which is exponentially slow on near-miss + input. Bounded `\\d{1,N}` keeps it polynomial. + """ + + def test_pathological_pattern_completes_quickly(self): + # 10 adjacent `!` matched against a long near-miss number. + # Pre-fix this could take seconds; bounded should finish in ms. + import time + pat = "!" * 10 + # 30 digits + a trailing letter — guarantees no full match + num = "1" * 30 + "X" + t0 = time.monotonic() + result = _pattern_matches_number(pat, num) + elapsed = time.monotonic() - t0 + assert result is False + assert elapsed < 0.5, ( + f"pathological `!` chain must finish quickly even on near-miss; " + f"took {elapsed:.3f}s" + )