mcaxl/src/mcp_cucm_axl/client.py

"""AXL SOAP client wrapper.

Lazy connection — instantiated on first tool call, not at server boot.
This means the FastMCP server registers tools and prompts immediately,
even if the cluster is unreachable, and the user gets a clear error
only when they actually invoke a tool that needs CUCM.
"""

from __future__ import annotations

import os
import sys
import urllib3
from pathlib import Path
from typing import Any

from requests import Session
from requests.auth import HTTPBasicAuth
from zeep import Client, Settings
from zeep.cache import SqliteCache
from zeep.transports import Transport

from .cache import AxlCache
from .sql_validator import validate_select
from .wsdl_loader import resolve_wsdl_path


class _ConfigError(RuntimeError):
    """Permanent configuration error — pin and don't retry.

    Used internally to distinguish "missing env var, bad WSDL path, etc."
    (which won't get better until the operator fixes them) from operational
    errors like network blips or session timeouts (which should retry).
    """


class AxlClient:
    """Lazy-loaded zeep client for CUCM AXL.

    Hamilton review MAJOR #5: distinguishes configuration errors (pinned —
    they don't get better on retry) from operational errors (transient —
    next call should attempt fresh). Pre-fix, ANY first-time failure
    pinned the client forever and required a server restart.
    """

    def __init__(self, response_cache: AxlCache):
        self._client: Client | None = None
        self._service: Any = None
        self._response_cache = response_cache
        self._config_error: str | None = None  # permanent, pinned
        self._last_error: str | None = None    # last seen, may be transient
        self._connected_at: float | None = None  # monotonic time of last success
        self._retry_config: dict | None = None  # populated when session is built

    def connection_status(self) -> dict:
        """Diagnostic snapshot — what's the state of the connection?

        Useful for the `health` MCP tool and for operators trying to
        figure out why a tool call failed. Reports whether we're
        currently connected, when we last successfully connected, the
        last error (config or operational), and the rate-limit retry
        policy in effect.
        """
        return {
            "connected": self._service is not None,
            "connected_at_monotonic": self._connected_at,
            "config_error": self._config_error,  # permanent until restart
            "last_error": self._last_error,
            "retry_config": self._retry_config,
        }

    def _ensure_connected(self) -> None:
        if self._service is not None:
            return
        # Configuration errors are permanent — don't waste time retrying.
        if self._config_error is not None:
            raise _ConfigError(self._config_error)

        # Read env vars FIRST. Missing env is a config error (pinned).
        try:
            url = os.environ["AXL_URL"]
            user = os.environ["AXL_USER"]
            password = os.environ["AXL_PASS"]
        except KeyError as e:
            self._config_error = (
                f"Missing required env var {e.args[0]}. "
                f"Set AXL_URL, AXL_USER, AXL_PASS in .env or the environment."
            )
            self._last_error = self._config_error
            raise _ConfigError(self._config_error) from None

        # CUCM's AXL endpoint 302-redirects /axl to /axl/. The redirect
        # converts POST to GET (standard HTTP/1.1 behavior for 302), which
        # makes the SOAP request silently fail with an HTML status page.
        # Normalize the trailing slash so users don't need to remember.
        if not url.rstrip().endswith("/"):
            url = url.rstrip() + "/"

        verify_tls = os.environ.get("AXL_VERIFY_TLS", "false").lower() in ("1", "true", "yes")
        if not verify_tls:
            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

        wsdl_path = resolve_wsdl_path()

        session = Session()
        session.verify = verify_tls
        session.auth = HTTPBasicAuth(user, password)

        # Rate-limit / transient-error retry. CUCM's SOAP layer returns 503
        # under load (multiple admins running AXL queries during a change
        # window, etc). 502/504 occur when the publisher is restarting or
        # a load balancer is between us and CUCM. Pre-fix, any of these
        # was a hard failure to the caller; now they're retried with
        # exponential backoff.
        from requests.adapters import HTTPAdapter
        from urllib3.util.retry import Retry
        max_retries = int(os.environ.get("AXL_RATE_LIMIT_RETRIES", "3"))
        if max_retries > 0:
            retry = Retry(
                total=max_retries,
                backoff_factor=1.0,           # 1s, 2s, 4s between retries
                status_forcelist=(502, 503, 504),
                allowed_methods=frozenset(["POST", "GET"]),
                raise_on_status=False,        # let zeep see the final response
                respect_retry_after_header=True,
            )
            adapter = HTTPAdapter(max_retries=retry)
            session.mount("https://", adapter)
            session.mount("http://", adapter)
        self._retry_config = {
            "max_retries": max_retries,
            "backoff_factor": 1.0,
            "status_forcelist": [502, 503, 504],
        }

        # zeep's own WSDL cache (separate from our response cache) keeps
        # repeat startups fast — it parses the WSDL once and reuses
        from platformdirs import user_cache_dir
        zeep_cache_path = Path(user_cache_dir("mcp-cucm-axl")) / "zeep_wsdl.db"
        zeep_cache_path.parent.mkdir(parents=True, exist_ok=True)

        transport = Transport(
            session=session,
            cache=SqliteCache(path=str(zeep_cache_path), timeout=86400),
            timeout=30,
        )

        try:
            self._client = Client(
                wsdl=str(wsdl_path),
                settings=Settings(strict=False, xml_huge_tree=True),
                transport=transport,
            )
            # AXL endpoint is the AXL_URL itself; override the WSDL's default
            # service location which usually points at a placeholder host.
            self._service = self._client.create_service(
                "{http://www.cisco.com/AXLAPIService/}AXLAPIBinding",
                url,
            )
            import time as _time
            self._connected_at = _time.monotonic()
            self._last_error = None  # operational state is now clean
            print(
                f"[mcp-cucm-axl] connected to {url} (TLS verify={verify_tls})",
                file=sys.stderr,
                flush=True,
            )
        except Exception as e:
            # Operational error (network, TLS, WSDL fetch failure). Don't
            # pin — the next call should be allowed to retry. Just record
            # the last error for diagnostics.
            self._last_error = f"AXL connection failed: {e}"
            print(
                f"[mcp-cucm-axl] {self._last_error} (operational, will retry on next call)",
                file=sys.stderr,
                flush=True,
            )
            raise RuntimeError(self._last_error) from e

    # ---- read-only operations ----

    def get_ccm_version(self) -> dict:
        cached = self._response_cache.get("getCCMVersion", {})
        if cached is not None:
            return cached
        self._ensure_connected()
        resp = self._service.getCCMVersion()
        # zeep CompoundValue → dict; the actual payload is under "return"
        full = _zeep_to_dict(resp)
        result = full.get("return", full) if isinstance(full, dict) else full
        self._response_cache.set("getCCMVersion", {}, result, ttl=3600)
        return result

    def execute_sql_query(self, query: str) -> dict:
        cleaned = validate_select(query)
        cached = self._response_cache.get("executeSQLQuery", {"sql": cleaned})
        if cached is not None:
            return {**cached, "_cache": "hit"}
        self._ensure_connected()
        resp = self._service.executeSQLQuery(sql=cleaned)
        rows = _parse_sql_rows(resp)
        result = {"row_count": len(rows), "rows": rows, "query": cleaned}
        self._response_cache.set("executeSQLQuery", {"sql": cleaned}, result)
        return {**result, "_cache": "miss"}

    def list_informix_tables(self, pattern: str | None = None) -> dict:
        # systables is the Informix system catalog. tabid > 99 filters out
        # internal/system tables and leaves CUCM's data dictionary tables.
        if pattern:
            safe_pattern = pattern.replace("'", "''")
            sql = (
                "SELECT tabname FROM systables "
                f"WHERE tabid > 99 AND tabname LIKE '{safe_pattern}' "
                "ORDER BY tabname"
            )
        else:
            sql = "SELECT tabname FROM systables WHERE tabid > 99 ORDER BY tabname"
        result = self.execute_sql_query(sql)
        names = [row.get("tabname") for row in result.get("rows", []) if row.get("tabname")]
        return {"table_count": len(names), "tables": names, "pattern": pattern}

    def describe_informix_table(self, table_name: str) -> dict:
        # Join syscolumns to systables to get column metadata for one table.
        # coltype encoding: low byte = type code, high bit = NOT NULL flag.
        safe = table_name.replace("'", "''")
        sql = (
            "SELECT c.colname, c.coltype, c.collength "
            "FROM syscolumns c, systables t "
            f"WHERE t.tabname = '{safe}' AND c.tabid = t.tabid "
            "ORDER BY c.colno"
        )
        result = self.execute_sql_query(sql)
        columns = []
        for row in result.get("rows", []):
            coltype_raw = int(row.get("coltype", 0))
            type_code = coltype_raw & 0xFF
            not_null = bool(coltype_raw & 0x100)
            columns.append({
                "name": row.get("colname"),
                "informix_type_code": type_code,
                "type": _INFORMIX_TYPE_NAMES.get(type_code, f"type_{type_code}"),
                "length": int(row.get("collength", 0)),
                "not_null": not_null,
            })
        if not columns:
            return {"table": table_name, "error": "Table not found or has no columns."}
        return {"table": table_name, "column_count": len(columns), "columns": columns}


# Informix type codes — partial list, enough for CUCM's data dictionary.
# Full list: https://www.ibm.com/docs/en/informix-servers/14.10?topic=tables-syscolumns
_INFORMIX_TYPE_NAMES = {
    0: "CHAR",
    1: "SMALLINT",
    2: "INTEGER",
    3: "FLOAT",
    4: "SMALLFLOAT",
    5: "DECIMAL",
    6: "SERIAL",
    7: "DATE",
    8: "MONEY",
    10: "DATETIME",
    11: "BYTE",
    12: "TEXT",
    13: "VARCHAR",
    14: "INTERVAL",
    15: "NCHAR",
    16: "NVARCHAR",
    17: "INT8",
    18: "SERIAL8",
    19: "SET",
    20: "MULTISET",
    21: "LIST",
    22: "ROW",
    23: "COLLECTION",
    41: "LVARCHAR",
    43: "LVARCHAR",
    45: "BOOLEAN",
}


def _zeep_to_dict(obj: Any) -> Any:
    """Recursively convert zeep CompoundValue objects to plain dicts/lists."""
    if obj is None:
        return None
    if hasattr(obj, "__values__"):
        return {k: _zeep_to_dict(v) for k, v in obj.__values__.items()}
    if isinstance(obj, list):
        return [_zeep_to_dict(item) for item in obj]
    if isinstance(obj, dict):
        return {k: _zeep_to_dict(v) for k, v in obj.items()}
    return obj


def _parse_sql_rows(resp: Any) -> list[dict]:
    """Pull the row list out of an executeSQLQuery response.

    AXL's executeSQLQuery returns rows as raw lxml elements wrapped in
    `<return><row><colname>val</colname>...</row></return>`. Zeep doesn't
    schema-bind these because the columns vary per query — they come
    through as a list of `lxml.etree._Element` row objects with column
    children.

    When the query matches zero rows, the response is `<return/>` (empty),
    which arrives as a CompoundValue with .return = None. In that case we
    must return [] — NOT fall back to parsing the response envelope itself,
    which would yield a phantom row of `{"return": None, "sequence": None}`.
    """
    if resp is None:
        return []

    # Find the row container at .return / ["return"] / __values__["return"]
    container = None
    for accessor in (
        lambda: getattr(resp, "return", None) if hasattr(resp, "return") else None,
        lambda: resp.__values__.get("return") if hasattr(resp, "__values__") else None,
        lambda: resp.get("return") if isinstance(resp, dict) else None,
    ):
        try:
            v = accessor()
        except Exception:
            v = None
        if v is not None:
            container = v
            break

    # No `return` member, or it's None → zero rows. Critical: do NOT fall
    # back to parsing `resp` itself, which would produce a phantom row.
    if container is None:
        return []

    # If the container is itself the rows list, use it; else look for .row
    if isinstance(container, list):
        row_iter = container
    elif hasattr(container, "row"):
        row_iter = container.row or []
    elif isinstance(container, dict) and "row" in container:
        row_iter = container["row"] or []
    else:
        # Container present but no obvious row collection — try iterating it
        row_iter = list(container) if hasattr(container, "__iter__") else [container]

    if not isinstance(row_iter, list):
        row_iter = [row_iter]

    out = []
    for r in row_iter:
        # AXL's executeSQLQuery wraps each row as a list of lxml column
        # elements: [<Element colname1>, <Element colname2>, ...].
        if isinstance(r, list):
            out.append({
                child.tag: child.text
                for child in r
                if hasattr(child, "tag")
            })
            continue
        # Single lxml element with children (some response shapes)
        if hasattr(r, "tag") and not isinstance(r, str):
            try:
                out.append({child.tag: child.text for child in r})
                continue
            except TypeError:
                pass
        if hasattr(r, "__values__"):
            out.append({k: _stringify(v) for k, v in r.__values__.items()})
        elif isinstance(r, dict):
            out.append({k: _stringify(v) for k, v in r.items()})
        else:
            out.append({"value": str(r)})
    return out


def _stringify(v: Any) -> Any:
    if v is None or isinstance(v, (str, int, float, bool)):
        return v
    return str(v)