mcarchive-org/tests/test_server_mocked.py
Ryan Malloy 6198defeca Resilience: address Hamilton tier-2 findings
H7 — Process-wide shared httpx.AsyncClient via get_shared_client().
Each tool call no longer pays a TCP+TLS handshake; connection pool is
reused across the server's lifetime. Tests inject mock transports
directly via ArchiveClient(transport=...) so the singleton stays clean.

M1 — Retry/backoff on 429/502/503/504 with Retry-After honored
(both delta-seconds and HTTP-date forms). Exponential backoff with
jitter, capped at 30s, max 3 attempts. Applied to both _fetch_json
and stream_file (retry happens BEFORE any bytes are yielded so it
can't corrupt a partial write).

M2 — Per-(identifier, filename) asyncio.Lock in download_file
serializes concurrent downloads of the same file inside one process.
Different files still download in parallel.

M5 — collection field normalized to list[str] in all output paths
(search docs, scrape items, item metadata). LLMs can write
`if 'foo' in doc['collection']` without checking the type first.

M7 — `is_collection: bool` derived from mediatype on every doc /
metadata response, so LLMs can route collection containers vs.
real media items without re-querying.

H1 — Stream-abort errors (httpx.ReadError, RemoteProtocolError,
ConnectError, ReadTimeout) caught and re-raised as ArchiveError
with bytes-written context so the caller knows where the partial
download ended. Bytes already on disk remain valid for resume.

19 new regression tests (52 total, all green, ruff clean):
- 4 tests covering retry/backoff, exhaustion, HTTP-date Retry-After
- 1 test for stream-abort byte-count surfacing
- 6 tests for collection normalization shapes
- 4 tests for is_collection in real tool flow + shared client lifecycle
- 2 tests verifying download lock: same-file serialized, different files parallel
2026-04-21 20:24:21 -06:00

194 lines
6.1 KiB
Python

"""Server-layer regression tests using a swapped-in shared client.
These exercise the MCP tool functions directly and verify:
- Collection normalization (M5)
- `is_collection` derived flag (M7)
- Shared client lifecycle (H7)
- Concurrent-download serialization (M2)
"""
from __future__ import annotations
import asyncio
from contextlib import asynccontextmanager
import httpx
import pytest
from mcarchive_org import client as client_mod
from mcarchive_org.client import ArchiveClient
from mcarchive_org.server import (
_enrich_doc,
_normalize_collection,
download_file,
get_item_metadata,
search_items,
)
@asynccontextmanager
async def swap_shared_client(handler):
"""Temporarily replace the process-wide shared client with a mock-backed one.
Tests that exercise server.py tools need this because those tools call
get_shared_client() under the hood, and we can't pass a transport in.
"""
saved = client_mod._shared_client
mock = ArchiveClient(transport=httpx.MockTransport(handler))
client_mod._shared_client = mock
try:
yield mock
finally:
client_mod._shared_client = saved
await mock.aclose()
# ---------- M5: collection normalization ----------
@pytest.mark.parametrize(
"raw,expected",
[
(None, []),
("", []),
("nasa", ["nasa"]),
(["nasa", "opensource"], ["nasa", "opensource"]),
([], []),
([None, "nasa", ""], ["nasa"]), # falsy items dropped
],
)
def test_normalize_collection_shapes(raw, expected):
assert _normalize_collection(raw) == expected
def test_enrich_doc_marks_is_collection():
assert _enrich_doc({"mediatype": "collection", "identifier": "nasa"})["is_collection"] is True
assert _enrich_doc({"mediatype": "audio", "identifier": "x"})["is_collection"] is False
assert _enrich_doc({"identifier": "x"})["is_collection"] is False
def test_enrich_doc_normalizes_collection_field():
out = _enrich_doc({"identifier": "x", "collection": "single"})
assert out["collection"] == ["single"]
# ---------- M7: is_collection in real tool flow ----------
async def test_search_items_decorates_docs_with_is_collection():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(
200,
json={
"response": {
"numFound": 2,
"docs": [
{"identifier": "nasa", "mediatype": "collection", "collection": "nasa"},
{"identifier": "song1", "mediatype": "audio", "collection": ["etree", "GratefulDead"]},
],
}
},
)
async with swap_shared_client(handler):
result = await search_items(query="x", rows=2)
assert len(result["docs"]) == 2
nasa, song = result["docs"]
assert nasa["is_collection"] is True
assert nasa["collection"] == ["nasa"]
assert song["is_collection"] is False
assert song["collection"] == ["etree", "GratefulDead"]
async def test_get_item_metadata_normalizes_collection():
def handler(req: httpx.Request) -> httpx.Response:
return httpx.Response(
200,
json={
"metadata": {
"identifier": "nasa",
"title": "NASA Images",
"mediatype": "collection",
"collection": "internetarchive",
},
"files_count": 0,
"item_size": 0,
},
)
async with swap_shared_client(handler):
result = await get_item_metadata(identifier="nasa")
assert result["is_collection"] is True
assert result["collection"] == ["internetarchive"]
# ---------- H7: shared client lifecycle ----------
async def test_get_shared_client_returns_same_instance():
await client_mod.close_shared_client()
a = await client_mod.get_shared_client()
b = await client_mod.get_shared_client()
assert a is b
await client_mod.close_shared_client()
async def test_close_shared_client_clears_singleton():
a = await client_mod.get_shared_client()
await client_mod.close_shared_client()
b = await client_mod.get_shared_client()
assert a is not b
await client_mod.close_shared_client()
# ---------- M2: concurrent-download serialization ----------
async def test_concurrent_downloads_same_file_are_serialized(tmp_path, monkeypatch):
"""Two parallel download_file calls for the same (id, filename) must not
interleave — otherwise they'd race on the destination file."""
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
state = {"active": 0, "max_active": 0}
async def handler(req: httpx.Request) -> httpx.Response:
state["active"] += 1
state["max_active"] = max(state["max_active"], state["active"])
await asyncio.sleep(0.05) # hold the request long enough to overlap
state["active"] -= 1
return httpx.Response(200, content=b"file-content")
async with swap_shared_client(handler):
await asyncio.gather(
download_file(identifier="nasa", filename="shared.bin", overwrite=True),
download_file(identifier="nasa", filename="shared.bin", overwrite=True),
)
# The lock should have prevented any overlap.
assert state["max_active"] == 1
async def test_concurrent_downloads_different_files_run_in_parallel(tmp_path, monkeypatch):
"""Different filenames get different locks — they should run concurrently."""
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
state = {"active": 0, "max_active": 0}
async def handler(req: httpx.Request) -> httpx.Response:
state["active"] += 1
state["max_active"] = max(state["max_active"], state["active"])
await asyncio.sleep(0.05)
state["active"] -= 1
return httpx.Response(200, content=b"data")
async with swap_shared_client(handler):
await asyncio.gather(
download_file(identifier="nasa", filename="a.bin", overwrite=True),
download_file(identifier="nasa", filename="b.bin", overwrite=True),
)
# Different files — should overlap.
assert state["max_active"] == 2