H7 — Process-wide shared httpx.AsyncClient via get_shared_client(). Each tool call no longer pays a TCP+TLS handshake; connection pool is reused across the server's lifetime. Tests inject mock transports directly via ArchiveClient(transport=...) so the singleton stays clean. M1 — Retry/backoff on 429/502/503/504 with Retry-After honored (both delta-seconds and HTTP-date forms). Exponential backoff with jitter, capped at 30s, max 3 attempts. Applied to both _fetch_json and stream_file (retry happens BEFORE any bytes are yielded so it can't corrupt a partial write). M2 — Per-(identifier, filename) asyncio.Lock in download_file serializes concurrent downloads of the same file inside one process. Different files still download in parallel. M5 — collection field normalized to list[str] in all output paths (search docs, scrape items, item metadata). LLMs can write `if 'foo' in doc['collection']` without checking the type first. M7 — `is_collection: bool` derived from mediatype on every doc / metadata response, so LLMs can route collection containers vs. real media items without re-querying. H1 — Stream-abort errors (httpx.ReadError, RemoteProtocolError, ConnectError, ReadTimeout) caught and re-raised as ArchiveError with bytes-written context so the caller knows where the partial download ended. Bytes already on disk remain valid for resume. 19 new regression tests (52 total, all green, ruff clean): - 4 tests covering retry/backoff, exhaustion, HTTP-date Retry-After - 1 test for stream-abort byte-count surfacing - 6 tests for collection normalization shapes - 4 tests for is_collection in real tool flow + shared client lifecycle - 2 tests verifying download lock: same-file serialized, different files parallel
194 lines
6.1 KiB
Python
194 lines
6.1 KiB
Python
"""Server-layer regression tests using a swapped-in shared client.
|
|
|
|
These exercise the MCP tool functions directly and verify:
|
|
- Collection normalization (M5)
|
|
- `is_collection` derived flag (M7)
|
|
- Shared client lifecycle (H7)
|
|
- Concurrent-download serialization (M2)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from contextlib import asynccontextmanager
|
|
|
|
import httpx
|
|
import pytest
|
|
|
|
from mcarchive_org import client as client_mod
|
|
from mcarchive_org.client import ArchiveClient
|
|
from mcarchive_org.server import (
|
|
_enrich_doc,
|
|
_normalize_collection,
|
|
download_file,
|
|
get_item_metadata,
|
|
search_items,
|
|
)
|
|
|
|
|
|
@asynccontextmanager
|
|
async def swap_shared_client(handler):
|
|
"""Temporarily replace the process-wide shared client with a mock-backed one.
|
|
|
|
Tests that exercise server.py tools need this because those tools call
|
|
get_shared_client() under the hood, and we can't pass a transport in.
|
|
"""
|
|
saved = client_mod._shared_client
|
|
mock = ArchiveClient(transport=httpx.MockTransport(handler))
|
|
client_mod._shared_client = mock
|
|
try:
|
|
yield mock
|
|
finally:
|
|
client_mod._shared_client = saved
|
|
await mock.aclose()
|
|
|
|
|
|
# ---------- M5: collection normalization ----------
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"raw,expected",
|
|
[
|
|
(None, []),
|
|
("", []),
|
|
("nasa", ["nasa"]),
|
|
(["nasa", "opensource"], ["nasa", "opensource"]),
|
|
([], []),
|
|
([None, "nasa", ""], ["nasa"]), # falsy items dropped
|
|
],
|
|
)
|
|
def test_normalize_collection_shapes(raw, expected):
|
|
assert _normalize_collection(raw) == expected
|
|
|
|
|
|
def test_enrich_doc_marks_is_collection():
|
|
assert _enrich_doc({"mediatype": "collection", "identifier": "nasa"})["is_collection"] is True
|
|
assert _enrich_doc({"mediatype": "audio", "identifier": "x"})["is_collection"] is False
|
|
assert _enrich_doc({"identifier": "x"})["is_collection"] is False
|
|
|
|
|
|
def test_enrich_doc_normalizes_collection_field():
|
|
out = _enrich_doc({"identifier": "x", "collection": "single"})
|
|
assert out["collection"] == ["single"]
|
|
|
|
|
|
# ---------- M7: is_collection in real tool flow ----------
|
|
|
|
|
|
async def test_search_items_decorates_docs_with_is_collection():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(
|
|
200,
|
|
json={
|
|
"response": {
|
|
"numFound": 2,
|
|
"docs": [
|
|
{"identifier": "nasa", "mediatype": "collection", "collection": "nasa"},
|
|
{"identifier": "song1", "mediatype": "audio", "collection": ["etree", "GratefulDead"]},
|
|
],
|
|
}
|
|
},
|
|
)
|
|
|
|
async with swap_shared_client(handler):
|
|
result = await search_items(query="x", rows=2)
|
|
|
|
assert len(result["docs"]) == 2
|
|
nasa, song = result["docs"]
|
|
assert nasa["is_collection"] is True
|
|
assert nasa["collection"] == ["nasa"]
|
|
assert song["is_collection"] is False
|
|
assert song["collection"] == ["etree", "GratefulDead"]
|
|
|
|
|
|
async def test_get_item_metadata_normalizes_collection():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(
|
|
200,
|
|
json={
|
|
"metadata": {
|
|
"identifier": "nasa",
|
|
"title": "NASA Images",
|
|
"mediatype": "collection",
|
|
"collection": "internetarchive",
|
|
},
|
|
"files_count": 0,
|
|
"item_size": 0,
|
|
},
|
|
)
|
|
|
|
async with swap_shared_client(handler):
|
|
result = await get_item_metadata(identifier="nasa")
|
|
|
|
assert result["is_collection"] is True
|
|
assert result["collection"] == ["internetarchive"]
|
|
|
|
|
|
# ---------- H7: shared client lifecycle ----------
|
|
|
|
|
|
async def test_get_shared_client_returns_same_instance():
|
|
await client_mod.close_shared_client()
|
|
a = await client_mod.get_shared_client()
|
|
b = await client_mod.get_shared_client()
|
|
assert a is b
|
|
await client_mod.close_shared_client()
|
|
|
|
|
|
async def test_close_shared_client_clears_singleton():
|
|
a = await client_mod.get_shared_client()
|
|
await client_mod.close_shared_client()
|
|
b = await client_mod.get_shared_client()
|
|
assert a is not b
|
|
await client_mod.close_shared_client()
|
|
|
|
|
|
# ---------- M2: concurrent-download serialization ----------
|
|
|
|
|
|
async def test_concurrent_downloads_same_file_are_serialized(tmp_path, monkeypatch):
|
|
"""Two parallel download_file calls for the same (id, filename) must not
|
|
interleave — otherwise they'd race on the destination file."""
|
|
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
|
|
|
|
state = {"active": 0, "max_active": 0}
|
|
|
|
async def handler(req: httpx.Request) -> httpx.Response:
|
|
state["active"] += 1
|
|
state["max_active"] = max(state["max_active"], state["active"])
|
|
await asyncio.sleep(0.05) # hold the request long enough to overlap
|
|
state["active"] -= 1
|
|
return httpx.Response(200, content=b"file-content")
|
|
|
|
async with swap_shared_client(handler):
|
|
await asyncio.gather(
|
|
download_file(identifier="nasa", filename="shared.bin", overwrite=True),
|
|
download_file(identifier="nasa", filename="shared.bin", overwrite=True),
|
|
)
|
|
|
|
# The lock should have prevented any overlap.
|
|
assert state["max_active"] == 1
|
|
|
|
|
|
async def test_concurrent_downloads_different_files_run_in_parallel(tmp_path, monkeypatch):
|
|
"""Different filenames get different locks — they should run concurrently."""
|
|
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
|
|
|
|
state = {"active": 0, "max_active": 0}
|
|
|
|
async def handler(req: httpx.Request) -> httpx.Response:
|
|
state["active"] += 1
|
|
state["max_active"] = max(state["max_active"], state["active"])
|
|
await asyncio.sleep(0.05)
|
|
state["active"] -= 1
|
|
return httpx.Response(200, content=b"data")
|
|
|
|
async with swap_shared_client(handler):
|
|
await asyncio.gather(
|
|
download_file(identifier="nasa", filename="a.bin", overwrite=True),
|
|
download_file(identifier="nasa", filename="b.bin", overwrite=True),
|
|
)
|
|
|
|
# Different files — should overlap.
|
|
assert state["max_active"] == 2
|