Atomic write pattern (tier-3 polish from headless test finding): - download_to_file now writes to <dest>.part and renames to <dest> only on successful stream completion (os.replace is POSIX-atomic). Failed downloads leave only the .part file — no misleading 0-byte dest files in the user's downloads directory. - Resume logic reads from <dest>.part instead of <dest>; the user's directory only ever contains complete files or clearly-marked .part files. - New `already_complete` short-circuit: if dest exists and no .part, skip the network entirely (still re-verify MD5 if requested). The headless Claude test confirmed this avoids redundant CDN load. - Symlink rejection re-added at the new code path: even though os.replace would only replace (not follow) a symlink at dest, predictable refusal beats silent symlink removal. Runtime download root tools (for stdio MCP mode): - get_download_root(): reports current root, source (env var vs default), existence, writability. - set_download_root(path): change MCARCHIVE_DOWNLOAD_ROOT mid-session. Expands ~, creates the dir, refuses system paths (/, /etc, /usr, /bin, /sbin, /var, /sys, /proc, /dev, /boot, /root). The lazy-resolved root means the change takes effect on the next download_file call without restarting the server. 14 new tests (66 total, all green, ruff clean): - 4 staging tests: failed download leaves no dest, success leaves no .part, already_complete short-circuit, MD5 verification on existing files - 6 root-tools tests: env reporting, default reporting, ~ expansion, system-dir refusal (parametrized), set→download takes effect immediately - 4 existing tests rewritten to use .part as the resume staging file Headless Claude smoke test verified end-to-end: get_download_root → set_download_root → search → list → download → second download short-circuits with already_complete=true and zero network bytes.
266 lines
8.8 KiB
Python
266 lines
8.8 KiB
Python
"""Server-layer regression tests using a swapped-in shared client.
|
|
|
|
These exercise the MCP tool functions directly and verify:
|
|
- Collection normalization (M5)
|
|
- `is_collection` derived flag (M7)
|
|
- Shared client lifecycle (H7)
|
|
- Concurrent-download serialization (M2)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import os
|
|
from contextlib import asynccontextmanager
|
|
|
|
import httpx
|
|
import pytest
|
|
|
|
from mcarchive_org import client as client_mod
|
|
from mcarchive_org.client import ArchiveClient
|
|
from mcarchive_org.server import (
|
|
_enrich_doc,
|
|
_normalize_collection,
|
|
download_file,
|
|
get_download_root,
|
|
get_item_metadata,
|
|
search_items,
|
|
set_download_root,
|
|
)
|
|
|
|
|
|
@asynccontextmanager
|
|
async def swap_shared_client(handler):
|
|
"""Temporarily replace the process-wide shared client with a mock-backed one.
|
|
|
|
Tests that exercise server.py tools need this because those tools call
|
|
get_shared_client() under the hood, and we can't pass a transport in.
|
|
"""
|
|
saved = client_mod._shared_client
|
|
mock = ArchiveClient(transport=httpx.MockTransport(handler))
|
|
client_mod._shared_client = mock
|
|
try:
|
|
yield mock
|
|
finally:
|
|
client_mod._shared_client = saved
|
|
await mock.aclose()
|
|
|
|
|
|
# ---------- M5: collection normalization ----------
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"raw,expected",
|
|
[
|
|
(None, []),
|
|
("", []),
|
|
("nasa", ["nasa"]),
|
|
(["nasa", "opensource"], ["nasa", "opensource"]),
|
|
([], []),
|
|
([None, "nasa", ""], ["nasa"]), # falsy items dropped
|
|
],
|
|
)
|
|
def test_normalize_collection_shapes(raw, expected):
|
|
assert _normalize_collection(raw) == expected
|
|
|
|
|
|
def test_enrich_doc_marks_is_collection():
|
|
assert _enrich_doc({"mediatype": "collection", "identifier": "nasa"})["is_collection"] is True
|
|
assert _enrich_doc({"mediatype": "audio", "identifier": "x"})["is_collection"] is False
|
|
assert _enrich_doc({"identifier": "x"})["is_collection"] is False
|
|
|
|
|
|
def test_enrich_doc_normalizes_collection_field():
|
|
out = _enrich_doc({"identifier": "x", "collection": "single"})
|
|
assert out["collection"] == ["single"]
|
|
|
|
|
|
# ---------- M7: is_collection in real tool flow ----------
|
|
|
|
|
|
async def test_search_items_decorates_docs_with_is_collection():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(
|
|
200,
|
|
json={
|
|
"response": {
|
|
"numFound": 2,
|
|
"docs": [
|
|
{"identifier": "nasa", "mediatype": "collection", "collection": "nasa"},
|
|
{"identifier": "song1", "mediatype": "audio", "collection": ["etree", "GratefulDead"]},
|
|
],
|
|
}
|
|
},
|
|
)
|
|
|
|
async with swap_shared_client(handler):
|
|
result = await search_items(query="x", rows=2)
|
|
|
|
assert len(result["docs"]) == 2
|
|
nasa, song = result["docs"]
|
|
assert nasa["is_collection"] is True
|
|
assert nasa["collection"] == ["nasa"]
|
|
assert song["is_collection"] is False
|
|
assert song["collection"] == ["etree", "GratefulDead"]
|
|
|
|
|
|
async def test_get_item_metadata_normalizes_collection():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(
|
|
200,
|
|
json={
|
|
"metadata": {
|
|
"identifier": "nasa",
|
|
"title": "NASA Images",
|
|
"mediatype": "collection",
|
|
"collection": "internetarchive",
|
|
},
|
|
"files_count": 0,
|
|
"item_size": 0,
|
|
},
|
|
)
|
|
|
|
async with swap_shared_client(handler):
|
|
result = await get_item_metadata(identifier="nasa")
|
|
|
|
assert result["is_collection"] is True
|
|
assert result["collection"] == ["internetarchive"]
|
|
|
|
|
|
# ---------- H7: shared client lifecycle ----------
|
|
|
|
|
|
async def test_get_shared_client_returns_same_instance():
|
|
await client_mod.close_shared_client()
|
|
a = await client_mod.get_shared_client()
|
|
b = await client_mod.get_shared_client()
|
|
assert a is b
|
|
await client_mod.close_shared_client()
|
|
|
|
|
|
async def test_close_shared_client_clears_singleton():
|
|
a = await client_mod.get_shared_client()
|
|
await client_mod.close_shared_client()
|
|
b = await client_mod.get_shared_client()
|
|
assert a is not b
|
|
await client_mod.close_shared_client()
|
|
|
|
|
|
# ---------- M2: concurrent-download serialization ----------
|
|
|
|
|
|
async def test_concurrent_downloads_same_file_are_serialized(tmp_path, monkeypatch):
|
|
"""Two parallel download_file calls for the same (id, filename) must not
|
|
interleave — otherwise they'd race on the destination file."""
|
|
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
|
|
|
|
state = {"active": 0, "max_active": 0}
|
|
|
|
async def handler(req: httpx.Request) -> httpx.Response:
|
|
state["active"] += 1
|
|
state["max_active"] = max(state["max_active"], state["active"])
|
|
await asyncio.sleep(0.05) # hold the request long enough to overlap
|
|
state["active"] -= 1
|
|
return httpx.Response(200, content=b"file-content")
|
|
|
|
async with swap_shared_client(handler):
|
|
await asyncio.gather(
|
|
download_file(identifier="nasa", filename="shared.bin", overwrite=True),
|
|
download_file(identifier="nasa", filename="shared.bin", overwrite=True),
|
|
)
|
|
|
|
# The lock should have prevented any overlap.
|
|
assert state["max_active"] == 1
|
|
|
|
|
|
# ---------- runtime download root management ----------
|
|
|
|
|
|
def test_get_download_root_reports_env_value(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
|
|
info = get_download_root()
|
|
assert info["download_root"] == str(tmp_path.resolve())
|
|
assert info["source"] == "MCARCHIVE_DOWNLOAD_ROOT env var"
|
|
assert info["raw_env_value"] == str(tmp_path)
|
|
|
|
|
|
def test_get_download_root_reports_default_when_no_env(monkeypatch):
|
|
monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
|
|
info = get_download_root()
|
|
assert info["source"] == "default (./downloads under server CWD)"
|
|
assert info["raw_env_value"] is None
|
|
|
|
|
|
def test_set_download_root_changes_env_and_creates_dir(tmp_path, monkeypatch):
|
|
monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
|
|
target = tmp_path / "new" / "spot"
|
|
assert not target.exists()
|
|
|
|
info = set_download_root(path=str(target))
|
|
|
|
assert info["download_root"] == str(target.resolve())
|
|
assert info["changed"] is True
|
|
assert target.exists() and target.is_dir()
|
|
assert os.environ["MCARCHIVE_DOWNLOAD_ROOT"] == str(target.resolve())
|
|
|
|
|
|
def test_set_download_root_expands_tilde(tmp_path, monkeypatch):
|
|
monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
|
|
monkeypatch.setenv("HOME", str(tmp_path))
|
|
|
|
info = set_download_root(path="~/dl")
|
|
|
|
assert info["download_root"] == str((tmp_path / "dl").resolve())
|
|
assert (tmp_path / "dl").exists()
|
|
|
|
|
|
@pytest.mark.parametrize("forbidden", ["/etc", "/usr/local", "/var/log", "/", "/sys"])
|
|
def test_set_download_root_refuses_system_dirs(forbidden):
|
|
with pytest.raises(ValueError, match="system directory"):
|
|
set_download_root(path=forbidden)
|
|
|
|
|
|
async def test_set_download_root_takes_effect_for_next_download(tmp_path, monkeypatch):
|
|
"""The lazy-resolved root means a runtime change is honored by download_file
|
|
on the very next call without restarting."""
|
|
monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
|
|
set_download_root(path=str(tmp_path / "first"))
|
|
|
|
def handler(req):
|
|
return httpx.Response(200, content=b"data")
|
|
|
|
async with swap_shared_client(handler):
|
|
await download_file(identifier="nasa", filename="a.bin", overwrite=True)
|
|
# Now move the root to a different directory mid-session.
|
|
set_download_root(path=str(tmp_path / "second"))
|
|
await download_file(identifier="nasa", filename="b.bin", overwrite=True)
|
|
|
|
assert (tmp_path / "first" / "nasa" / "a.bin").exists()
|
|
assert (tmp_path / "second" / "nasa" / "b.bin").exists()
|
|
|
|
|
|
# ---------- M2 (continued): cross-file parallelism ----------
|
|
|
|
|
|
async def test_concurrent_downloads_different_files_run_in_parallel(tmp_path, monkeypatch):
|
|
"""Different filenames get different locks — they should run concurrently."""
|
|
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
|
|
|
|
state = {"active": 0, "max_active": 0}
|
|
|
|
async def handler(req: httpx.Request) -> httpx.Response:
|
|
state["active"] += 1
|
|
state["max_active"] = max(state["max_active"], state["active"])
|
|
await asyncio.sleep(0.05)
|
|
state["active"] -= 1
|
|
return httpx.Response(200, content=b"data")
|
|
|
|
async with swap_shared_client(handler):
|
|
await asyncio.gather(
|
|
download_file(identifier="nasa", filename="a.bin", overwrite=True),
|
|
download_file(identifier="nasa", filename="b.bin", overwrite=True),
|
|
)
|
|
|
|
# Different files — should overlap.
|
|
assert state["max_active"] == 2
|