mcarchive-org/tests/test_server_mocked.py

"""Server-layer regression tests using a swapped-in shared client.

These exercise the MCP tool functions directly and verify:
- Collection normalization (M5)
- `is_collection` derived flag (M7)
- Shared client lifecycle (H7)
- Concurrent-download serialization (M2)
"""

from __future__ import annotations

import asyncio
from contextlib import asynccontextmanager

import httpx
import pytest

from mcarchive_org import client as client_mod
from mcarchive_org.client import ArchiveClient
from mcarchive_org.server import (
    _enrich_doc,
    _normalize_collection,
    download_file,
    get_item_metadata,
    search_items,
)


@asynccontextmanager
async def swap_shared_client(handler):
    """Temporarily replace the process-wide shared client with a mock-backed one.

    Tests that exercise server.py tools need this because those tools call
    get_shared_client() under the hood, and we can't pass a transport in.
    """
    saved = client_mod._shared_client
    mock = ArchiveClient(transport=httpx.MockTransport(handler))
    client_mod._shared_client = mock
    try:
        yield mock
    finally:
        client_mod._shared_client = saved
        await mock.aclose()


# ---------- M5: collection normalization ----------


@pytest.mark.parametrize(
    "raw,expected",
    [
        (None, []),
        ("", []),
        ("nasa", ["nasa"]),
        (["nasa", "opensource"], ["nasa", "opensource"]),
        ([], []),
        ([None, "nasa", ""], ["nasa"]),  # falsy items dropped
    ],
)
def test_normalize_collection_shapes(raw, expected):
    assert _normalize_collection(raw) == expected


def test_enrich_doc_marks_is_collection():
    assert _enrich_doc({"mediatype": "collection", "identifier": "nasa"})["is_collection"] is True
    assert _enrich_doc({"mediatype": "audio", "identifier": "x"})["is_collection"] is False
    assert _enrich_doc({"identifier": "x"})["is_collection"] is False


def test_enrich_doc_normalizes_collection_field():
    out = _enrich_doc({"identifier": "x", "collection": "single"})
    assert out["collection"] == ["single"]


# ---------- M7: is_collection in real tool flow ----------


async def test_search_items_decorates_docs_with_is_collection():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(
            200,
            json={
                "response": {
                    "numFound": 2,
                    "docs": [
                        {"identifier": "nasa", "mediatype": "collection", "collection": "nasa"},
                        {"identifier": "song1", "mediatype": "audio", "collection": ["etree", "GratefulDead"]},
                    ],
                }
            },
        )

    async with swap_shared_client(handler):
        result = await search_items(query="x", rows=2)

    assert len(result["docs"]) == 2
    nasa, song = result["docs"]
    assert nasa["is_collection"] is True
    assert nasa["collection"] == ["nasa"]
    assert song["is_collection"] is False
    assert song["collection"] == ["etree", "GratefulDead"]


async def test_get_item_metadata_normalizes_collection():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(
            200,
            json={
                "metadata": {
                    "identifier": "nasa",
                    "title": "NASA Images",
                    "mediatype": "collection",
                    "collection": "internetarchive",
                },
                "files_count": 0,
                "item_size": 0,
            },
        )

    async with swap_shared_client(handler):
        result = await get_item_metadata(identifier="nasa")

    assert result["is_collection"] is True
    assert result["collection"] == ["internetarchive"]


# ---------- H7: shared client lifecycle ----------


async def test_get_shared_client_returns_same_instance():
    await client_mod.close_shared_client()
    a = await client_mod.get_shared_client()
    b = await client_mod.get_shared_client()
    assert a is b
    await client_mod.close_shared_client()


async def test_close_shared_client_clears_singleton():
    a = await client_mod.get_shared_client()
    await client_mod.close_shared_client()
    b = await client_mod.get_shared_client()
    assert a is not b
    await client_mod.close_shared_client()


# ---------- M2: concurrent-download serialization ----------


async def test_concurrent_downloads_same_file_are_serialized(tmp_path, monkeypatch):
    """Two parallel download_file calls for the same (id, filename) must not
    interleave — otherwise they'd race on the destination file."""
    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))

    state = {"active": 0, "max_active": 0}

    async def handler(req: httpx.Request) -> httpx.Response:
        state["active"] += 1
        state["max_active"] = max(state["max_active"], state["active"])
        await asyncio.sleep(0.05)  # hold the request long enough to overlap
        state["active"] -= 1
        return httpx.Response(200, content=b"file-content")

    async with swap_shared_client(handler):
        await asyncio.gather(
            download_file(identifier="nasa", filename="shared.bin", overwrite=True),
            download_file(identifier="nasa", filename="shared.bin", overwrite=True),
        )

    # The lock should have prevented any overlap.
    assert state["max_active"] == 1


async def test_concurrent_downloads_different_files_run_in_parallel(tmp_path, monkeypatch):
    """Different filenames get different locks — they should run concurrently."""
    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))

    state = {"active": 0, "max_active": 0}

    async def handler(req: httpx.Request) -> httpx.Response:
        state["active"] += 1
        state["max_active"] = max(state["max_active"], state["active"])
        await asyncio.sleep(0.05)
        state["active"] -= 1
        return httpx.Response(200, content=b"data")

    async with swap_shared_client(handler):
        await asyncio.gather(
            download_file(identifier="nasa", filename="a.bin", overwrite=True),
            download_file(identifier="nasa", filename="b.bin", overwrite=True),
        )

    # Different files — should overlap.
    assert state["max_active"] == 2