mcarchive-org/tests/test_server_mocked.py

"""Server-layer regression tests using a swapped-in shared client.

These exercise the MCP tool functions directly and verify:
- Collection normalization (M5)
- `is_collection` derived flag (M7)
- Shared client lifecycle (H7)
- Concurrent-download serialization (M2)
"""

from __future__ import annotations

import asyncio
import os
from contextlib import asynccontextmanager

import httpx
import pytest

from mcarchive_org import client as client_mod
from mcarchive_org.client import ArchiveClient
from mcarchive_org.server import (
    _enrich_doc,
    _normalize_collection,
    download_file,
    get_download_root,
    get_item_metadata,
    search_items,
    set_download_root,
)


@asynccontextmanager
async def swap_shared_client(handler):
    """Temporarily replace the process-wide shared client with a mock-backed one.

    Tests that exercise server.py tools need this because those tools call
    get_shared_client() under the hood, and we can't pass a transport in.
    """
    saved = client_mod._shared_client
    mock = ArchiveClient(transport=httpx.MockTransport(handler))
    client_mod._shared_client = mock
    try:
        yield mock
    finally:
        client_mod._shared_client = saved
        await mock.aclose()


# ---------- M5: collection normalization ----------


@pytest.mark.parametrize(
    "raw,expected",
    [
        (None, []),
        ("", []),
        ("nasa", ["nasa"]),
        (["nasa", "opensource"], ["nasa", "opensource"]),
        ([], []),
        ([None, "nasa", ""], ["nasa"]),  # falsy items dropped
    ],
)
def test_normalize_collection_shapes(raw, expected):
    assert _normalize_collection(raw) == expected


def test_enrich_doc_marks_is_collection():
    assert _enrich_doc({"mediatype": "collection", "identifier": "nasa"})["is_collection"] is True
    assert _enrich_doc({"mediatype": "audio", "identifier": "x"})["is_collection"] is False
    assert _enrich_doc({"identifier": "x"})["is_collection"] is False


def test_enrich_doc_normalizes_collection_field():
    out = _enrich_doc({"identifier": "x", "collection": "single"})
    assert out["collection"] == ["single"]


# ---------- M7: is_collection in real tool flow ----------


async def test_search_items_decorates_docs_with_is_collection():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(
            200,
            json={
                "response": {
                    "numFound": 2,
                    "docs": [
                        {"identifier": "nasa", "mediatype": "collection", "collection": "nasa"},
                        {"identifier": "song1", "mediatype": "audio", "collection": ["etree", "GratefulDead"]},
                    ],
                }
            },
        )

    async with swap_shared_client(handler):
        result = await search_items(query="x", rows=2)

    assert len(result["docs"]) == 2
    nasa, song = result["docs"]
    assert nasa["is_collection"] is True
    assert nasa["collection"] == ["nasa"]
    assert song["is_collection"] is False
    assert song["collection"] == ["etree", "GratefulDead"]


async def test_get_item_metadata_normalizes_collection():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(
            200,
            json={
                "metadata": {
                    "identifier": "nasa",
                    "title": "NASA Images",
                    "mediatype": "collection",
                    "collection": "internetarchive",
                },
                "files_count": 0,
                "item_size": 0,
            },
        )

    async with swap_shared_client(handler):
        result = await get_item_metadata(identifier="nasa")

    assert result["is_collection"] is True
    assert result["collection"] == ["internetarchive"]


# ---------- H7: shared client lifecycle ----------


async def test_get_shared_client_returns_same_instance():
    await client_mod.close_shared_client()
    a = await client_mod.get_shared_client()
    b = await client_mod.get_shared_client()
    assert a is b
    await client_mod.close_shared_client()


async def test_close_shared_client_clears_singleton():
    a = await client_mod.get_shared_client()
    await client_mod.close_shared_client()
    b = await client_mod.get_shared_client()
    assert a is not b
    await client_mod.close_shared_client()


# ---------- M2: concurrent-download serialization ----------


async def test_concurrent_downloads_same_file_are_serialized(tmp_path, monkeypatch):
    """Two parallel download_file calls for the same (id, filename) must not
    interleave — otherwise they'd race on the destination file."""
    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))

    state = {"active": 0, "max_active": 0}

    async def handler(req: httpx.Request) -> httpx.Response:
        state["active"] += 1
        state["max_active"] = max(state["max_active"], state["active"])
        await asyncio.sleep(0.05)  # hold the request long enough to overlap
        state["active"] -= 1
        return httpx.Response(200, content=b"file-content")

    async with swap_shared_client(handler):
        await asyncio.gather(
            download_file(identifier="nasa", filename="shared.bin", overwrite=True),
            download_file(identifier="nasa", filename="shared.bin", overwrite=True),
        )

    # The lock should have prevented any overlap.
    assert state["max_active"] == 1


# ---------- runtime download root management ----------


def test_get_download_root_reports_env_value(tmp_path, monkeypatch):
    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
    info = get_download_root()
    assert info["download_root"] == str(tmp_path.resolve())
    assert info["source"] == "MCARCHIVE_DOWNLOAD_ROOT env var"
    assert info["raw_env_value"] == str(tmp_path)


def test_get_download_root_reports_default_when_no_env(monkeypatch):
    monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
    info = get_download_root()
    assert info["source"] == "default (./downloads under server CWD)"
    assert info["raw_env_value"] is None


def test_set_download_root_changes_env_and_creates_dir(tmp_path, monkeypatch):
    monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
    target = tmp_path / "new" / "spot"
    assert not target.exists()

    info = set_download_root(path=str(target))

    assert info["download_root"] == str(target.resolve())
    assert info["changed"] is True
    assert target.exists() and target.is_dir()
    assert os.environ["MCARCHIVE_DOWNLOAD_ROOT"] == str(target.resolve())


def test_set_download_root_expands_tilde(tmp_path, monkeypatch):
    monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
    monkeypatch.setenv("HOME", str(tmp_path))

    info = set_download_root(path="~/dl")

    assert info["download_root"] == str((tmp_path / "dl").resolve())
    assert (tmp_path / "dl").exists()


@pytest.mark.parametrize("forbidden", ["/etc", "/usr/local", "/var/log", "/", "/sys"])
def test_set_download_root_refuses_system_dirs(forbidden):
    with pytest.raises(ValueError, match="system directory"):
        set_download_root(path=forbidden)


async def test_set_download_root_takes_effect_for_next_download(tmp_path, monkeypatch):
    """The lazy-resolved root means a runtime change is honored by download_file
    on the very next call without restarting."""
    monkeypatch.delenv("MCARCHIVE_DOWNLOAD_ROOT", raising=False)
    set_download_root(path=str(tmp_path / "first"))

    def handler(req):
        return httpx.Response(200, content=b"data")

    async with swap_shared_client(handler):
        await download_file(identifier="nasa", filename="a.bin", overwrite=True)
        # Now move the root to a different directory mid-session.
        set_download_root(path=str(tmp_path / "second"))
        await download_file(identifier="nasa", filename="b.bin", overwrite=True)

    assert (tmp_path / "first" / "nasa" / "a.bin").exists()
    assert (tmp_path / "second" / "nasa" / "b.bin").exists()


# ---------- M2 (continued): cross-file parallelism ----------


async def test_concurrent_downloads_different_files_run_in_parallel(tmp_path, monkeypatch):
    """Different filenames get different locks — they should run concurrently."""
    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))

    state = {"active": 0, "max_active": 0}

    async def handler(req: httpx.Request) -> httpx.Response:
        state["active"] += 1
        state["max_active"] = max(state["max_active"], state["active"])
        await asyncio.sleep(0.05)
        state["active"] -= 1
        return httpx.Response(200, content=b"data")

    async with swap_shared_client(handler):
        await asyncio.gather(
            download_file(identifier="nasa", filename="a.bin", overwrite=True),
            download_file(identifier="nasa", filename="b.bin", overwrite=True),
        )

    # Different files — should overlap.
    assert state["max_active"] == 2