mcarchive-org/tests/test_client_mocked.py

"""Failure-mode regression tests using httpx.MockTransport (no network).

Each test pins down one of the Hamilton review findings (C1/C2/C3/H4 etc.) so
future refactors can't silently regress safety.
"""

from __future__ import annotations

import hashlib

import httpx
import pytest

from mcarchive_org.client import (
    ArchiveClient,
    ArchiveError,
    validate_filename,
    validate_identifier,
)
from mcarchive_org.server import _confine_dest


def _client_with(handler) -> ArchiveClient:
    """Build an ArchiveClient backed by a MockTransport handler."""
    return ArchiveClient(transport=httpx.MockTransport(handler))


# ---------- C1: identifier + filename validation ----------


@pytest.mark.parametrize("bad", ["", "../etc", "foo/bar", "has space", "a" * 200])
def test_invalid_identifier_rejected(bad):
    with pytest.raises(ValueError, match=r"invalid archive\.org identifier"):
        validate_identifier(bad)


@pytest.mark.parametrize(
    "bad",
    [
        "../escape.txt",
        "/etc/passwd",
        "C:\\windows.txt",
        "with\x00null.bin",
        "foo/../bar.mp3",
        "foo\\..\\bar.mp3",
        "",
    ],
)
def test_invalid_filename_rejected(bad):
    with pytest.raises(ValueError):
        validate_filename(bad)


@pytest.mark.parametrize(
    "ok",
    ["song.mp3", "cover/back.jpg", "subdir/file with space.txt", "a.b.c.d"],
)
def test_legitimate_filenames_accepted(ok):
    assert validate_filename(ok) == ok


def test_confine_dest_blocks_traversal(tmp_path, monkeypatch):
    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
    # validate_filename catches '..' before _confine_dest's path-resolution check,
    # so this raises ValueError from the validator — both layers in agreement.
    with pytest.raises(ValueError):
        _confine_dest("nasa", "../escape.txt", dest_dir=None)


def test_confine_dest_legit_filename_lands_in_root(tmp_path, monkeypatch):
    monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
    dest = _confine_dest("nasa", "globe.jpg", dest_dir=None)
    assert dest.is_relative_to(tmp_path)
    assert dest.name == "globe.jpg"


# ---------- C2: symlink refusal ----------


async def test_download_refuses_symlink_at_dest(tmp_path):
    target = tmp_path / "real.bin"
    target.write_bytes(b"original-content")

    link = tmp_path / "evil.bin"
    link.symlink_to(target)

    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(200, content=b"new-content-that-should-not-overwrite")

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="symlink"):
            await c.download_to_file("nasa", "evil.bin", link)

    # Symlink target must be unchanged.
    assert target.read_bytes() == b"original-content"


# ---------- C3: Range-ignored detection ----------


async def test_resume_with_200_response_raises_before_writing(tmp_path):
    """If the server returns 200 instead of 206 on a Range request, we must not
    append to the existing file — that path corrupts data silently."""
    dest = tmp_path / "partial.bin"
    part = tmp_path / "partial.bin.part"  # staging file holds resume state
    part.write_bytes(b"X" * 100)

    def handler(req: httpx.Request) -> httpx.Response:
        # Server ignores Range header and returns the full body with 200
        assert req.headers.get("Range") == "bytes=100-"
        return httpx.Response(200, content=b"FULL_FILE_BODY")

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="ignored Range"):
            await c.download_to_file("nasa", "partial.bin", dest)

    # The .part file must be unchanged and dest must not exist — corruption avoided.
    assert part.read_bytes() == b"X" * 100
    assert not dest.exists()


async def test_resume_with_correct_206_succeeds(tmp_path):
    full_body = b"0123456789ABCDEF" * 16  # 256 bytes
    dest = tmp_path / "resume.bin"
    part = tmp_path / "resume.bin.part"
    part.write_bytes(full_body[:64])  # we already have first 64 bytes in staging

    def handler(req: httpx.Request) -> httpx.Response:
        assert req.headers.get("Range") == "bytes=64-"
        return httpx.Response(
            206,
            content=full_body[64:],
            headers={"Content-Range": f"bytes 64-{len(full_body)-1}/{len(full_body)}"},
        )

    expected_md5 = hashlib.md5(full_body).hexdigest()
    async with _client_with(handler) as c:
        result = await c.download_to_file(
            "nasa", "resume.bin", dest, verify_md5=expected_md5
        )

    assert result["bytes_written"] == len(full_body)
    assert result["resumed_from"] == 64
    assert result["md5_ok"] is True
    # On success the .part is atomically renamed to dest.
    assert dest.read_bytes() == full_body
    assert not part.exists()


async def test_resume_with_wrong_content_range_start_raises(tmp_path):
    dest = tmp_path / "off.bin"
    part = tmp_path / "off.bin.part"
    part.write_bytes(b"X" * 100)

    def handler(req: httpx.Request) -> httpx.Response:
        # Server returns 206 but with WRONG starting offset
        return httpx.Response(
            206,
            content=b"junk",
            headers={"Content-Range": "bytes 50-99/100"},
        )

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="Content-Range start"):
            await c.download_to_file("nasa", "off.bin", dest)

    # .part unchanged, dest never created.
    assert part.read_bytes() == b"X" * 100
    assert not dest.exists()


# ---------- H4: error body surfacing ----------


async def test_search_400_includes_response_body():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(400, text='{"error":"bad query syntax"}')

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="bad query syntax"):
            await c.search(query="INVALID:::")


async def test_metadata_404_includes_status():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(404, text="not found")

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="HTTP 404"):
            await c.metadata("nasa")


async def test_metadata_empty_dict_means_not_found():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(200, json={})

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="not found or unavailable"):
            await c.metadata("nasa")


async def test_files_returns_error_payload_as_archive_error():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(200, json={"error": "item is dark"})

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="item is dark"):
            await c.files("nasa")


async def test_scrape_error_payload_surfaced():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(
            200, json={"error": "count too small", "errorType": "RangeException"}
        )

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match=r"RangeException.*count too small"):
            await c.scrape(query="identifier:nasa", count=100)


async def test_invalid_json_response_surfaced():
    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(200, text="<html>not json</html>")

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="invalid JSON"):
            await c.metadata("nasa")


# ---------- happy path ----------


# ---------- M1: retry/backoff with Retry-After ----------


async def test_retry_on_429_then_success(monkeypatch):
    """First call gets 429 with Retry-After: 0, second call succeeds."""
    sleeps: list[float] = []

    async def fake_sleep(d: float) -> None:
        sleeps.append(d)

    monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", fake_sleep)

    calls = {"n": 0}

    def handler(req: httpx.Request) -> httpx.Response:
        calls["n"] += 1
        if calls["n"] == 1:
            return httpx.Response(429, headers={"Retry-After": "0"}, json={"error": "slow down"})
        return httpx.Response(200, json={"response": {"numFound": 0, "docs": []}})

    async with _client_with(handler) as c:
        result = await c.search(query="x", rows=1)
    assert result["num_found"] == 0
    assert calls["n"] == 2
    assert sleeps == [0.0]  # honored Retry-After: 0


async def test_retry_exhaustion_raises_with_body(monkeypatch):
    """If 429 persists past max_attempts, the final error body is surfaced."""
    monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", lambda d: _noop_sleep())

    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(429, json={"error": "rate limit exhausted"})

    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="rate limit exhausted"):
            await c.search(query="x")


async def _noop_sleep():
    """Used in place of asyncio.sleep when we don't care about backoff timing."""


async def test_retry_on_503_for_stream(monkeypatch, tmp_path):
    """Stream-level retry: 503 once, then 200 with body."""
    monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", lambda d: _noop_sleep())

    body = b"actual file body"
    calls = {"n": 0}

    def handler(req: httpx.Request) -> httpx.Response:
        calls["n"] += 1
        if calls["n"] == 1:
            return httpx.Response(503, text="overloaded")
        return httpx.Response(200, content=body)

    dest = tmp_path / "f.bin"
    async with _client_with(handler) as c:
        result = await c.download_to_file("nasa", "f.bin", dest)
    assert result["bytes_written"] == len(body)
    assert calls["n"] == 2
    assert dest.read_bytes() == body


async def test_retry_after_http_date_form(monkeypatch):
    """Retry-After can be an HTTP-date; we must parse it to a delta seconds."""
    sleeps: list[float] = []

    async def fake_sleep(d: float) -> None:
        sleeps.append(d)

    monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", fake_sleep)
    calls = {"n": 0}

    def handler(req: httpx.Request) -> httpx.Response:
        calls["n"] += 1
        if calls["n"] == 1:
            # An HTTP-date in the past should produce a 0-or-negative wait, clamped to 0.
            return httpx.Response(429, headers={"Retry-After": "Wed, 21 Oct 2015 07:28:00 GMT"})
        return httpx.Response(200, json={"response": {"numFound": 0, "docs": []}})

    async with _client_with(handler) as c:
        await c.search(query="x")
    assert sleeps == [0.0]


# ---------- H1: stream-abort error context ----------


async def test_stream_abort_raises_archive_error_with_byte_count(tmp_path):
    """If httpx raises mid-stream, we wrap it in ArchiveError with byte count
    so the caller knows where the partial download ended."""

    # Yield enough bytes to flush past httpx's internal chunk buffer (64KB) so
    # at least one chunk reaches our writer before the error fires.
    chunk_payload = b"X" * (1 << 17)  # 128KB — multiple buffer fills

    async def evil_body():
        yield chunk_payload
        raise httpx.ReadError("simulated network drop")

    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(200, content=evil_body())

    dest = tmp_path / "interrupted.bin"
    part = tmp_path / "interrupted.bin.part"
    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError) as exc_info:
            await c.download_to_file("nasa", "interrupted.bin", dest)

    msg = str(exc_info.value)
    assert "interrupted after" in msg
    assert "ReadError" in msg
    # Partial bytes go to .part, NOT dest. dest stays absent until success.
    assert not dest.exists()
    on_disk = part.read_bytes()
    assert len(on_disk) > 0
    assert on_disk == chunk_payload[: len(on_disk)]


# ---------- happy path ----------


async def test_fresh_download_writes_full_body(tmp_path):
    body = b"hello world" * 100
    dest = tmp_path / "new.bin"

    def handler(req: httpx.Request) -> httpx.Response:
        assert "Range" not in req.headers
        return httpx.Response(200, content=body)

    async with _client_with(handler) as c:
        result = await c.download_to_file(
            "nasa", "new.bin", dest, verify_md5=hashlib.md5(body).hexdigest()
        )

    assert result["bytes_written"] == len(body)
    assert result["resumed_from"] == 0
    assert result["md5_ok"] is True
    assert result["already_complete"] is False
    # The atomic-rename pattern leaves no .part artifact after success.
    assert dest.read_bytes() == body
    assert not (tmp_path / "new.bin.part").exists()


# ---------- Atomic .part staging ----------


async def test_failed_download_leaves_no_dest_file(tmp_path):
    """A failed fresh download must NOT leave the final dest file as zero bytes —
    it should leave only the .part staging file (or nothing if no bytes arrived)."""

    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(500, text="upstream cdn miss")

    dest = tmp_path / "shouldfail.bin"
    async with _client_with(handler) as c:
        with pytest.raises(ArchiveError, match="HTTP 500"):
            await c.download_to_file("nasa", "shouldfail.bin", dest)

    # Critical: dest must NOT exist as an empty file misleading the user.
    assert not dest.exists()


async def test_already_complete_short_circuits_without_network(tmp_path):
    """If dest exists and no .part, a follow-up download must not hit the
    network — the file is already complete."""
    dest = tmp_path / "done.bin"
    dest.write_bytes(b"already-here")

    calls = {"n": 0}

    def handler(req: httpx.Request) -> httpx.Response:
        calls["n"] += 1
        return httpx.Response(500, text="should never fire")

    async with _client_with(handler) as c:
        result = await c.download_to_file("nasa", "done.bin", dest)

    assert calls["n"] == 0  # no network at all
    assert result["already_complete"] is True
    assert result["bytes_written"] == len(b"already-here")
    assert dest.read_bytes() == b"already-here"


async def test_already_complete_verifies_md5_against_existing_file(tmp_path):
    """If verify_md5 is passed and dest is complete, we re-hash to confirm."""
    body = b"on-disk-content"
    dest = tmp_path / "done.bin"
    dest.write_bytes(body)

    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(500, text="should never fire")

    async with _client_with(handler) as c:
        result = await c.download_to_file(
            "nasa", "done.bin", dest, verify_md5=hashlib.md5(body).hexdigest()
        )

    assert result["already_complete"] is True
    assert result["md5_ok"] is True


async def test_already_complete_md5_mismatch_caught(tmp_path):
    """If the existing file's MD5 doesn't match expected, surface md5_ok=False."""
    dest = tmp_path / "wrong.bin"
    dest.write_bytes(b"actual-content")

    def handler(req: httpx.Request) -> httpx.Response:
        return httpx.Response(500, text="should never fire")

    async with _client_with(handler) as c:
        result = await c.download_to_file(
            "nasa", "wrong.bin", dest, verify_md5="0" * 32
        )
    assert result["already_complete"] is True
    assert result["md5_ok"] is False
    assert result["md5_expected"] == "0" * 32