Atomic write pattern (tier-3 polish from headless test finding): - download_to_file now writes to <dest>.part and renames to <dest> only on successful stream completion (os.replace is POSIX-atomic). Failed downloads leave only the .part file — no misleading 0-byte dest files in the user's downloads directory. - Resume logic reads from <dest>.part instead of <dest>; the user's directory only ever contains complete files or clearly-marked .part files. - New `already_complete` short-circuit: if dest exists and no .part, skip the network entirely (still re-verify MD5 if requested). The headless Claude test confirmed this avoids redundant CDN load. - Symlink rejection re-added at the new code path: even though os.replace would only replace (not follow) a symlink at dest, predictable refusal beats silent symlink removal. Runtime download root tools (for stdio MCP mode): - get_download_root(): reports current root, source (env var vs default), existence, writability. - set_download_root(path): change MCARCHIVE_DOWNLOAD_ROOT mid-session. Expands ~, creates the dir, refuses system paths (/, /etc, /usr, /bin, /sbin, /var, /sys, /proc, /dev, /boot, /root). The lazy-resolved root means the change takes effect on the next download_file call without restarting the server. 14 new tests (66 total, all green, ruff clean): - 4 staging tests: failed download leaves no dest, success leaves no .part, already_complete short-circuit, MD5 verification on existing files - 6 root-tools tests: env reporting, default reporting, ~ expansion, system-dir refusal (parametrized), set→download takes effect immediately - 4 existing tests rewritten to use .part as the resume staging file Headless Claude smoke test verified end-to-end: get_download_root → set_download_root → search → list → download → second download short-circuits with already_complete=true and zero network bytes.
452 lines
15 KiB
Python
452 lines
15 KiB
Python
"""Failure-mode regression tests using httpx.MockTransport (no network).
|
|
|
|
Each test pins down one of the Hamilton review findings (C1/C2/C3/H4 etc.) so
|
|
future refactors can't silently regress safety.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
|
|
import httpx
|
|
import pytest
|
|
|
|
from mcarchive_org.client import (
|
|
ArchiveClient,
|
|
ArchiveError,
|
|
validate_filename,
|
|
validate_identifier,
|
|
)
|
|
from mcarchive_org.server import _confine_dest
|
|
|
|
|
|
def _client_with(handler) -> ArchiveClient:
|
|
"""Build an ArchiveClient backed by a MockTransport handler."""
|
|
return ArchiveClient(transport=httpx.MockTransport(handler))
|
|
|
|
|
|
# ---------- C1: identifier + filename validation ----------
|
|
|
|
|
|
@pytest.mark.parametrize("bad", ["", "../etc", "foo/bar", "has space", "a" * 200])
|
|
def test_invalid_identifier_rejected(bad):
|
|
with pytest.raises(ValueError, match=r"invalid archive\.org identifier"):
|
|
validate_identifier(bad)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"bad",
|
|
[
|
|
"../escape.txt",
|
|
"/etc/passwd",
|
|
"C:\\windows.txt",
|
|
"with\x00null.bin",
|
|
"foo/../bar.mp3",
|
|
"foo\\..\\bar.mp3",
|
|
"",
|
|
],
|
|
)
|
|
def test_invalid_filename_rejected(bad):
|
|
with pytest.raises(ValueError):
|
|
validate_filename(bad)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"ok",
|
|
["song.mp3", "cover/back.jpg", "subdir/file with space.txt", "a.b.c.d"],
|
|
)
|
|
def test_legitimate_filenames_accepted(ok):
|
|
assert validate_filename(ok) == ok
|
|
|
|
|
|
def test_confine_dest_blocks_traversal(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
|
|
# validate_filename catches '..' before _confine_dest's path-resolution check,
|
|
# so this raises ValueError from the validator — both layers in agreement.
|
|
with pytest.raises(ValueError):
|
|
_confine_dest("nasa", "../escape.txt", dest_dir=None)
|
|
|
|
|
|
def test_confine_dest_legit_filename_lands_in_root(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("MCARCHIVE_DOWNLOAD_ROOT", str(tmp_path))
|
|
dest = _confine_dest("nasa", "globe.jpg", dest_dir=None)
|
|
assert dest.is_relative_to(tmp_path)
|
|
assert dest.name == "globe.jpg"
|
|
|
|
|
|
# ---------- C2: symlink refusal ----------
|
|
|
|
|
|
async def test_download_refuses_symlink_at_dest(tmp_path):
|
|
target = tmp_path / "real.bin"
|
|
target.write_bytes(b"original-content")
|
|
|
|
link = tmp_path / "evil.bin"
|
|
link.symlink_to(target)
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(200, content=b"new-content-that-should-not-overwrite")
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="symlink"):
|
|
await c.download_to_file("nasa", "evil.bin", link)
|
|
|
|
# Symlink target must be unchanged.
|
|
assert target.read_bytes() == b"original-content"
|
|
|
|
|
|
# ---------- C3: Range-ignored detection ----------
|
|
|
|
|
|
async def test_resume_with_200_response_raises_before_writing(tmp_path):
|
|
"""If the server returns 200 instead of 206 on a Range request, we must not
|
|
append to the existing file — that path corrupts data silently."""
|
|
dest = tmp_path / "partial.bin"
|
|
part = tmp_path / "partial.bin.part" # staging file holds resume state
|
|
part.write_bytes(b"X" * 100)
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
# Server ignores Range header and returns the full body with 200
|
|
assert req.headers.get("Range") == "bytes=100-"
|
|
return httpx.Response(200, content=b"FULL_FILE_BODY")
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="ignored Range"):
|
|
await c.download_to_file("nasa", "partial.bin", dest)
|
|
|
|
# The .part file must be unchanged and dest must not exist — corruption avoided.
|
|
assert part.read_bytes() == b"X" * 100
|
|
assert not dest.exists()
|
|
|
|
|
|
async def test_resume_with_correct_206_succeeds(tmp_path):
|
|
full_body = b"0123456789ABCDEF" * 16 # 256 bytes
|
|
dest = tmp_path / "resume.bin"
|
|
part = tmp_path / "resume.bin.part"
|
|
part.write_bytes(full_body[:64]) # we already have first 64 bytes in staging
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
assert req.headers.get("Range") == "bytes=64-"
|
|
return httpx.Response(
|
|
206,
|
|
content=full_body[64:],
|
|
headers={"Content-Range": f"bytes 64-{len(full_body)-1}/{len(full_body)}"},
|
|
)
|
|
|
|
expected_md5 = hashlib.md5(full_body).hexdigest()
|
|
async with _client_with(handler) as c:
|
|
result = await c.download_to_file(
|
|
"nasa", "resume.bin", dest, verify_md5=expected_md5
|
|
)
|
|
|
|
assert result["bytes_written"] == len(full_body)
|
|
assert result["resumed_from"] == 64
|
|
assert result["md5_ok"] is True
|
|
# On success the .part is atomically renamed to dest.
|
|
assert dest.read_bytes() == full_body
|
|
assert not part.exists()
|
|
|
|
|
|
async def test_resume_with_wrong_content_range_start_raises(tmp_path):
|
|
dest = tmp_path / "off.bin"
|
|
part = tmp_path / "off.bin.part"
|
|
part.write_bytes(b"X" * 100)
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
# Server returns 206 but with WRONG starting offset
|
|
return httpx.Response(
|
|
206,
|
|
content=b"junk",
|
|
headers={"Content-Range": "bytes 50-99/100"},
|
|
)
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="Content-Range start"):
|
|
await c.download_to_file("nasa", "off.bin", dest)
|
|
|
|
# .part unchanged, dest never created.
|
|
assert part.read_bytes() == b"X" * 100
|
|
assert not dest.exists()
|
|
|
|
|
|
# ---------- H4: error body surfacing ----------
|
|
|
|
|
|
async def test_search_400_includes_response_body():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(400, text='{"error":"bad query syntax"}')
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="bad query syntax"):
|
|
await c.search(query="INVALID:::")
|
|
|
|
|
|
async def test_metadata_404_includes_status():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(404, text="not found")
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="HTTP 404"):
|
|
await c.metadata("nasa")
|
|
|
|
|
|
async def test_metadata_empty_dict_means_not_found():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(200, json={})
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="not found or unavailable"):
|
|
await c.metadata("nasa")
|
|
|
|
|
|
async def test_files_returns_error_payload_as_archive_error():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(200, json={"error": "item is dark"})
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="item is dark"):
|
|
await c.files("nasa")
|
|
|
|
|
|
async def test_scrape_error_payload_surfaced():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(
|
|
200, json={"error": "count too small", "errorType": "RangeException"}
|
|
)
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match=r"RangeException.*count too small"):
|
|
await c.scrape(query="identifier:nasa", count=100)
|
|
|
|
|
|
async def test_invalid_json_response_surfaced():
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(200, text="<html>not json</html>")
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="invalid JSON"):
|
|
await c.metadata("nasa")
|
|
|
|
|
|
# ---------- happy path ----------
|
|
|
|
|
|
# ---------- M1: retry/backoff with Retry-After ----------
|
|
|
|
|
|
async def test_retry_on_429_then_success(monkeypatch):
|
|
"""First call gets 429 with Retry-After: 0, second call succeeds."""
|
|
sleeps: list[float] = []
|
|
|
|
async def fake_sleep(d: float) -> None:
|
|
sleeps.append(d)
|
|
|
|
monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", fake_sleep)
|
|
|
|
calls = {"n": 0}
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
calls["n"] += 1
|
|
if calls["n"] == 1:
|
|
return httpx.Response(429, headers={"Retry-After": "0"}, json={"error": "slow down"})
|
|
return httpx.Response(200, json={"response": {"numFound": 0, "docs": []}})
|
|
|
|
async with _client_with(handler) as c:
|
|
result = await c.search(query="x", rows=1)
|
|
assert result["num_found"] == 0
|
|
assert calls["n"] == 2
|
|
assert sleeps == [0.0] # honored Retry-After: 0
|
|
|
|
|
|
async def test_retry_exhaustion_raises_with_body(monkeypatch):
|
|
"""If 429 persists past max_attempts, the final error body is surfaced."""
|
|
monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", lambda d: _noop_sleep())
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(429, json={"error": "rate limit exhausted"})
|
|
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="rate limit exhausted"):
|
|
await c.search(query="x")
|
|
|
|
|
|
async def _noop_sleep():
|
|
"""Used in place of asyncio.sleep when we don't care about backoff timing."""
|
|
|
|
|
|
async def test_retry_on_503_for_stream(monkeypatch, tmp_path):
|
|
"""Stream-level retry: 503 once, then 200 with body."""
|
|
monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", lambda d: _noop_sleep())
|
|
|
|
body = b"actual file body"
|
|
calls = {"n": 0}
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
calls["n"] += 1
|
|
if calls["n"] == 1:
|
|
return httpx.Response(503, text="overloaded")
|
|
return httpx.Response(200, content=body)
|
|
|
|
dest = tmp_path / "f.bin"
|
|
async with _client_with(handler) as c:
|
|
result = await c.download_to_file("nasa", "f.bin", dest)
|
|
assert result["bytes_written"] == len(body)
|
|
assert calls["n"] == 2
|
|
assert dest.read_bytes() == body
|
|
|
|
|
|
async def test_retry_after_http_date_form(monkeypatch):
|
|
"""Retry-After can be an HTTP-date; we must parse it to a delta seconds."""
|
|
sleeps: list[float] = []
|
|
|
|
async def fake_sleep(d: float) -> None:
|
|
sleeps.append(d)
|
|
|
|
monkeypatch.setattr("mcarchive_org.client.asyncio.sleep", fake_sleep)
|
|
calls = {"n": 0}
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
calls["n"] += 1
|
|
if calls["n"] == 1:
|
|
# An HTTP-date in the past should produce a 0-or-negative wait, clamped to 0.
|
|
return httpx.Response(429, headers={"Retry-After": "Wed, 21 Oct 2015 07:28:00 GMT"})
|
|
return httpx.Response(200, json={"response": {"numFound": 0, "docs": []}})
|
|
|
|
async with _client_with(handler) as c:
|
|
await c.search(query="x")
|
|
assert sleeps == [0.0]
|
|
|
|
|
|
# ---------- H1: stream-abort error context ----------
|
|
|
|
|
|
async def test_stream_abort_raises_archive_error_with_byte_count(tmp_path):
|
|
"""If httpx raises mid-stream, we wrap it in ArchiveError with byte count
|
|
so the caller knows where the partial download ended."""
|
|
|
|
# Yield enough bytes to flush past httpx's internal chunk buffer (64KB) so
|
|
# at least one chunk reaches our writer before the error fires.
|
|
chunk_payload = b"X" * (1 << 17) # 128KB — multiple buffer fills
|
|
|
|
async def evil_body():
|
|
yield chunk_payload
|
|
raise httpx.ReadError("simulated network drop")
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(200, content=evil_body())
|
|
|
|
dest = tmp_path / "interrupted.bin"
|
|
part = tmp_path / "interrupted.bin.part"
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError) as exc_info:
|
|
await c.download_to_file("nasa", "interrupted.bin", dest)
|
|
|
|
msg = str(exc_info.value)
|
|
assert "interrupted after" in msg
|
|
assert "ReadError" in msg
|
|
# Partial bytes go to .part, NOT dest. dest stays absent until success.
|
|
assert not dest.exists()
|
|
on_disk = part.read_bytes()
|
|
assert len(on_disk) > 0
|
|
assert on_disk == chunk_payload[: len(on_disk)]
|
|
|
|
|
|
# ---------- happy path ----------
|
|
|
|
|
|
async def test_fresh_download_writes_full_body(tmp_path):
|
|
body = b"hello world" * 100
|
|
dest = tmp_path / "new.bin"
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
assert "Range" not in req.headers
|
|
return httpx.Response(200, content=body)
|
|
|
|
async with _client_with(handler) as c:
|
|
result = await c.download_to_file(
|
|
"nasa", "new.bin", dest, verify_md5=hashlib.md5(body).hexdigest()
|
|
)
|
|
|
|
assert result["bytes_written"] == len(body)
|
|
assert result["resumed_from"] == 0
|
|
assert result["md5_ok"] is True
|
|
assert result["already_complete"] is False
|
|
# The atomic-rename pattern leaves no .part artifact after success.
|
|
assert dest.read_bytes() == body
|
|
assert not (tmp_path / "new.bin.part").exists()
|
|
|
|
|
|
# ---------- Atomic .part staging ----------
|
|
|
|
|
|
async def test_failed_download_leaves_no_dest_file(tmp_path):
|
|
"""A failed fresh download must NOT leave the final dest file as zero bytes —
|
|
it should leave only the .part staging file (or nothing if no bytes arrived)."""
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(500, text="upstream cdn miss")
|
|
|
|
dest = tmp_path / "shouldfail.bin"
|
|
async with _client_with(handler) as c:
|
|
with pytest.raises(ArchiveError, match="HTTP 500"):
|
|
await c.download_to_file("nasa", "shouldfail.bin", dest)
|
|
|
|
# Critical: dest must NOT exist as an empty file misleading the user.
|
|
assert not dest.exists()
|
|
|
|
|
|
async def test_already_complete_short_circuits_without_network(tmp_path):
|
|
"""If dest exists and no .part, a follow-up download must not hit the
|
|
network — the file is already complete."""
|
|
dest = tmp_path / "done.bin"
|
|
dest.write_bytes(b"already-here")
|
|
|
|
calls = {"n": 0}
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
calls["n"] += 1
|
|
return httpx.Response(500, text="should never fire")
|
|
|
|
async with _client_with(handler) as c:
|
|
result = await c.download_to_file("nasa", "done.bin", dest)
|
|
|
|
assert calls["n"] == 0 # no network at all
|
|
assert result["already_complete"] is True
|
|
assert result["bytes_written"] == len(b"already-here")
|
|
assert dest.read_bytes() == b"already-here"
|
|
|
|
|
|
async def test_already_complete_verifies_md5_against_existing_file(tmp_path):
|
|
"""If verify_md5 is passed and dest is complete, we re-hash to confirm."""
|
|
body = b"on-disk-content"
|
|
dest = tmp_path / "done.bin"
|
|
dest.write_bytes(body)
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(500, text="should never fire")
|
|
|
|
async with _client_with(handler) as c:
|
|
result = await c.download_to_file(
|
|
"nasa", "done.bin", dest, verify_md5=hashlib.md5(body).hexdigest()
|
|
)
|
|
|
|
assert result["already_complete"] is True
|
|
assert result["md5_ok"] is True
|
|
|
|
|
|
async def test_already_complete_md5_mismatch_caught(tmp_path):
|
|
"""If the existing file's MD5 doesn't match expected, surface md5_ok=False."""
|
|
dest = tmp_path / "wrong.bin"
|
|
dest.write_bytes(b"actual-content")
|
|
|
|
def handler(req: httpx.Request) -> httpx.Response:
|
|
return httpx.Response(500, text="should never fire")
|
|
|
|
async with _client_with(handler) as c:
|
|
result = await c.download_to_file(
|
|
"nasa", "wrong.bin", dest, verify_md5="0" * 32
|
|
)
|
|
assert result["already_complete"] is True
|
|
assert result["md5_ok"] is False
|
|
assert result["md5_expected"] == "0" * 32
|