From 52a2be7cc60884ccaab559facc68a15498031fc1 Mon Sep 17 00:00:00 2001 From: Ryan Malloy Date: Tue, 21 Apr 2026 21:20:56 -0600 Subject: [PATCH] Release prep: CHANGELOG, CI workflow, Gitea project URLs - CHANGELOG.md documents the 2026.04.21 initial release: full tool inventory, every reliability claim, and test count (66/66 green). - .github/workflows/ci.yml runs ruff check + pytest -m 'not network' across Python 3.10/3.11/3.12/3.13 on push and PR. Skips live archive.org tests in CI to keep runs fast and avoid hammering archive.org. - pyproject.toml [project.urls]: point Homepage / Repository / Bug Tracker / Changelog at git.supported.systems/rsp2k/mcarchive-org. Keep the archive.org developer docs link for context. --- .github/workflows/ci.yml | 35 ++++++++++++++++++++++++++++++++ CHANGELOG.md | 44 ++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 6 +++++- 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci.yml create mode 100644 CHANGELOG.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..19a0a04 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,35 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + + - name: Install dependencies + run: uv sync --all-extras + + - name: Lint with ruff + run: uv run ruff check src/ tests/ + + - name: Run tests (skip live network tests) + run: uv run pytest -m 'not network' -v diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..44f6832 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,44 @@ +# Changelog + +Versioning is date-based: `YYYY.MM.DD` for normal releases, `YYYY.MM.DD.N` (PEP 440 post-release) for same-day fixes. + +## 2026.04.21 — initial release + +First public release. An MCP (Model Context Protocol) server that lets an LLM search, inspect, and download content from the [Internet Archive](https://archive.org). No API key required. + +### Tools + +- `search_items` — Solr-style search via `advancedsearch.php` (1–200 rows, paginated) +- `scrape_items` — bulk cursor-paginated search via the Scrape API (count ≥ 100) +- `get_item_metadata` — item metadata; skips the (potentially huge) files list by default +- `list_files` — files array with optional format / fnmatch glob filtering, includes pre-built `download_url` per file +- `get_file_url` — build a canonical download URL without hitting the network +- `download_file` — stream a file to disk with HTTP Range resume + optional MD5 verification +- `get_download_root` — report current download root and its source (env var vs default) +- `set_download_root` — change the download root mid-session (useful in stdio mode where env vars can't be re-exported) + +Plus an MCP resource template: `archive://item/{identifier}`. + +### Reliability features + +- **Input validation**: identifiers must match `^[A-Za-z0-9._-]+$`; filenames reject `..` components, absolute paths, NUL bytes, and Windows drive letters before any FS or network I/O +- **Path confinement**: download destinations are resolved and asserted to live under `MCARCHIVE_DOWNLOAD_ROOT`; symlinks at the destination are refused +- **`O_NOFOLLOW`**: defense-in-depth against symlink-substitution races on the destination file +- **Range-correctness check**: when resuming, the server's response must be HTTP 206 with a matching `Content-Range` start byte — otherwise the download aborts before any byte is written, eliminating silent file corruption +- **Atomic write staging**: downloads write to `.part` and are renamed to `` only on successful completion (POSIX-atomic). Failed downloads leave only `.part`, never an empty `dest` +- **Already-complete short-circuit**: re-downloading an already-complete file skips the network entirely (still re-verifies MD5 if asked) +- **Retry with backoff**: 429/502/503/504 retried up to 3 times with `Retry-After` honored (delta-seconds and HTTP-date forms), exponential backoff with jitter, capped at 30s. Retries happen *before* any bytes are yielded, so retry can never corrupt a partial write +- **Concurrent-download serialization**: per-`(identifier, filename)` `asyncio.Lock` prevents two parallel calls from racing on the same destination file. Different files still download in parallel +- **Stream-abort surfacing**: `httpx.ReadError`/`RemoteProtocolError`/`ConnectError`/`ReadTimeout` mid-stream are caught and re-raised as `ArchiveError` with a byte-count context so the caller knows where the partial download ended +- **Error body surfacing**: 4xx/5xx responses include a body preview in the exception message — invaluable for an LLM trying to fix a bad query +- **Process-wide shared `httpx.AsyncClient`**: one connection pool reused across the server's lifetime (no TCP+TLS handshake per tool call) + +### Output normalization + +- `collection` field is always `list[str]` (archive.org returns string OR list inconsistently) +- Every search doc / metadata response includes a derived `is_collection: bool` so LLMs can route collection containers vs. real media items without re-querying +- File entries always include a ready-to-use `download_url` plus `size_human` ("12.3 MB") alongside raw `size` in bytes + +### Tests + +66 tests total (4 live integration against archive.org + 62 mock-transport regression tests). Mock tests cover every reliability claim above so future refactors can't silently regress safety. diff --git a/pyproject.toml b/pyproject.toml index e1602d8..3e46da3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,11 @@ dependencies = [ mcarchive-org = "mcarchive_org.server:main" [project.urls] -Homepage = "https://archive.org/developers/" +Homepage = "https://git.supported.systems/rsp2k/mcarchive-org" +Repository = "https://git.supported.systems/rsp2k/mcarchive-org" +"Bug Tracker" = "https://git.supported.systems/rsp2k/mcarchive-org/issues" +Changelog = "https://git.supported.systems/rsp2k/mcarchive-org/src/branch/main/CHANGELOG.md" +"Archive.org API docs" = "https://archive.org/developers/" [build-system] requires = ["hatchling"]