pg_orrery/bench/build_catalog.py

#!/usr/bin/env python3
"""
Build a merged TLE catalog from multiple sources for pg_orrery benchmarks.

Usage:
    # Merge existing TLE files into SQL
    ./build_catalog.py bench/spacetrack_everything.tle bench/celestrak_active.tle ...

    # Pipe to psql
    ./build_catalog.py bench/*.tle | PGPORT=5499 psql -d contrib_regression

    # Or generate SQL file
    ./build_catalog.py bench/*.tle > bench/load_catalog.sql

Deduplication: when the same NORAD ID appears in multiple files, the entry
with the newest epoch wins.  This means CelesTrak SupGP data (fresher epochs)
automatically overrides stale Space-Track entries.

Alpha-5 NORAD IDs (T0002 etc.) are handled transparently — they parse into
integers >100,000 via the same logic as Bill Gray's get_el.c.
"""
import sys
import os
import re
from collections import OrderedDict

# Alpha-5 NORAD decoding — mirrors get_norad_number() in src/sgp4/get_el.c
_ALPHA5_SKIP = {'I', 'O'}  # skipped in Alpha-5 encoding

def decode_norad(s):
    """Decode a 5-character NORAD field to integer.  Handles Alpha-5."""
    s = s.strip()
    if not s:
        return None
    first = s[0]
    if first.isdigit():
        try:
            return int(s)
        except ValueError:
            return None
    elif first.isalpha() and first.isupper():
        # Alpha-5: letter + 4 digits
        val = ord(first) - ord('A')
        if first > 'I':
            val -= 1
        if first > 'O':
            val -= 1
        try:
            return val * 10000 + int(s[1:]) + 100000
        except ValueError:
            return None
    return None


def parse_3le_file(filepath):
    """Parse a 3LE (or 2LE) file into a dict of norad_str -> (line1, line2, name, epoch)."""
    objects = {}
    try:
        lines = open(filepath, errors='replace').readlines()
    except FileNotFoundError:
        print(f"# SKIP {filepath}: not found", file=sys.stderr)
        return objects

    i = 0
    while i < len(lines):
        line = lines[i].rstrip('\r\n')

        if line.startswith('1 ') and i + 1 < len(lines) and lines[i + 1].rstrip('\r\n').startswith('2 '):
            line1 = line.rstrip('\r\n')
            line2 = lines[i + 1].rstrip('\r\n')

            # Look back for name line (3LE format)
            name = ''
            if i > 0:
                prev = lines[i - 1].rstrip('\r\n')
                if prev and not prev.startswith(('1 ', '2 ')):
                    name = prev.strip()

            # Extract NORAD ID (works for both standard and Alpha-5)
            norad_field = line1[2:7]
            norad_int = decode_norad(norad_field)
            if norad_int is None:
                i += 2
                continue

            norad_str = str(norad_int)

            # Extract epoch (column 18-32 of line 1)
            try:
                epoch = float(line1[18:32].strip())
            except (ValueError, IndexError):
                epoch = 0.0

            # Keep the entry with the newest epoch
            if norad_str not in objects or epoch > objects[norad_str][3]:
                objects[norad_str] = (line1, line2, name, epoch)

            i += 2
        else:
            i += 1

    return objects


def main():
    if len(sys.argv) < 2:
        print(__doc__, file=sys.stderr)
        sys.exit(1)

    # Parse --table-name option
    table_name = 'bench_catalog'
    files = []
    i = 1
    while i < len(sys.argv):
        if sys.argv[i] == '--table' and i + 1 < len(sys.argv):
            table_name = sys.argv[i + 1]
            i += 2
        elif sys.argv[i].startswith('--table='):
            table_name = sys.argv[i].split('=', 1)[1]
            i += 1
        else:
            files.append(sys.argv[i])
            i += 1

    # Merge all sources (later files override earlier for same NORAD ID if newer epoch)
    mega = {}
    for filepath in files:
        objs = parse_3le_file(filepath)
        new = updated = 0
        for k, v in objs.items():
            if k not in mega:
                new += 1
                mega[k] = v
            elif v[3] > mega[k][3]:
                updated += 1
                mega[k] = v
        basename = os.path.basename(filepath)
        print(f"-- {basename}: {len(objs)} objects ({new} new, {updated} updated)", file=sys.stderr)

    print(f"-- Total: {len(mega)} unique objects", file=sys.stderr)

    # Emit SQL
    print(f"-- pg_orrery benchmark catalog ({len(mega)} objects)")
    print(f"-- Generated from {len(files)} TLE source files")
    print(f"-- Sources: {', '.join(os.path.basename(f) for f in files)}")
    print()
    print(f"DROP TABLE IF EXISTS {table_name};")
    print(f"CREATE TABLE {table_name} (")
    print(f"    id serial,")
    print(f"    name text,")
    print(f"    tle tle")
    print(f");")
    print()

    count = 0
    for norad_str in sorted(mega.keys(), key=lambda x: int(x)):
        line1, line2, name, epoch = mega[norad_str]

        if not name:
            name = f'NORAD {norad_str}'

        name_sql = name.replace("'", "''").replace('\\', '\\\\')
        tle_str = f"{line1}\\n{line2}"

        print(f"INSERT INTO {table_name} (name, tle) VALUES ('{name_sql}', E'{tle_str}');")
        count += 1

    print()
    print(f"-- Loaded {count} objects")


if __name__ == '__main__':
    main()