"""Tests for the SELECT-only SQL guardrail.""" import pytest from mcaxl.sql_validator import validate_select, SqlValidationError class TestSelectAccepted: def test_simple_select(self): assert validate_select("SELECT * FROM device") == "SELECT * FROM device" def test_with_cte(self): q = "WITH x AS (SELECT 1 FROM systables) SELECT * FROM x" assert validate_select(q) == q def test_lowercase_select(self): assert validate_select("select * from numplan") == "select * from numplan" def test_trailing_semicolon_stripped(self): assert validate_select("SELECT 1 FROM device;") == "SELECT 1 FROM device" def test_block_comments_stripped(self): q = "/* comment */ SELECT 1 FROM device" cleaned = validate_select(q) assert "SELECT 1 FROM device" in cleaned def test_line_comments_stripped(self): q = "-- a comment\nSELECT 1 FROM device" cleaned = validate_select(q) assert "SELECT 1 FROM device" in cleaned class TestRejected: def test_empty(self): with pytest.raises(SqlValidationError, match="empty"): validate_select("") def test_whitespace_only(self): with pytest.raises(SqlValidationError, match="empty"): validate_select(" \n ") def test_only_comments(self): with pytest.raises(SqlValidationError, match="empty"): validate_select("-- just a comment\n/* and another */") def test_insert_rejected(self): with pytest.raises(SqlValidationError, match="INSERT"): validate_select("INSERT INTO device VALUES (1)") def test_update_rejected(self): with pytest.raises(SqlValidationError, match="UPDATE"): validate_select("UPDATE device SET name='x' WHERE pkid='y'") def test_delete_rejected(self): with pytest.raises(SqlValidationError, match="DELETE"): validate_select("DELETE FROM device WHERE pkid='y'") def test_drop_rejected(self): with pytest.raises(SqlValidationError, match="DROP"): validate_select("DROP TABLE device") def test_select_with_embedded_drop_rejected(self): # Belt-and-suspenders: even if "DROP" appears alongside a SELECT, the # query is rejected. Under the sqlparse-based validator the rejection # reason is now "Multiple statements detected" (caught earlier than # forbidden-keyword scan); under the prior regex validator it was # "DROP". Either is correct — we just want this query refused. with pytest.raises(SqlValidationError, match="DROP|Multiple"): validate_select("SELECT 1 FROM device; DROP TABLE device") def test_truncate_rejected(self): with pytest.raises(SqlValidationError, match="TRUNCATE"): validate_select("TRUNCATE TABLE device") class TestEdgeCases: def test_keyword_as_column_name_blocked(self): # A column named "delete" would be blocked. This is acceptable — # the data dictionary doesn't use SQL keywords as column names, # and conservative blocking is the right call for v1. with pytest.raises(SqlValidationError): validate_select("SELECT delete FROM device") def test_select_with_subquery(self): q = "SELECT name FROM device WHERE pkid IN (SELECT fkdevice FROM numplan)" assert "SELECT name FROM device" in validate_select(q) class TestStringLiterals: """Forbidden keywords inside string literals must be ignored. Otherwise CSS names like 'Call Forward-CSS', DN descriptions containing 'DELETE' (e.g., 'Delete this voicemail line'), or partition names with 'INSERT' would all fail to query, even though the SQL itself is read-only. """ def test_call_inside_string_literal_passes(self): q = "SELECT pkid FROM callingsearchspace WHERE name = 'Call Forward-CSS'" result = validate_select(q) assert "Call Forward-CSS" in result # literal preserved def test_delete_inside_string_literal_passes(self): q = "SELECT pkid FROM numplan WHERE description = 'Delete after audit'" result = validate_select(q) assert "Delete after audit" in result def test_drop_inside_string_literal_passes(self): q = "SELECT pkid FROM numplan WHERE description = 'DROP table backup'" assert validate_select(q) def test_actual_drop_outside_literal_still_blocked(self): # See test_select_with_embedded_drop_rejected — rejection reason # is now multi-statement detection (sqlparse catches it earlier), # but the query still fails closed. with pytest.raises(SqlValidationError, match="DROP|Multiple"): validate_select("SELECT 1 FROM device; DROP TABLE backup") def test_escaped_quote_in_literal(self): # Informix uses '' (doubled) as escaped single quote within literals q = "SELECT pkid FROM numplan WHERE description = 'O''Brien''s line'" result = validate_select(q) assert "O''Brien''s line" in result def test_keyword_just_outside_literal_blocked(self): # The literal 'safe text' is fine; the trailing DROP is not. with pytest.raises(SqlValidationError, match="DROP"): validate_select("SELECT 1 FROM device WHERE x = 'safe text' OR DROP") def test_multiple_literals(self): q = "SELECT 1 FROM numplan WHERE name = 'CALL' AND description = 'UPDATE pending'" assert validate_select(q) class TestLiteralPreservedInOutput: """Hamilton review CRITICAL #1: comment-strip mutated string literals. The query SENT to AXL must preserve the literal contents byte-for-byte. Previously, the comment-strip pass ran before the literal-aware pass, so `--` or `/* */` inside a quoted string were silently eaten on the way to the cluster. An LLM dialing `description LIKE '%-- old%'` got a different query than it asked for. """ def test_dash_dash_inside_literal_preserved(self): q = "SELECT * FROM numplan WHERE description = 'Smith -- old line'" result = validate_select(q) assert "Smith -- old line" in result, ( f"line-comment marker inside literal must NOT be stripped; got: {result!r}" ) def test_block_comment_marker_inside_literal_preserved(self): q = "SELECT * FROM device WHERE name = 'before /* still in literal */ after'" result = validate_select(q) assert "/* still in literal */" in result assert "before" in result and "after" in result def test_like_pattern_with_dash_dash_preserved(self): # Real-world case: an LLM searches for descriptions containing "--" q = "SELECT pkid FROM numplan WHERE description LIKE '%-- old%'" result = validate_select(q) assert "'%-- old%'" in result def test_actual_line_comment_outside_literal_still_handled(self): # An actual --comment outside any literal is fine (AXL handles it), # and the keyword check ignores it. q = "SELECT 1 FROM device -- a real comment at the end" result = validate_select(q) # We don't strip from output, so the comment stays in the returned text. # The important thing is the validator passes and a forbidden keyword # in the comment wouldn't trip the check (covered separately). assert "SELECT 1 FROM device" in result def test_forbidden_keyword_inside_real_comment_does_not_trip(self): # Real comment, with a forbidden keyword in it, should not trip the validator q = "SELECT 1 FROM device -- TODO: someone DELETE the old test data" result = validate_select(q) assert "SELECT 1" in result def test_block_literal_with_drop_inside_preserved(self): q = "SELECT 1 FROM numplan WHERE description = 'log: DROP detected'" result = validate_select(q) assert "'log: DROP detected'" in result class TestSqlparseSpecific: """Cases that exercise wins of the sqlparse-based validator over the earlier regex implementation. Each test names the property being checked. """ def test_multi_statement_explicit_reject(self): # Two SELECTs, no forbidden keywords. The regex validator accepted # this because neither SELECT is in _FORBIDDEN. sqlparse catches it # via the explicit statement-count check. with pytest.raises(SqlValidationError, match="Multiple"): validate_select("SELECT 1; SELECT 2") def test_multi_statement_with_intervening_comments_reject(self): # Comments between statements don't disguise the multi-statement # nature — sqlparse still parses 2 statements. with pytest.raises(SqlValidationError, match="Multiple"): validate_select("SELECT 1 /* break */; /* and again */ SELECT 2") def test_keyword_substring_in_identifier_passes(self): # `inserted_at`, `update_status`, `dropped_call_count` — all # legitimate column names containing forbidden-keyword substrings. # sqlparse classifies them as Token.Name (single tokens), so the # forbidden scan correctly ignores them. for col in ("inserted_at", "update_status", "dropped_call_count"): q = f"SELECT {col} FROM device" assert validate_select(q) == q, f"column {col!r} should pass" def test_forbidden_keyword_inside_cte_rejected(self): # A CTE that internally does a DELETE must still be caught — sqlparse # walks into nested groups, so the DELETE keyword token is found # even though it's inside `WITH x AS (...)`. with pytest.raises(SqlValidationError, match="DELETE"): validate_select("WITH x AS (DELETE FROM y) SELECT * FROM x") def test_drop_inside_block_comment_passes(self): # The DROP is inside a /* ... */ comment. sqlparse classifies the # entire comment as Token.Comment, so its content never reaches the # forbidden-keyword scan. q = "SELECT 1 /* ; DROP TABLE foo */ FROM device" assert validate_select(q) == q def test_nested_cte_all_clean_passes(self): # Multiple CTEs chained, all SELECT-only — must accept. q = ( "WITH a AS (SELECT 1 AS n FROM systables), " "b AS (SELECT n FROM a WHERE n > 0) " "SELECT * FROM b" ) assert validate_select(q) == q