Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions source/tyr/delete_unused_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""
Script to delete unused tokens from the TYR database.

Reads a CSV file (semicolon-separated) containing token information and generates
SQL DELETE statements for the 'key' table based on the token prefix column.

For entries where the token prefix was corrupted by Excel (scientific notation or
truncated), the deletion falls back to using the key ID (tyr_id).

Usage:
# Generate SQL file (dry-run, review before executing):
python delete_unused_tokens.py --csv tokens.csv --output delete_tokens.sql

# Execute directly against the database (requires SQLALCHEMY_DATABASE_URI or --db-uri):
python delete_unused_tokens.py --csv tokens.csv --execute --db-uri "postgresql://user:pass@host/db"
"""

import argparse
import csv
import re
import sys


def parse_csv(csv_path):
"""
Parse the CSV file and return two lists:
- valid_entries: list of (tyr_id, token_prefix) with valid hex prefixes
- corrupted_entries: list of (tyr_id, login, raw_prefix) for corrupted prefixes
"""
valid_entries = []
corrupted_entries = []

with open(csv_path, "r", encoding="latin-1") as f:
reader = csv.reader(f, delimiter=";")
header = next(reader)
print(f"CSV columns: {header}")

for i, row in enumerate(reader, start=2):
if len(row) < 3 or not row[0].strip():
continue

tyr_id = row[0].strip()
login = row[1].strip()
token_prefix = row[2].strip()

if not token_prefix:
continue

# Valid token prefixes are 8-character hex strings
if re.match(r"^[0-9a-fA-F]{8}$", token_prefix):
valid_entries.append((tyr_id, token_prefix))
else:
corrupted_entries.append((tyr_id, login, token_prefix))

return valid_entries, corrupted_entries


def generate_sql(valid_entries, corrupted_entries):
"""Generate SQL DELETE statements."""
lines = []
lines.append("-- =============================================================")
lines.append("-- Script de suppression des tokens inutilisés depuis 365 jours")
lines.append("-- =============================================================")
lines.append("-- ATTENTION: Exécuter dans une transaction pour pouvoir annuler")
lines.append("-- en cas de problème.")
lines.append("-- =============================================================")
lines.append("")
lines.append("BEGIN;")
lines.append("")

# --- Part 1: Delete by token prefix (valid hex entries) ---
lines.append(f"-- Partie 1: Suppression par préfixe de token ({len(valid_entries)} entrées)")
lines.append("-- Utilisation d'une table temporaire pour les préfixes")
lines.append("")
lines.append("CREATE TEMPORARY TABLE _token_prefixes_to_delete (prefix TEXT NOT NULL);")
lines.append("")

# Batch INSERT for efficiency
batch_size = 100
for i in range(0, len(valid_entries), batch_size):
batch = valid_entries[i : i + batch_size]
values = ", ".join(f"('{entry[1]}')" for entry in batch)
lines.append(f"INSERT INTO _token_prefixes_to_delete (prefix) VALUES {values};")

lines.append("")
lines.append("-- Vérification du nombre de tokens qui seront supprimés (préfixes)")
lines.append(
"SELECT COUNT(*) AS tokens_to_delete_by_prefix FROM key k "
"WHERE EXISTS (SELECT 1 FROM _token_prefixes_to_delete t WHERE k.token LIKE t.prefix || '%');"
)
lines.append("")
lines.append("-- Suppression des tokens correspondant aux préfixes")
lines.append(
"DELETE FROM key k "
"WHERE EXISTS (SELECT 1 FROM _token_prefixes_to_delete t WHERE k.token LIKE t.prefix || '%');"
)
lines.append("")
lines.append("DROP TABLE _token_prefixes_to_delete;")
lines.append("")

# --- Part 2: Delete by key ID (corrupted entries) ---
if corrupted_entries:
lines.append(f"-- Partie 2: Suppression par ID de clé ({len(corrupted_entries)} entrées)")
lines.append("-- Ces entrées avaient un préfixe de token corrompu par Excel (notation scientifique)")
lines.append("-- On utilise donc le tyr_id (= key.id) pour les supprimer directement.")
lines.append("")

for tyr_id, login, raw_prefix in corrupted_entries:
lines.append(f"-- login={login}, préfixe corrompu: {raw_prefix}")

lines.append("")
key_ids = ", ".join(entry[0] for entry in corrupted_entries)
lines.append(f"DELETE FROM key WHERE id IN ({key_ids});")
Comment on lines +113 to +114
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 SQL injection via unsanitized tyr_id from CSV in corrupted entries DELETE statement

The tyr_id values read from the CSV are directly interpolated into a SQL DELETE FROM key WHERE id IN (...) statement at source/tyr/delete_unused_tokens.py:114 without any validation that they are integers. A malicious or malformed CSV could contain arbitrary SQL in the tyr_id column.

Root cause and exploitation path

At source/tyr/delete_unused_tokens.py:43, tyr_id = row[0].strip() reads the raw string from CSV. At line 113-114:

key_ids = ", ".join(entry[0] for entry in corrupted_entries)
lines.append(f"DELETE FROM key WHERE id IN ({key_ids});")

If a CSV row has tyr_id = 1); DROP TABLE key; --, the generated SQL becomes:

DELETE FROM key WHERE id IN (1); DROP TABLE key; --);

This is exploitable both in the generated SQL file (if executed by a DBA) and in --execute mode. Even for the --output mode (generating a .sql file), the injected SQL would be present in the output file.

Impact: Potential for arbitrary SQL execution including data destruction.

Suggested change
key_ids = ", ".join(entry[0] for entry in corrupted_entries)
lines.append(f"DELETE FROM key WHERE id IN ({key_ids});")
key_ids = ", ".join(str(int(entry[0])) for entry in corrupted_entries)
lines.append(f"DELETE FROM key WHERE id IN ({key_ids});")
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

lines.append("")

# --- Summary ---
total = len(valid_entries) + len(corrupted_entries)
lines.append(f"-- Total attendu: ~{total} tokens supprimés")
lines.append("")
lines.append("-- Vérifier le résultat avant de valider:")
lines.append("-- Si tout est correct, exécuter: COMMIT;")
lines.append("-- Sinon, exécuter: ROLLBACK;")
lines.append("")
lines.append("-- Décommenter la ligne suivante pour valider:")
lines.append("-- COMMIT;")
lines.append("")
lines.append("-- Ou annuler avec:")
lines.append("-- ROLLBACK;")

return "\n".join(lines)


def execute_sql(sql, db_uri):
"""Execute SQL directly against the database."""
try:
import sqlalchemy
except ImportError:
print("ERROR: sqlalchemy is required for --execute mode. Install with: pip install sqlalchemy")
sys.exit(1)

engine = sqlalchemy.create_engine(db_uri)
with engine.connect() as conn:
# Split and execute statements
for statement in sql.split(";"):
statement = statement.strip()
if statement and not statement.startswith("--"):
Comment on lines +145 to +147
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Naive SQL splitting on ; causes BEGIN, CREATE TABLE, and DELETE statements to be skipped in --execute mode

In --execute mode, execute_sql splits the entire generated SQL on ; and then skips any chunk that starts with --. Because generate_sql places comment lines immediately before SQL statements (with no intervening ;), the split merges comments with the following SQL statement into a single chunk. Since the chunk starts with --, the actual SQL statement is silently skipped.

Detailed explanation of which statements are skipped and why

The generated SQL looks like:

-- =============================================================
-- Script de suppression...
-- =============================================================

BEGIN;

-- Partie 1: Suppression par préfixe...

CREATE TEMPORARY TABLE _token_prefixes_to_delete ...;

When split on ;, the first chunk is:

-- =============================================================\n...\n\nBEGIN

This starts with --, so BEGIN is never executed — all operations run without a transaction.

The second chunk is:

\n\n-- Partie 1: ...\n\nCREATE TEMPORARY TABLE _token_prefixes_to_delete (prefix TEXT NOT NULL)

This also starts with -- (after stripping), so CREATE TEMPORARY TABLE is never executed. The subsequent INSERT and DELETE statements referencing _token_prefixes_to_delete will then fail with a "relation does not exist" error.

Similarly, the DELETE FROM key WHERE id IN (...) for corrupted entries is in a chunk starting with comments and is also skipped.

Impact: In --execute mode, the script either crashes (table not found) or silently skips critical DELETE statements, and runs without transaction safety.

Prompt for agents
In source/tyr/delete_unused_tokens.py, the execute_sql function (lines 134-154) splits SQL on semicolons and then checks if each chunk starts with '--' to skip comments. This is fundamentally broken because comments and SQL statements get merged into the same chunk after splitting.

The fix should replace the naive split-on-semicolon approach with proper statement-by-statement execution. Options:

1. Instead of generating a single SQL string and splitting it, refactor generate_sql to return a list of individual SQL statements (without comments), and have execute_sql iterate over that list directly.

2. Alternatively, use sqlalchemy's text() to execute the entire SQL script at once if the driver supports it, or use a proper SQL parser.

3. At minimum, filter out comment-only lines from each chunk before checking if it starts with '--'. For example, after splitting on ';', strip each chunk, split it into lines, remove lines that start with '--' or are empty, and then rejoin to get the actual SQL statement.
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

print(f"Executing: {statement[:80]}...")
result = conn.execute(sqlalchemy.text(statement))
if result.returns_rows:
for row in result:
print(f" Result: {row}")
conn.commit()
print("Done.")


def main():
parser = argparse.ArgumentParser(description="Delete unused tokens from the TYR database")
parser.add_argument(
"--csv",
required=True,
help="Path to the CSV file with token prefixes",
)
parser.add_argument(
"--output",
default="delete_tokens.sql",
help="Output SQL file path (default: delete_tokens.sql)",
)
parser.add_argument(
"--execute",
action="store_true",
help="Execute SQL directly against the database instead of writing to file",
)
parser.add_argument(
"--db-uri",
help="Database URI (e.g., postgresql://user:pass@host/db). "
"Can also be set via SQLALCHEMY_DATABASE_URI env var.",
)

args = parser.parse_args()

# Parse CSV
print(f"Reading CSV: {args.csv}")
valid_entries, corrupted_entries = parse_csv(args.csv)
print(f"Valid token prefixes: {len(valid_entries)}")
print(f"Corrupted entries (will use key ID): {len(corrupted_entries)}")

if corrupted_entries:
print("\nCorrupted entries details:")
for tyr_id, login, raw_prefix in corrupted_entries:
print(f" tyr_id={tyr_id}, login={login}, prefix='{raw_prefix}'")

# Generate SQL
sql = generate_sql(valid_entries, corrupted_entries)

if args.execute:
import os

db_uri = args.db_uri or os.environ.get("SQLALCHEMY_DATABASE_URI")
if not db_uri:
print("ERROR: --db-uri or SQLALCHEMY_DATABASE_URI env var required for --execute mode")
sys.exit(1)
print("\nExecuting against database...")
execute_sql(sql, db_uri)
else:
with open(args.output, "w", encoding="utf-8") as f:
f.write(sql)
print(f"\nSQL written to: {args.output}")
print("Review the file, then execute it against your database.")


if __name__ == "__main__":
main()
Loading