Private
Public Access
0
0
Files
hk-ipo/scripts/update_sync_state.py
T
geometrybase 08db218b6d Add archivist incremental sync state
Request:
Add archivist support for remembering which IPO archive stages have already been synced and which stages should be updated next.

Changes:
- Add sync_runs, ticker_sync_state, sync_tasks, and price_performance tables to the archive schema.
- Add scripts/update_sync_state.py to derive per-ticker stage status and rebuild the next-sync task queue.
- Export the new sync-state tables as Git-friendly CSV snapshots.
- Document the incremental archive flow in the archivist skill and README.

Verification:
- Ran scripts/bootstrap_historical_data.py.
- Ran scripts/update_sync_state.py with a deterministic as-of timestamp.
- Checked SQLite integrity and DB-to-snapshot row counts with Python sqlite3.
- Parsed Python scripts with ast.parse.
- Ran git diff --check and checked for temporary SQLite/cache files.

Next useful context:
- Current derived queue has 2 open tasks for 06658 and 15 waiting_until_due tasks for future stages.
2026-06-15 06:29:54 +00:00

448 lines
14 KiB
Python

#!/usr/bin/env python3
"""Update per-ticker archive sync state and pending sync tasks."""
from __future__ import annotations
import argparse
import csv
import sqlite3
from dataclasses import dataclass
from datetime import date, datetime, timedelta, timezone
from pathlib import Path
DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite")
DEFAULT_SCHEMA_PATH = Path("schema/hk_ipo.schema.sql")
SNAPSHOT_DIR = Path("data/snapshots")
@dataclass(frozen=True)
class Ticker:
ticker: str
listing_date: str | None
application_start_date: str | None
allotment_results_expected_date: str | None
@dataclass(frozen=True)
class StageState:
ticker: str
stage: str
status: str
required: int
due_date: str | None
completed_at: str | None
last_source_id: str | None
data_gap_id: str | None
notes: str
STAGE_ORDER = [
"T0_prospectus",
"T1_allotment",
"T2_grey_market",
"D1",
"D5",
"D20",
"D60",
]
TASK_TYPES = {
"T0_prospectus": "archive_prospectus_and_terms",
"T1_allotment": "archive_allotment_results",
"T2_grey_market": "archive_grey_market_result",
"D1": "archive_price_performance",
"D5": "archive_price_performance",
"D20": "archive_price_performance",
"D60": "archive_price_performance",
}
def parse_as_of(value: str | None) -> datetime:
if value:
normalized = value.replace("Z", "+00:00")
return datetime.fromisoformat(normalized)
return datetime.now(timezone.utc).replace(microsecond=0)
def parse_date(value: str | None) -> date | None:
if not value:
return None
return date.fromisoformat(value)
def due_status(due_date: str | None, as_of_date: date) -> str:
if due_date is None:
return "pending_due"
due = parse_date(due_date)
if due is None or due <= as_of_date:
return "pending_due"
return "pending_not_due"
def query_one(conn: sqlite3.Connection, sql: str, params: tuple[object, ...]) -> sqlite3.Row | None:
return conn.execute(sql, params).fetchone()
def load_tickers(conn: sqlite3.Connection) -> list[Ticker]:
rows = conn.execute(
"""
SELECT ticker, listing_date, application_start_date, allotment_results_expected_date
FROM ipo_master
ORDER BY ticker
"""
).fetchall()
return [Ticker(**dict(row)) for row in rows]
def data_gap_for(conn: sqlite3.Connection, ticker: str, stage: str) -> sqlite3.Row | None:
return query_one(
conn,
"""
SELECT gap_id, reason, expected_resolution_date
FROM data_gaps
WHERE ticker = ? AND stage = ?
ORDER BY created_at DESC, gap_id DESC
LIMIT 1
""",
(ticker, stage),
)
def prospectus_state(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> StageState:
source = query_one(
conn,
"""
SELECT source_id, source_date
FROM source_refs
WHERE ticker = ? AND source_type = 'prospectus'
ORDER BY source_date DESC, source_id DESC
LIMIT 1
""",
(ticker.ticker,),
)
terms = query_one(conn, "SELECT ticker FROM offering_terms WHERE ticker = ?", (ticker.ticker,))
if source and terms:
return StageState(
ticker.ticker,
"T0_prospectus",
"complete",
1,
ticker.application_start_date,
source["source_date"],
source["source_id"],
None,
"Prospectus source and offering terms are archived.",
)
gap = data_gap_for(conn, ticker.ticker, "T0_prospectus")
status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(ticker.application_start_date, as_of_date)
return StageState(
ticker.ticker,
"T0_prospectus",
status,
1,
ticker.application_start_date,
None,
source["source_id"] if source else None,
gap["gap_id"] if gap else None,
"Missing prospectus source or offering terms.",
)
def allotment_state(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> StageState:
demand = query_one(
conn,
"""
SELECT source_id, stage_date
FROM ipo_demand
WHERE ticker = ?
ORDER BY stage_date DESC, demand_id DESC
LIMIT 1
""",
(ticker.ticker,),
)
source = query_one(
conn,
"""
SELECT source_id, source_date
FROM source_refs
WHERE ticker = ? AND source_type = 'allotment_results'
ORDER BY source_date DESC, source_id DESC
LIMIT 1
""",
(ticker.ticker,),
)
if demand or source:
return StageState(
ticker.ticker,
"T1_allotment",
"complete",
1,
ticker.allotment_results_expected_date,
(demand or source)["stage_date" if demand else "source_date"],
(demand or source)["source_id"],
None,
"Allotment result facts are archived.",
)
gap = data_gap_for(conn, ticker.ticker, "T1_allotment")
status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(ticker.allotment_results_expected_date, as_of_date)
return StageState(
ticker.ticker,
"T1_allotment",
status,
1,
ticker.allotment_results_expected_date,
None,
None,
gap["gap_id"] if gap else None,
"Allotment result facts are not archived yet.",
)
def price_state(conn: sqlite3.Connection, ticker: Ticker, stage: str, due_date: str | None, as_of_date: date) -> StageState:
perf = query_one(
conn,
"""
SELECT source_id, as_of_date
FROM price_performance
WHERE ticker = ? AND stage = ?
LIMIT 1
""",
(ticker.ticker, stage),
)
source_types = {
"T2_grey_market": ("grey_market", "dark_market", "grey_market_performance"),
}.get(stage, ())
source = None
if source_types:
placeholders = ", ".join("?" for _ in source_types)
source = query_one(
conn,
f"""
SELECT source_id, source_date
FROM source_refs
WHERE ticker = ? AND source_type IN ({placeholders})
ORDER BY source_date DESC, source_id DESC
LIMIT 1
""",
(ticker.ticker, *source_types),
)
if perf or source:
row = perf or source
completed_key = "as_of_date" if perf else "source_date"
return StageState(
ticker.ticker,
stage,
"complete",
1,
due_date,
row[completed_key],
row["source_id"],
None,
"Performance source is archived.",
)
gap = data_gap_for(conn, ticker.ticker, stage)
status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(due_date, as_of_date)
note = "Price/performance source is not archived yet."
if stage in {"D5", "D20", "D60"}:
note += " Due date uses calendar days until trading-calendar support is added."
return StageState(
ticker.ticker,
stage,
status,
1,
due_date,
None,
None,
gap["gap_id"] if gap else None,
note,
)
def listing_offset_due(listing_date: str | None, days_after_listing: int) -> str | None:
listed = parse_date(listing_date)
if listed is None:
return None
return (listed + timedelta(days=days_after_listing)).isoformat()
def build_states(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> list[StageState]:
return [
prospectus_state(conn, ticker, as_of_date),
allotment_state(conn, ticker, as_of_date),
price_state(conn, ticker, "T2_grey_market", ticker.listing_date, as_of_date),
price_state(conn, ticker, "D1", listing_offset_due(ticker.listing_date, 0), as_of_date),
price_state(conn, ticker, "D5", listing_offset_due(ticker.listing_date, 4), as_of_date),
price_state(conn, ticker, "D20", listing_offset_due(ticker.listing_date, 19), as_of_date),
price_state(conn, ticker, "D60", listing_offset_due(ticker.listing_date, 59), as_of_date),
]
def upsert_sync_run(conn: sqlite3.Connection, run_id: str, mode: str, as_of: str, status: str, notes: str) -> None:
conn.execute(
"""
INSERT INTO sync_runs (sync_run_id, mode, as_of, started_at, finished_at, status, notes)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(sync_run_id) DO UPDATE SET
mode = excluded.mode,
as_of = excluded.as_of,
started_at = excluded.started_at,
finished_at = excluded.finished_at,
status = excluded.status,
notes = excluded.notes
""",
(run_id, mode, as_of, as_of, as_of, status, notes),
)
def replace_state(conn: sqlite3.Connection, states: list[StageState], run_id: str, updated_at: str) -> None:
conn.executemany(
"""
INSERT INTO ticker_sync_state (
ticker, stage, status, required, due_date, completed_at, last_source_id,
data_gap_id, last_sync_run_id, updated_at, notes
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(ticker, stage) DO UPDATE SET
status = excluded.status,
required = excluded.required,
due_date = excluded.due_date,
completed_at = excluded.completed_at,
last_source_id = excluded.last_source_id,
data_gap_id = excluded.data_gap_id,
last_sync_run_id = excluded.last_sync_run_id,
updated_at = excluded.updated_at,
notes = excluded.notes
""",
[
(
state.ticker,
state.stage,
state.status,
state.required,
state.due_date,
state.completed_at,
state.last_source_id,
state.data_gap_id,
run_id,
updated_at,
state.notes,
)
for state in states
],
)
def rebuild_tasks(conn: sqlite3.Connection, states: list[StageState], run_id: str, updated_at: str) -> None:
conn.execute("DELETE FROM sync_tasks")
rows = []
for state in states:
if state.status == "complete" or state.required == 0:
continue
task_status = {
"pending_due": "open",
"pending_not_due": "waiting_until_due",
"blocked": "blocked",
}.get(state.status)
if task_status is None:
continue
rows.append(
(
f"{state.ticker}_{state.stage}",
state.ticker,
state.stage,
TASK_TYPES[state.stage],
task_status,
state.due_date,
state.data_gap_id,
run_id,
updated_at,
state.notes,
)
)
conn.executemany(
"""
INSERT INTO sync_tasks (
task_id, ticker, stage, task_type, task_status, due_date, data_gap_id,
last_sync_run_id, updated_at, notes
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
rows,
)
def export_snapshot(conn: sqlite3.Connection, table: str, order_by: str = "1") -> None:
SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True)
cursor = conn.execute(f"SELECT * FROM {table} ORDER BY {order_by}")
columns = [description[0] for description in cursor.description]
with (SNAPSHOT_DIR / f"{table}.csv").open("w", newline="", encoding="utf-8") as handle:
writer = csv.writer(handle, lineterminator="\n")
writer.writerow(columns)
writer.writerows(cursor.fetchall())
def print_summary(states: list[StageState]) -> None:
counts: dict[str, int] = {}
for state in states:
counts[state.status] = counts.get(state.status, 0) + 1
print("sync state updated")
for status in sorted(counts):
print(f"{status}: {counts[status]}")
open_items = [state for state in states if state.status in {"pending_due", "blocked"}]
if open_items:
print("actionable items:")
for state in open_items:
print(f"- {state.ticker} {state.stage}: {state.status} due={state.due_date or ''}")
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH), help="Repo-relative schema path.")
parser.add_argument("--as-of", help="ISO timestamp for deterministic sync-state snapshots.")
parser.add_argument("--run-id", help="Stable sync run id. Defaults to sync_state_<timestamp>.")
parser.add_argument("--mode", default="state_refresh", help="Sync run mode label.")
args = parser.parse_args()
as_of_dt = parse_as_of(args.as_of)
as_of = as_of_dt.isoformat().replace("+00:00", "Z")
as_of_date = as_of_dt.date()
run_id = args.run_id or "sync_state_" + as_of.replace(":", "").replace("-", "").replace("+", "").replace("Z", "Z")
db_path = Path(args.db)
schema_path = Path(args.schema)
with sqlite3.connect(db_path) as conn:
conn.row_factory = sqlite3.Row
conn.executescript(schema_path.read_text(encoding="utf-8"))
upsert_sync_run(conn, run_id, args.mode, as_of, "running", "Refreshing derived ticker sync state.")
states = [state for ticker in load_tickers(conn) for state in build_states(conn, ticker, as_of_date)]
replace_state(conn, states, run_id, as_of)
rebuild_tasks(conn, states, run_id, as_of)
upsert_sync_run(conn, run_id, args.mode, as_of, "complete", "Derived ticker sync state refreshed.")
export_snapshot(conn, "sync_runs", "sync_run_id")
export_snapshot(
conn,
"ticker_sync_state",
"""
ticker,
CASE stage
WHEN 'T0_prospectus' THEN 0
WHEN 'T1_allotment' THEN 1
WHEN 'T2_grey_market' THEN 2
WHEN 'D1' THEN 3
WHEN 'D5' THEN 4
WHEN 'D20' THEN 5
WHEN 'D60' THEN 6
ELSE 99
END
""",
)
export_snapshot(conn, "sync_tasks", "task_status, due_date, ticker, stage")
print_summary(states)
return 0
if __name__ == "__main__":
raise SystemExit(main())