08db218b6d
Request: Add archivist support for remembering which IPO archive stages have already been synced and which stages should be updated next. Changes: - Add sync_runs, ticker_sync_state, sync_tasks, and price_performance tables to the archive schema. - Add scripts/update_sync_state.py to derive per-ticker stage status and rebuild the next-sync task queue. - Export the new sync-state tables as Git-friendly CSV snapshots. - Document the incremental archive flow in the archivist skill and README. Verification: - Ran scripts/bootstrap_historical_data.py. - Ran scripts/update_sync_state.py with a deterministic as-of timestamp. - Checked SQLite integrity and DB-to-snapshot row counts with Python sqlite3. - Parsed Python scripts with ast.parse. - Ran git diff --check and checked for temporary SQLite/cache files. Next useful context: - Current derived queue has 2 open tasks for 06658 and 15 waiting_until_due tasks for future stages.
448 lines
14 KiB
Python
448 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""Update per-ticker archive sync state and pending sync tasks."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import sqlite3
|
|
from dataclasses import dataclass
|
|
from datetime import date, datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite")
|
|
DEFAULT_SCHEMA_PATH = Path("schema/hk_ipo.schema.sql")
|
|
SNAPSHOT_DIR = Path("data/snapshots")
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Ticker:
|
|
ticker: str
|
|
listing_date: str | None
|
|
application_start_date: str | None
|
|
allotment_results_expected_date: str | None
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class StageState:
|
|
ticker: str
|
|
stage: str
|
|
status: str
|
|
required: int
|
|
due_date: str | None
|
|
completed_at: str | None
|
|
last_source_id: str | None
|
|
data_gap_id: str | None
|
|
notes: str
|
|
|
|
|
|
STAGE_ORDER = [
|
|
"T0_prospectus",
|
|
"T1_allotment",
|
|
"T2_grey_market",
|
|
"D1",
|
|
"D5",
|
|
"D20",
|
|
"D60",
|
|
]
|
|
|
|
TASK_TYPES = {
|
|
"T0_prospectus": "archive_prospectus_and_terms",
|
|
"T1_allotment": "archive_allotment_results",
|
|
"T2_grey_market": "archive_grey_market_result",
|
|
"D1": "archive_price_performance",
|
|
"D5": "archive_price_performance",
|
|
"D20": "archive_price_performance",
|
|
"D60": "archive_price_performance",
|
|
}
|
|
|
|
|
|
def parse_as_of(value: str | None) -> datetime:
|
|
if value:
|
|
normalized = value.replace("Z", "+00:00")
|
|
return datetime.fromisoformat(normalized)
|
|
return datetime.now(timezone.utc).replace(microsecond=0)
|
|
|
|
|
|
def parse_date(value: str | None) -> date | None:
|
|
if not value:
|
|
return None
|
|
return date.fromisoformat(value)
|
|
|
|
|
|
def due_status(due_date: str | None, as_of_date: date) -> str:
|
|
if due_date is None:
|
|
return "pending_due"
|
|
due = parse_date(due_date)
|
|
if due is None or due <= as_of_date:
|
|
return "pending_due"
|
|
return "pending_not_due"
|
|
|
|
|
|
def query_one(conn: sqlite3.Connection, sql: str, params: tuple[object, ...]) -> sqlite3.Row | None:
|
|
return conn.execute(sql, params).fetchone()
|
|
|
|
|
|
def load_tickers(conn: sqlite3.Connection) -> list[Ticker]:
|
|
rows = conn.execute(
|
|
"""
|
|
SELECT ticker, listing_date, application_start_date, allotment_results_expected_date
|
|
FROM ipo_master
|
|
ORDER BY ticker
|
|
"""
|
|
).fetchall()
|
|
return [Ticker(**dict(row)) for row in rows]
|
|
|
|
|
|
def data_gap_for(conn: sqlite3.Connection, ticker: str, stage: str) -> sqlite3.Row | None:
|
|
return query_one(
|
|
conn,
|
|
"""
|
|
SELECT gap_id, reason, expected_resolution_date
|
|
FROM data_gaps
|
|
WHERE ticker = ? AND stage = ?
|
|
ORDER BY created_at DESC, gap_id DESC
|
|
LIMIT 1
|
|
""",
|
|
(ticker, stage),
|
|
)
|
|
|
|
|
|
def prospectus_state(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> StageState:
|
|
source = query_one(
|
|
conn,
|
|
"""
|
|
SELECT source_id, source_date
|
|
FROM source_refs
|
|
WHERE ticker = ? AND source_type = 'prospectus'
|
|
ORDER BY source_date DESC, source_id DESC
|
|
LIMIT 1
|
|
""",
|
|
(ticker.ticker,),
|
|
)
|
|
terms = query_one(conn, "SELECT ticker FROM offering_terms WHERE ticker = ?", (ticker.ticker,))
|
|
if source and terms:
|
|
return StageState(
|
|
ticker.ticker,
|
|
"T0_prospectus",
|
|
"complete",
|
|
1,
|
|
ticker.application_start_date,
|
|
source["source_date"],
|
|
source["source_id"],
|
|
None,
|
|
"Prospectus source and offering terms are archived.",
|
|
)
|
|
gap = data_gap_for(conn, ticker.ticker, "T0_prospectus")
|
|
status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(ticker.application_start_date, as_of_date)
|
|
return StageState(
|
|
ticker.ticker,
|
|
"T0_prospectus",
|
|
status,
|
|
1,
|
|
ticker.application_start_date,
|
|
None,
|
|
source["source_id"] if source else None,
|
|
gap["gap_id"] if gap else None,
|
|
"Missing prospectus source or offering terms.",
|
|
)
|
|
|
|
|
|
def allotment_state(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> StageState:
|
|
demand = query_one(
|
|
conn,
|
|
"""
|
|
SELECT source_id, stage_date
|
|
FROM ipo_demand
|
|
WHERE ticker = ?
|
|
ORDER BY stage_date DESC, demand_id DESC
|
|
LIMIT 1
|
|
""",
|
|
(ticker.ticker,),
|
|
)
|
|
source = query_one(
|
|
conn,
|
|
"""
|
|
SELECT source_id, source_date
|
|
FROM source_refs
|
|
WHERE ticker = ? AND source_type = 'allotment_results'
|
|
ORDER BY source_date DESC, source_id DESC
|
|
LIMIT 1
|
|
""",
|
|
(ticker.ticker,),
|
|
)
|
|
if demand or source:
|
|
return StageState(
|
|
ticker.ticker,
|
|
"T1_allotment",
|
|
"complete",
|
|
1,
|
|
ticker.allotment_results_expected_date,
|
|
(demand or source)["stage_date" if demand else "source_date"],
|
|
(demand or source)["source_id"],
|
|
None,
|
|
"Allotment result facts are archived.",
|
|
)
|
|
gap = data_gap_for(conn, ticker.ticker, "T1_allotment")
|
|
status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(ticker.allotment_results_expected_date, as_of_date)
|
|
return StageState(
|
|
ticker.ticker,
|
|
"T1_allotment",
|
|
status,
|
|
1,
|
|
ticker.allotment_results_expected_date,
|
|
None,
|
|
None,
|
|
gap["gap_id"] if gap else None,
|
|
"Allotment result facts are not archived yet.",
|
|
)
|
|
|
|
|
|
def price_state(conn: sqlite3.Connection, ticker: Ticker, stage: str, due_date: str | None, as_of_date: date) -> StageState:
|
|
perf = query_one(
|
|
conn,
|
|
"""
|
|
SELECT source_id, as_of_date
|
|
FROM price_performance
|
|
WHERE ticker = ? AND stage = ?
|
|
LIMIT 1
|
|
""",
|
|
(ticker.ticker, stage),
|
|
)
|
|
source_types = {
|
|
"T2_grey_market": ("grey_market", "dark_market", "grey_market_performance"),
|
|
}.get(stage, ())
|
|
source = None
|
|
if source_types:
|
|
placeholders = ", ".join("?" for _ in source_types)
|
|
source = query_one(
|
|
conn,
|
|
f"""
|
|
SELECT source_id, source_date
|
|
FROM source_refs
|
|
WHERE ticker = ? AND source_type IN ({placeholders})
|
|
ORDER BY source_date DESC, source_id DESC
|
|
LIMIT 1
|
|
""",
|
|
(ticker.ticker, *source_types),
|
|
)
|
|
if perf or source:
|
|
row = perf or source
|
|
completed_key = "as_of_date" if perf else "source_date"
|
|
return StageState(
|
|
ticker.ticker,
|
|
stage,
|
|
"complete",
|
|
1,
|
|
due_date,
|
|
row[completed_key],
|
|
row["source_id"],
|
|
None,
|
|
"Performance source is archived.",
|
|
)
|
|
gap = data_gap_for(conn, ticker.ticker, stage)
|
|
status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(due_date, as_of_date)
|
|
note = "Price/performance source is not archived yet."
|
|
if stage in {"D5", "D20", "D60"}:
|
|
note += " Due date uses calendar days until trading-calendar support is added."
|
|
return StageState(
|
|
ticker.ticker,
|
|
stage,
|
|
status,
|
|
1,
|
|
due_date,
|
|
None,
|
|
None,
|
|
gap["gap_id"] if gap else None,
|
|
note,
|
|
)
|
|
|
|
|
|
def listing_offset_due(listing_date: str | None, days_after_listing: int) -> str | None:
|
|
listed = parse_date(listing_date)
|
|
if listed is None:
|
|
return None
|
|
return (listed + timedelta(days=days_after_listing)).isoformat()
|
|
|
|
|
|
def build_states(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> list[StageState]:
|
|
return [
|
|
prospectus_state(conn, ticker, as_of_date),
|
|
allotment_state(conn, ticker, as_of_date),
|
|
price_state(conn, ticker, "T2_grey_market", ticker.listing_date, as_of_date),
|
|
price_state(conn, ticker, "D1", listing_offset_due(ticker.listing_date, 0), as_of_date),
|
|
price_state(conn, ticker, "D5", listing_offset_due(ticker.listing_date, 4), as_of_date),
|
|
price_state(conn, ticker, "D20", listing_offset_due(ticker.listing_date, 19), as_of_date),
|
|
price_state(conn, ticker, "D60", listing_offset_due(ticker.listing_date, 59), as_of_date),
|
|
]
|
|
|
|
|
|
def upsert_sync_run(conn: sqlite3.Connection, run_id: str, mode: str, as_of: str, status: str, notes: str) -> None:
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO sync_runs (sync_run_id, mode, as_of, started_at, finished_at, status, notes)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(sync_run_id) DO UPDATE SET
|
|
mode = excluded.mode,
|
|
as_of = excluded.as_of,
|
|
started_at = excluded.started_at,
|
|
finished_at = excluded.finished_at,
|
|
status = excluded.status,
|
|
notes = excluded.notes
|
|
""",
|
|
(run_id, mode, as_of, as_of, as_of, status, notes),
|
|
)
|
|
|
|
|
|
def replace_state(conn: sqlite3.Connection, states: list[StageState], run_id: str, updated_at: str) -> None:
|
|
conn.executemany(
|
|
"""
|
|
INSERT INTO ticker_sync_state (
|
|
ticker, stage, status, required, due_date, completed_at, last_source_id,
|
|
data_gap_id, last_sync_run_id, updated_at, notes
|
|
)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(ticker, stage) DO UPDATE SET
|
|
status = excluded.status,
|
|
required = excluded.required,
|
|
due_date = excluded.due_date,
|
|
completed_at = excluded.completed_at,
|
|
last_source_id = excluded.last_source_id,
|
|
data_gap_id = excluded.data_gap_id,
|
|
last_sync_run_id = excluded.last_sync_run_id,
|
|
updated_at = excluded.updated_at,
|
|
notes = excluded.notes
|
|
""",
|
|
[
|
|
(
|
|
state.ticker,
|
|
state.stage,
|
|
state.status,
|
|
state.required,
|
|
state.due_date,
|
|
state.completed_at,
|
|
state.last_source_id,
|
|
state.data_gap_id,
|
|
run_id,
|
|
updated_at,
|
|
state.notes,
|
|
)
|
|
for state in states
|
|
],
|
|
)
|
|
|
|
|
|
def rebuild_tasks(conn: sqlite3.Connection, states: list[StageState], run_id: str, updated_at: str) -> None:
|
|
conn.execute("DELETE FROM sync_tasks")
|
|
rows = []
|
|
for state in states:
|
|
if state.status == "complete" or state.required == 0:
|
|
continue
|
|
task_status = {
|
|
"pending_due": "open",
|
|
"pending_not_due": "waiting_until_due",
|
|
"blocked": "blocked",
|
|
}.get(state.status)
|
|
if task_status is None:
|
|
continue
|
|
rows.append(
|
|
(
|
|
f"{state.ticker}_{state.stage}",
|
|
state.ticker,
|
|
state.stage,
|
|
TASK_TYPES[state.stage],
|
|
task_status,
|
|
state.due_date,
|
|
state.data_gap_id,
|
|
run_id,
|
|
updated_at,
|
|
state.notes,
|
|
)
|
|
)
|
|
conn.executemany(
|
|
"""
|
|
INSERT INTO sync_tasks (
|
|
task_id, ticker, stage, task_type, task_status, due_date, data_gap_id,
|
|
last_sync_run_id, updated_at, notes
|
|
)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
""",
|
|
rows,
|
|
)
|
|
|
|
|
|
def export_snapshot(conn: sqlite3.Connection, table: str, order_by: str = "1") -> None:
|
|
SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True)
|
|
cursor = conn.execute(f"SELECT * FROM {table} ORDER BY {order_by}")
|
|
columns = [description[0] for description in cursor.description]
|
|
with (SNAPSHOT_DIR / f"{table}.csv").open("w", newline="", encoding="utf-8") as handle:
|
|
writer = csv.writer(handle, lineterminator="\n")
|
|
writer.writerow(columns)
|
|
writer.writerows(cursor.fetchall())
|
|
|
|
|
|
def print_summary(states: list[StageState]) -> None:
|
|
counts: dict[str, int] = {}
|
|
for state in states:
|
|
counts[state.status] = counts.get(state.status, 0) + 1
|
|
print("sync state updated")
|
|
for status in sorted(counts):
|
|
print(f"{status}: {counts[status]}")
|
|
open_items = [state for state in states if state.status in {"pending_due", "blocked"}]
|
|
if open_items:
|
|
print("actionable items:")
|
|
for state in open_items:
|
|
print(f"- {state.ticker} {state.stage}: {state.status} due={state.due_date or ''}")
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
|
|
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH), help="Repo-relative schema path.")
|
|
parser.add_argument("--as-of", help="ISO timestamp for deterministic sync-state snapshots.")
|
|
parser.add_argument("--run-id", help="Stable sync run id. Defaults to sync_state_<timestamp>.")
|
|
parser.add_argument("--mode", default="state_refresh", help="Sync run mode label.")
|
|
args = parser.parse_args()
|
|
|
|
as_of_dt = parse_as_of(args.as_of)
|
|
as_of = as_of_dt.isoformat().replace("+00:00", "Z")
|
|
as_of_date = as_of_dt.date()
|
|
run_id = args.run_id or "sync_state_" + as_of.replace(":", "").replace("-", "").replace("+", "").replace("Z", "Z")
|
|
|
|
db_path = Path(args.db)
|
|
schema_path = Path(args.schema)
|
|
with sqlite3.connect(db_path) as conn:
|
|
conn.row_factory = sqlite3.Row
|
|
conn.executescript(schema_path.read_text(encoding="utf-8"))
|
|
upsert_sync_run(conn, run_id, args.mode, as_of, "running", "Refreshing derived ticker sync state.")
|
|
states = [state for ticker in load_tickers(conn) for state in build_states(conn, ticker, as_of_date)]
|
|
replace_state(conn, states, run_id, as_of)
|
|
rebuild_tasks(conn, states, run_id, as_of)
|
|
upsert_sync_run(conn, run_id, args.mode, as_of, "complete", "Derived ticker sync state refreshed.")
|
|
export_snapshot(conn, "sync_runs", "sync_run_id")
|
|
export_snapshot(
|
|
conn,
|
|
"ticker_sync_state",
|
|
"""
|
|
ticker,
|
|
CASE stage
|
|
WHEN 'T0_prospectus' THEN 0
|
|
WHEN 'T1_allotment' THEN 1
|
|
WHEN 'T2_grey_market' THEN 2
|
|
WHEN 'D1' THEN 3
|
|
WHEN 'D5' THEN 4
|
|
WHEN 'D20' THEN 5
|
|
WHEN 'D60' THEN 6
|
|
ELSE 99
|
|
END
|
|
""",
|
|
)
|
|
export_snapshot(conn, "sync_tasks", "task_status, due_date, ticker, stage")
|
|
print_summary(states)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|