diff --git a/.codex/skills/archivist/SKILL.md b/.codex/skills/archivist/SKILL.md index 33bdf37..d12d32a 100644 --- a/.codex/skills/archivist/SKILL.md +++ b/.codex/skills/archivist/SKILL.md @@ -42,6 +42,7 @@ references/ - Record source references, URLs, as-of timestamps, relative paths, and hashes. - Update embedded SQLite tables for IPO facts. - Export Git-friendly CSV snapshots after database updates. +- Maintain `sync_runs`, `ticker_sync_state`, and `sync_tasks` so repeated syncs know what is already archived and what remains pending. - Preserve raw source files; do not overwrite without first checking whether the contents changed. - Label missing, stale, inconsistent, or estimated fields explicitly. @@ -66,9 +67,40 @@ If a user asks for both data update and analysis, complete the archive/update st 4. Compute hashes for archived files. 5. Insert or update structured facts in `data/hk_ipo.sqlite`. 6. Record every source in the source reference table using repo-relative paths. -7. Export key tables to `data/snapshots/` for readable Git diffs. -8. Verify path rules, required fields, and snapshot generation. -9. Commit only the related archive/database/snapshot changes. +7. Refresh sync state with `scripts/update_sync_state.py` after fact updates. +8. Export key tables to `data/snapshots/` for readable Git diffs. +9. Verify path rules, required fields, hash checks, sync state, and snapshot generation. +10. Commit only the related archive/database/snapshot changes. + +## Incremental Sync State + +Use `ticker_sync_state` as the per-ticker stage ledger and `sync_tasks` as the next-sync queue. + +Stages: + +- `T0_prospectus` +- `T1_allotment` +- `T2_grey_market` +- `D1` +- `D5` +- `D20` +- `D60` + +Status values: + +- `complete`: required facts or source files are archived. +- `pending_not_due`: the stage is expected in the future. +- `pending_due`: the stage is due and should be updated on the next sync. +- `blocked`: the missing data has no known resolution date or needs manual intervention. +- `not_applicable`: the stage does not apply. + +Default incremental flow: + +```bash +python3 scripts/update_sync_state.py +``` + +Then update only rows in `sync_tasks` whose `task_status` is `open` or `blocked`. Do not re-download existing source files unless the upstream source changed or the stored hash no longer matches. ## Quality Checks @@ -79,4 +111,5 @@ Before finishing, confirm: - Raw files referenced by the database exist. - Source hashes match current file contents. - CSV snapshots reflect the database update. +- `sync_tasks` reflects only missing or future work, not completed stages. - Any unavailable field is marked as a data gap rather than invented. diff --git a/README.md b/README.md index ace2c1b..9c9fbdd 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,18 @@ python3 -m venv .venv The extractor reads PDF paths from `data/hk_ipo.sqlite`, writes derived text files under `data/extracted_text/`, and exports `data/snapshots/extracted_text_manifest.csv` with page counts, text hashes, and extraction status. +## Incremental Archive Sync + +The archivist keeps a per-ticker sync ledger so repeated updates can focus on missing stages: + +```bash +python3 scripts/update_sync_state.py +``` + +This writes `ticker_sync_state` and `sync_tasks` into `data/hk_ipo.sqlite`, then exports `data/snapshots/ticker_sync_state.csv`, `data/snapshots/sync_tasks.csv`, and `data/snapshots/sync_runs.csv`. + +Use `sync_tasks` as the next-sync queue. Tasks marked `open` are due now; tasks marked `waiting_until_due` are known future updates. + ## Git Discipline The repository uses automatic focused commits for completed project changes. diff --git a/data/hk_ipo.sqlite b/data/hk_ipo.sqlite index 248c3d6..9113f28 100644 Binary files a/data/hk_ipo.sqlite and b/data/hk_ipo.sqlite differ diff --git a/data/snapshots/price_performance.csv b/data/snapshots/price_performance.csv new file mode 100644 index 0000000..960b553 --- /dev/null +++ b/data/snapshots/price_performance.csv @@ -0,0 +1 @@ +performance_id,ticker,stage,source_id,as_of_date,open_price_hkd,high_price_hkd,low_price_hkd,close_price_hkd,return_pct,turnover_hkd_m,data_as_of,notes diff --git a/data/snapshots/sync_runs.csv b/data/snapshots/sync_runs.csv new file mode 100644 index 0000000..e3278e9 --- /dev/null +++ b/data/snapshots/sync_runs.csv @@ -0,0 +1,2 @@ +sync_run_id,mode,as_of,started_at,finished_at,status,notes +sync_state_seed_2026_06_15,bootstrap_state_refresh,2026-06-15T06:30:00Z,2026-06-15T06:30:00Z,2026-06-15T06:30:00Z,complete,Derived ticker sync state refreshed. diff --git a/data/snapshots/sync_tasks.csv b/data/snapshots/sync_tasks.csv new file mode 100644 index 0000000..afd6590 --- /dev/null +++ b/data/snapshots/sync_tasks.csv @@ -0,0 +1,18 @@ +task_id,ticker,stage,task_type,task_status,due_date,data_gap_id,last_sync_run_id,updated_at,notes +06658_D1,06658,D1,archive_price_performance,open,2026-06-15,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06658_T2_grey_market,06658,T2_grey_market,archive_grey_market_result,open,2026-06-15,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06675_T1_allotment,06675,T1_allotment,archive_allotment_results,waiting_until_due,2026-06-16,06675_allotment_results_pending_2026_06_15,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are not archived yet. +06675_D1,06675,D1,archive_price_performance,waiting_until_due,2026-06-17,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06675_T2_grey_market,06675,T2_grey_market,archive_grey_market_result,waiting_until_due,2026-06-17,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06658_D5,06658,D5,archive_price_performance,waiting_until_due,2026-06-19,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675_D5,06675,D5,archive_price_performance,waiting_until_due,2026-06-21,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106_T1_allotment,06106,T1_allotment,archive_allotment_results,waiting_until_due,2026-06-23,06106_allotment_results_pending_2026_06_15,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are not archived yet. +06106_D1,06106,D1,archive_price_performance,waiting_until_due,2026-06-24,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06106_T2_grey_market,06106,T2_grey_market,archive_grey_market_result,waiting_until_due,2026-06-24,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06106_D5,06106,D5,archive_price_performance,waiting_until_due,2026-06-28,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658_D20,06658,D20,archive_price_performance,waiting_until_due,2026-07-04,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675_D20,06675,D20,archive_price_performance,waiting_until_due,2026-07-06,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106_D20,06106,D20,archive_price_performance,waiting_until_due,2026-07-13,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658_D60,06658,D60,archive_price_performance,waiting_until_due,2026-08-13,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675_D60,06675,D60,archive_price_performance,waiting_until_due,2026-08-15,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106_D60,06106,D60,archive_price_performance,waiting_until_due,2026-08-22,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. diff --git a/data/snapshots/ticker_sync_state.csv b/data/snapshots/ticker_sync_state.csv new file mode 100644 index 0000000..a72b4c9 --- /dev/null +++ b/data/snapshots/ticker_sync_state.csv @@ -0,0 +1,22 @@ +ticker,stage,status,required,due_date,completed_at,last_source_id,data_gap_id,last_sync_run_id,updated_at,notes +06106,T0_prospectus,complete,1,2026-06-15,2026-06-15,06106_prospectus_candidate_2026_06_15,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Prospectus source and offering terms are archived. +06106,T1_allotment,pending_not_due,1,2026-06-23,,,06106_allotment_results_pending_2026_06_15,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are not archived yet. +06106,T2_grey_market,pending_not_due,1,2026-06-24,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06106,D1,pending_not_due,1,2026-06-24,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06106,D5,pending_not_due,1,2026-06-28,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106,D20,pending_not_due,1,2026-07-13,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106,D60,pending_not_due,1,2026-08-22,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658,T0_prospectus,complete,1,2026-06-05,2026-06-05,06658_prospectus_2026_06_05,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Prospectus source and offering terms are archived. +06658,T1_allotment,complete,1,2026-06-12,2026-06-12,06658_allotment_results_2026_06_12,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are archived. +06658,T2_grey_market,pending_due,1,2026-06-15,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06658,D1,pending_due,1,2026-06-15,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06658,D5,pending_not_due,1,2026-06-19,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658,D20,pending_not_due,1,2026-07-04,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658,D60,pending_not_due,1,2026-08-13,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675,T0_prospectus,complete,1,2026-06-09,2026-06-09,06675_prospectus_2026_06_09,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Prospectus source and offering terms are archived. +06675,T1_allotment,pending_not_due,1,2026-06-16,,,06675_allotment_results_pending_2026_06_15,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are not archived yet. +06675,T2_grey_market,pending_not_due,1,2026-06-17,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06675,D1,pending_not_due,1,2026-06-17,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06675,D5,pending_not_due,1,2026-06-21,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675,D20,pending_not_due,1,2026-07-06,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675,D60,pending_not_due,1,2026-08-15,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. diff --git a/schema/hk_ipo.schema.sql b/schema/hk_ipo.schema.sql index 6d040e6..cc7f8c8 100644 --- a/schema/hk_ipo.schema.sql +++ b/schema/hk_ipo.schema.sql @@ -53,6 +53,23 @@ CREATE TABLE IF NOT EXISTS ipo_demand ( notes TEXT ); +CREATE TABLE IF NOT EXISTS price_performance ( + performance_id TEXT PRIMARY KEY, + ticker TEXT NOT NULL REFERENCES ipo_master(ticker), + stage TEXT NOT NULL, + source_id TEXT, + as_of_date TEXT NOT NULL, + open_price_hkd REAL, + high_price_hkd REAL, + low_price_hkd REAL, + close_price_hkd REAL, + return_pct REAL, + turnover_hkd_m REAL, + data_as_of TEXT NOT NULL, + notes TEXT, + UNIQUE (ticker, stage) +); + CREATE TABLE IF NOT EXISTS source_refs ( source_id TEXT PRIMARY KEY, ticker TEXT NOT NULL REFERENCES ipo_master(ticker), @@ -81,3 +98,45 @@ CREATE TABLE IF NOT EXISTS data_gaps ( created_at TEXT NOT NULL, notes TEXT ); + +CREATE TABLE IF NOT EXISTS sync_runs ( + sync_run_id TEXT PRIMARY KEY, + mode TEXT NOT NULL, + as_of TEXT NOT NULL, + started_at TEXT NOT NULL, + finished_at TEXT, + status TEXT NOT NULL, + notes TEXT, + CHECK (status IN ('running', 'complete', 'failed')) +); + +CREATE TABLE IF NOT EXISTS ticker_sync_state ( + ticker TEXT NOT NULL REFERENCES ipo_master(ticker), + stage TEXT NOT NULL, + status TEXT NOT NULL, + required INTEGER NOT NULL DEFAULT 1, + due_date TEXT, + completed_at TEXT, + last_source_id TEXT, + data_gap_id TEXT, + last_sync_run_id TEXT REFERENCES sync_runs(sync_run_id), + updated_at TEXT NOT NULL, + notes TEXT, + PRIMARY KEY (ticker, stage), + CHECK (status IN ('complete', 'pending_not_due', 'pending_due', 'blocked', 'not_applicable')), + CHECK (required IN (0, 1)) +); + +CREATE TABLE IF NOT EXISTS sync_tasks ( + task_id TEXT PRIMARY KEY, + ticker TEXT NOT NULL REFERENCES ipo_master(ticker), + stage TEXT NOT NULL, + task_type TEXT NOT NULL, + task_status TEXT NOT NULL, + due_date TEXT, + data_gap_id TEXT, + last_sync_run_id TEXT REFERENCES sync_runs(sync_run_id), + updated_at TEXT NOT NULL, + notes TEXT, + CHECK (task_status IN ('open', 'waiting_until_due', 'blocked')) +); diff --git a/scripts/bootstrap_historical_data.py b/scripts/bootstrap_historical_data.py index b49c8e7..a582fd3 100644 --- a/scripts/bootstrap_historical_data.py +++ b/scripts/bootstrap_historical_data.py @@ -303,8 +303,12 @@ def main() -> None: "ipo_master", "offering_terms", "ipo_demand", + "price_performance", "source_refs", "data_gaps", + "sync_runs", + "ticker_sync_state", + "sync_tasks", ]: export_snapshot(conn, table) diff --git a/scripts/update_sync_state.py b/scripts/update_sync_state.py new file mode 100644 index 0000000..0ae36e6 --- /dev/null +++ b/scripts/update_sync_state.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python3 +"""Update per-ticker archive sync state and pending sync tasks.""" + +from __future__ import annotations + +import argparse +import csv +import sqlite3 +from dataclasses import dataclass +from datetime import date, datetime, timedelta, timezone +from pathlib import Path + + +DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite") +DEFAULT_SCHEMA_PATH = Path("schema/hk_ipo.schema.sql") +SNAPSHOT_DIR = Path("data/snapshots") + + +@dataclass(frozen=True) +class Ticker: + ticker: str + listing_date: str | None + application_start_date: str | None + allotment_results_expected_date: str | None + + +@dataclass(frozen=True) +class StageState: + ticker: str + stage: str + status: str + required: int + due_date: str | None + completed_at: str | None + last_source_id: str | None + data_gap_id: str | None + notes: str + + +STAGE_ORDER = [ + "T0_prospectus", + "T1_allotment", + "T2_grey_market", + "D1", + "D5", + "D20", + "D60", +] + +TASK_TYPES = { + "T0_prospectus": "archive_prospectus_and_terms", + "T1_allotment": "archive_allotment_results", + "T2_grey_market": "archive_grey_market_result", + "D1": "archive_price_performance", + "D5": "archive_price_performance", + "D20": "archive_price_performance", + "D60": "archive_price_performance", +} + + +def parse_as_of(value: str | None) -> datetime: + if value: + normalized = value.replace("Z", "+00:00") + return datetime.fromisoformat(normalized) + return datetime.now(timezone.utc).replace(microsecond=0) + + +def parse_date(value: str | None) -> date | None: + if not value: + return None + return date.fromisoformat(value) + + +def due_status(due_date: str | None, as_of_date: date) -> str: + if due_date is None: + return "pending_due" + due = parse_date(due_date) + if due is None or due <= as_of_date: + return "pending_due" + return "pending_not_due" + + +def query_one(conn: sqlite3.Connection, sql: str, params: tuple[object, ...]) -> sqlite3.Row | None: + return conn.execute(sql, params).fetchone() + + +def load_tickers(conn: sqlite3.Connection) -> list[Ticker]: + rows = conn.execute( + """ + SELECT ticker, listing_date, application_start_date, allotment_results_expected_date + FROM ipo_master + ORDER BY ticker + """ + ).fetchall() + return [Ticker(**dict(row)) for row in rows] + + +def data_gap_for(conn: sqlite3.Connection, ticker: str, stage: str) -> sqlite3.Row | None: + return query_one( + conn, + """ + SELECT gap_id, reason, expected_resolution_date + FROM data_gaps + WHERE ticker = ? AND stage = ? + ORDER BY created_at DESC, gap_id DESC + LIMIT 1 + """, + (ticker, stage), + ) + + +def prospectus_state(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> StageState: + source = query_one( + conn, + """ + SELECT source_id, source_date + FROM source_refs + WHERE ticker = ? AND source_type = 'prospectus' + ORDER BY source_date DESC, source_id DESC + LIMIT 1 + """, + (ticker.ticker,), + ) + terms = query_one(conn, "SELECT ticker FROM offering_terms WHERE ticker = ?", (ticker.ticker,)) + if source and terms: + return StageState( + ticker.ticker, + "T0_prospectus", + "complete", + 1, + ticker.application_start_date, + source["source_date"], + source["source_id"], + None, + "Prospectus source and offering terms are archived.", + ) + gap = data_gap_for(conn, ticker.ticker, "T0_prospectus") + status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(ticker.application_start_date, as_of_date) + return StageState( + ticker.ticker, + "T0_prospectus", + status, + 1, + ticker.application_start_date, + None, + source["source_id"] if source else None, + gap["gap_id"] if gap else None, + "Missing prospectus source or offering terms.", + ) + + +def allotment_state(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> StageState: + demand = query_one( + conn, + """ + SELECT source_id, stage_date + FROM ipo_demand + WHERE ticker = ? + ORDER BY stage_date DESC, demand_id DESC + LIMIT 1 + """, + (ticker.ticker,), + ) + source = query_one( + conn, + """ + SELECT source_id, source_date + FROM source_refs + WHERE ticker = ? AND source_type = 'allotment_results' + ORDER BY source_date DESC, source_id DESC + LIMIT 1 + """, + (ticker.ticker,), + ) + if demand or source: + return StageState( + ticker.ticker, + "T1_allotment", + "complete", + 1, + ticker.allotment_results_expected_date, + (demand or source)["stage_date" if demand else "source_date"], + (demand or source)["source_id"], + None, + "Allotment result facts are archived.", + ) + gap = data_gap_for(conn, ticker.ticker, "T1_allotment") + status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(ticker.allotment_results_expected_date, as_of_date) + return StageState( + ticker.ticker, + "T1_allotment", + status, + 1, + ticker.allotment_results_expected_date, + None, + None, + gap["gap_id"] if gap else None, + "Allotment result facts are not archived yet.", + ) + + +def price_state(conn: sqlite3.Connection, ticker: Ticker, stage: str, due_date: str | None, as_of_date: date) -> StageState: + perf = query_one( + conn, + """ + SELECT source_id, as_of_date + FROM price_performance + WHERE ticker = ? AND stage = ? + LIMIT 1 + """, + (ticker.ticker, stage), + ) + source_types = { + "T2_grey_market": ("grey_market", "dark_market", "grey_market_performance"), + }.get(stage, ()) + source = None + if source_types: + placeholders = ", ".join("?" for _ in source_types) + source = query_one( + conn, + f""" + SELECT source_id, source_date + FROM source_refs + WHERE ticker = ? AND source_type IN ({placeholders}) + ORDER BY source_date DESC, source_id DESC + LIMIT 1 + """, + (ticker.ticker, *source_types), + ) + if perf or source: + row = perf or source + completed_key = "as_of_date" if perf else "source_date" + return StageState( + ticker.ticker, + stage, + "complete", + 1, + due_date, + row[completed_key], + row["source_id"], + None, + "Performance source is archived.", + ) + gap = data_gap_for(conn, ticker.ticker, stage) + status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(due_date, as_of_date) + note = "Price/performance source is not archived yet." + if stage in {"D5", "D20", "D60"}: + note += " Due date uses calendar days until trading-calendar support is added." + return StageState( + ticker.ticker, + stage, + status, + 1, + due_date, + None, + None, + gap["gap_id"] if gap else None, + note, + ) + + +def listing_offset_due(listing_date: str | None, days_after_listing: int) -> str | None: + listed = parse_date(listing_date) + if listed is None: + return None + return (listed + timedelta(days=days_after_listing)).isoformat() + + +def build_states(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> list[StageState]: + return [ + prospectus_state(conn, ticker, as_of_date), + allotment_state(conn, ticker, as_of_date), + price_state(conn, ticker, "T2_grey_market", ticker.listing_date, as_of_date), + price_state(conn, ticker, "D1", listing_offset_due(ticker.listing_date, 0), as_of_date), + price_state(conn, ticker, "D5", listing_offset_due(ticker.listing_date, 4), as_of_date), + price_state(conn, ticker, "D20", listing_offset_due(ticker.listing_date, 19), as_of_date), + price_state(conn, ticker, "D60", listing_offset_due(ticker.listing_date, 59), as_of_date), + ] + + +def upsert_sync_run(conn: sqlite3.Connection, run_id: str, mode: str, as_of: str, status: str, notes: str) -> None: + conn.execute( + """ + INSERT INTO sync_runs (sync_run_id, mode, as_of, started_at, finished_at, status, notes) + VALUES (?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(sync_run_id) DO UPDATE SET + mode = excluded.mode, + as_of = excluded.as_of, + started_at = excluded.started_at, + finished_at = excluded.finished_at, + status = excluded.status, + notes = excluded.notes + """, + (run_id, mode, as_of, as_of, as_of, status, notes), + ) + + +def replace_state(conn: sqlite3.Connection, states: list[StageState], run_id: str, updated_at: str) -> None: + conn.executemany( + """ + INSERT INTO ticker_sync_state ( + ticker, stage, status, required, due_date, completed_at, last_source_id, + data_gap_id, last_sync_run_id, updated_at, notes + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(ticker, stage) DO UPDATE SET + status = excluded.status, + required = excluded.required, + due_date = excluded.due_date, + completed_at = excluded.completed_at, + last_source_id = excluded.last_source_id, + data_gap_id = excluded.data_gap_id, + last_sync_run_id = excluded.last_sync_run_id, + updated_at = excluded.updated_at, + notes = excluded.notes + """, + [ + ( + state.ticker, + state.stage, + state.status, + state.required, + state.due_date, + state.completed_at, + state.last_source_id, + state.data_gap_id, + run_id, + updated_at, + state.notes, + ) + for state in states + ], + ) + + +def rebuild_tasks(conn: sqlite3.Connection, states: list[StageState], run_id: str, updated_at: str) -> None: + conn.execute("DELETE FROM sync_tasks") + rows = [] + for state in states: + if state.status == "complete" or state.required == 0: + continue + task_status = { + "pending_due": "open", + "pending_not_due": "waiting_until_due", + "blocked": "blocked", + }.get(state.status) + if task_status is None: + continue + rows.append( + ( + f"{state.ticker}_{state.stage}", + state.ticker, + state.stage, + TASK_TYPES[state.stage], + task_status, + state.due_date, + state.data_gap_id, + run_id, + updated_at, + state.notes, + ) + ) + conn.executemany( + """ + INSERT INTO sync_tasks ( + task_id, ticker, stage, task_type, task_status, due_date, data_gap_id, + last_sync_run_id, updated_at, notes + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + rows, + ) + + +def export_snapshot(conn: sqlite3.Connection, table: str, order_by: str = "1") -> None: + SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True) + cursor = conn.execute(f"SELECT * FROM {table} ORDER BY {order_by}") + columns = [description[0] for description in cursor.description] + with (SNAPSHOT_DIR / f"{table}.csv").open("w", newline="", encoding="utf-8") as handle: + writer = csv.writer(handle, lineterminator="\n") + writer.writerow(columns) + writer.writerows(cursor.fetchall()) + + +def print_summary(states: list[StageState]) -> None: + counts: dict[str, int] = {} + for state in states: + counts[state.status] = counts.get(state.status, 0) + 1 + print("sync state updated") + for status in sorted(counts): + print(f"{status}: {counts[status]}") + open_items = [state for state in states if state.status in {"pending_due", "blocked"}] + if open_items: + print("actionable items:") + for state in open_items: + print(f"- {state.ticker} {state.stage}: {state.status} due={state.due_date or ''}") + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.") + parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH), help="Repo-relative schema path.") + parser.add_argument("--as-of", help="ISO timestamp for deterministic sync-state snapshots.") + parser.add_argument("--run-id", help="Stable sync run id. Defaults to sync_state_.") + parser.add_argument("--mode", default="state_refresh", help="Sync run mode label.") + args = parser.parse_args() + + as_of_dt = parse_as_of(args.as_of) + as_of = as_of_dt.isoformat().replace("+00:00", "Z") + as_of_date = as_of_dt.date() + run_id = args.run_id or "sync_state_" + as_of.replace(":", "").replace("-", "").replace("+", "").replace("Z", "Z") + + db_path = Path(args.db) + schema_path = Path(args.schema) + with sqlite3.connect(db_path) as conn: + conn.row_factory = sqlite3.Row + conn.executescript(schema_path.read_text(encoding="utf-8")) + upsert_sync_run(conn, run_id, args.mode, as_of, "running", "Refreshing derived ticker sync state.") + states = [state for ticker in load_tickers(conn) for state in build_states(conn, ticker, as_of_date)] + replace_state(conn, states, run_id, as_of) + rebuild_tasks(conn, states, run_id, as_of) + upsert_sync_run(conn, run_id, args.mode, as_of, "complete", "Derived ticker sync state refreshed.") + export_snapshot(conn, "sync_runs", "sync_run_id") + export_snapshot( + conn, + "ticker_sync_state", + """ + ticker, + CASE stage + WHEN 'T0_prospectus' THEN 0 + WHEN 'T1_allotment' THEN 1 + WHEN 'T2_grey_market' THEN 2 + WHEN 'D1' THEN 3 + WHEN 'D5' THEN 4 + WHEN 'D20' THEN 5 + WHEN 'D60' THEN 6 + ELSE 99 + END + """, + ) + export_snapshot(conn, "sync_tasks", "task_status, due_date, ticker, stage") + print_summary(states) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())