5f9546b16c
Request: - Rework archivist handling for stubborn T0/T1 HKEX document gaps and unresolved T2 grey-market gaps. Changes: - Query HKEXnews titleSearchServlet with IPO-date windows instead of only the latest title-search page. - Recognize SHARE OFFER listing documents and archive official HTML allotment-result notices when no PDF is published. - Mark source-only allotment completion clearly when structured demand parsing is not yet covered. - Add a reusable grey-market gap marker and archivist source policy for T2 data. - Archive newly discovered HKEX raw sources, update SQLite, and refresh CSV snapshots. - Treat raw evidence files as binary in Git attributes. Verification: - Ran py_compile for archive_hkex_documents.py, update_sync_state.py, and mark_grey_market_gaps.py. - Ran HKEX document archive backfill and grey-market gap marker. - Checked SQLite integrity, foreign keys, source paths, source hashes, and DB-vs-snapshot row counts. - Ran git diff --cached --check after marking raw archives binary. Next useful context: - T0 is now complete for 293 tickers. - T1 has 291 complete and 2 pending_not_due tickers. - T2 has 291 blocked gaps pending an approved grey-market source strategy.
130 lines
4.3 KiB
Python
130 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Mark due T2 grey-market tasks as blocked until a reliable source is approved."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import sqlite3
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
|
|
DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite")
|
|
DEFAULT_SCHEMA_PATH = Path("schema/hk_ipo.schema.sql")
|
|
SNAPSHOT_DIR = Path("data/snapshots")
|
|
|
|
|
|
def parse_as_of(value: str | None) -> str:
|
|
if value:
|
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).isoformat().replace("+00:00", "Z")
|
|
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
|
|
|
|
|
|
def export_snapshot(conn: sqlite3.Connection, table: str, order_by: str = "1") -> None:
|
|
SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True)
|
|
cursor = conn.execute(f"SELECT * FROM {table} ORDER BY {order_by}")
|
|
columns = [description[0] for description in cursor.description]
|
|
with (SNAPSHOT_DIR / f"{table}.csv").open("w", newline="", encoding="utf-8") as handle:
|
|
writer = csv.writer(handle, lineterminator="\n")
|
|
writer.writerow(columns)
|
|
writer.writerows(cursor.fetchall())
|
|
|
|
|
|
def refresh_sync_state(db_path: str, schema_path: str, as_of: str) -> None:
|
|
subprocess.run(
|
|
[
|
|
sys.executable,
|
|
"scripts/update_sync_state.py",
|
|
"--db",
|
|
db_path,
|
|
"--schema",
|
|
schema_path,
|
|
"--as-of",
|
|
as_of,
|
|
"--mode",
|
|
"grey_market_gap_review",
|
|
"--summary-limit",
|
|
"25",
|
|
],
|
|
check=True,
|
|
)
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
|
|
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH), help="Repo-relative schema path.")
|
|
parser.add_argument("--as-of", help="Archive timestamp. Defaults to current UTC time.")
|
|
parser.add_argument("--skip-sync-state", action="store_true", help="Do not refresh sync state after marking gaps.")
|
|
args = parser.parse_args()
|
|
|
|
as_of = parse_as_of(args.as_of)
|
|
reason = (
|
|
"No reproducible and redistribution-safe grey-market data source has been approved. "
|
|
"HKEX does not publish an official grey-market feed; broker and third-party grey-market feeds "
|
|
"are platform-specific or proprietary."
|
|
)
|
|
notes = (
|
|
"Keep T2 blocked until the project has a licensed export, user-provided evidence file, "
|
|
"or public historical source with clear reuse terms. Do not scrape proprietary grey-market "
|
|
"feeds into the repo."
|
|
)
|
|
|
|
with sqlite3.connect(args.db) as conn:
|
|
conn.executescript(Path(args.schema).read_text(encoding="utf-8"))
|
|
tickers = [
|
|
row[0]
|
|
for row in conn.execute(
|
|
"""
|
|
SELECT ticker
|
|
FROM ticker_sync_state
|
|
WHERE stage = 'T2_grey_market'
|
|
AND status = 'pending_due'
|
|
ORDER BY ticker
|
|
"""
|
|
)
|
|
]
|
|
rows = [
|
|
(
|
|
f"{ticker}_T2_grey_market_source_strategy_required",
|
|
ticker,
|
|
"T2_grey_market",
|
|
"grey_market_price_performance",
|
|
reason,
|
|
None,
|
|
as_of,
|
|
notes,
|
|
)
|
|
for ticker in tickers
|
|
]
|
|
conn.executemany(
|
|
"""
|
|
INSERT INTO data_gaps (
|
|
gap_id, ticker, stage, field_name, reason, expected_resolution_date, created_at, notes
|
|
)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(gap_id) DO UPDATE SET
|
|
field_name = excluded.field_name,
|
|
reason = excluded.reason,
|
|
expected_resolution_date = excluded.expected_resolution_date,
|
|
created_at = excluded.created_at,
|
|
notes = excluded.notes
|
|
""",
|
|
rows,
|
|
)
|
|
export_snapshot(conn, "data_gaps")
|
|
|
|
if not args.skip_sync_state:
|
|
refresh_sync_state(args.db, args.schema, as_of)
|
|
|
|
print("grey-market gaps marked")
|
|
print(f"tickers marked: {len(rows)}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|