Improve IPO archive gap handling

Request:
- Rework archivist handling for stubborn T0/T1 HKEX document gaps and unresolved T2 grey-market gaps.

Changes:
- Query HKEXnews titleSearchServlet with IPO-date windows instead of only the latest title-search page.
- Recognize SHARE OFFER listing documents and archive official HTML allotment-result notices when no PDF is published.
- Mark source-only allotment completion clearly when structured demand parsing is not yet covered.
- Add a reusable grey-market gap marker and archivist source policy for T2 data.
- Archive newly discovered HKEX raw sources, update SQLite, and refresh CSV snapshots.
- Treat raw evidence files as binary in Git attributes.

Verification:
- Ran py_compile for archive_hkex_documents.py, update_sync_state.py, and mark_grey_market_gaps.py.
- Ran HKEX document archive backfill and grey-market gap marker.
- Checked SQLite integrity, foreign keys, source paths, source hashes, and DB-vs-snapshot row counts.
- Ran git diff --cached --check after marking raw archives binary.

Next useful context:
- T0 is now complete for 293 tickers.
- T1 has 291 complete and 2 pending_not_due tickers.
- T2 has 291 blocked gaps pending an approved grey-market source strategy.
This commit is contained in:
2026-06-15 09:47:36 +00:00
parent 078f56998b
commit 5f9546b16c
184 changed files with 1204626 additions and 2857 deletions
+85 -9
View File
@@ -14,9 +14,9 @@ import sqlite3
import subprocess
import sys
from dataclasses import dataclass
from datetime import date, datetime, timezone
from datetime import date, datetime, timedelta, timezone
from pathlib import Path
from urllib.parse import urljoin
from urllib.parse import urlencode, urljoin
from urllib.request import Request, urlopen
from pypdf import PdfReader
@@ -28,6 +28,7 @@ BASE_URL = "https://www1.hkexnews.hk"
ACTIVE_STOCK_URL = f"{BASE_URL}/ncms/script/eds/activestock_sehk_e.json"
INACTIVE_STOCK_URL = f"{BASE_URL}/ncms/script/eds/inactivestock_sehk_e.json"
TITLE_SEARCH_URL = f"{BASE_URL}/search/titlesearch.xhtml"
TITLE_SEARCH_SERVLET_URL = f"{BASE_URL}/search/titleSearchServlet.do"
DB_PATH = Path("data/hk_ipo.sqlite")
SCHEMA_PATH = Path("schema/hk_ipo.schema.sql")
SNAPSHOT_DIR = Path("data/snapshots")
@@ -130,7 +131,11 @@ def parse_release_date(value: str) -> str:
return datetime.strptime(value.split()[0], "%d/%m/%Y").date().isoformat()
def title_search_rows(stock_id: int) -> list[DocumentRow]:
def parse_release_datetime(value: str) -> str:
return datetime.strptime(value, "%d/%m/%Y %H:%M").date().isoformat()
def latest_title_search_rows(stock_id: int) -> list[DocumentRow]:
url = f"{TITLE_SEARCH_URL}?category=0&market=SEHK&stockId={stock_id}"
page = fetch_bytes(url).decode("utf-8", "replace")
rows: list[DocumentRow] = []
@@ -155,6 +160,65 @@ def title_search_rows(stock_id: int) -> list[DocumentRow]:
return rows
def window_title_search_rows(stock_id: int, from_date: date, to_date: date) -> list[DocumentRow]:
params = {
"sortDir": "0",
"sortByOptions": "DateTime",
"category": "0",
"market": "SEHK",
"stockId": str(stock_id),
"documentType": "-1",
"fromDate": from_date.strftime("%Y%m%d"),
"toDate": to_date.strftime("%Y%m%d"),
"title": "",
"searchType": "0",
"t1code": "-2",
"t2Gcode": "-2",
"t2code": "-2",
"rowRange": "500",
"lang": "en",
}
url = f"{TITLE_SEARCH_SERVLET_URL}?{urlencode(params)}"
request = Request(
url,
headers={
"User-Agent": "Mozilla/5.0",
"Referer": f"{TITLE_SEARCH_URL}?category=0&market=SEHK&stockId={stock_id}",
},
)
with urlopen(request, timeout=60) as response:
payload = response.read().decode("utf-8", "replace")
response_data = json.loads(payload)
result = json.loads(response_data.get("result") or "[]")
rows: list[DocumentRow] = []
for item in result:
href = html.unescape(item.get("FILE_LINK") or "")
release_time = " ".join((item.get("DATE_TIME") or "").split())
if not href or not release_time:
continue
rows.append(
DocumentRow(
release_time=release_time,
release_date=parse_release_datetime(release_time),
headline=clean_html(item.get("SHORT_TEXT") or ""),
title=clean_html(item.get("TITLE") or ""),
href=href,
url=urljoin(BASE_URL, href),
)
)
return rows
def title_search_rows(stock_id: int, listing_date: str | None, prospectus_date: str | None) -> list[DocumentRow]:
listed = parse_iso_date(listing_date)
prospectus = parse_iso_date(prospectus_date)
if listed:
return window_title_search_rows(stock_id, listed - timedelta(days=90), listed + timedelta(days=14))
if prospectus:
return window_title_search_rows(stock_id, prospectus - timedelta(days=14), prospectus + timedelta(days=60))
return latest_title_search_rows(stock_id)
def parse_iso_date(value: str | None) -> date | None:
if not value:
return None
@@ -165,6 +229,10 @@ def date_distance(left: str, right: str) -> int:
return abs((date.fromisoformat(left) - date.fromisoformat(right)).days)
def archiveable_document(row: DocumentRow) -> bool:
return Path(row.href.lower()).suffix in {".pdf", ".htm", ".html"}
def choose_prospectus(rows: list[DocumentRow], prospectus_date: str | None, listing_date: str | None) -> DocumentRow | None:
candidates = []
for row in rows:
@@ -174,7 +242,7 @@ def choose_prospectus(rows: list[DocumentRow], prospectus_date: str | None, list
continue
if "listing documents" not in headline:
continue
if "global offering" in title or "prospectus" in title:
if "global offering" in title or "prospectus" in title or title in {"share offer", "public offer"}:
candidates.append(row)
if not candidates:
return None
@@ -196,7 +264,7 @@ def choose_allotment(rows: list[DocumentRow], listing_date: str | None) -> Docum
candidates = [
row
for row in rows
if row.href.lower().endswith(".pdf")
if archiveable_document(row)
and ("allotment results" in row.headline.lower() or "allotment results" in row.title.lower())
]
if not candidates:
@@ -708,8 +776,8 @@ def main() -> int:
if stock_id is None:
missing_stock_ids.append(ticker)
continue
rows = title_search_rows(stock_id)
listing_date, prospectus_date = ticker_dates(conn, ticker)
rows = title_search_rows(stock_id, listing_date, prospectus_date)
prospectus_row = choose_prospectus(rows, prospectus_date, listing_date)
allotment_row = choose_allotment(rows, listing_date)
if not prospectus_row and not allotment_row:
@@ -733,9 +801,17 @@ def main() -> int:
if allotment_row:
allotment_source = download_document(ticker, "allotment_results", allotment_row)
sources_for_ticker.append(allotment_source)
allotment_facts = parse_allotment_facts(allotment_source.local_path)
update_terms_from_allotment(conn, ticker, allotment_facts, as_of)
upsert_demand(conn, ticker, allotment_source.source_id, allotment_source.source_date, allotment_facts, as_of)
if Path(allotment_source.local_path).suffix.lower() == ".pdf":
allotment_facts = parse_allotment_facts(allotment_source.local_path)
update_terms_from_allotment(conn, ticker, allotment_facts, as_of)
upsert_demand(
conn,
ticker,
allotment_source.source_id,
allotment_source.source_date,
allotment_facts,
as_of,
)
upsert_source_refs(conn, sources_for_ticker, as_of)
archived_sources.extend(sources_for_ticker)