Refresh HK IPO heat ranking

Request:
- Update the latest Hong Kong IPO candidate list and rescore it based on subscription multiples.

Changes:
- Archived the 2026-06-22 HKEX Main Board New Listing Information page, adding 02697, 03952, 06715, and 06915 to the current candidate set.
- Archived and extracted the four new prospectuses, refreshed current HKEX document facts, and rebuilt the v0 analysis dataset to 311 rows.
- Archived a 2026-06-22T08:55:00Z VBKR/Jieli market-heat snapshot and wrote only still-actionable T0.95 rows to avoid look-ahead leakage for already-closed IPOs.
- Improved prospectus date parsing for split weekday/month text, glued noon/commence phrases, and current new-listing expected listing-date updates.
- Added a Chinese 2026-06-22 latest IPO report ranking candidates after the subscription-multiple overlay.

Verification:
- Ran py_compile for archive_hkex_documents.py, archive_t0_5_market_heat.py, archive_hkex_current_new_listings.py, and build_analysis_dataset.py.
- Re-ran HKEX current-page seeding, document archiving, market-heat archiving, and analysis dataset build as of 2026-06-22T08:55:00Z.
- Ran git diff --check and git diff --cached --check.
- Ran SQLite integrity_check and foreign_key_check.
- Verified source_refs paths, file existence, and SHA-256 hashes.

Next useful context:
- 01956 is the only current candidate with both strong T0 structure and >100x actionable heat in this snapshot.
- Recheck 03952 and 06715 near the 2026-06-25 cutoff; their structure is strong but 2026-06-22 heat is below 10x.
- Official T1 allotment facts for 06067 and 06132 were still unavailable at this archive timestamp.
This commit is contained in:
2026-06-22 09:03:50 +00:00
parent e346690bb7
commit e746cae035
24 changed files with 96314 additions and 2979 deletions
+55 -7
View File
@@ -340,16 +340,54 @@ def first_pdf_text_with_lines(local_path: str, max_pages: int) -> str:
def normalize_pdf_text(text: str) -> str:
replacements = {
"H o n g K o n g P u b l i c O f f e r i n g c o m m e n c e s": "Hong Kong Public Offering commences",
"H o n gK o n gP u b l i cO f f e r i n gc o m m e n c e s": "Hong Kong Public Offering commences",
"a t o r b e f o r e": "at or before",
"n o l a t e r": "no later",
"o n o r b e f o r e": "on or before",
"c o m m e n c e": "commence",
"e x p e c t e d t o": "expected to",
"e x p e c t e dt o": "expected to",
}
for source, target in replacements.items():
text = text.replace(source, target)
text = re.sub(r"\ba\s+t\b", "at", text)
text = re.sub(r"\bo\s+n\b", "on", text)
text = re.sub(r"\bf\s+r\s+o\s+m\b", "from", text)
text = re.sub(r"\bexpected\s*tocommenceo\s*n\b", "expected to commence on", text, flags=re.I)
text = re.sub(r"\bexpected\s+to\s+commenceo\s*n\b", "expected to commence on", text, flags=re.I)
text = re.sub(r"\bcommenceo\s*n\b", "commence on", text, flags=re.I)
for word in [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
]:
pattern = r"(?<![A-Za-z])" + r"\s*".join(word) + r"(?![A-Za-z])"
text = re.sub(pattern, word, text, flags=re.I)
text = re.sub(
r"\b(January|February|March|April|May|June|July|August|September|October|November|December)(\d)",
r"\1 \2",
text,
)
text = re.sub(r"\b(\d)\s+(\d)\s*,\s*(\d)\s+(\d)\s+(\d)\s+(\d)\b", r"\1\2, \3\4\5\6", text)
text = re.sub(r"\b(\d{1,2})\s*,\s*(\d)\s+(\d)\s+(\d)\s+(\d)\b", r"\1, \2\3\4\5", text)
text = re.sub(r"(?<![A-Za-z])n\s*o\s*o\s*n\s*o\s*n\s+([A-Z][a-z]+)", r"noon on \1", text, flags=re.I)
text = re.sub(r"\bno\s*o\s*n\s+([A-Z][a-z]+)", r"noon on \1", text, flags=re.I)
return text
@@ -395,8 +433,8 @@ def strict_money_m_after(pattern: str, text: str) -> float | None:
def date_after(label_pattern: str, text: str) -> str | None:
match = re.search(
label_pattern
+ r".{0,600}?(?:on|from|at or before)\s+(?:[.\s]+)?(?:[A-Z][a-z]+,\s+)?"
+ r"([A-Z][a-z]+ \d{1,2}, \d{4}|\d{1,2} [A-Z][a-z]+ \d{4})",
+ r".{0,600}?\b(?:on(?: or about)?|from|at or before)\b\s*(?:[.\s]+)?(?:[A-Z][a-z]+\s*,\s*)?"
+ r"([A-Z][a-z]+ \d{1,2},\s*\d{4}|\d{1,2} [A-Z][a-z]+ \d{4})",
text,
flags=re.I,
)
@@ -471,16 +509,22 @@ def parse_prospectus_facts(local_path: str) -> ProspectusFacts:
allotment_date = (
date_after(r"Announcement of the level of indications.*?basis of allocation", text)
or date_after(r"The results of allocations", text)
or date_after(r"allotment results announcement", text)
or date_after(r"Announcement of", text)
or date_after(r"Announcement of.*?Offer Price", text)
)
return ProspectusFacts(
application_start_date=date_after(r"Hong Kong Public Offering commences", text),
application_start_date=(
date_after(r"Hong Kong Public Offering commences", text)
or date_after(r"Application lists.*?open", text)
),
application_end_date=date_after(r"Application lists.*?close", text),
allotment_results_expected_date=allotment_date,
listing_date=date_after(
r"Dealings in (?:our\s+|the\s+)?(?:H\s+)?(?:Shares|HDRs).*?(?:expected to commence|to commence)",
text,
listing_date=(
date_after(
r"Dealings in (?:our\s+|the\s+)?(?:H\s+)?(?:Shares|HDRs).*?(?:expected to commence|to commence)",
text,
)
),
offer_price_hkd=offer_price,
board_lot=board_lot,
@@ -834,7 +878,10 @@ def update_master_from_prospectus(conn: sqlite3.Connection, ticker: str, facts:
SET application_start_date = COALESCE(?, application_start_date),
application_end_date = COALESCE(?, application_end_date),
allotment_results_expected_date = COALESCE(?, allotment_results_expected_date),
listing_date = COALESCE(listing_date, ?),
listing_date = CASE
WHEN status = 'listed' THEN COALESCE(listing_date, ?)
ELSE COALESCE(?, listing_date)
END,
data_as_of = ?
WHERE ticker = ?
""",
@@ -843,6 +890,7 @@ def update_master_from_prospectus(conn: sqlite3.Connection, ticker: str, facts:
facts.application_end_date,
facts.allotment_results_expected_date,
facts.listing_date,
facts.listing_date,
as_of,
ticker,
),
+13 -1
View File
@@ -11,7 +11,7 @@ import re
import sqlite3
import subprocess
from dataclasses import dataclass
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
from pathlib import Path
from urllib.request import Request, urlopen
@@ -132,6 +132,16 @@ def parse_deadline(value: str) -> str:
return " ".join(lines)
def is_still_actionable(row: MarketHeatRow, as_of: str) -> bool:
try:
deadline_hkt = datetime.strptime(row.subscription_deadline, "%Y-%m-%d %H:%M")
except ValueError:
return False
observed_at = datetime.fromisoformat(as_of.replace("Z", "+00:00"))
observed_hkt = observed_at.astimezone(timezone(timedelta(hours=8))).replace(tzinfo=None)
return observed_hkt < deadline_hkt
def parse_rows(page: str) -> list[MarketHeatRow]:
rows: list[MarketHeatRow] = []
for tr in re.findall(r"<tr[^>]*>(.*?)</tr>", page, flags=re.S):
@@ -220,6 +230,8 @@ def upsert_rows(
for row in rows:
if row.ticker not in selected:
continue
if stage == T0_95_STAGE and not is_still_actionable(row, as_of):
continue
source_id = f"{row.ticker}_{slug}_vbkr_{compact_timestamp(as_of)}"
heat_id = source_id
conn.execute(