Refresh HK IPO heat ranking
Request: - Update the latest Hong Kong IPO candidate list and rescore it based on subscription multiples. Changes: - Archived the 2026-06-22 HKEX Main Board New Listing Information page, adding 02697, 03952, 06715, and 06915 to the current candidate set. - Archived and extracted the four new prospectuses, refreshed current HKEX document facts, and rebuilt the v0 analysis dataset to 311 rows. - Archived a 2026-06-22T08:55:00Z VBKR/Jieli market-heat snapshot and wrote only still-actionable T0.95 rows to avoid look-ahead leakage for already-closed IPOs. - Improved prospectus date parsing for split weekday/month text, glued noon/commence phrases, and current new-listing expected listing-date updates. - Added a Chinese 2026-06-22 latest IPO report ranking candidates after the subscription-multiple overlay. Verification: - Ran py_compile for archive_hkex_documents.py, archive_t0_5_market_heat.py, archive_hkex_current_new_listings.py, and build_analysis_dataset.py. - Re-ran HKEX current-page seeding, document archiving, market-heat archiving, and analysis dataset build as of 2026-06-22T08:55:00Z. - Ran git diff --check and git diff --cached --check. - Ran SQLite integrity_check and foreign_key_check. - Verified source_refs paths, file existence, and SHA-256 hashes. Next useful context: - 01956 is the only current candidate with both strong T0 structure and >100x actionable heat in this snapshot. - Recheck 03952 and 06715 near the 2026-06-25 cutoff; their structure is strong but 2026-06-22 heat is below 10x. - Official T1 allotment facts for 06067 and 06132 were still unavailable at this archive timestamp.
This commit is contained in:
@@ -340,16 +340,54 @@ def first_pdf_text_with_lines(local_path: str, max_pages: int) -> str:
|
||||
def normalize_pdf_text(text: str) -> str:
|
||||
replacements = {
|
||||
"H o n g K o n g P u b l i c O f f e r i n g c o m m e n c e s": "Hong Kong Public Offering commences",
|
||||
"H o n gK o n gP u b l i cO f f e r i n gc o m m e n c e s": "Hong Kong Public Offering commences",
|
||||
"a t o r b e f o r e": "at or before",
|
||||
"n o l a t e r": "no later",
|
||||
"o n o r b e f o r e": "on or before",
|
||||
"c o m m e n c e": "commence",
|
||||
"e x p e c t e d t o": "expected to",
|
||||
"e x p e c t e dt o": "expected to",
|
||||
}
|
||||
for source, target in replacements.items():
|
||||
text = text.replace(source, target)
|
||||
text = re.sub(r"\ba\s+t\b", "at", text)
|
||||
text = re.sub(r"\bo\s+n\b", "on", text)
|
||||
text = re.sub(r"\bf\s+r\s+o\s+m\b", "from", text)
|
||||
text = re.sub(r"\bexpected\s*tocommenceo\s*n\b", "expected to commence on", text, flags=re.I)
|
||||
text = re.sub(r"\bexpected\s+to\s+commenceo\s*n\b", "expected to commence on", text, flags=re.I)
|
||||
text = re.sub(r"\bcommenceo\s*n\b", "commence on", text, flags=re.I)
|
||||
for word in [
|
||||
"Monday",
|
||||
"Tuesday",
|
||||
"Wednesday",
|
||||
"Thursday",
|
||||
"Friday",
|
||||
"Saturday",
|
||||
"Sunday",
|
||||
"January",
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June",
|
||||
"July",
|
||||
"August",
|
||||
"September",
|
||||
"October",
|
||||
"November",
|
||||
"December",
|
||||
]:
|
||||
pattern = r"(?<![A-Za-z])" + r"\s*".join(word) + r"(?![A-Za-z])"
|
||||
text = re.sub(pattern, word, text, flags=re.I)
|
||||
text = re.sub(
|
||||
r"\b(January|February|March|April|May|June|July|August|September|October|November|December)(\d)",
|
||||
r"\1 \2",
|
||||
text,
|
||||
)
|
||||
text = re.sub(r"\b(\d)\s+(\d)\s*,\s*(\d)\s+(\d)\s+(\d)\s+(\d)\b", r"\1\2, \3\4\5\6", text)
|
||||
text = re.sub(r"\b(\d{1,2})\s*,\s*(\d)\s+(\d)\s+(\d)\s+(\d)\b", r"\1, \2\3\4\5", text)
|
||||
text = re.sub(r"(?<![A-Za-z])n\s*o\s*o\s*n\s*o\s*n\s+([A-Z][a-z]+)", r"noon on \1", text, flags=re.I)
|
||||
text = re.sub(r"\bno\s*o\s*n\s+([A-Z][a-z]+)", r"noon on \1", text, flags=re.I)
|
||||
return text
|
||||
|
||||
|
||||
@@ -395,8 +433,8 @@ def strict_money_m_after(pattern: str, text: str) -> float | None:
|
||||
def date_after(label_pattern: str, text: str) -> str | None:
|
||||
match = re.search(
|
||||
label_pattern
|
||||
+ r".{0,600}?(?:on|from|at or before)\s+(?:[.\s]+)?(?:[A-Z][a-z]+,\s+)?"
|
||||
+ r"([A-Z][a-z]+ \d{1,2}, \d{4}|\d{1,2} [A-Z][a-z]+ \d{4})",
|
||||
+ r".{0,600}?\b(?:on(?: or about)?|from|at or before)\b\s*(?:[.\s]+)?(?:[A-Z][a-z]+\s*,\s*)?"
|
||||
+ r"([A-Z][a-z]+ \d{1,2},\s*\d{4}|\d{1,2} [A-Z][a-z]+ \d{4})",
|
||||
text,
|
||||
flags=re.I,
|
||||
)
|
||||
@@ -471,16 +509,22 @@ def parse_prospectus_facts(local_path: str) -> ProspectusFacts:
|
||||
allotment_date = (
|
||||
date_after(r"Announcement of the level of indications.*?basis of allocation", text)
|
||||
or date_after(r"The results of allocations", text)
|
||||
or date_after(r"allotment results announcement", text)
|
||||
or date_after(r"Announcement of", text)
|
||||
or date_after(r"Announcement of.*?Offer Price", text)
|
||||
)
|
||||
return ProspectusFacts(
|
||||
application_start_date=date_after(r"Hong Kong Public Offering commences", text),
|
||||
application_start_date=(
|
||||
date_after(r"Hong Kong Public Offering commences", text)
|
||||
or date_after(r"Application lists.*?open", text)
|
||||
),
|
||||
application_end_date=date_after(r"Application lists.*?close", text),
|
||||
allotment_results_expected_date=allotment_date,
|
||||
listing_date=date_after(
|
||||
r"Dealings in (?:our\s+|the\s+)?(?:H\s+)?(?:Shares|HDRs).*?(?:expected to commence|to commence)",
|
||||
text,
|
||||
listing_date=(
|
||||
date_after(
|
||||
r"Dealings in (?:our\s+|the\s+)?(?:H\s+)?(?:Shares|HDRs).*?(?:expected to commence|to commence)",
|
||||
text,
|
||||
)
|
||||
),
|
||||
offer_price_hkd=offer_price,
|
||||
board_lot=board_lot,
|
||||
@@ -834,7 +878,10 @@ def update_master_from_prospectus(conn: sqlite3.Connection, ticker: str, facts:
|
||||
SET application_start_date = COALESCE(?, application_start_date),
|
||||
application_end_date = COALESCE(?, application_end_date),
|
||||
allotment_results_expected_date = COALESCE(?, allotment_results_expected_date),
|
||||
listing_date = COALESCE(listing_date, ?),
|
||||
listing_date = CASE
|
||||
WHEN status = 'listed' THEN COALESCE(listing_date, ?)
|
||||
ELSE COALESCE(?, listing_date)
|
||||
END,
|
||||
data_as_of = ?
|
||||
WHERE ticker = ?
|
||||
""",
|
||||
@@ -843,6 +890,7 @@ def update_master_from_prospectus(conn: sqlite3.Connection, ticker: str, facts:
|
||||
facts.application_end_date,
|
||||
facts.allotment_results_expected_date,
|
||||
facts.listing_date,
|
||||
facts.listing_date,
|
||||
as_of,
|
||||
ticker,
|
||||
),
|
||||
|
||||
@@ -11,7 +11,7 @@ import re
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
@@ -132,6 +132,16 @@ def parse_deadline(value: str) -> str:
|
||||
return " ".join(lines)
|
||||
|
||||
|
||||
def is_still_actionable(row: MarketHeatRow, as_of: str) -> bool:
|
||||
try:
|
||||
deadline_hkt = datetime.strptime(row.subscription_deadline, "%Y-%m-%d %H:%M")
|
||||
except ValueError:
|
||||
return False
|
||||
observed_at = datetime.fromisoformat(as_of.replace("Z", "+00:00"))
|
||||
observed_hkt = observed_at.astimezone(timezone(timedelta(hours=8))).replace(tzinfo=None)
|
||||
return observed_hkt < deadline_hkt
|
||||
|
||||
|
||||
def parse_rows(page: str) -> list[MarketHeatRow]:
|
||||
rows: list[MarketHeatRow] = []
|
||||
for tr in re.findall(r"<tr[^>]*>(.*?)</tr>", page, flags=re.S):
|
||||
@@ -220,6 +230,8 @@ def upsert_rows(
|
||||
for row in rows:
|
||||
if row.ticker not in selected:
|
||||
continue
|
||||
if stage == T0_95_STAGE and not is_still_actionable(row, as_of):
|
||||
continue
|
||||
source_id = f"{row.ticker}_{slug}_vbkr_{compact_timestamp(as_of)}"
|
||||
heat_id = source_id
|
||||
conn.execute(
|
||||
|
||||
Reference in New Issue
Block a user