Private
Public Access
0
0

Add external IPO history to heat model

Request:
- Add historical data around T0.5 margin heat and rebuild the model.

Changes:
- Add external_ipo_history to store third-party historical IPO records separately from true T0.5 market-heat snapshots.
- Add scripts/archive_ipohk_history.py to archive ipohk structured listed IPO history.
- Archive 807 ipohk rows, including final oversubscription, one-lot win rate, grey-market return, and first-day return where available.
- Extend the v0 analysis dataset with true T0.5 market-heat columns and separate external final-heat columns.
- Rebuild reports/2026-06-15_analysis_model_v0.md with T0.5 coverage and external final-heat calibration.
- Add a Chinese report explaining why historical final oversubscription cannot be treated as T0.5 margin snapshots.
- Update analyst and archivist skills to keep T0.5 and external final history separate.

Verification:
- .venv/bin/python -m py_compile scripts/build_analysis_dataset.py scripts/archive_ipohk_history.py scripts/archive_t0_5_market_heat.py
- .venv/bin/python scripts/build_analysis_dataset.py --as-of 2026-06-15T19:20:00Z
- Python sqlite3 PRAGMA integrity_check returned ok and foreign_key_check returned zero rows.
- Confirmed 807 external_ipo_history rows, 792 rows with external final oversubscription, 5 true T0.5 market-heat rows, and 297 analysis dataset rows.
- git diff --cached --check

Next useful context:
- True T0.5 historical backtesting still requires ongoing frozen margin-heat snapshots during each IPO subscription window.
This commit is contained in:
2026-06-15 16:06:56 +00:00
parent 222f55c140
commit 943eab27cb
12 changed files with 1589 additions and 299 deletions
+140
View File
@@ -223,6 +223,24 @@ def score_t1(row: sqlite3.Row) -> tuple[int, str]:
return score, "|".join(components)
def score_t0_5_market_heat(row: sqlite3.Row) -> tuple[int | None, str]:
margin = as_float(row["t0_5_margin_subscription_multiple"])
if margin is None:
return None, ""
components: list[str] = []
if margin >= 5000:
score = add_component(components, "margin_subscription", 8, "gte_5000x")
elif margin >= 1000:
score = add_component(components, "margin_subscription", 6, "1000x_to_5000x")
elif margin >= 100:
score = add_component(components, "margin_subscription", 3, "100x_to_1000x")
elif margin >= 10:
score = add_component(components, "margin_subscription", 0, "10x_to_100x")
else:
score = add_component(components, "margin_subscription", -3, "lt_10x")
return score, "|".join(components)
def has_structured_t1(row: sqlite3.Row) -> bool:
return any(
row[key] is not None
@@ -257,6 +275,33 @@ def total_bucket(score: int) -> str:
return "total_gte_26"
def t0_plus_t0_5_bucket(score: int | None) -> str | None:
if score is None:
return None
if score < 5:
return "t0_5_lt_5"
if score <= 7:
return "t0_5_5_to_7"
if score <= 11:
return "t0_5_8_to_11"
return "t0_5_gte_12"
def external_oversub_bucket(value: Any) -> str | None:
oversub = as_float(value)
if oversub is None:
return None
if oversub >= 5000:
return "external_os_gte_5000x"
if oversub >= 1000:
return "external_os_1000x_to_5000x"
if oversub >= 100:
return "external_os_100x_to_1000x"
if oversub >= 10:
return "external_os_10x_to_100x"
return "external_os_lt_10x"
def decision_band(row: dict[str, Any]) -> str:
if not row["has_structured_t1"]:
score = row["t0_score"]
@@ -306,6 +351,28 @@ def fetch_rows(conn: sqlite3.Connection) -> list[sqlite3.Row]:
MAX(CASE WHEN stage = 'D1' THEN turnover_hkd_m END) AS d1_turnover_hkd_m
FROM price_performance
GROUP BY ticker
),
latest_market_heat AS (
SELECT h.*
FROM ipo_market_heat h
JOIN (
SELECT ticker, MAX(observed_at) AS observed_at
FROM ipo_market_heat
GROUP BY ticker
) latest
ON latest.ticker = h.ticker AND latest.observed_at = h.observed_at
),
external_history AS (
SELECT e.*
FROM external_ipo_history e
JOIN (
SELECT ticker, MAX(listing_date) AS listing_date
FROM external_ipo_history
WHERE provider = 'ipohk'
GROUP BY ticker
) latest
ON latest.ticker = e.ticker AND latest.listing_date = e.listing_date
WHERE e.provider = 'ipohk'
)
SELECT
m.ticker,
@@ -347,6 +414,15 @@ def fetch_rows(conn: sqlite3.Connection) -> list[sqlite3.Row]:
p.d20_return_pct,
p.d60_return_pct,
p.d1_turnover_hkd_m,
h.observed_at AS t0_5_observed_at,
h.provider AS t0_5_provider,
h.margin_subscription_multiple AS t0_5_margin_subscription_multiple,
h.source_id AS t0_5_source_id,
eh.one_hand_win_rate_pct AS external_one_hand_win_rate_pct,
eh.public_oversubscription_times AS external_public_oversubscription_times,
eh.grey_market_return_pct AS external_grey_market_return_pct,
eh.first_day_return_pct AS external_first_day_return_pct,
eh.local_path AS external_history_source_path,
(
SELECT local_path
FROM source_refs s
@@ -366,6 +442,8 @@ def fetch_rows(conn: sqlite3.Connection) -> list[sqlite3.Row]:
LEFT JOIN listing_reports lr ON lr.ticker = m.ticker
LEFT JOIN ipo_demand d ON d.ticker = m.ticker
LEFT JOIN performance p ON p.ticker = m.ticker
LEFT JOIN latest_market_heat h ON h.ticker = m.ticker
LEFT JOIN external_history eh ON eh.ticker = m.ticker
ORDER BY m.listing_date, m.ticker
"""
).fetchall()
@@ -376,8 +454,10 @@ def build_records(rows: list[sqlite3.Row], as_of: str) -> list[dict[str, Any]]:
for row in rows:
t0_score_value, t0_breakdown = score_t0(row)
t1_score_value, t1_breakdown = score_t1(row)
t0_5_score_value, t0_5_breakdown = score_t0_5_market_heat(row)
structured_t1 = has_structured_t1(row)
total_score = t0_score_value + (t1_score_value if structured_t1 else 0)
t0_plus_t0_5_score = t0_score_value + t0_5_score_value if t0_5_score_value is not None else None
size = offer_size_hkd_m(row)
record: dict[str, Any] = {
"model_version": MODEL_VERSION,
@@ -406,6 +486,14 @@ def build_records(rows: list[sqlite3.Row], as_of: str) -> list[dict[str, Any]]:
"over_allotment_offer_shares": row["over_allotment_offer_shares"],
"public_oversubscription_times": row["public_oversubscription_times"],
"international_oversubscription_times": row["international_oversubscription_times"],
"t0_5_observed_at": row["t0_5_observed_at"],
"t0_5_provider": row["t0_5_provider"],
"t0_5_margin_subscription_multiple": row["t0_5_margin_subscription_multiple"],
"t0_5_source_id": row["t0_5_source_id"],
"t0_5_add_score": t0_5_score_value,
"t0_plus_t0_5_score": t0_plus_t0_5_score,
"t0_plus_t0_5_score_bucket": t0_plus_t0_5_bucket(t0_plus_t0_5_score),
"t0_5_score_breakdown": t0_5_breakdown,
"valid_applications": row["valid_applications"],
"successful_applications": row["successful_applications"],
"application_success_rate": success_rate(row),
@@ -426,6 +514,12 @@ def build_records(rows: list[sqlite3.Row], as_of: str) -> list[dict[str, Any]]:
"d1_turnover_hkd_m": row["d1_turnover_hkd_m"],
"d1_positive": as_float(row["d1_return_pct"]) is not None and as_float(row["d1_return_pct"]) > 0,
"d1_strong_10pct": as_float(row["d1_return_pct"]) is not None and as_float(row["d1_return_pct"]) >= 10,
"external_one_hand_win_rate_pct": row["external_one_hand_win_rate_pct"],
"external_public_oversubscription_times": row["external_public_oversubscription_times"],
"external_public_oversubscription_bucket": external_oversub_bucket(row["external_public_oversubscription_times"]),
"external_grey_market_return_pct": row["external_grey_market_return_pct"],
"external_first_day_return_pct": row["external_first_day_return_pct"],
"external_history_source_path": row["external_history_source_path"],
"prospectus_source_path": row["prospectus_source_path"],
"allotment_source_path": row["allotment_source_path"],
}
@@ -514,6 +608,13 @@ def write_dataset(records: list[dict[str, Any]], output_path: Path) -> None:
"over_allotment_offer_shares",
"public_oversubscription_times",
"international_oversubscription_times",
"t0_5_observed_at",
"t0_5_provider",
"t0_5_margin_subscription_multiple",
"t0_5_source_id",
"t0_5_add_score",
"t0_plus_t0_5_score",
"t0_plus_t0_5_score_bucket",
"valid_applications",
"successful_applications",
"application_success_rate",
@@ -534,9 +635,16 @@ def write_dataset(records: list[dict[str, Any]], output_path: Path) -> None:
"d1_turnover_hkd_m",
"d1_positive",
"d1_strong_10pct",
"external_one_hand_win_rate_pct",
"external_public_oversubscription_times",
"external_public_oversubscription_bucket",
"external_grey_market_return_pct",
"external_first_day_return_pct",
"external_history_source_path",
"prospectus_source_path",
"allotment_source_path",
"t0_score_breakdown",
"t0_5_score_breakdown",
"t1_score_breakdown",
]
with output_path.open("w", newline="", encoding="utf-8") as handle:
@@ -597,6 +705,16 @@ def write_report(
total = len(records)
d1_records = [record for record in records if record["d1_return_pct"] is not None]
structured_t1 = [record for record in records if record["has_structured_t1"]]
structured_t0_5 = [record for record in records if record["t0_5_margin_subscription_multiple"] is not None]
t0_5_with_d1 = [record for record in structured_t0_5 if record["d1_return_pct"] is not None]
external_history_rows = [record for record in records if record["external_history_source_path"]]
external_oversub_rows = [record for record in records if record["external_public_oversubscription_times"] is not None]
external_oversub_with_d1 = [
record
for record in records
if record["external_public_oversubscription_times"] is not None and record["d1_return_pct"] is not None
]
external_oversub_metrics = calibration(records, "external_public_oversubscription_bucket")
pending_t1_tickers = ", ".join(sorted(record["ticker"] for record in records if not record["has_structured_t1"]))
t1_public_os_missing = sum(record["public_oversubscription_times"] is None for record in structured_t1)
t1_international_os_missing = sum(record["international_oversubscription_times"] is None for record in structured_t1)
@@ -628,6 +746,11 @@ def write_report(
f"- Rows with offer size: {count_present(records, 'offer_size_hkd_m')}",
f"- Rows with public oversubscription: {count_present(records, 'public_oversubscription_times')}",
f"- Rows with international oversubscription: {count_present(records, 'international_oversubscription_times')}",
f"- Rows with T0.5 margin heat snapshots: {len(structured_t0_5)}",
f"- Rows with T0.5 margin heat and D1 labels: {len(t0_5_with_d1)}",
f"- Rows matched to external ipohk history: {len(external_history_rows)}",
f"- Rows with external final oversubscription: {len(external_oversub_rows)}",
f"- Rows with external final oversubscription and D1 labels: {len(external_oversub_with_d1)}",
f"- Rows pending T1 structure: {total - len(structured_t1)}"
+ (f" ({pending_t1_tickers})" if pending_t1_tickers else ""),
f"- T1 field-level blanks: public oversubscription {t1_public_os_missing}, international oversubscription {t1_international_os_missing}, valid applications {t1_valid_missing}, successful applications {t1_successful_missing}",
@@ -644,6 +767,23 @@ def write_report(
"",
metrics_table(total_metrics),
"",
"## T0.5 Market Heat",
"",
"T0.5 uses archived subscription-period margin heat snapshots. These are non-official live signals and are kept separate from T1 allotment demand. The current archive is not yet a historical training set: it has too few rows and no D1 labels for calibration.",
"",
f"- T0.5 margin rows: {len(structured_t0_5)}",
f"- T0.5 rows with D1 labels: {len(t0_5_with_d1)}",
"",
"## External Final Heat Proxy",
"",
"The ipohk history archive adds final public oversubscription, one-lot win rate, grey-market return, and first-day return where available. These fields are useful for coverage checks and post-hoc calibration, but they are not T0.5 inputs because they are final or near-final history.",
"",
f"- External history rows matched into this dataset: {len(external_history_rows)}",
f"- Matched rows with final oversubscription: {len(external_oversub_rows)}",
f"- Matched rows with final oversubscription and D1 labels: {len(external_oversub_with_d1)}",
"",
metrics_table(external_oversub_metrics),
"",
"## Current Read",
"",
f"After the T1 demand text backfill, the strongest v0 T1 bucket is `{best_bucket.bucket}` with {best_bucket.sample_size} historical D1 observations and a {fmt_pct(best_bucket.d1_positive_rate)} D1 positive rate. The model is most useful after allotment results are available; T0 is a watchlist filter rather than a final subscription call.",