Add A/H share-class mapping workflow

Request:
- Add a repeatable mechanism so HK IPO reports detect issuers that already have Mainland A shares.
- Include a third internet/official-exchange cross-check layer beyond structured history and prospectus scans.

Changes:
- Add listed_share_classes schema support for same-issuer A-share mappings and evidence links.
- Add scripts/archive_a_share_mappings.py to scan prospectus extracted text, reject sponsor/portfolio/cornerstone false positives, archive optional official web evidence and A-share/FX quote evidence, and export snapshots on write.
- Surface a_share_* fields in the analysis dataset and single-ticker report output.
- Update hk-ipo analyst/archivist skill rules and scheduled refresh prompt to require the three-layer A/H mapping check.

Verification:
- python3 -m py_compile scripts/archive_a_share_mappings.py scripts/build_analysis_dataset.py scripts/generate_ipo_report.py
- .venv/bin/python scripts/archive_a_share_mappings.py --as-of 2026-06-24T00:00:00Z --tickers 00668,01688,03661,09630 --dry-run
- .venv/bin/python scripts/build_analysis_dataset.py --db /tmp/hk_ipo_ah_dataset_test.sqlite --dataset /tmp/hk_ipo_ah_dataset_test.csv --report /tmp/hk_ipo_ah_model_test.md --as-of 2026-06-24T00:00:00Z
- .venv/bin/python scripts/generate_ipo_report.py 09630 --dataset /tmp/hk_ipo_ah_dataset_test.csv --stdout --as-of 2026-06-24T00:00:00Z
- git diff --check

Next useful context:
- Dry-run detected 00668->300866.SZ, 01688->002600.SZ, 03661->300661.SZ, and 09630->688630.SH.
- A false positive 01688->300476.SZ from a cornerstone investor parent was rejected by the issuer-context filter.
This commit is contained in:
2026-06-24 07:21:21 +00:00
parent d3b67fa473
commit 7cbdd533b0
7 changed files with 710 additions and 0 deletions
+567
View File
@@ -0,0 +1,567 @@
#!/usr/bin/env python3
"""Detect and archive A/H or onshore share-class mappings from prospectus text."""
from __future__ import annotations
import argparse
import csv
import hashlib
import html
import re
import sqlite3
import sys
from dataclasses import dataclass
from datetime import date, datetime, timedelta, timezone
from pathlib import Path
from urllib.error import HTTPError, URLError
from urllib.parse import urlencode
from urllib.request import Request, urlopen
DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite")
DEFAULT_SCHEMA_PATH = Path("schema/hk_ipo.schema.sql")
SNAPSHOT_DIR = Path("data/snapshots")
TEXT_MANIFEST = SNAPSHOT_DIR / "extracted_text_manifest.csv"
RAW_QUOTE_DIR = Path("data/raw/a_share_quotes")
RAW_WEB_DIR = Path("data/raw/a_share_mapping_web")
YAHOO_CHART_BASE = "https://query1.finance.yahoo.com/v8/finance/chart"
@dataclass(frozen=True)
class ProspectusText:
ticker: str
source_id: str
local_path: str
text_path: Path
text: str
@dataclass(frozen=True)
class ShareClassMapping:
ticker: str
related_ticker: str
exchange: str
board: str | None
company_name: str | None
listed_date: str | None
prospectus_source_id: str
evidence_text: str
confidence: str
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH), help="Repo-relative schema path.")
parser.add_argument("--as-of", help="Archive timestamp. Defaults to current UTC time.")
parser.add_argument("--tickers", help="Comma-separated HK tickers to scan. Defaults to current prospectus rows.")
parser.add_argument("--archive-quotes", action="store_true", help="Archive Yahoo A-share and HKD/CNY chart evidence.")
parser.add_argument("--web-cross-check", action="store_true", help="Archive supported public web cross-check pages.")
parser.add_argument("--dry-run", action="store_true", help="Print detected mappings without writing DB or files.")
return parser.parse_args()
def parse_as_of(value: str | None) -> str:
if value:
return datetime.fromisoformat(value.replace("Z", "+00:00")).isoformat().replace("+00:00", "Z")
return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
def compact_timestamp(value: str) -> str:
return value.replace("-", "").replace(":", "").replace("+00:00", "Z")
def source_date(value: str) -> str:
return datetime.fromisoformat(value.replace("Z", "+00:00")).date().isoformat()
def selected_tickers(value: str | None) -> set[str] | None:
if not value:
return None
return {item.strip().zfill(5) for item in value.split(",") if item.strip()}
def load_manifest() -> dict[str, Path]:
if not TEXT_MANIFEST.exists():
return {}
with TEXT_MANIFEST.open(newline="", encoding="utf-8") as handle:
return {row["source_id"]: Path(row["text_local_path"]) for row in csv.DictReader(handle)}
def load_prospectus_texts(conn: sqlite3.Connection, tickers: set[str] | None) -> list[ProspectusText]:
ticker_filter = ""
params: list[object] = []
if tickers:
ticker_filter = f"AND s.ticker IN ({','.join('?' for _ in tickers)})"
params.extend(sorted(tickers))
rows = conn.execute(
f"""
SELECT s.ticker, s.source_id, s.local_path
FROM source_refs s
WHERE s.source_type = 'prospectus'
{ticker_filter}
ORDER BY s.ticker, s.source_date DESC, s.source_id DESC
""",
params,
).fetchall()
manifest = load_manifest()
texts: list[ProspectusText] = []
seen: set[str] = set()
for row in rows:
ticker = row["ticker"]
if ticker in seen:
continue
text_path = manifest.get(row["source_id"])
if text_path is None or not text_path.exists():
continue
texts.append(
ProspectusText(
ticker=ticker,
source_id=row["source_id"],
local_path=row["local_path"],
text_path=text_path,
text=text_path.read_text(encoding="utf-8", errors="replace"),
)
)
seen.add(ticker)
return texts
CODE_RE = re.compile(
r"(?:stock\s+code\s*[:]?\s*)?\(?\b([036]\d{5})(?:\.(SH|SZ|SS))?\b\)?",
flags=re.I,
)
def clean_context(value: str) -> str:
return " ".join(html.unescape(value).split())
def has_issuer_context(context: str) -> bool:
lowered = context.lower()
excluded_phrases = [
"cornerstone investment",
"cornerstone investor",
"portfolio companies",
"portfolio company",
"will subscribe for and hold",
"wholly owned by",
]
if any(phrase in lowered for phrase in excluded_phrases):
return False
if "sponsor" in lowered and "our company" not in lowered and "the company" not in lowered:
return False
if "a shares of which" in lowered and not re.search(
r"[\"“]\s*(?:company|our company|the company)\s*[\"”]",
context,
flags=re.I,
):
return False
issuer_phrases = [
"the a shares of which",
"a shares of which",
"our a shares",
"the company's a shares",
"the companys a shares",
"our company has been listed",
"our company became listed",
"our company was listed",
"we completed the listing of our a shares",
"prior to the listing, our share capital comprises entirely a shares",
"a shares listed on",
]
return any(phrase in lowered for phrase in issuer_phrases)
def exchange_from_context(code: str, suffix: str | None, context: str) -> tuple[str, str | None, str]:
lowered = context.lower()
suffix = (suffix or "").upper()
if suffix in {"SH", "SS"} or "shanghai stock exchange" in lowered or "上海证券交易所" in context:
exchange = "SSE"
ticker = f"{code}.SH"
elif suffix == "SZ" or "shenzhen stock exchange" in lowered or "深圳证券交易所" in context:
exchange = "SZSE"
ticker = f"{code}.SZ"
elif code.startswith("6"):
exchange = "SSE"
ticker = f"{code}.SH"
else:
exchange = "SZSE"
ticker = f"{code}.SZ"
board = None
if "star market" in lowered or "science and technology innovation board" in lowered:
board = "STAR Market"
elif "chinext" in lowered:
board = "ChiNext"
return exchange, board, ticker
def company_name_from_context(context: str) -> str | None:
match = re.search(r"[\"“](?:the\s+Company|Company)[\"”]\s+([^,]+),", context, flags=re.I)
if match:
return clean_context(match.group(1))
match = re.search(r"([A-Z][A-Za-z0-9&.,'() -]+(?:Co\.|Company|Corp|Inc\.)[^,]*)", context)
if match:
return clean_context(match.group(1))
return None
def listed_date_from_context(context: str) -> str | None:
match = re.search(
r"(?:since|on)\s+([A-Z][a-z]+\s+\d{1,2},\s+\d{4})",
context,
)
if not match:
return None
try:
return datetime.strptime(match.group(1), "%B %d, %Y").date().isoformat()
except ValueError:
return None
def detect_mappings(item: ProspectusText) -> list[ShareClassMapping]:
mappings: dict[str, ShareClassMapping] = {}
for match in CODE_RE.finditer(item.text):
code, suffix = match.group(1), match.group(2)
start = max(0, match.start() - 500)
end = min(len(item.text), match.end() + 500)
context = clean_context(item.text[start:end])
if not has_issuer_context(context):
continue
exchange, board, related_ticker = exchange_from_context(code, suffix, context)
confidence = "high" if "a shares of which" in context.lower() or "our a shares" in context.lower() else "medium"
candidate = ShareClassMapping(
ticker=item.ticker,
related_ticker=related_ticker,
exchange=exchange,
board=board,
company_name=company_name_from_context(context),
listed_date=listed_date_from_context(context),
prospectus_source_id=item.source_id,
evidence_text=context[:700],
confidence=confidence,
)
existing = mappings.get(related_ticker)
if existing:
stronger = existing.confidence != "high" and candidate.confidence == "high"
more_complete = (
(not existing.board and candidate.board)
or (not existing.company_name and candidate.company_name)
or (not existing.listed_date and candidate.listed_date)
)
if stronger or more_complete:
mappings[related_ticker] = ShareClassMapping(
ticker=existing.ticker,
related_ticker=existing.related_ticker,
exchange=candidate.exchange,
board=candidate.board or existing.board,
company_name=candidate.company_name or existing.company_name,
listed_date=candidate.listed_date or existing.listed_date,
prospectus_source_id=candidate.prospectus_source_id,
evidence_text=candidate.evidence_text,
confidence="high" if stronger or existing.confidence == "high" else candidate.confidence,
)
continue
mappings[related_ticker] = candidate
return list(mappings.values())
def fetch_bytes(url: str) -> bytes:
request = Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urlopen(request, timeout=60) as response:
return response.read()
def sha256_bytes(payload: bytes) -> str:
return hashlib.sha256(payload).hexdigest()
def epoch(day: date) -> int:
return int(datetime(day.year, day.month, day.day, tzinfo=timezone.utc).timestamp())
def yahoo_symbol(related_ticker: str) -> str:
code, suffix = related_ticker.split(".", 1)
return f"{code}.SS" if suffix == "SH" else f"{code}.SZ"
def yahoo_chart_url(symbol: str, start: date, end: date) -> str:
params = urlencode(
{
"period1": epoch(start),
"period2": epoch(end + timedelta(days=1)),
"interval": "1d",
"events": "history",
"includeAdjustedClose": "true",
}
)
return f"{YAHOO_CHART_BASE}/{symbol}?{params}"
def source_row(
source_id: str,
ticker: str,
source_type: str,
title: str,
local_path: str,
url: str,
payload: bytes,
as_of: str,
notes: str,
) -> dict[str, object]:
return {
"source_id": source_id,
"ticker": ticker,
"source_type": source_type,
"title": title,
"path_base": "repo_root",
"local_path": local_path,
"url": url,
"file_sha256": sha256_bytes(payload),
"source_date": source_date(as_of),
"archived_at": as_of,
"notes": notes,
}
def archive_quote_sources(mappings: list[ShareClassMapping], as_of: str) -> list[dict[str, object]]:
if not mappings:
return []
RAW_QUOTE_DIR.mkdir(parents=True, exist_ok=True)
as_of_date = datetime.fromisoformat(as_of.replace("Z", "+00:00")).date()
start = as_of_date - timedelta(days=30)
compact = compact_timestamp(as_of)
rows: list[dict[str, object]] = []
for mapping in mappings:
symbol = yahoo_symbol(mapping.related_ticker)
slug = mapping.related_ticker.lower().replace(".", "_")
url = yahoo_chart_url(symbol, start, as_of_date)
try:
payload = fetch_bytes(url)
except (HTTPError, URLError, TimeoutError, OSError) as exc:
print(f"warning: quote archive failed for {mapping.related_ticker}: {exc}", file=sys.stderr)
continue
path = RAW_QUOTE_DIR / f"{slug}_yahoo_chart_{compact}.json"
if not path.exists() or path.read_bytes() != payload:
path.write_bytes(payload)
rows.append(
source_row(
f"{mapping.ticker}_a_share_yahoo_chart_{slug}_{compact}",
mapping.ticker,
"a_share_price_history",
f"Yahoo Finance daily chart for {mapping.related_ticker} A shares",
path.as_posix(),
url,
payload,
as_of,
"Raw Yahoo Finance chart response archived for A/H dual-listed valuation overlay.",
)
)
fx_symbol = "HKDCNY=X"
fx_url = yahoo_chart_url(fx_symbol, start, as_of_date)
try:
fx_payload = fetch_bytes(fx_url)
except (HTTPError, URLError, TimeoutError, OSError) as exc:
print(f"warning: FX archive failed for {fx_symbol}: {exc}", file=sys.stderr)
return rows
fx_path = RAW_QUOTE_DIR / f"hkdcny_x_yahoo_chart_{compact}.json"
if not fx_path.exists() or fx_path.read_bytes() != fx_payload:
fx_path.write_bytes(fx_payload)
for mapping in mappings:
rows.append(
source_row(
f"{mapping.ticker}_fx_hkdcny_yahoo_chart_{compact}",
mapping.ticker,
"fx_price_history",
"Yahoo Finance daily chart for HKD/CNY exchange rate",
fx_path.as_posix(),
fx_url,
fx_payload,
as_of,
"Raw Yahoo Finance chart response archived to convert H-share offer prices into RMB for A/H discount checks.",
)
)
return rows
def official_web_url(mapping: ShareClassMapping) -> str | None:
code = mapping.related_ticker.split(".", 1)[0]
if mapping.exchange == "SSE" and mapping.board == "STAR Market":
return f"https://www.sse.com.cn/star/market/stocklist/info/company/index.shtml?COMPANY_CODE={code}"
if mapping.exchange == "SSE":
return f"https://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE={code}"
if mapping.exchange == "SZSE":
return f"http://www.szse.cn/English/siteMarketData/siteMarketDatas/lookup/index.html?code={code}"
return None
def archive_web_sources(mappings: list[ShareClassMapping], as_of: str) -> tuple[list[dict[str, object]], dict[str, str]]:
RAW_WEB_DIR.mkdir(parents=True, exist_ok=True)
compact = compact_timestamp(as_of)
rows: list[dict[str, object]] = []
source_ids: dict[str, str] = {}
for mapping in mappings:
url = official_web_url(mapping)
if not url:
continue
try:
payload = fetch_bytes(url)
except (HTTPError, URLError, TimeoutError, OSError) as exc:
print(f"warning: web cross-check failed for {mapping.related_ticker}: {exc}", file=sys.stderr)
continue
slug = mapping.related_ticker.lower().replace(".", "_")
path = RAW_WEB_DIR / f"{mapping.ticker}_{slug}_official_{compact}.html"
if not path.exists() or path.read_bytes() != payload:
path.write_bytes(payload)
source_id = f"{mapping.ticker}_a_share_mapping_web_{slug}_{compact}"
rows.append(
source_row(
source_id,
mapping.ticker,
"a_share_mapping_web_evidence",
f"Official exchange company page for {mapping.related_ticker}",
path.as_posix(),
url,
payload,
as_of,
"Public internet cross-check for A/H share-class mapping. Prospectus remains the primary source.",
)
)
source_ids[mapping.ticker + "|" + mapping.related_ticker] = source_id
return rows, source_ids
def upsert_source_refs(conn: sqlite3.Connection, rows: list[dict[str, object]]) -> None:
if not rows:
return
conn.executemany(
"""
INSERT INTO source_refs (
source_id, ticker, source_type, title, path_base, local_path, url,
file_sha256, source_date, archived_at, notes
)
VALUES (
:source_id, :ticker, :source_type, :title, :path_base, :local_path, :url,
:file_sha256, :source_date, :archived_at, :notes
)
ON CONFLICT(source_id) DO UPDATE SET
source_type = excluded.source_type,
title = excluded.title,
path_base = excluded.path_base,
local_path = excluded.local_path,
url = excluded.url,
file_sha256 = excluded.file_sha256,
source_date = excluded.source_date,
archived_at = excluded.archived_at,
notes = excluded.notes
""",
rows,
)
def upsert_mappings(
conn: sqlite3.Connection,
mappings: list[ShareClassMapping],
web_source_ids: dict[str, str],
as_of: str,
) -> None:
conn.executemany(
"""
INSERT INTO listed_share_classes (
share_class_id, ticker, share_class_type, related_ticker, exchange, board,
relationship, company_name, listed_date, detection_method, confidence,
prospectus_source_id, web_source_id, evidence_text, data_as_of, notes
)
VALUES (?, ?, 'A_share', ?, ?, ?, 'same_issuer', ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(ticker, share_class_type, related_ticker) DO UPDATE SET
exchange = excluded.exchange,
board = excluded.board,
relationship = excluded.relationship,
company_name = COALESCE(excluded.company_name, listed_share_classes.company_name),
listed_date = COALESCE(excluded.listed_date, listed_share_classes.listed_date),
detection_method = excluded.detection_method,
confidence = excluded.confidence,
prospectus_source_id = excluded.prospectus_source_id,
web_source_id = COALESCE(excluded.web_source_id, listed_share_classes.web_source_id),
evidence_text = excluded.evidence_text,
data_as_of = excluded.data_as_of,
notes = excluded.notes
""",
[
(
f"{mapping.ticker}_a_share_{mapping.related_ticker.lower().replace('.', '_')}",
mapping.ticker,
mapping.related_ticker,
mapping.exchange,
mapping.board,
mapping.company_name,
mapping.listed_date,
"prospectus_text_plus_web" if web_source_ids.get(mapping.ticker + "|" + mapping.related_ticker) else "prospectus_text",
mapping.confidence,
mapping.prospectus_source_id,
web_source_ids.get(mapping.ticker + "|" + mapping.related_ticker),
mapping.evidence_text,
as_of,
"Detected from issuer prospectus text. Internet cross-check is supporting evidence when web_source_id is present.",
)
for mapping in mappings
],
)
def export_snapshot(conn: sqlite3.Connection, table: str, order_by: str = "1") -> None:
SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True)
cursor = conn.execute(f"SELECT * FROM {table} ORDER BY {order_by}")
columns = [description[0] for description in cursor.description]
with (SNAPSHOT_DIR / f"{table}.csv").open("w", newline="", encoding="utf-8") as handle:
writer = csv.writer(handle, lineterminator="\n")
writer.writerow(columns)
writer.writerows(cursor.fetchall())
def main() -> int:
args = parse_args()
as_of = parse_as_of(args.as_of)
tickers = selected_tickers(args.tickers)
with sqlite3.connect(args.db) as conn:
conn.row_factory = sqlite3.Row
if not args.dry_run:
conn.executescript(Path(args.schema).read_text(encoding="utf-8"))
texts = load_prospectus_texts(conn, tickers)
mappings = [mapping for item in texts for mapping in detect_mappings(item)]
if args.dry_run:
for mapping in mappings:
print(
f"{mapping.ticker}: {mapping.related_ticker} {mapping.exchange} "
f"{mapping.board or ''} confidence={mapping.confidence} source={mapping.prospectus_source_id}"
)
print(f"detected mappings: {len(mappings)}")
return 0
web_rows: list[dict[str, object]] = []
web_source_ids: dict[str, str] = {}
if args.web_cross_check:
web_rows, web_source_ids = archive_web_sources(mappings, as_of)
quote_rows = archive_quote_sources(mappings, as_of) if args.archive_quotes else []
upsert_source_refs(conn, web_rows + quote_rows)
upsert_mappings(conn, mappings, web_source_ids, as_of)
export_snapshot(conn, "listed_share_classes", "ticker, related_ticker")
export_snapshot(conn, "source_refs", "source_id")
print("A/H share-class mappings archived")
print(f"as_of: {as_of}")
print(f"prospectuses scanned: {len(texts)}")
print(f"mappings detected: {len(mappings)}")
print(f"web sources archived: {len(web_rows)}")
print(f"quote/fx sources archived: {len(quote_rows)}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
+47
View File
@@ -16,6 +16,7 @@ from typing import Any
MODEL_VERSION = "ipo_score_v0"
RULE_PATH = Path("rules/ipo_score_v0.yaml")
DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite")
DEFAULT_SCHEMA_PATH = Path("schema/hk_ipo.schema.sql")
DEFAULT_DATASET_PATH = Path("data/snapshots/analysis_model_v0_dataset.csv")
DEFAULT_REPORT_PATH = Path("reports/2026-06-15_analysis_model_v0.md")
@@ -33,6 +34,7 @@ class Metric:
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH), help="Repo-relative schema path.")
parser.add_argument("--dataset", default=str(DEFAULT_DATASET_PATH), help="Output CSV dataset path.")
parser.add_argument("--report", default=str(DEFAULT_REPORT_PATH), help="Output Markdown report path.")
parser.add_argument("--as-of", help="Analysis timestamp. Defaults to current UTC time.")
@@ -435,6 +437,16 @@ def fetch_rows(conn: sqlite3.Connection) -> list[sqlite3.Row]:
eh.grey_market_return_pct AS external_grey_market_return_pct,
eh.first_day_return_pct AS external_first_day_return_pct,
eh.local_path AS external_history_source_path,
ah.related_ticker AS a_share_ticker,
ah.exchange AS a_share_exchange,
ah.board AS a_share_board,
ah.relationship AS a_share_relationship,
ah.company_name AS a_share_company_name,
ah.listed_date AS a_share_listed_date,
ah.detection_method AS a_share_detection_method,
ah.confidence AS a_share_mapping_confidence,
ahp.local_path AS a_share_prospectus_source_path,
ahw.local_path AS a_share_web_source_path,
(
SELECT local_path
FROM source_refs s
@@ -456,6 +468,19 @@ def fetch_rows(conn: sqlite3.Connection) -> list[sqlite3.Row]:
LEFT JOIN performance p ON p.ticker = m.ticker
LEFT JOIN latest_market_heat h ON h.ticker = m.ticker
LEFT JOIN external_history eh ON eh.ticker = m.ticker
LEFT JOIN listed_share_classes ah
ON ah.share_class_id = (
SELECT l.share_class_id
FROM listed_share_classes l
WHERE l.ticker = m.ticker AND l.share_class_type = 'A_share'
ORDER BY
l.data_as_of DESC,
CASE l.confidence WHEN 'high' THEN 0 WHEN 'medium' THEN 1 ELSE 2 END,
l.related_ticker
LIMIT 1
)
LEFT JOIN source_refs ahp ON ahp.source_id = ah.prospectus_source_id
LEFT JOIN source_refs ahw ON ahw.source_id = ah.web_source_id
ORDER BY m.listing_date, m.ticker
"""
).fetchall()
@@ -483,6 +508,14 @@ def build_records(rows: list[sqlite3.Row], as_of: str) -> list[dict[str, Any]]:
"company_name_en": row["company_name_en"],
"company_name_zh": row["company_name_zh"],
"stock_short_name": stock_short_name,
"a_share_ticker": row["a_share_ticker"],
"a_share_exchange": row["a_share_exchange"],
"a_share_board": row["a_share_board"],
"a_share_relationship": row["a_share_relationship"],
"a_share_company_name": row["a_share_company_name"],
"a_share_listed_date": row["a_share_listed_date"],
"a_share_detection_method": row["a_share_detection_method"],
"a_share_mapping_confidence": row["a_share_mapping_confidence"],
"board": row["board"],
"status": row["status"],
"listing_date": row["listing_date"],
@@ -539,6 +572,8 @@ def build_records(rows: list[sqlite3.Row], as_of: str) -> list[dict[str, Any]]:
"external_grey_market_return_pct": row["external_grey_market_return_pct"],
"external_first_day_return_pct": row["external_first_day_return_pct"],
"external_history_source_path": row["external_history_source_path"],
"a_share_prospectus_source_path": row["a_share_prospectus_source_path"],
"a_share_web_source_path": row["a_share_web_source_path"],
"prospectus_source_path": row["prospectus_source_path"],
"allotment_source_path": row["allotment_source_path"],
}
@@ -608,6 +643,14 @@ def write_dataset(records: list[dict[str, Any]], output_path: Path) -> None:
"company_name_en",
"company_name_zh",
"stock_short_name",
"a_share_ticker",
"a_share_exchange",
"a_share_board",
"a_share_relationship",
"a_share_company_name",
"a_share_listed_date",
"a_share_detection_method",
"a_share_mapping_confidence",
"board",
"status",
"listing_date",
@@ -662,6 +705,8 @@ def write_dataset(records: list[dict[str, Any]], output_path: Path) -> None:
"external_grey_market_return_pct",
"external_first_day_return_pct",
"external_history_source_path",
"a_share_prospectus_source_path",
"a_share_web_source_path",
"prospectus_source_path",
"allotment_source_path",
"t0_score_breakdown",
@@ -846,10 +891,12 @@ def main() -> int:
args = parse_args()
as_of = parse_as_of(args.as_of)
db_path = Path(args.db)
schema_path = Path(args.schema)
dataset_path = Path(args.dataset)
report_path = Path(args.report)
with sqlite3.connect(db_path) as conn:
conn.executescript(schema_path.read_text(encoding="utf-8"))
rows = fetch_rows(conn)
records = build_records(rows, as_of)
+39
View File
@@ -320,6 +320,37 @@ def facts_table(record: dict[str, str], stage: str) -> str:
return "\n".join(lines)
def ah_overlay(record: dict[str, str]) -> str:
if not record.get("a_share_ticker"):
return "- 未识别到同一发行人的 A 股或其他内地上市股本。"
prospectus_path = record.get("a_share_prospectus_source_path") or "data_gap"
web_path = record.get("a_share_web_source_path") or "data_gap"
rows = [
("A 股代码", fmt_value(record.get("a_share_ticker"))),
("交易所", fmt_value(record.get("a_share_exchange"))),
("板块", fmt_value(record.get("a_share_board"))),
("关系", fmt_value(record.get("a_share_relationship"))),
("A 股公司名", fmt_value(record.get("a_share_company_name"))),
("A 股上市日", fmt_value(record.get("a_share_listed_date"))),
("识别方法", fmt_value(record.get("a_share_detection_method"))),
("映射置信度", fmt_value(record.get("a_share_mapping_confidence"))),
("招股书证据", f"`{prospectus_path}`" if prospectus_path != "data_gap" else "`data_gap`"),
("互联网交叉验证", f"`{web_path}`" if web_path != "data_gap" else "`data_gap`"),
]
lines = ["| 字段 | 数值 |", "| --- | --- |"]
for label, value in rows:
lines.append(f"| {label} | {value} |")
lines.extend(
[
"",
"- 这是 A/H 或内地上市股本定价场景,不应按纯首次上市 IPO 处理。",
"- A 股价格可作为估值锚,但 A 股和 H 股通常不能互换或直接套利;短线收益仍取决于香港侧认购热度、流动性、供给和 T2/D1 出口。",
]
)
return "\n".join(lines)
def stage_calendar_table(record: dict[str, str]) -> str:
application_start = fmt_value(record["application_start_date"])
application_end = fmt_value(record["application_end_date"])
@@ -372,6 +403,10 @@ def source_paths(record: dict[str, str], stage: str) -> list[str]:
paths.append(record["prospectus_source_path"])
if stage == T1_STAGE and record["allotment_source_path"]:
paths.append(record["allotment_source_path"])
if record.get("a_share_prospectus_source_path"):
paths.append(record["a_share_prospectus_source_path"])
if record.get("a_share_web_source_path"):
paths.append(record["a_share_web_source_path"])
return paths
@@ -455,6 +490,10 @@ def build_report(record: dict[str, str], rows: list[dict[str, str]], stage: str,
"",
facts_table(record, stage),
"",
"## A/H 或内地上市股本检查",
"",
ah_overlay(record),
"",
"## 短线退出模型推断",
"",
f"- D1 正收益概率:{fmt_pct_rate(metric.d1_positive_rate)}",
+1
View File
@@ -24,6 +24,7 @@ Goals:
- Inspect the current worktree and recent git history first.
- Refresh the latest IPO candidate universe from online sources through `hk-ipo-archivist` before analysis.
- Update all relevant fresh network facts for the latest candidate report, especially live subscription-period market heat / margin subscription multiples, official T1 allotment demand when published, prospectus documents, listing calendars, and recent D1 review data.
- Refresh A/H or other onshore share-class mappings before rebuilding the report: use the structured `listed_share_classes` archive, scan prospectus extracted text with `scripts/archive_a_share_mappings.py`, and add internet / official-exchange cross-check evidence when supported.
- Keep unofficial subscription multiples in `ipo_market_heat` with their provider and `observed_at`; do not copy them into official T1 public oversubscription fields.
- Rebuild the analysis dataset after any archive refresh.
- Produce a complete latest broad IPO candidate report for actionable subscriptions, including ranking, fundamentals, break-risk/risk-reward, per-IPO notes, closed/waiting names, recent 30-day review, guardrails, and sources.