Add PDF text extraction workflow
Request: - Provide a way to install or develop a PDF extraction tool for archived HK IPO documents. Changes: - Add requirements.txt with pypdf as the lightweight PDF text extraction dependency. - Add scripts/extract_pdf_text.py to extract text from PDF source_refs into repo-relative data/extracted_text files. - Add extracted text outputs and an extracted_text_manifest snapshot for the six archived HKEXnews PDFs. - Document the extraction workflow in README.md. - Ignore .venv and keep generated SQLite/Python transient files out of git. - Use extracted text to verify the 06106 full prospectus, update source_refs, remove the related data gap, and fill 06106 offering terms. Verification: - Installed python3.14-venv system support, created a local .venv, and installed requirements.txt. - Re-ran scripts/bootstrap_historical_data.py and scripts/extract_pdf_text.py. - Verified extracted text paths and hashes against data/snapshots/extracted_text_manifest.csv. - Verified SQLite integrity and snapshot row counts. - Ran git diff --cached --check and searched durable files for machine-specific absolute paths.
This commit is contained in:
@@ -13,6 +13,9 @@ ARCHIVE_AS_OF = "2026-06-15T06:15:00Z"
|
||||
DB_PATH = Path("data/hk_ipo.sqlite")
|
||||
SCHEMA_PATH = Path("schema/hk_ipo.schema.sql")
|
||||
SNAPSHOT_DIR = Path("data/snapshots")
|
||||
STALE_GAP_IDS = [
|
||||
"06106_full_prospectus_classification_2026_06_15",
|
||||
]
|
||||
|
||||
|
||||
IPO_MASTER = [
|
||||
@@ -102,7 +105,7 @@ OFFERING_TERMS = [
|
||||
},
|
||||
{
|
||||
"ticker": "06106",
|
||||
"source_id": "06106_prospectus_notice_2026_06_15",
|
||||
"source_id": "06106_prospectus_candidate_2026_06_15",
|
||||
"prospectus_date": "2026-06-15",
|
||||
"offer_price_hkd": 101.60,
|
||||
"board_lot": 50,
|
||||
@@ -113,10 +116,10 @@ OFFERING_TERMS = [
|
||||
"public_offer_pct_initial": 0.05,
|
||||
"over_allotment_offer_shares": 1574550,
|
||||
"offer_size_adjustment_offer_shares": 1574550,
|
||||
"market_cap_hkd_m": None,
|
||||
"gross_proceeds_hkd_m": None,
|
||||
"net_proceeds_hkd_m": None,
|
||||
"issued_shares_upon_listing": None,
|
||||
"market_cap_hkd_m": 11226.52568,
|
||||
"gross_proceeds_hkd_m": 1066.52568,
|
||||
"net_proceeds_hkd_m": 995.4,
|
||||
"issued_shares_upon_listing": 110497300,
|
||||
"data_as_of": ARCHIVE_AS_OF,
|
||||
},
|
||||
]
|
||||
@@ -195,12 +198,12 @@ SOURCES = [
|
||||
{
|
||||
"source_id": "06106_prospectus_candidate_2026_06_15",
|
||||
"ticker": "06106",
|
||||
"source_type": "prospectus_candidate_pending_verification",
|
||||
"title": "Shanghai Seer Intelligent Technology Co., Ltd. Prospectus Candidate",
|
||||
"source_type": "prospectus",
|
||||
"title": "Shanghai Seer Intelligent Technology Co., Ltd. Prospectus",
|
||||
"local_path": "data/raw/06106/prospectus_candidate_2026-06-15.pdf",
|
||||
"url": "https://www1.hkexnews.hk/listedco/listconews/sehk/2026/0615/2026061500013.pdf",
|
||||
"source_date": "2026-06-15",
|
||||
"notes": "Downloaded from HKEXnews; document role should be verified before using for detailed fact extraction.",
|
||||
"notes": "HKEXnews prospectus; verified by text extraction as a 424-page GLOBAL OFFERING document.",
|
||||
},
|
||||
]
|
||||
|
||||
@@ -226,16 +229,6 @@ DATA_GAPS = [
|
||||
"created_at": ARCHIVE_AS_OF,
|
||||
"notes": "Update after the HKEXnews allotment results announcement is published.",
|
||||
},
|
||||
{
|
||||
"gap_id": "06106_full_prospectus_classification_2026_06_15",
|
||||
"ticker": "06106",
|
||||
"stage": "T0_prospectus",
|
||||
"field_name": "full_prospectus_local_path",
|
||||
"reason": "The archived 2026061500011 PDF is an offering announcement/notice; the separately archived 2026061500013 PDF needs document-role verification before detailed extraction.",
|
||||
"expected_resolution_date": None,
|
||||
"created_at": ARCHIVE_AS_OF,
|
||||
"notes": "Keep both official files in raw archive until classification is confirmed.",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -278,7 +271,7 @@ def export_snapshot(conn: sqlite3.Connection, table: str) -> None:
|
||||
cursor = conn.execute(f"SELECT * FROM {table} ORDER BY 1")
|
||||
columns = [description[0] for description in cursor.description]
|
||||
with (SNAPSHOT_DIR / f"{table}.csv").open("w", newline="", encoding="utf-8") as handle:
|
||||
writer = csv.writer(handle)
|
||||
writer = csv.writer(handle, lineterminator="\n")
|
||||
writer.writerow(columns)
|
||||
writer.writerows(cursor.fetchall())
|
||||
|
||||
@@ -287,6 +280,7 @@ def main() -> None:
|
||||
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with sqlite3.connect(DB_PATH) as conn:
|
||||
conn.executescript(SCHEMA_PATH.read_text(encoding="utf-8"))
|
||||
conn.executemany("DELETE FROM data_gaps WHERE gap_id = ?", [(gap_id,) for gap_id in STALE_GAP_IDS])
|
||||
upsert_rows(conn, "ipo_master", IPO_MASTER)
|
||||
upsert_rows(conn, "offering_terms", OFFERING_TERMS)
|
||||
upsert_rows(conn, "ipo_demand", IPO_DEMAND)
|
||||
|
||||
Reference in New Issue
Block a user