Add PDF text extraction workflow

Request: - Provide a way to install or develop a PDF extraction tool for archived HK IPO documents. Changes: - Add requirements.txt with pypdf as the lightweight PDF text extraction dependency. - Add scripts/extract_pdf_text.py to extract text from PDF source_refs into repo-relative data/extracted_text files. - Add extracted text outputs and an extracted_text_manifest snapshot for the six archived HKEXnews PDFs. - Document the extraction workflow in README.md. - Ignore .venv and keep generated SQLite/Python transient files out of git. - Use extracted text to verify the 06106 full prospectus, update source_refs, remove the related data gap, and fill 06106 offering terms. Verification: - Installed python3.14-venv system support, created a local .venv, and installed requirements.txt. - Re-ran scripts/bootstrap_historical_data.py and scripts/extract_pdf_text.py. - Verified extracted text paths and hashes against data/snapshots/extracted_text_manifest.csv. - Verified SQLite integrity and snapshot row counts. - Ran git diff --cached --check and searched durable files for machine-specific absolute paths.
2026-06-15 06:21:16 +00:00
parent 7a8c648d87
commit eae427d85b
18 changed files with 65850 additions and 41 deletions
@@ -13,6 +13,9 @@ ARCHIVE_AS_OF = "2026-06-15T06:15:00Z"
 DB_PATH = Path("data/hk_ipo.sqlite")
 SCHEMA_PATH = Path("schema/hk_ipo.schema.sql")
 SNAPSHOT_DIR = Path("data/snapshots")
+STALE_GAP_IDS = [
+    "06106_full_prospectus_classification_2026_06_15",
+]


 IPO_MASTER = [
@@ -102,7 +105,7 @@ OFFERING_TERMS = [
    },
    {
        "ticker": "06106",
-        "source_id": "06106_prospectus_notice_2026_06_15",
+        "source_id": "06106_prospectus_candidate_2026_06_15",
        "prospectus_date": "2026-06-15",
        "offer_price_hkd": 101.60,
        "board_lot": 50,
@@ -113,10 +116,10 @@ OFFERING_TERMS = [
        "public_offer_pct_initial": 0.05,
        "over_allotment_offer_shares": 1574550,
        "offer_size_adjustment_offer_shares": 1574550,
-        "market_cap_hkd_m": None,
-        "gross_proceeds_hkd_m": None,
-        "net_proceeds_hkd_m": None,
-        "issued_shares_upon_listing": None,
+        "market_cap_hkd_m": 11226.52568,
+        "gross_proceeds_hkd_m": 1066.52568,
+        "net_proceeds_hkd_m": 995.4,
+        "issued_shares_upon_listing": 110497300,
        "data_as_of": ARCHIVE_AS_OF,
    },
 ]
@@ -195,12 +198,12 @@ SOURCES = [
    {
        "source_id": "06106_prospectus_candidate_2026_06_15",
        "ticker": "06106",
-        "source_type": "prospectus_candidate_pending_verification",
-        "title": "Shanghai Seer Intelligent Technology Co., Ltd. Prospectus Candidate",
+        "source_type": "prospectus",
+        "title": "Shanghai Seer Intelligent Technology Co., Ltd. Prospectus",
        "local_path": "data/raw/06106/prospectus_candidate_2026-06-15.pdf",
        "url": "https://www1.hkexnews.hk/listedco/listconews/sehk/2026/0615/2026061500013.pdf",
        "source_date": "2026-06-15",
-        "notes": "Downloaded from HKEXnews; document role should be verified before using for detailed fact extraction.",
+        "notes": "HKEXnews prospectus; verified by text extraction as a 424-page GLOBAL OFFERING document.",
    },
 ]

@@ -226,16 +229,6 @@ DATA_GAPS = [
        "created_at": ARCHIVE_AS_OF,
        "notes": "Update after the HKEXnews allotment results announcement is published.",
    },
-    {
-        "gap_id": "06106_full_prospectus_classification_2026_06_15",
-        "ticker": "06106",
-        "stage": "T0_prospectus",
-        "field_name": "full_prospectus_local_path",
-        "reason": "The archived 2026061500011 PDF is an offering announcement/notice; the separately archived 2026061500013 PDF needs document-role verification before detailed extraction.",
-        "expected_resolution_date": None,
-        "created_at": ARCHIVE_AS_OF,
-        "notes": "Keep both official files in raw archive until classification is confirmed.",
-    },
 ]


@@ -278,7 +271,7 @@ def export_snapshot(conn: sqlite3.Connection, table: str) -> None:
    cursor = conn.execute(f"SELECT * FROM {table} ORDER BY 1")
    columns = [description[0] for description in cursor.description]
    with (SNAPSHOT_DIR / f"{table}.csv").open("w", newline="", encoding="utf-8") as handle:
-        writer = csv.writer(handle)
+        writer = csv.writer(handle, lineterminator="\n")
        writer.writerow(columns)
        writer.writerows(cursor.fetchall())

@@ -287,6 +280,7 @@ def main() -> None:
    DB_PATH.parent.mkdir(parents=True, exist_ok=True)
    with sqlite3.connect(DB_PATH) as conn:
        conn.executescript(SCHEMA_PATH.read_text(encoding="utf-8"))
+        conn.executemany("DELETE FROM data_gaps WHERE gap_id = ?", [(gap_id,) for gap_id in STALE_GAP_IDS])
        upsert_rows(conn, "ipo_master", IPO_MASTER)
        upsert_rows(conn, "offering_terms", OFFERING_TERMS)
        upsert_rows(conn, "ipo_demand", IPO_DEMAND)
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""Extract text from archived IPO PDFs into repo-relative derived text files."""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import hashlib
+import json
+import sqlite3
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+
+DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite")
+DEFAULT_OUTPUT_ROOT = Path("data/extracted_text")
+DEFAULT_MANIFEST = Path("data/snapshots/extracted_text_manifest.csv")
+
+
+@dataclass(frozen=True)
+class SourceDocument:
+    source_id: str
+    ticker: str
+    source_type: str
+    local_path: str
+    file_sha256: str | None
+
+
+def repo_root() -> Path:
+    return Path.cwd()
+
+
+def require_repo_relative(relative_path: str) -> Path:
+    path = Path(relative_path)
+    if path.is_absolute() or relative_path.startswith("./") or "\\" in relative_path:
+        raise ValueError(f"Path must be repo-relative POSIX style: {relative_path}")
+    full_path = repo_root() / path
+    if not full_path.exists():
+        raise FileNotFoundError(relative_path)
+    return full_path
+
+
+def sha256_file(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as handle:
+        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def load_sources(db_path: Path, requested_sources: list[str]) -> list[SourceDocument]:
+    with sqlite3.connect(db_path) as conn:
+        conn.row_factory = sqlite3.Row
+        if requested_sources:
+            placeholders = ", ".join("?" for _ in requested_sources)
+            rows = conn.execute(
+                f"""
+                SELECT source_id, ticker, source_type, local_path, file_sha256
+                FROM source_refs
+                WHERE source_id IN ({placeholders})
+                ORDER BY ticker, source_id
+                """,
+                requested_sources,
+            ).fetchall()
+        else:
+            rows = conn.execute(
+                """
+                SELECT source_id, ticker, source_type, local_path, file_sha256
+                FROM source_refs
+                WHERE local_path LIKE '%.pdf'
+                ORDER BY ticker, source_id
+                """
+            ).fetchall()
+    return [SourceDocument(**dict(row)) for row in rows]
+
+
+def import_pypdf():
+    try:
+        from pypdf import PdfReader
+    except ModuleNotFoundError as exc:
+        raise SystemExit(
+            "Missing dependency: pypdf. Install with `python3 -m pip install -r requirements.txt`."
+        ) from exc
+    return PdfReader
+
+
+def extract_text(pdf_path: Path) -> tuple[str, int, int]:
+    PdfReader = import_pypdf()
+    reader = PdfReader(str(pdf_path))
+    chunks: list[str] = []
+    pages_with_text = 0
+    for index, page in enumerate(reader.pages, start=1):
+        text = page.extract_text() or ""
+        if text.strip():
+            pages_with_text += 1
+        cleaned_text = "\n".join(line.rstrip() for line in text.strip().splitlines())
+        chunks.append(f"\n\n--- page {index} ---\n{cleaned_text}\n")
+    return "".join(chunks).strip() + "\n", len(reader.pages), pages_with_text
+
+
+def text_output_path(output_root: Path, source: SourceDocument) -> Path:
+    pdf_stem = Path(source.local_path).stem
+    return output_root / source.ticker / f"{pdf_stem}.txt"
+
+
+def write_manifest(rows: list[dict[str, object]], manifest_path: Path) -> None:
+    manifest_path.parent.mkdir(parents=True, exist_ok=True)
+    fieldnames = [
+        "source_id",
+        "ticker",
+        "source_type",
+        "pdf_local_path",
+        "pdf_sha256",
+        "text_local_path",
+        "text_sha256",
+        "page_count",
+        "pages_with_text",
+        "char_count",
+        "status",
+        "notes",
+    ]
+    with manifest_path.open("w", newline="", encoding="utf-8") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames, lineterminator="\n")
+        writer.writeheader()
+        writer.writerows(rows)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
+    parser.add_argument(
+        "--output-root",
+        default=str(DEFAULT_OUTPUT_ROOT),
+        help="Repo-relative output directory for extracted text.",
+    )
+    parser.add_argument(
+        "--manifest",
+        default=str(DEFAULT_MANIFEST),
+        help="Repo-relative CSV manifest path.",
+    )
+    parser.add_argument(
+        "--source-id",
+        action="append",
+        default=[],
+        help="Specific source_id to extract. May be passed multiple times. Defaults to all PDF source_refs.",
+    )
+    parser.add_argument("--json", action="store_true", help="Print a JSON summary.")
+    args = parser.parse_args()
+
+    db_path = require_repo_relative(args.db)
+    output_root = Path(args.output_root)
+    if output_root.is_absolute() or args.output_root.startswith("./") or "\\" in args.output_root:
+        raise ValueError(f"Output root must be repo-relative POSIX style: {args.output_root}")
+    manifest_path = Path(args.manifest)
+    if manifest_path.is_absolute() or args.manifest.startswith("./") or "\\" in args.manifest:
+        raise ValueError(f"Manifest path must be repo-relative POSIX style: {args.manifest}")
+
+    rows: list[dict[str, object]] = []
+    for source in load_sources(db_path, args.source_id):
+        pdf_path = require_repo_relative(source.local_path)
+        actual_pdf_hash = sha256_file(pdf_path)
+        if source.file_sha256 and source.file_sha256 != actual_pdf_hash:
+            raise ValueError(f"PDF hash mismatch for {source.source_id}")
+
+        output_path = text_output_path(output_root, source)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            text, page_count, pages_with_text = extract_text(pdf_path)
+            output_path.write_text(text, encoding="utf-8")
+            text_hash = sha256_file(output_path)
+            char_count = len(text)
+            status = "ok" if pages_with_text else "no_text_extracted"
+            notes = ""
+        except Exception as exc:
+            output_path.write_text("", encoding="utf-8")
+            text_hash = sha256_file(output_path)
+            page_count = 0
+            pages_with_text = 0
+            char_count = 0
+            status = "error"
+            notes = f"{type(exc).__name__}: {exc}"
+
+        rows.append(
+            {
+                "source_id": source.source_id,
+                "ticker": source.ticker,
+                "source_type": source.source_type,
+                "pdf_local_path": source.local_path,
+                "pdf_sha256": actual_pdf_hash,
+                "text_local_path": output_path.as_posix(),
+                "text_sha256": text_hash,
+                "page_count": page_count,
+                "pages_with_text": pages_with_text,
+                "char_count": char_count,
+                "status": status,
+                "notes": notes,
+            }
+        )
+
+    write_manifest(rows, manifest_path)
+    if args.json:
+        print(json.dumps(rows, ensure_ascii=False, indent=2))
+    else:
+        print(f"extracted {len(rows)} PDF source(s); manifest: {manifest_path.as_posix()}")
+        for row in rows:
+            print(
+                f"{row['source_id']}: {row['status']} "
+                f"pages={row['pages_with_text']}/{row['page_count']} "
+                f"chars={row['char_count']}"
+            )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())