#!/usr/bin/env python3 """Extract text from archived IPO PDFs into repo-relative derived text files.""" from __future__ import annotations import argparse import csv import hashlib import json import sqlite3 import sys from dataclasses import dataclass from pathlib import Path DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite") DEFAULT_OUTPUT_ROOT = Path("data/extracted_text") DEFAULT_MANIFEST = Path("data/snapshots/extracted_text_manifest.csv") @dataclass(frozen=True) class SourceDocument: source_id: str ticker: str source_type: str local_path: str file_sha256: str | None def repo_root() -> Path: return Path.cwd() def require_repo_relative(relative_path: str) -> Path: path = Path(relative_path) if path.is_absolute() or relative_path.startswith("./") or "\\" in relative_path: raise ValueError(f"Path must be repo-relative POSIX style: {relative_path}") full_path = repo_root() / path if not full_path.exists(): raise FileNotFoundError(relative_path) return full_path def sha256_file(path: Path) -> str: digest = hashlib.sha256() with path.open("rb") as handle: for chunk in iter(lambda: handle.read(1024 * 1024), b""): digest.update(chunk) return digest.hexdigest() def load_sources(db_path: Path, requested_sources: list[str]) -> list[SourceDocument]: with sqlite3.connect(db_path) as conn: conn.row_factory = sqlite3.Row if requested_sources: placeholders = ", ".join("?" for _ in requested_sources) rows = conn.execute( f""" SELECT source_id, ticker, source_type, local_path, file_sha256 FROM source_refs WHERE source_id IN ({placeholders}) ORDER BY ticker, source_id """, requested_sources, ).fetchall() else: rows = conn.execute( """ SELECT source_id, ticker, source_type, local_path, file_sha256 FROM source_refs WHERE local_path LIKE '%.pdf' ORDER BY ticker, source_id """ ).fetchall() return [SourceDocument(**dict(row)) for row in rows] def import_pypdf(): try: from pypdf import PdfReader except ModuleNotFoundError as exc: raise SystemExit( "Missing dependency: pypdf. Install with `python3 -m pip install -r requirements.txt`." ) from exc return PdfReader def extract_text(pdf_path: Path) -> tuple[str, int, int]: PdfReader = import_pypdf() reader = PdfReader(str(pdf_path)) chunks: list[str] = [] pages_with_text = 0 for index, page in enumerate(reader.pages, start=1): text = page.extract_text() or "" if text.strip(): pages_with_text += 1 cleaned_text = "\n".join(line.rstrip() for line in text.strip().splitlines()) chunks.append(f"\n\n--- page {index} ---\n{cleaned_text}\n") return "".join(chunks).strip() + "\n", len(reader.pages), pages_with_text def text_output_path(output_root: Path, source: SourceDocument) -> Path: pdf_stem = Path(source.local_path).stem return output_root / source.ticker / f"{pdf_stem}.txt" def write_manifest(rows: list[dict[str, object]], manifest_path: Path) -> None: manifest_path.parent.mkdir(parents=True, exist_ok=True) fieldnames = [ "source_id", "ticker", "source_type", "pdf_local_path", "pdf_sha256", "text_local_path", "text_sha256", "page_count", "pages_with_text", "char_count", "status", "notes", ] with manifest_path.open("w", newline="", encoding="utf-8") as handle: writer = csv.DictWriter(handle, fieldnames=fieldnames, lineterminator="\n") writer.writeheader() writer.writerows(rows) def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.") parser.add_argument( "--output-root", default=str(DEFAULT_OUTPUT_ROOT), help="Repo-relative output directory for extracted text.", ) parser.add_argument( "--manifest", default=str(DEFAULT_MANIFEST), help="Repo-relative CSV manifest path.", ) parser.add_argument( "--source-id", action="append", default=[], help="Specific source_id to extract. May be passed multiple times. Defaults to all PDF source_refs.", ) parser.add_argument("--json", action="store_true", help="Print a JSON summary.") args = parser.parse_args() db_path = require_repo_relative(args.db) output_root = Path(args.output_root) if output_root.is_absolute() or args.output_root.startswith("./") or "\\" in args.output_root: raise ValueError(f"Output root must be repo-relative POSIX style: {args.output_root}") manifest_path = Path(args.manifest) if manifest_path.is_absolute() or args.manifest.startswith("./") or "\\" in args.manifest: raise ValueError(f"Manifest path must be repo-relative POSIX style: {args.manifest}") rows: list[dict[str, object]] = [] for source in load_sources(db_path, args.source_id): pdf_path = require_repo_relative(source.local_path) actual_pdf_hash = sha256_file(pdf_path) if source.file_sha256 and source.file_sha256 != actual_pdf_hash: raise ValueError(f"PDF hash mismatch for {source.source_id}") output_path = text_output_path(output_root, source) output_path.parent.mkdir(parents=True, exist_ok=True) try: text, page_count, pages_with_text = extract_text(pdf_path) output_path.write_text(text, encoding="utf-8") text_hash = sha256_file(output_path) char_count = len(text) status = "ok" if pages_with_text else "no_text_extracted" notes = "" except Exception as exc: output_path.write_text("", encoding="utf-8") text_hash = sha256_file(output_path) page_count = 0 pages_with_text = 0 char_count = 0 status = "error" notes = f"{type(exc).__name__}: {exc}" rows.append( { "source_id": source.source_id, "ticker": source.ticker, "source_type": source.source_type, "pdf_local_path": source.local_path, "pdf_sha256": actual_pdf_hash, "text_local_path": output_path.as_posix(), "text_sha256": text_hash, "page_count": page_count, "pages_with_text": pages_with_text, "char_count": char_count, "status": status, "notes": notes, } ) write_manifest(rows, manifest_path) if args.json: print(json.dumps(rows, ensure_ascii=False, indent=2)) else: print(f"extracted {len(rows)} PDF source(s); manifest: {manifest_path.as_posix()}") for row in rows: print( f"{row['source_id']}: {row['status']} " f"pages={row['pages_with_text']}/{row['page_count']} " f"chars={row['char_count']}" ) return 0 if __name__ == "__main__": raise SystemExit(main())