8a0dfd88f0
Request: - Add extracted PDF text generation to the archivist workflow as a standard step. Changes: - Run PDF text extraction automatically for newly archived HKEX PDF sources. - Make the PDF text extractor incremental and manifest-preserving. - Document extracted-text handling in the archivist skill and README. - Mark generated extracted text as no-diff data evidence. - Backfill extracted text for all archived PDF source references. Verification: - Ran git diff --cached --check. - Ran .venv/bin/python -m py_compile scripts/extract_pdf_text.py scripts/archive_hkex_documents.py. - Ran full PDF extraction, then confirmed an incremental rerun skips unchanged files. - Verified 557 PDF source_refs, 557 manifest rows, all status ok, and zero missing text/hash/path issues. Next useful context: - HKEX HTML notices and Yahoo JSON market data remain under data/raw and are not expected in data/extracted_text.
256 lines
9.0 KiB
Python
256 lines
9.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Extract text from archived IPO PDFs into repo-relative derived text files."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import sqlite3
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
|
|
DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite")
|
|
DEFAULT_OUTPUT_ROOT = Path("data/extracted_text")
|
|
DEFAULT_MANIFEST = Path("data/snapshots/extracted_text_manifest.csv")
|
|
|
|
|
|
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class SourceDocument:
|
|
source_id: str
|
|
ticker: str
|
|
source_type: str
|
|
local_path: str
|
|
file_sha256: str | None
|
|
|
|
|
|
def repo_root() -> Path:
|
|
return Path.cwd()
|
|
|
|
|
|
def require_repo_relative(relative_path: str) -> Path:
|
|
path = Path(relative_path)
|
|
if path.is_absolute() or relative_path.startswith("./") or "\\" in relative_path:
|
|
raise ValueError(f"Path must be repo-relative POSIX style: {relative_path}")
|
|
full_path = repo_root() / path
|
|
if not full_path.exists():
|
|
raise FileNotFoundError(relative_path)
|
|
return full_path
|
|
|
|
|
|
def sha256_file(path: Path) -> str:
|
|
digest = hashlib.sha256()
|
|
with path.open("rb") as handle:
|
|
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
digest.update(chunk)
|
|
return digest.hexdigest()
|
|
|
|
|
|
def load_sources(db_path: Path, requested_sources: list[str]) -> list[SourceDocument]:
|
|
with sqlite3.connect(db_path) as conn:
|
|
conn.row_factory = sqlite3.Row
|
|
if requested_sources:
|
|
placeholders = ", ".join("?" for _ in requested_sources)
|
|
rows = conn.execute(
|
|
f"""
|
|
SELECT source_id, ticker, source_type, local_path, file_sha256
|
|
FROM source_refs
|
|
WHERE source_id IN ({placeholders})
|
|
ORDER BY ticker, source_id
|
|
""",
|
|
requested_sources,
|
|
).fetchall()
|
|
else:
|
|
rows = conn.execute(
|
|
"""
|
|
SELECT source_id, ticker, source_type, local_path, file_sha256
|
|
FROM source_refs
|
|
WHERE local_path LIKE '%.pdf'
|
|
ORDER BY ticker, source_id
|
|
"""
|
|
).fetchall()
|
|
return [SourceDocument(**dict(row)) for row in rows]
|
|
|
|
|
|
def import_pypdf():
|
|
try:
|
|
from pypdf import PdfReader
|
|
except ModuleNotFoundError as exc:
|
|
raise SystemExit(
|
|
"Missing dependency: pypdf. Install with `python3 -m pip install -r requirements.txt`."
|
|
) from exc
|
|
return PdfReader
|
|
|
|
|
|
def extract_text(pdf_path: Path) -> tuple[str, int, int]:
|
|
PdfReader = import_pypdf()
|
|
reader = PdfReader(str(pdf_path))
|
|
chunks: list[str] = []
|
|
pages_with_text = 0
|
|
for index, page in enumerate(reader.pages, start=1):
|
|
text = page.extract_text() or ""
|
|
if text.strip():
|
|
pages_with_text += 1
|
|
cleaned_text = "\n".join(line.rstrip() for line in text.strip().splitlines())
|
|
chunks.append(f"\n\n--- page {index} ---\n{cleaned_text}\n")
|
|
return "".join(chunks).strip() + "\n", len(reader.pages), pages_with_text
|
|
|
|
|
|
def text_output_path(output_root: Path, source: SourceDocument) -> Path:
|
|
pdf_stem = Path(source.local_path).stem
|
|
return output_root / source.ticker / f"{pdf_stem}.txt"
|
|
|
|
|
|
def write_manifest(rows: list[dict[str, object]], manifest_path: Path) -> None:
|
|
manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
fieldnames = [
|
|
"source_id",
|
|
"ticker",
|
|
"source_type",
|
|
"pdf_local_path",
|
|
"pdf_sha256",
|
|
"text_local_path",
|
|
"text_sha256",
|
|
"page_count",
|
|
"pages_with_text",
|
|
"char_count",
|
|
"status",
|
|
"notes",
|
|
]
|
|
with manifest_path.open("w", newline="", encoding="utf-8") as handle:
|
|
writer = csv.DictWriter(handle, fieldnames=fieldnames, lineterminator="\n")
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
|
|
def load_manifest(manifest_path: Path) -> dict[str, dict[str, object]]:
|
|
if not manifest_path.exists():
|
|
return {}
|
|
with manifest_path.open(newline="", encoding="utf-8") as handle:
|
|
return {row["source_id"]: row for row in csv.DictReader(handle)}
|
|
|
|
|
|
def can_reuse_existing_manifest_row(
|
|
existing_row: dict[str, object] | None,
|
|
output_path: Path,
|
|
actual_pdf_hash: str,
|
|
) -> bool:
|
|
if not existing_row:
|
|
return False
|
|
if existing_row.get("pdf_sha256") != actual_pdf_hash:
|
|
return False
|
|
if existing_row.get("text_local_path") != output_path.as_posix():
|
|
return False
|
|
if not output_path.exists():
|
|
return False
|
|
text_hash = existing_row.get("text_sha256")
|
|
return bool(text_hash) and sha256_file(output_path) == text_hash
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
|
|
parser.add_argument(
|
|
"--output-root",
|
|
default=str(DEFAULT_OUTPUT_ROOT),
|
|
help="Repo-relative output directory for extracted text.",
|
|
)
|
|
parser.add_argument(
|
|
"--manifest",
|
|
default=str(DEFAULT_MANIFEST),
|
|
help="Repo-relative CSV manifest path.",
|
|
)
|
|
parser.add_argument(
|
|
"--source-id",
|
|
action="append",
|
|
default=[],
|
|
help="Specific source_id to extract. May be passed multiple times. Defaults to all PDF source_refs.",
|
|
)
|
|
parser.add_argument("--force", action="store_true", help="Re-extract even when the manifest row is unchanged.")
|
|
parser.add_argument("--json", action="store_true", help="Print a JSON summary.")
|
|
args = parser.parse_args()
|
|
|
|
db_path = require_repo_relative(args.db)
|
|
output_root = Path(args.output_root)
|
|
if output_root.is_absolute() or args.output_root.startswith("./") or "\\" in args.output_root:
|
|
raise ValueError(f"Output root must be repo-relative POSIX style: {args.output_root}")
|
|
manifest_path = Path(args.manifest)
|
|
if manifest_path.is_absolute() or args.manifest.startswith("./") or "\\" in args.manifest:
|
|
raise ValueError(f"Manifest path must be repo-relative POSIX style: {args.manifest}")
|
|
|
|
existing_rows = load_manifest(manifest_path)
|
|
rows_by_source_id = dict(existing_rows) if args.source_id else {}
|
|
processed_rows: list[dict[str, object]] = []
|
|
skipped = 0
|
|
for source in load_sources(db_path, args.source_id):
|
|
pdf_path = require_repo_relative(source.local_path)
|
|
actual_pdf_hash = sha256_file(pdf_path)
|
|
if source.file_sha256 and source.file_sha256 != actual_pdf_hash:
|
|
raise ValueError(f"PDF hash mismatch for {source.source_id}")
|
|
|
|
output_path = text_output_path(output_root, source)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
existing_row = existing_rows.get(source.source_id)
|
|
if not args.force and can_reuse_existing_manifest_row(existing_row, output_path, actual_pdf_hash):
|
|
row = dict(existing_row or {})
|
|
skipped += 1
|
|
else:
|
|
try:
|
|
text, page_count, pages_with_text = extract_text(pdf_path)
|
|
output_path.write_text(text, encoding="utf-8")
|
|
text_hash = sha256_file(output_path)
|
|
char_count = len(text)
|
|
status = "ok" if pages_with_text else "no_text_extracted"
|
|
notes = ""
|
|
except Exception as exc:
|
|
output_path.write_text("", encoding="utf-8")
|
|
text_hash = sha256_file(output_path)
|
|
page_count = 0
|
|
pages_with_text = 0
|
|
char_count = 0
|
|
status = "error"
|
|
notes = f"{type(exc).__name__}: {exc}"
|
|
|
|
row = {
|
|
"source_id": source.source_id,
|
|
"ticker": source.ticker,
|
|
"source_type": source.source_type,
|
|
"pdf_local_path": source.local_path,
|
|
"pdf_sha256": actual_pdf_hash,
|
|
"text_local_path": output_path.as_posix(),
|
|
"text_sha256": text_hash,
|
|
"page_count": page_count,
|
|
"pages_with_text": pages_with_text,
|
|
"char_count": char_count,
|
|
"status": status,
|
|
"notes": notes,
|
|
}
|
|
rows_by_source_id[source.source_id] = row
|
|
processed_rows.append(row)
|
|
|
|
manifest_rows = sorted(rows_by_source_id.values(), key=lambda row: (str(row["ticker"]), str(row["source_id"])))
|
|
write_manifest(manifest_rows, manifest_path)
|
|
if args.json:
|
|
print(json.dumps(processed_rows, ensure_ascii=False, indent=2))
|
|
else:
|
|
print(f"processed {len(processed_rows)} PDF source(s); skipped unchanged: {skipped}")
|
|
print(f"manifest rows: {len(manifest_rows)}; manifest: {manifest_path.as_posix()}")
|
|
for row in processed_rows:
|
|
print(
|
|
f"{row['source_id']}: {row['status']} "
|
|
f"pages={row['pages_with_text']}/{row['page_count']} "
|
|
f"chars={row['char_count']}"
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|