Make PDF text extraction a standard archive step

Request: - Add extracted PDF text generation to the archivist workflow as a standard step. Changes: - Run PDF text extraction automatically for newly archived HKEX PDF sources. - Make the PDF text extractor incremental and manifest-preserving. - Document extracted-text handling in the archivist skill and README. - Mark generated extracted text as no-diff data evidence. - Backfill extracted text for all archived PDF source references. Verification: - Ran git diff --cached --check. - Ran .venv/bin/python -m py_compile scripts/extract_pdf_text.py scripts/archive_hkex_documents.py. - Ran full PDF extraction, then confirmed an incremental rerun skips unchanged files. - Verified 557 PDF source_refs, 557 manifest rows, all status ok, and zero missing text/hash/path issues. Next useful context: - HKEX HTML notices and Yahoo JSON market data remain under data/raw and are not expected in data/extracted_text.
2026-06-15 13:27:41 +00:00
parent 48b89552fe
commit 8a0dfd88f0
557 changed files with 8721479 additions and 28 deletions
@@ -95,6 +95,7 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument("--limit", type=int, help="Optional maximum tickers to process. Omit to process all open T0/T1 tasks.")
    parser.add_argument("--tickers", help="Comma-separated tickers to process instead of selecting from sync_tasks.")
    parser.add_argument("--skip-sync-state", action="store_true", help="Do not refresh sync state after updating facts.")
+    parser.add_argument("--skip-text-extraction", action="store_true", help="Do not extract text for newly archived PDFs.")
    return parser.parse_args()


@@ -756,6 +757,25 @@ def refresh_sync_state(db_path: str, schema_path: str, as_of: str) -> None:
    )


+def refresh_extracted_text(db_path: str, sources: list[ArchivedSource]) -> None:
+    pdf_source_ids = [
+        source.source_id
+        for source in sources
+        if Path(source.local_path).suffix.lower() == ".pdf"
+    ]
+    if not pdf_source_ids:
+        return
+    command = [
+        sys.executable,
+        "scripts/extract_pdf_text.py",
+        "--db",
+        db_path,
+    ]
+    for source_id in sorted(set(pdf_source_ids)):
+        command.extend(["--source-id", source_id])
+    subprocess.run(command, check=True)
+
+
 def main() -> int:
    args = parse_args()
    as_of = parse_as_of(args.as_of)
@@ -830,6 +850,8 @@ def main() -> int:

    if not args.skip_sync_state:
        refresh_sync_state(args.db, args.schema, as_of)
+    if not args.skip_text_extraction:
+        refresh_extracted_text(args.db, archived_sources)

    print("hkex documents archived")
    print(f"tickers selected: {len(tickers)}")
@@ -7,6 +7,7 @@ import argparse
 import csv
 import hashlib
 import json
+import logging
 import sqlite3
 import sys
 from dataclasses import dataclass
@@ -18,6 +19,9 @@ DEFAULT_OUTPUT_ROOT = Path("data/extracted_text")
 DEFAULT_MANIFEST = Path("data/snapshots/extracted_text_manifest.csv")


+logging.getLogger("pypdf").setLevel(logging.ERROR)
+
+
@dataclass(frozen=True)
 class SourceDocument:
    source_id: str
@@ -126,6 +130,30 @@ def write_manifest(rows: list[dict[str, object]], manifest_path: Path) -> None:
        writer.writerows(rows)


+def load_manifest(manifest_path: Path) -> dict[str, dict[str, object]]:
+    if not manifest_path.exists():
+        return {}
+    with manifest_path.open(newline="", encoding="utf-8") as handle:
+        return {row["source_id"]: row for row in csv.DictReader(handle)}
+
+
+def can_reuse_existing_manifest_row(
+    existing_row: dict[str, object] | None,
+    output_path: Path,
+    actual_pdf_hash: str,
+) -> bool:
+    if not existing_row:
+        return False
+    if existing_row.get("pdf_sha256") != actual_pdf_hash:
+        return False
+    if existing_row.get("text_local_path") != output_path.as_posix():
+        return False
+    if not output_path.exists():
+        return False
+    text_hash = existing_row.get("text_sha256")
+    return bool(text_hash) and sha256_file(output_path) == text_hash
+
+
 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
@@ -145,6 +173,7 @@ def main() -> int:
        default=[],
        help="Specific source_id to extract. May be passed multiple times. Defaults to all PDF source_refs.",
    )
+    parser.add_argument("--force", action="store_true", help="Re-extract even when the manifest row is unchanged.")
    parser.add_argument("--json", action="store_true", help="Print a JSON summary.")
    args = parser.parse_args()

@@ -156,7 +185,10 @@ def main() -> int:
    if manifest_path.is_absolute() or args.manifest.startswith("./") or "\\" in args.manifest:
        raise ValueError(f"Manifest path must be repo-relative POSIX style: {args.manifest}")

-    rows: list[dict[str, object]] = []
+    existing_rows = load_manifest(manifest_path)
+    rows_by_source_id = dict(existing_rows) if args.source_id else {}
+    processed_rows: list[dict[str, object]] = []
+    skipped = 0
    for source in load_sources(db_path, args.source_id):
        pdf_path = require_repo_relative(source.local_path)
        actual_pdf_hash = sha256_file(pdf_path)
@@ -165,24 +197,28 @@ def main() -> int:

        output_path = text_output_path(output_root, source)
        output_path.parent.mkdir(parents=True, exist_ok=True)
-        try:
-            text, page_count, pages_with_text = extract_text(pdf_path)
-            output_path.write_text(text, encoding="utf-8")
-            text_hash = sha256_file(output_path)
-            char_count = len(text)
-            status = "ok" if pages_with_text else "no_text_extracted"
-            notes = ""
-        except Exception as exc:
-            output_path.write_text("", encoding="utf-8")
-            text_hash = sha256_file(output_path)
-            page_count = 0
-            pages_with_text = 0
-            char_count = 0
-            status = "error"
-            notes = f"{type(exc).__name__}: {exc}"
+        existing_row = existing_rows.get(source.source_id)
+        if not args.force and can_reuse_existing_manifest_row(existing_row, output_path, actual_pdf_hash):
+            row = dict(existing_row or {})
+            skipped += 1
+        else:
+            try:
+                text, page_count, pages_with_text = extract_text(pdf_path)
+                output_path.write_text(text, encoding="utf-8")
+                text_hash = sha256_file(output_path)
+                char_count = len(text)
+                status = "ok" if pages_with_text else "no_text_extracted"
+                notes = ""
+            except Exception as exc:
+                output_path.write_text("", encoding="utf-8")
+                text_hash = sha256_file(output_path)
+                page_count = 0
+                pages_with_text = 0
+                char_count = 0
+                status = "error"
+                notes = f"{type(exc).__name__}: {exc}"

-        rows.append(
-            {
+            row = {
                "source_id": source.source_id,
                "ticker": source.ticker,
                "source_type": source.source_type,
@@ -196,14 +232,17 @@ def main() -> int:
                "status": status,
                "notes": notes,
            }
-        )
+        rows_by_source_id[source.source_id] = row
+        processed_rows.append(row)

-    write_manifest(rows, manifest_path)
+    manifest_rows = sorted(rows_by_source_id.values(), key=lambda row: (str(row["ticker"]), str(row["source_id"])))
+    write_manifest(manifest_rows, manifest_path)
    if args.json:
-        print(json.dumps(rows, ensure_ascii=False, indent=2))
+        print(json.dumps(processed_rows, ensure_ascii=False, indent=2))
    else:
-        print(f"extracted {len(rows)} PDF source(s); manifest: {manifest_path.as_posix()}")
-        for row in rows:
+        print(f"processed {len(processed_rows)} PDF source(s); skipped unchanged: {skipped}")
+        print(f"manifest rows: {len(manifest_rows)}; manifest: {manifest_path.as_posix()}")
+        for row in processed_rows:
            print(
                f"{row['source_id']}: {row['status']} "
                f"pages={row['pages_with_text']}/{row['page_count']} "