Make PDF text extraction a standard archive step

Request:
- Add extracted PDF text generation to the archivist workflow as a standard step.

Changes:
- Run PDF text extraction automatically for newly archived HKEX PDF sources.
- Make the PDF text extractor incremental and manifest-preserving.
- Document extracted-text handling in the archivist skill and README.
- Mark generated extracted text as no-diff data evidence.
- Backfill extracted text for all archived PDF source references.

Verification:
- Ran git diff --cached --check.
- Ran .venv/bin/python -m py_compile scripts/extract_pdf_text.py scripts/archive_hkex_documents.py.
- Ran full PDF extraction, then confirmed an incremental rerun skips unchanged files.
- Verified 557 PDF source_refs, 557 manifest rows, all status ok, and zero missing text/hash/path issues.

Next useful context:
- HKEX HTML notices and Yahoo JSON market data remain under data/raw and are not expected in data/extracted_text.
This commit is contained in:
2026-06-15 13:27:41 +00:00
parent 48b89552fe
commit 8a0dfd88f0
557 changed files with 8721479 additions and 28 deletions
+22
View File
@@ -95,6 +95,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--limit", type=int, help="Optional maximum tickers to process. Omit to process all open T0/T1 tasks.")
parser.add_argument("--tickers", help="Comma-separated tickers to process instead of selecting from sync_tasks.")
parser.add_argument("--skip-sync-state", action="store_true", help="Do not refresh sync state after updating facts.")
parser.add_argument("--skip-text-extraction", action="store_true", help="Do not extract text for newly archived PDFs.")
return parser.parse_args()
@@ -756,6 +757,25 @@ def refresh_sync_state(db_path: str, schema_path: str, as_of: str) -> None:
)
def refresh_extracted_text(db_path: str, sources: list[ArchivedSource]) -> None:
pdf_source_ids = [
source.source_id
for source in sources
if Path(source.local_path).suffix.lower() == ".pdf"
]
if not pdf_source_ids:
return
command = [
sys.executable,
"scripts/extract_pdf_text.py",
"--db",
db_path,
]
for source_id in sorted(set(pdf_source_ids)):
command.extend(["--source-id", source_id])
subprocess.run(command, check=True)
def main() -> int:
args = parse_args()
as_of = parse_as_of(args.as_of)
@@ -830,6 +850,8 @@ def main() -> int:
if not args.skip_sync_state:
refresh_sync_state(args.db, args.schema, as_of)
if not args.skip_text_extraction:
refresh_extracted_text(args.db, archived_sources)
print("hkex documents archived")
print(f"tickers selected: {len(tickers)}")
+62 -23
View File
@@ -7,6 +7,7 @@ import argparse
import csv
import hashlib
import json
import logging
import sqlite3
import sys
from dataclasses import dataclass
@@ -18,6 +19,9 @@ DEFAULT_OUTPUT_ROOT = Path("data/extracted_text")
DEFAULT_MANIFEST = Path("data/snapshots/extracted_text_manifest.csv")
logging.getLogger("pypdf").setLevel(logging.ERROR)
@dataclass(frozen=True)
class SourceDocument:
source_id: str
@@ -126,6 +130,30 @@ def write_manifest(rows: list[dict[str, object]], manifest_path: Path) -> None:
writer.writerows(rows)
def load_manifest(manifest_path: Path) -> dict[str, dict[str, object]]:
if not manifest_path.exists():
return {}
with manifest_path.open(newline="", encoding="utf-8") as handle:
return {row["source_id"]: row for row in csv.DictReader(handle)}
def can_reuse_existing_manifest_row(
existing_row: dict[str, object] | None,
output_path: Path,
actual_pdf_hash: str,
) -> bool:
if not existing_row:
return False
if existing_row.get("pdf_sha256") != actual_pdf_hash:
return False
if existing_row.get("text_local_path") != output_path.as_posix():
return False
if not output_path.exists():
return False
text_hash = existing_row.get("text_sha256")
return bool(text_hash) and sha256_file(output_path) == text_hash
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
@@ -145,6 +173,7 @@ def main() -> int:
default=[],
help="Specific source_id to extract. May be passed multiple times. Defaults to all PDF source_refs.",
)
parser.add_argument("--force", action="store_true", help="Re-extract even when the manifest row is unchanged.")
parser.add_argument("--json", action="store_true", help="Print a JSON summary.")
args = parser.parse_args()
@@ -156,7 +185,10 @@ def main() -> int:
if manifest_path.is_absolute() or args.manifest.startswith("./") or "\\" in args.manifest:
raise ValueError(f"Manifest path must be repo-relative POSIX style: {args.manifest}")
rows: list[dict[str, object]] = []
existing_rows = load_manifest(manifest_path)
rows_by_source_id = dict(existing_rows) if args.source_id else {}
processed_rows: list[dict[str, object]] = []
skipped = 0
for source in load_sources(db_path, args.source_id):
pdf_path = require_repo_relative(source.local_path)
actual_pdf_hash = sha256_file(pdf_path)
@@ -165,24 +197,28 @@ def main() -> int:
output_path = text_output_path(output_root, source)
output_path.parent.mkdir(parents=True, exist_ok=True)
try:
text, page_count, pages_with_text = extract_text(pdf_path)
output_path.write_text(text, encoding="utf-8")
text_hash = sha256_file(output_path)
char_count = len(text)
status = "ok" if pages_with_text else "no_text_extracted"
notes = ""
except Exception as exc:
output_path.write_text("", encoding="utf-8")
text_hash = sha256_file(output_path)
page_count = 0
pages_with_text = 0
char_count = 0
status = "error"
notes = f"{type(exc).__name__}: {exc}"
existing_row = existing_rows.get(source.source_id)
if not args.force and can_reuse_existing_manifest_row(existing_row, output_path, actual_pdf_hash):
row = dict(existing_row or {})
skipped += 1
else:
try:
text, page_count, pages_with_text = extract_text(pdf_path)
output_path.write_text(text, encoding="utf-8")
text_hash = sha256_file(output_path)
char_count = len(text)
status = "ok" if pages_with_text else "no_text_extracted"
notes = ""
except Exception as exc:
output_path.write_text("", encoding="utf-8")
text_hash = sha256_file(output_path)
page_count = 0
pages_with_text = 0
char_count = 0
status = "error"
notes = f"{type(exc).__name__}: {exc}"
rows.append(
{
row = {
"source_id": source.source_id,
"ticker": source.ticker,
"source_type": source.source_type,
@@ -196,14 +232,17 @@ def main() -> int:
"status": status,
"notes": notes,
}
)
rows_by_source_id[source.source_id] = row
processed_rows.append(row)
write_manifest(rows, manifest_path)
manifest_rows = sorted(rows_by_source_id.values(), key=lambda row: (str(row["ticker"]), str(row["source_id"])))
write_manifest(manifest_rows, manifest_path)
if args.json:
print(json.dumps(rows, ensure_ascii=False, indent=2))
print(json.dumps(processed_rows, ensure_ascii=False, indent=2))
else:
print(f"extracted {len(rows)} PDF source(s); manifest: {manifest_path.as_posix()}")
for row in rows:
print(f"processed {len(processed_rows)} PDF source(s); skipped unchanged: {skipped}")
print(f"manifest rows: {len(manifest_rows)}; manifest: {manifest_path.as_posix()}")
for row in processed_rows:
print(
f"{row['source_id']}: {row['status']} "
f"pages={row['pages_with_text']}/{row['page_count']} "