Make PDF text extraction a standard archive step
Request: - Add extracted PDF text generation to the archivist workflow as a standard step. Changes: - Run PDF text extraction automatically for newly archived HKEX PDF sources. - Make the PDF text extractor incremental and manifest-preserving. - Document extracted-text handling in the archivist skill and README. - Mark generated extracted text as no-diff data evidence. - Backfill extracted text for all archived PDF source references. Verification: - Ran git diff --cached --check. - Ran .venv/bin/python -m py_compile scripts/extract_pdf_text.py scripts/archive_hkex_documents.py. - Ran full PDF extraction, then confirmed an incremental rerun skips unchanged files. - Verified 557 PDF source_refs, 557 manifest rows, all status ok, and zero missing text/hash/path issues. Next useful context: - HKEX HTML notices and Yahoo JSON market data remain under data/raw and are not expected in data/extracted_text.
This commit is contained in:
@@ -95,6 +95,7 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.add_argument("--limit", type=int, help="Optional maximum tickers to process. Omit to process all open T0/T1 tasks.")
|
||||
parser.add_argument("--tickers", help="Comma-separated tickers to process instead of selecting from sync_tasks.")
|
||||
parser.add_argument("--skip-sync-state", action="store_true", help="Do not refresh sync state after updating facts.")
|
||||
parser.add_argument("--skip-text-extraction", action="store_true", help="Do not extract text for newly archived PDFs.")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -756,6 +757,25 @@ def refresh_sync_state(db_path: str, schema_path: str, as_of: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
def refresh_extracted_text(db_path: str, sources: list[ArchivedSource]) -> None:
|
||||
pdf_source_ids = [
|
||||
source.source_id
|
||||
for source in sources
|
||||
if Path(source.local_path).suffix.lower() == ".pdf"
|
||||
]
|
||||
if not pdf_source_ids:
|
||||
return
|
||||
command = [
|
||||
sys.executable,
|
||||
"scripts/extract_pdf_text.py",
|
||||
"--db",
|
||||
db_path,
|
||||
]
|
||||
for source_id in sorted(set(pdf_source_ids)):
|
||||
command.extend(["--source-id", source_id])
|
||||
subprocess.run(command, check=True)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
as_of = parse_as_of(args.as_of)
|
||||
@@ -830,6 +850,8 @@ def main() -> int:
|
||||
|
||||
if not args.skip_sync_state:
|
||||
refresh_sync_state(args.db, args.schema, as_of)
|
||||
if not args.skip_text_extraction:
|
||||
refresh_extracted_text(args.db, archived_sources)
|
||||
|
||||
print("hkex documents archived")
|
||||
print(f"tickers selected: {len(tickers)}")
|
||||
|
||||
+62
-23
@@ -7,6 +7,7 @@ import argparse
|
||||
import csv
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
@@ -18,6 +19,9 @@ DEFAULT_OUTPUT_ROOT = Path("data/extracted_text")
|
||||
DEFAULT_MANIFEST = Path("data/snapshots/extracted_text_manifest.csv")
|
||||
|
||||
|
||||
logging.getLogger("pypdf").setLevel(logging.ERROR)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceDocument:
|
||||
source_id: str
|
||||
@@ -126,6 +130,30 @@ def write_manifest(rows: list[dict[str, object]], manifest_path: Path) -> None:
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def load_manifest(manifest_path: Path) -> dict[str, dict[str, object]]:
|
||||
if not manifest_path.exists():
|
||||
return {}
|
||||
with manifest_path.open(newline="", encoding="utf-8") as handle:
|
||||
return {row["source_id"]: row for row in csv.DictReader(handle)}
|
||||
|
||||
|
||||
def can_reuse_existing_manifest_row(
|
||||
existing_row: dict[str, object] | None,
|
||||
output_path: Path,
|
||||
actual_pdf_hash: str,
|
||||
) -> bool:
|
||||
if not existing_row:
|
||||
return False
|
||||
if existing_row.get("pdf_sha256") != actual_pdf_hash:
|
||||
return False
|
||||
if existing_row.get("text_local_path") != output_path.as_posix():
|
||||
return False
|
||||
if not output_path.exists():
|
||||
return False
|
||||
text_hash = existing_row.get("text_sha256")
|
||||
return bool(text_hash) and sha256_file(output_path) == text_hash
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.")
|
||||
@@ -145,6 +173,7 @@ def main() -> int:
|
||||
default=[],
|
||||
help="Specific source_id to extract. May be passed multiple times. Defaults to all PDF source_refs.",
|
||||
)
|
||||
parser.add_argument("--force", action="store_true", help="Re-extract even when the manifest row is unchanged.")
|
||||
parser.add_argument("--json", action="store_true", help="Print a JSON summary.")
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -156,7 +185,10 @@ def main() -> int:
|
||||
if manifest_path.is_absolute() or args.manifest.startswith("./") or "\\" in args.manifest:
|
||||
raise ValueError(f"Manifest path must be repo-relative POSIX style: {args.manifest}")
|
||||
|
||||
rows: list[dict[str, object]] = []
|
||||
existing_rows = load_manifest(manifest_path)
|
||||
rows_by_source_id = dict(existing_rows) if args.source_id else {}
|
||||
processed_rows: list[dict[str, object]] = []
|
||||
skipped = 0
|
||||
for source in load_sources(db_path, args.source_id):
|
||||
pdf_path = require_repo_relative(source.local_path)
|
||||
actual_pdf_hash = sha256_file(pdf_path)
|
||||
@@ -165,24 +197,28 @@ def main() -> int:
|
||||
|
||||
output_path = text_output_path(output_root, source)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
text, page_count, pages_with_text = extract_text(pdf_path)
|
||||
output_path.write_text(text, encoding="utf-8")
|
||||
text_hash = sha256_file(output_path)
|
||||
char_count = len(text)
|
||||
status = "ok" if pages_with_text else "no_text_extracted"
|
||||
notes = ""
|
||||
except Exception as exc:
|
||||
output_path.write_text("", encoding="utf-8")
|
||||
text_hash = sha256_file(output_path)
|
||||
page_count = 0
|
||||
pages_with_text = 0
|
||||
char_count = 0
|
||||
status = "error"
|
||||
notes = f"{type(exc).__name__}: {exc}"
|
||||
existing_row = existing_rows.get(source.source_id)
|
||||
if not args.force and can_reuse_existing_manifest_row(existing_row, output_path, actual_pdf_hash):
|
||||
row = dict(existing_row or {})
|
||||
skipped += 1
|
||||
else:
|
||||
try:
|
||||
text, page_count, pages_with_text = extract_text(pdf_path)
|
||||
output_path.write_text(text, encoding="utf-8")
|
||||
text_hash = sha256_file(output_path)
|
||||
char_count = len(text)
|
||||
status = "ok" if pages_with_text else "no_text_extracted"
|
||||
notes = ""
|
||||
except Exception as exc:
|
||||
output_path.write_text("", encoding="utf-8")
|
||||
text_hash = sha256_file(output_path)
|
||||
page_count = 0
|
||||
pages_with_text = 0
|
||||
char_count = 0
|
||||
status = "error"
|
||||
notes = f"{type(exc).__name__}: {exc}"
|
||||
|
||||
rows.append(
|
||||
{
|
||||
row = {
|
||||
"source_id": source.source_id,
|
||||
"ticker": source.ticker,
|
||||
"source_type": source.source_type,
|
||||
@@ -196,14 +232,17 @@ def main() -> int:
|
||||
"status": status,
|
||||
"notes": notes,
|
||||
}
|
||||
)
|
||||
rows_by_source_id[source.source_id] = row
|
||||
processed_rows.append(row)
|
||||
|
||||
write_manifest(rows, manifest_path)
|
||||
manifest_rows = sorted(rows_by_source_id.values(), key=lambda row: (str(row["ticker"]), str(row["source_id"])))
|
||||
write_manifest(manifest_rows, manifest_path)
|
||||
if args.json:
|
||||
print(json.dumps(rows, ensure_ascii=False, indent=2))
|
||||
print(json.dumps(processed_rows, ensure_ascii=False, indent=2))
|
||||
else:
|
||||
print(f"extracted {len(rows)} PDF source(s); manifest: {manifest_path.as_posix()}")
|
||||
for row in rows:
|
||||
print(f"processed {len(processed_rows)} PDF source(s); skipped unchanged: {skipped}")
|
||||
print(f"manifest rows: {len(manifest_rows)}; manifest: {manifest_path.as_posix()}")
|
||||
for row in processed_rows:
|
||||
print(
|
||||
f"{row['source_id']}: {row['status']} "
|
||||
f"pages={row['pages_with_text']}/{row['page_count']} "
|
||||
|
||||
Reference in New Issue
Block a user