Add 02335 T0 analyst report
Request: - Generate an analyst report for HK IPO ticker 02335. Changes: - Archived the official HKEXnews 02335 prospectus PDF and extracted text under project-relative data paths. - Seeded 02335 T0 prospectus facts, source references, sync state, and analysis snapshots. - Generated reports/2026-06-15_02335_T0_prospectus_analysis.md in Simplified Chinese with concrete T0/T1/T2/D1 dates and short-exit T2/D1 discipline. - Made PDF text extraction tolerant of invalid Unicode surrogate characters emitted by pypdf. Verification: - Compiled archive_hkex_documents.py, generate_ipo_report.py, build_analysis_dataset.py, extract_pdf_text.py, and update_sync_state.py. - Ran SQLite integrity_check and foreign_key_check. - Verified the archived 02335 PDF hash, extracted-text manifest row, and analysis dataset row. - Ran git diff --check. Next useful context: - 02335 is currently T0_prospectus; T1_allotment is pending for 2026-06-23.
This commit is contained in:
@@ -89,6 +89,10 @@ def import_pypdf():
|
||||
return PdfReader
|
||||
|
||||
|
||||
def utf8_safe_text(value: str) -> str:
|
||||
return value.encode("utf-8", "replace").decode("utf-8")
|
||||
|
||||
|
||||
def extract_text(pdf_path: Path) -> tuple[str, int, int]:
|
||||
PdfReader = import_pypdf()
|
||||
reader = PdfReader(str(pdf_path))
|
||||
@@ -98,7 +102,7 @@ def extract_text(pdf_path: Path) -> tuple[str, int, int]:
|
||||
text = page.extract_text() or ""
|
||||
if text.strip():
|
||||
pages_with_text += 1
|
||||
cleaned_text = "\n".join(line.rstrip() for line in text.strip().splitlines())
|
||||
cleaned_text = utf8_safe_text("\n".join(line.rstrip() for line in text.strip().splitlines()))
|
||||
chunks.append(f"\n\n--- page {index} ---\n{cleaned_text}\n")
|
||||
return "".join(chunks).strip() + "\n", len(reader.pages), pages_with_text
|
||||
|
||||
|
||||
Reference in New Issue
Block a user