From 08db218b6d5559cdb43ae59d69f1b577a49e7640 Mon Sep 17 00:00:00 2001 From: geometrybase Date: Mon, 15 Jun 2026 06:29:54 +0000 Subject: [PATCH] Add archivist incremental sync state Request: Add archivist support for remembering which IPO archive stages have already been synced and which stages should be updated next. Changes: - Add sync_runs, ticker_sync_state, sync_tasks, and price_performance tables to the archive schema. - Add scripts/update_sync_state.py to derive per-ticker stage status and rebuild the next-sync task queue. - Export the new sync-state tables as Git-friendly CSV snapshots. - Document the incremental archive flow in the archivist skill and README. Verification: - Ran scripts/bootstrap_historical_data.py. - Ran scripts/update_sync_state.py with a deterministic as-of timestamp. - Checked SQLite integrity and DB-to-snapshot row counts with Python sqlite3. - Parsed Python scripts with ast.parse. - Ran git diff --check and checked for temporary SQLite/cache files. Next useful context: - Current derived queue has 2 open tasks for 06658 and 15 waiting_until_due tasks for future stages. --- .codex/skills/archivist/SKILL.md | 39 ++- README.md | 12 + data/hk_ipo.sqlite | Bin 45056 -> 90112 bytes data/snapshots/price_performance.csv | 1 + data/snapshots/sync_runs.csv | 2 + data/snapshots/sync_tasks.csv | 18 ++ data/snapshots/ticker_sync_state.csv | 22 ++ schema/hk_ipo.schema.sql | 59 ++++ scripts/bootstrap_historical_data.py | 4 + scripts/update_sync_state.py | 447 +++++++++++++++++++++++++++ 10 files changed, 601 insertions(+), 3 deletions(-) create mode 100644 data/snapshots/price_performance.csv create mode 100644 data/snapshots/sync_runs.csv create mode 100644 data/snapshots/sync_tasks.csv create mode 100644 data/snapshots/ticker_sync_state.csv create mode 100644 scripts/update_sync_state.py diff --git a/.codex/skills/archivist/SKILL.md b/.codex/skills/archivist/SKILL.md index 33bdf37..d12d32a 100644 --- a/.codex/skills/archivist/SKILL.md +++ b/.codex/skills/archivist/SKILL.md @@ -42,6 +42,7 @@ references/ - Record source references, URLs, as-of timestamps, relative paths, and hashes. - Update embedded SQLite tables for IPO facts. - Export Git-friendly CSV snapshots after database updates. +- Maintain `sync_runs`, `ticker_sync_state`, and `sync_tasks` so repeated syncs know what is already archived and what remains pending. - Preserve raw source files; do not overwrite without first checking whether the contents changed. - Label missing, stale, inconsistent, or estimated fields explicitly. @@ -66,9 +67,40 @@ If a user asks for both data update and analysis, complete the archive/update st 4. Compute hashes for archived files. 5. Insert or update structured facts in `data/hk_ipo.sqlite`. 6. Record every source in the source reference table using repo-relative paths. -7. Export key tables to `data/snapshots/` for readable Git diffs. -8. Verify path rules, required fields, and snapshot generation. -9. Commit only the related archive/database/snapshot changes. +7. Refresh sync state with `scripts/update_sync_state.py` after fact updates. +8. Export key tables to `data/snapshots/` for readable Git diffs. +9. Verify path rules, required fields, hash checks, sync state, and snapshot generation. +10. Commit only the related archive/database/snapshot changes. + +## Incremental Sync State + +Use `ticker_sync_state` as the per-ticker stage ledger and `sync_tasks` as the next-sync queue. + +Stages: + +- `T0_prospectus` +- `T1_allotment` +- `T2_grey_market` +- `D1` +- `D5` +- `D20` +- `D60` + +Status values: + +- `complete`: required facts or source files are archived. +- `pending_not_due`: the stage is expected in the future. +- `pending_due`: the stage is due and should be updated on the next sync. +- `blocked`: the missing data has no known resolution date or needs manual intervention. +- `not_applicable`: the stage does not apply. + +Default incremental flow: + +```bash +python3 scripts/update_sync_state.py +``` + +Then update only rows in `sync_tasks` whose `task_status` is `open` or `blocked`. Do not re-download existing source files unless the upstream source changed or the stored hash no longer matches. ## Quality Checks @@ -79,4 +111,5 @@ Before finishing, confirm: - Raw files referenced by the database exist. - Source hashes match current file contents. - CSV snapshots reflect the database update. +- `sync_tasks` reflects only missing or future work, not completed stages. - Any unavailable field is marked as a data gap rather than invented. diff --git a/README.md b/README.md index ace2c1b..9c9fbdd 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,18 @@ python3 -m venv .venv The extractor reads PDF paths from `data/hk_ipo.sqlite`, writes derived text files under `data/extracted_text/`, and exports `data/snapshots/extracted_text_manifest.csv` with page counts, text hashes, and extraction status. +## Incremental Archive Sync + +The archivist keeps a per-ticker sync ledger so repeated updates can focus on missing stages: + +```bash +python3 scripts/update_sync_state.py +``` + +This writes `ticker_sync_state` and `sync_tasks` into `data/hk_ipo.sqlite`, then exports `data/snapshots/ticker_sync_state.csv`, `data/snapshots/sync_tasks.csv`, and `data/snapshots/sync_runs.csv`. + +Use `sync_tasks` as the next-sync queue. Tasks marked `open` are due now; tasks marked `waiting_until_due` are known future updates. + ## Git Discipline The repository uses automatic focused commits for completed project changes. diff --git a/data/hk_ipo.sqlite b/data/hk_ipo.sqlite index 248c3d6e4fc4549ae6ce44a1406eb00b56ccddc8..9113f28be557b40246bb359ca2e598637ac0d1bd 100644 GIT binary patch literal 90112 zcmeI5du$xXdBE>@H@3clU0OcXCscrq4EEO0pyhjvWV!z1>-I ztK;oycTcn&G|5R$oCGQCplE6*DU#N8g9L7n1Wx@zfIicv2nsYo+dqm1$is?L!w6b5 zZO|fUznQ&#+>0U=QxE4moW$Lkx!IZDeDm!*zd3bc*>nkSSoXT=^10YVEFO=|^L#88 z+YSG2f`9&J1TF^s8~7WKT=%=&9h-Ud><|si{Vxr(*Zb2iIJzSNB!C2v01`j~NB{{S z0VIF~kN^@u0&hNnJ*hNAnixMC<0jI73V-l}1dsp{Kmter2_OL^fCP{L64)An3r1oh zYpSt*Q*%@BvE-&A6hxtLYE7-JuBj$}iV&M$s=1_EHCIW^Un_quFNNHLaQ?NQJ{f;jonVuS7YxD*9Ja zZmFg>9M?A0DsM*8XImA^b@*AHUGSP!b8V|yCAQ=LRFqeQ!VytEA_xyb_H?52hHb6$ zG;KAq;qa?fs{*lFhCytzw#uuun$@UjWF7KB~U|#yaFeshW-pe*+40{*_Sj zuS6-hDl9ZyYu$3qbA;b8ZBiu;WR=t$3gncv2!I{3ZUSDtp}E#J=tj_~HT^QMRp94R z?z~N$MimOSX1V-%;_|AktwG}SndW+&%jOm*_f5Txk}b>U0jykHHXBv5u};iTOfS9s z8T$Rh4|OA3UOElmqvfS`0j11{9zY4?PzbI>A@G$>)l}^al)0q?ojN3#*pNUFu7>Fb zp)MzU9eBC`%#1?KPH;bnaX;Yxn|otx)f_`10VIF~kN^@u0!RP}AOR$R1dsp{Kmykt zfssTu&W7O<-W7}!4Gbr;BWz%U&HrbnW88Y?Tkr=zNB{{S0VIF~kN^@u0!RP}AOR%s z%TA!lCw6Bam`cS{gNeaZB9Tg^24&W*WdAAQOtYfFOuVV9E>Y+_H7o&9B5N+>`3DX5 zk>;NtdsqFLe;$o}?(p8>0X_EAuYLKBFFZ{@pSn5uN%+qH^&5*z?{oLfKl#O9x$j>- z_w4)=Z_m+B@{^gpy=K6hw+!sgu1=)}yd=g_F_uK=@3O@ainkI&@mFFf(UllMQp5EN zLTO%l_|Q-P;)#c2e>?x&|LlGAd-r@PG4saFC*J?Tfq$(%b;nCyS|9!OpUuBA%Mv#K z@zX5%H~rp=@93GpiTLj9qR%iEYcYiN1*|U~W$2R&*9HjkeEIJfFn#_jB2CtG>`QO_ z(XRJD7W;jR_fs?f#ebZEl-@i4ff7r~{QH{h@8DOTyth}9aQ^?7y<%hjkpL1v0!RP} zAOR$R1dsp{KmthMEkfY3^Z#+~NDRLCK>|ns2_OL^fCP{L5Gp?@@uio z3+da(zB_tg*oGVYAOR$R1dsp{xE={yJd)%d$WDD=@@(^9Gd(3Y-?gPZh%8XJPjP!z zsrmZ@Bh5CcTZ(O2F6?nsXKi(3mc=|9M%}Rmk{X~E<(ax}thsL8IWjxDv9Te}teqi` zP!rK`G3(g|)+}}bc96k&)*;|QNBe2~GP|M|f*^=F8W|~8M5!c|3UVPw^0~@fF_*89 zk}j82Eiaa&f>BXPC2t7DqOM9>Nr3Q%kgw!LSu7RiD&a&zcoiD64h&d_*}9!S2R5t? z6U-1-&hsvL#O2|it!m6jHOx7vd|t{QuB)qrzi0XQoeRtS@w@LXpIkb253gGq*iWvx zGZ%%Si4)n`W_t2WtMsxAUv&2)m}NG#==M<-B20+l1-Rl8}fr^#e)PR|jwm{Sd5PE@6QK`3cbQPXoqu~ZWCL@eY>6~hQ^ zM@O!Dlwx$VE3;z`cGIc#?N0OLF0t34AiX}$vcQ||zTK+ba{eCAnEG+wnEFD*n0l|$ghbA6_F&c$rUfgMI;!WkYNt?Y z)lQ)VwbP4BEmQv32qWDQ+!8A~V67E$`MDzSCUd%4l0;oET} z@C$+I`jejN`kvA*7iYU-515Nmhlp?U4J7$*AdneR5NKqTN=}e9LzXqII9C)(MAM6T znW%Yq4dk@EkrxfkP}Q8MSByNor$FK*q8ANKDClxt4WLJk3W!W<2QnVyDVVmtaCe*v z_7Vs-YXMNfmW$C>NFdmrS6FVnXkSK#0c-gCfT!?_z0n2#jV{nPy5zmlOHz^Qg`z}? zs%l6@Q7)+Yf>Fq8rDBDUoKVz@AjP^Us3jR*3wb#wsyU;i77c+wYoK-7MQAmkr_Km7 z!{+~!TsFpKxdrZJ?t9!C_o?f#$%kn|0!RP}AOR$R1dsp{Kmter2_OL^umc396AySc z^P@uwz3dt$e>HKUJ7i?k*fW1PBqmlN*fz#Y2PYGYtzU%QbvE|k4c8^S2h&lHV0c97 z-cvAeOJcD#{ogAZna=+Yq`w=>{3QL|@!AfM#3+#f5mCOhh;$v!9s*q zXOrI`Hi30ub)v!P_PkZ&{aI)!$JP-sf9MhO>N(Y{(gj7{zI4;!9Rj~lZEejwM`psG z`B%}&>(h05q5!WNut4cDo7&;2HLFq62wiW+!(-}=3aqbNBl^sROuP*U?T~K`2!#BF zfry1G0wNZ64hXgXKfpZ`oHQks|#B!C2v01`j~NB{{S0VIF~kN^@u0=)@f|GzgEbVUM400|%gB!C2v z01`j~NB{{S0VMENB0$&w$GL}N@Wl@jKmter2_OL^fCP{L5wRspZwOE03gdI3Tx8Cx$i2;Tc*V--q<%A=G@X%J>G;6(H zB`&!O4(mM!1{UEg-7{b`k*0xXX@FxC*+ak}BAx$_bKj4_7e7b<2_OL^fCP{L5`k3 z{{7I;hsuK=N&H7*ZQ!Z+-$2l}z^8d^Upl+zog-6_mj$_bmf1_Zt04F%VZdn>irygJ zfJ^bPDE9~0Yf$EX+tkQxo!EwD!)Djdh5+Te8F;TtvAtnx5CL=WgV9QZkndfl?w*%qp!wb($F%dD8{SYg*MY-oNOu)(%G`MN z;Qo=RsYOsHo;Fz#B1gyiBeK4-5K7^dmw{e~vzE4(;`?pOp@;A^;9MO~-N11@yk!_L z3|w2~;oO{cFa50pcc|9LbZQN`xQQCfZ8t$zUk#>vWT?Ly%wP8zIy&B|2BC!dsX=ku z5_8oHC@}_V@YdFgmnt8~PM`*ZS8icvU%5?g6jWe)DtfEHtCpg}BcT4M1~~y58w(ly9*W$EgRQ620f)AyCt6RZ9+9UHfb&5(7mzwf z^PV35PArxDMlAjB++U96)6b8Lj{aZr8^hljI-C5RgfKKS_L<66T z1dsp{KmtghAA!e4Q>pB&x5giTkE>Ry#9RKLz%Dl@ayh(svb?ZT=2sT(TrRhkNU$*c zp;6wyH+6oc{O%S0{*y~b7fwFN-&cO{Ht=n2p<`ahj<4{?PA@O>C(C!2PnM4@mQV3! z-Gaq44y>^_`x=)l7e)95t!v)EieLD|Ip zqCJ$#PEExhpJg&cOH?PYdQT6pl5G+cx-$Xt0xLTqKz^K2@qvw;t3(NkjF0-AQ>&(~ zsP%f))Ku5BY7W12Y^8ip`6MgthNcn6F&fprLF$bP{8X{d5!-209L+ZCG#bS<*P+~@ z8ZFa^%o@mhjXi+MU{%!!_(q=89^f*u8DEl|m6@dY zE36c@!m9mluuWpx{g~856-z|tY+1MT5{<~piat>TiL7Y7OX~X`{8QUsz8}}@pczjM_DRn&2&w*8huLt((D!57fV*?b(FSVDdcbWJ7Th^>0vF_7cRD<4#O>#0 zI!*)jfP27{Mja|v)pT6CNH0=(`ib!9^zDPm?D6S-Ld&uNae3mXi0$`77mvr8n)|RB zs5$S&MFrfu8fX*R(0wz3rSe4=DXF#dN=;oS3aNG1ytV+3dT%VIkz7kV13X|!Gy8Gy zW+9JgYiez^@4N8c`^xWTuR$il?GFs!m`7E!#@|VWoQ0)Z;9-qq+q;~mqE!Z=PNj|! zB}zBwLjv2XRMZ=mZFhK9p-wT;0!`Max{g8~p$jyj?Asy%Wdky3pI54Cg?O)Q`sjWF zK>flOFzB#>%jqz$SUnx)T_@m0I80xL>EZR*c=F>h?wgsvXefY)U zM)Kps#Z6wA=!FE301`j~NZ?H&aB+IK*EHGh_2Q|`99MPDIBXFdyV}>*Gzs0krU`_7 zllxz^lhxf)ph;2s^y;ZszZM3pCU1Z9vse5oG%Q=qc-9PoVFZQc%x=_tk7$3u_j3;U zR}hNzpc{4Cu7IB3R=eyhBdydf9peujv|zdA!Q1$Q8!9x_p$*CQpV6&w)Ug8H_cKI4 zI1Qcb+@8%E9bv&J-p(vmcsGHRutvwiu4MM~Y`+Gq*Cp~0(6kXn>~h-um@tvb&d$a^ z!ZBIt3ee@St4(scyt~@Aet$YjK5PoNjq>ysl72#9ld}!e2D419wtmoa4;ri@YyEY~ zioXRQDBe(5*lr(RRM?Mc_xD=JRkRD+Vssm|IkifnLp)K5^Rq<-tYfWj)Tu)RAuwH} z+-iQqkRJ#o&3J2^DZ-dPB;3Y}RAfainJrKDlRCdHIW38W%I8d6*qupbCnw_QWZEeY4Y#C%=y{f;E-mLa|!wZqUuZq;=lvxOvnkr`)Q|KJNSur zod^{%xE#AMIl?O9V`)~}fk$vL+E%)OGb`1IBJ8Y#Z)#~8CaLP-07hHc*+g}$9z!C* zhyV-`Sk(p_Wrfsxih)AZ;FtX>eps1Dr{&>fcIno>+RYO*IwJ!5ExZ|_`~P>vS7Tf@ zb29zM*3SkFc^y z-foU>2*U=`nXw?po;3mGE?8wmGc0aD!#ivG;Lh}zoSj}^wXl`e_SI7?-R(|(9wG)3 zVAhz9=k}?auz7h$?Kc?XV`BCIh*)4KL5Z&P^0E+F`sHUp?HrEopZ@E^s>0c^4*dbU zhCQ9h7V7!=>T1gM$rtF%>wov_+0h<(W;zq{d__8QC%2ToIks!8kq(_n%h}r{ekD5d z2C#zgj$x)V_tDmXX9U61nNY_4btY~8`ea7}-D>jc zrTc8Mz4E|o$Ws__0gA&<;?@<$4Y2?J>`;$9Q-wkEd?gCw22hgLV7tO#ceg?MQ$htl(7|{LyyE2zz++T5D;9lT9&HWqq4_t$Lh+E^9xeFYZ`A=>j zbBU9=cU*br7k`NakN^@u0!RP}AOR$R1dsp{KmthM>JUgLCcSGxEv=j{h delta 120 zcmZoTz}oPDX@ayM7Xt$WHxR=B>qH%6MJ@)tszP4=9}HZqg$%qO`44lO@NVJc;`+~a zjN>wEA=~7QjUQRIFSlb`QD*uiJ4Vs%$J7{knb=G@*u*`Rr+>6!l$`#; To>5{umnP$Pw#|$J|K|e$r`{jn diff --git a/data/snapshots/price_performance.csv b/data/snapshots/price_performance.csv new file mode 100644 index 0000000..960b553 --- /dev/null +++ b/data/snapshots/price_performance.csv @@ -0,0 +1 @@ +performance_id,ticker,stage,source_id,as_of_date,open_price_hkd,high_price_hkd,low_price_hkd,close_price_hkd,return_pct,turnover_hkd_m,data_as_of,notes diff --git a/data/snapshots/sync_runs.csv b/data/snapshots/sync_runs.csv new file mode 100644 index 0000000..e3278e9 --- /dev/null +++ b/data/snapshots/sync_runs.csv @@ -0,0 +1,2 @@ +sync_run_id,mode,as_of,started_at,finished_at,status,notes +sync_state_seed_2026_06_15,bootstrap_state_refresh,2026-06-15T06:30:00Z,2026-06-15T06:30:00Z,2026-06-15T06:30:00Z,complete,Derived ticker sync state refreshed. diff --git a/data/snapshots/sync_tasks.csv b/data/snapshots/sync_tasks.csv new file mode 100644 index 0000000..afd6590 --- /dev/null +++ b/data/snapshots/sync_tasks.csv @@ -0,0 +1,18 @@ +task_id,ticker,stage,task_type,task_status,due_date,data_gap_id,last_sync_run_id,updated_at,notes +06658_D1,06658,D1,archive_price_performance,open,2026-06-15,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06658_T2_grey_market,06658,T2_grey_market,archive_grey_market_result,open,2026-06-15,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06675_T1_allotment,06675,T1_allotment,archive_allotment_results,waiting_until_due,2026-06-16,06675_allotment_results_pending_2026_06_15,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are not archived yet. +06675_D1,06675,D1,archive_price_performance,waiting_until_due,2026-06-17,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06675_T2_grey_market,06675,T2_grey_market,archive_grey_market_result,waiting_until_due,2026-06-17,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06658_D5,06658,D5,archive_price_performance,waiting_until_due,2026-06-19,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675_D5,06675,D5,archive_price_performance,waiting_until_due,2026-06-21,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106_T1_allotment,06106,T1_allotment,archive_allotment_results,waiting_until_due,2026-06-23,06106_allotment_results_pending_2026_06_15,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are not archived yet. +06106_D1,06106,D1,archive_price_performance,waiting_until_due,2026-06-24,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06106_T2_grey_market,06106,T2_grey_market,archive_grey_market_result,waiting_until_due,2026-06-24,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06106_D5,06106,D5,archive_price_performance,waiting_until_due,2026-06-28,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658_D20,06658,D20,archive_price_performance,waiting_until_due,2026-07-04,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675_D20,06675,D20,archive_price_performance,waiting_until_due,2026-07-06,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106_D20,06106,D20,archive_price_performance,waiting_until_due,2026-07-13,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658_D60,06658,D60,archive_price_performance,waiting_until_due,2026-08-13,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675_D60,06675,D60,archive_price_performance,waiting_until_due,2026-08-15,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106_D60,06106,D60,archive_price_performance,waiting_until_due,2026-08-22,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. diff --git a/data/snapshots/ticker_sync_state.csv b/data/snapshots/ticker_sync_state.csv new file mode 100644 index 0000000..a72b4c9 --- /dev/null +++ b/data/snapshots/ticker_sync_state.csv @@ -0,0 +1,22 @@ +ticker,stage,status,required,due_date,completed_at,last_source_id,data_gap_id,last_sync_run_id,updated_at,notes +06106,T0_prospectus,complete,1,2026-06-15,2026-06-15,06106_prospectus_candidate_2026_06_15,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Prospectus source and offering terms are archived. +06106,T1_allotment,pending_not_due,1,2026-06-23,,,06106_allotment_results_pending_2026_06_15,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are not archived yet. +06106,T2_grey_market,pending_not_due,1,2026-06-24,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06106,D1,pending_not_due,1,2026-06-24,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06106,D5,pending_not_due,1,2026-06-28,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106,D20,pending_not_due,1,2026-07-13,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06106,D60,pending_not_due,1,2026-08-22,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658,T0_prospectus,complete,1,2026-06-05,2026-06-05,06658_prospectus_2026_06_05,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Prospectus source and offering terms are archived. +06658,T1_allotment,complete,1,2026-06-12,2026-06-12,06658_allotment_results_2026_06_12,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are archived. +06658,T2_grey_market,pending_due,1,2026-06-15,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06658,D1,pending_due,1,2026-06-15,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06658,D5,pending_not_due,1,2026-06-19,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658,D20,pending_not_due,1,2026-07-04,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06658,D60,pending_not_due,1,2026-08-13,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675,T0_prospectus,complete,1,2026-06-09,2026-06-09,06675_prospectus_2026_06_09,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Prospectus source and offering terms are archived. +06675,T1_allotment,pending_not_due,1,2026-06-16,,,06675_allotment_results_pending_2026_06_15,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Allotment result facts are not archived yet. +06675,T2_grey_market,pending_not_due,1,2026-06-17,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06675,D1,pending_not_due,1,2026-06-17,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. +06675,D5,pending_not_due,1,2026-06-21,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675,D20,pending_not_due,1,2026-07-06,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. +06675,D60,pending_not_due,1,2026-08-15,,,,sync_state_seed_2026_06_15,2026-06-15T06:30:00Z,Price/performance source is not archived yet. Due date uses calendar days until trading-calendar support is added. diff --git a/schema/hk_ipo.schema.sql b/schema/hk_ipo.schema.sql index 6d040e6..cc7f8c8 100644 --- a/schema/hk_ipo.schema.sql +++ b/schema/hk_ipo.schema.sql @@ -53,6 +53,23 @@ CREATE TABLE IF NOT EXISTS ipo_demand ( notes TEXT ); +CREATE TABLE IF NOT EXISTS price_performance ( + performance_id TEXT PRIMARY KEY, + ticker TEXT NOT NULL REFERENCES ipo_master(ticker), + stage TEXT NOT NULL, + source_id TEXT, + as_of_date TEXT NOT NULL, + open_price_hkd REAL, + high_price_hkd REAL, + low_price_hkd REAL, + close_price_hkd REAL, + return_pct REAL, + turnover_hkd_m REAL, + data_as_of TEXT NOT NULL, + notes TEXT, + UNIQUE (ticker, stage) +); + CREATE TABLE IF NOT EXISTS source_refs ( source_id TEXT PRIMARY KEY, ticker TEXT NOT NULL REFERENCES ipo_master(ticker), @@ -81,3 +98,45 @@ CREATE TABLE IF NOT EXISTS data_gaps ( created_at TEXT NOT NULL, notes TEXT ); + +CREATE TABLE IF NOT EXISTS sync_runs ( + sync_run_id TEXT PRIMARY KEY, + mode TEXT NOT NULL, + as_of TEXT NOT NULL, + started_at TEXT NOT NULL, + finished_at TEXT, + status TEXT NOT NULL, + notes TEXT, + CHECK (status IN ('running', 'complete', 'failed')) +); + +CREATE TABLE IF NOT EXISTS ticker_sync_state ( + ticker TEXT NOT NULL REFERENCES ipo_master(ticker), + stage TEXT NOT NULL, + status TEXT NOT NULL, + required INTEGER NOT NULL DEFAULT 1, + due_date TEXT, + completed_at TEXT, + last_source_id TEXT, + data_gap_id TEXT, + last_sync_run_id TEXT REFERENCES sync_runs(sync_run_id), + updated_at TEXT NOT NULL, + notes TEXT, + PRIMARY KEY (ticker, stage), + CHECK (status IN ('complete', 'pending_not_due', 'pending_due', 'blocked', 'not_applicable')), + CHECK (required IN (0, 1)) +); + +CREATE TABLE IF NOT EXISTS sync_tasks ( + task_id TEXT PRIMARY KEY, + ticker TEXT NOT NULL REFERENCES ipo_master(ticker), + stage TEXT NOT NULL, + task_type TEXT NOT NULL, + task_status TEXT NOT NULL, + due_date TEXT, + data_gap_id TEXT, + last_sync_run_id TEXT REFERENCES sync_runs(sync_run_id), + updated_at TEXT NOT NULL, + notes TEXT, + CHECK (task_status IN ('open', 'waiting_until_due', 'blocked')) +); diff --git a/scripts/bootstrap_historical_data.py b/scripts/bootstrap_historical_data.py index b49c8e7..a582fd3 100644 --- a/scripts/bootstrap_historical_data.py +++ b/scripts/bootstrap_historical_data.py @@ -303,8 +303,12 @@ def main() -> None: "ipo_master", "offering_terms", "ipo_demand", + "price_performance", "source_refs", "data_gaps", + "sync_runs", + "ticker_sync_state", + "sync_tasks", ]: export_snapshot(conn, table) diff --git a/scripts/update_sync_state.py b/scripts/update_sync_state.py new file mode 100644 index 0000000..0ae36e6 --- /dev/null +++ b/scripts/update_sync_state.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python3 +"""Update per-ticker archive sync state and pending sync tasks.""" + +from __future__ import annotations + +import argparse +import csv +import sqlite3 +from dataclasses import dataclass +from datetime import date, datetime, timedelta, timezone +from pathlib import Path + + +DEFAULT_DB_PATH = Path("data/hk_ipo.sqlite") +DEFAULT_SCHEMA_PATH = Path("schema/hk_ipo.schema.sql") +SNAPSHOT_DIR = Path("data/snapshots") + + +@dataclass(frozen=True) +class Ticker: + ticker: str + listing_date: str | None + application_start_date: str | None + allotment_results_expected_date: str | None + + +@dataclass(frozen=True) +class StageState: + ticker: str + stage: str + status: str + required: int + due_date: str | None + completed_at: str | None + last_source_id: str | None + data_gap_id: str | None + notes: str + + +STAGE_ORDER = [ + "T0_prospectus", + "T1_allotment", + "T2_grey_market", + "D1", + "D5", + "D20", + "D60", +] + +TASK_TYPES = { + "T0_prospectus": "archive_prospectus_and_terms", + "T1_allotment": "archive_allotment_results", + "T2_grey_market": "archive_grey_market_result", + "D1": "archive_price_performance", + "D5": "archive_price_performance", + "D20": "archive_price_performance", + "D60": "archive_price_performance", +} + + +def parse_as_of(value: str | None) -> datetime: + if value: + normalized = value.replace("Z", "+00:00") + return datetime.fromisoformat(normalized) + return datetime.now(timezone.utc).replace(microsecond=0) + + +def parse_date(value: str | None) -> date | None: + if not value: + return None + return date.fromisoformat(value) + + +def due_status(due_date: str | None, as_of_date: date) -> str: + if due_date is None: + return "pending_due" + due = parse_date(due_date) + if due is None or due <= as_of_date: + return "pending_due" + return "pending_not_due" + + +def query_one(conn: sqlite3.Connection, sql: str, params: tuple[object, ...]) -> sqlite3.Row | None: + return conn.execute(sql, params).fetchone() + + +def load_tickers(conn: sqlite3.Connection) -> list[Ticker]: + rows = conn.execute( + """ + SELECT ticker, listing_date, application_start_date, allotment_results_expected_date + FROM ipo_master + ORDER BY ticker + """ + ).fetchall() + return [Ticker(**dict(row)) for row in rows] + + +def data_gap_for(conn: sqlite3.Connection, ticker: str, stage: str) -> sqlite3.Row | None: + return query_one( + conn, + """ + SELECT gap_id, reason, expected_resolution_date + FROM data_gaps + WHERE ticker = ? AND stage = ? + ORDER BY created_at DESC, gap_id DESC + LIMIT 1 + """, + (ticker, stage), + ) + + +def prospectus_state(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> StageState: + source = query_one( + conn, + """ + SELECT source_id, source_date + FROM source_refs + WHERE ticker = ? AND source_type = 'prospectus' + ORDER BY source_date DESC, source_id DESC + LIMIT 1 + """, + (ticker.ticker,), + ) + terms = query_one(conn, "SELECT ticker FROM offering_terms WHERE ticker = ?", (ticker.ticker,)) + if source and terms: + return StageState( + ticker.ticker, + "T0_prospectus", + "complete", + 1, + ticker.application_start_date, + source["source_date"], + source["source_id"], + None, + "Prospectus source and offering terms are archived.", + ) + gap = data_gap_for(conn, ticker.ticker, "T0_prospectus") + status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(ticker.application_start_date, as_of_date) + return StageState( + ticker.ticker, + "T0_prospectus", + status, + 1, + ticker.application_start_date, + None, + source["source_id"] if source else None, + gap["gap_id"] if gap else None, + "Missing prospectus source or offering terms.", + ) + + +def allotment_state(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> StageState: + demand = query_one( + conn, + """ + SELECT source_id, stage_date + FROM ipo_demand + WHERE ticker = ? + ORDER BY stage_date DESC, demand_id DESC + LIMIT 1 + """, + (ticker.ticker,), + ) + source = query_one( + conn, + """ + SELECT source_id, source_date + FROM source_refs + WHERE ticker = ? AND source_type = 'allotment_results' + ORDER BY source_date DESC, source_id DESC + LIMIT 1 + """, + (ticker.ticker,), + ) + if demand or source: + return StageState( + ticker.ticker, + "T1_allotment", + "complete", + 1, + ticker.allotment_results_expected_date, + (demand or source)["stage_date" if demand else "source_date"], + (demand or source)["source_id"], + None, + "Allotment result facts are archived.", + ) + gap = data_gap_for(conn, ticker.ticker, "T1_allotment") + status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(ticker.allotment_results_expected_date, as_of_date) + return StageState( + ticker.ticker, + "T1_allotment", + status, + 1, + ticker.allotment_results_expected_date, + None, + None, + gap["gap_id"] if gap else None, + "Allotment result facts are not archived yet.", + ) + + +def price_state(conn: sqlite3.Connection, ticker: Ticker, stage: str, due_date: str | None, as_of_date: date) -> StageState: + perf = query_one( + conn, + """ + SELECT source_id, as_of_date + FROM price_performance + WHERE ticker = ? AND stage = ? + LIMIT 1 + """, + (ticker.ticker, stage), + ) + source_types = { + "T2_grey_market": ("grey_market", "dark_market", "grey_market_performance"), + }.get(stage, ()) + source = None + if source_types: + placeholders = ", ".join("?" for _ in source_types) + source = query_one( + conn, + f""" + SELECT source_id, source_date + FROM source_refs + WHERE ticker = ? AND source_type IN ({placeholders}) + ORDER BY source_date DESC, source_id DESC + LIMIT 1 + """, + (ticker.ticker, *source_types), + ) + if perf or source: + row = perf or source + completed_key = "as_of_date" if perf else "source_date" + return StageState( + ticker.ticker, + stage, + "complete", + 1, + due_date, + row[completed_key], + row["source_id"], + None, + "Performance source is archived.", + ) + gap = data_gap_for(conn, ticker.ticker, stage) + status = "blocked" if gap and gap["expected_resolution_date"] is None else due_status(due_date, as_of_date) + note = "Price/performance source is not archived yet." + if stage in {"D5", "D20", "D60"}: + note += " Due date uses calendar days until trading-calendar support is added." + return StageState( + ticker.ticker, + stage, + status, + 1, + due_date, + None, + None, + gap["gap_id"] if gap else None, + note, + ) + + +def listing_offset_due(listing_date: str | None, days_after_listing: int) -> str | None: + listed = parse_date(listing_date) + if listed is None: + return None + return (listed + timedelta(days=days_after_listing)).isoformat() + + +def build_states(conn: sqlite3.Connection, ticker: Ticker, as_of_date: date) -> list[StageState]: + return [ + prospectus_state(conn, ticker, as_of_date), + allotment_state(conn, ticker, as_of_date), + price_state(conn, ticker, "T2_grey_market", ticker.listing_date, as_of_date), + price_state(conn, ticker, "D1", listing_offset_due(ticker.listing_date, 0), as_of_date), + price_state(conn, ticker, "D5", listing_offset_due(ticker.listing_date, 4), as_of_date), + price_state(conn, ticker, "D20", listing_offset_due(ticker.listing_date, 19), as_of_date), + price_state(conn, ticker, "D60", listing_offset_due(ticker.listing_date, 59), as_of_date), + ] + + +def upsert_sync_run(conn: sqlite3.Connection, run_id: str, mode: str, as_of: str, status: str, notes: str) -> None: + conn.execute( + """ + INSERT INTO sync_runs (sync_run_id, mode, as_of, started_at, finished_at, status, notes) + VALUES (?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(sync_run_id) DO UPDATE SET + mode = excluded.mode, + as_of = excluded.as_of, + started_at = excluded.started_at, + finished_at = excluded.finished_at, + status = excluded.status, + notes = excluded.notes + """, + (run_id, mode, as_of, as_of, as_of, status, notes), + ) + + +def replace_state(conn: sqlite3.Connection, states: list[StageState], run_id: str, updated_at: str) -> None: + conn.executemany( + """ + INSERT INTO ticker_sync_state ( + ticker, stage, status, required, due_date, completed_at, last_source_id, + data_gap_id, last_sync_run_id, updated_at, notes + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(ticker, stage) DO UPDATE SET + status = excluded.status, + required = excluded.required, + due_date = excluded.due_date, + completed_at = excluded.completed_at, + last_source_id = excluded.last_source_id, + data_gap_id = excluded.data_gap_id, + last_sync_run_id = excluded.last_sync_run_id, + updated_at = excluded.updated_at, + notes = excluded.notes + """, + [ + ( + state.ticker, + state.stage, + state.status, + state.required, + state.due_date, + state.completed_at, + state.last_source_id, + state.data_gap_id, + run_id, + updated_at, + state.notes, + ) + for state in states + ], + ) + + +def rebuild_tasks(conn: sqlite3.Connection, states: list[StageState], run_id: str, updated_at: str) -> None: + conn.execute("DELETE FROM sync_tasks") + rows = [] + for state in states: + if state.status == "complete" or state.required == 0: + continue + task_status = { + "pending_due": "open", + "pending_not_due": "waiting_until_due", + "blocked": "blocked", + }.get(state.status) + if task_status is None: + continue + rows.append( + ( + f"{state.ticker}_{state.stage}", + state.ticker, + state.stage, + TASK_TYPES[state.stage], + task_status, + state.due_date, + state.data_gap_id, + run_id, + updated_at, + state.notes, + ) + ) + conn.executemany( + """ + INSERT INTO sync_tasks ( + task_id, ticker, stage, task_type, task_status, due_date, data_gap_id, + last_sync_run_id, updated_at, notes + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + rows, + ) + + +def export_snapshot(conn: sqlite3.Connection, table: str, order_by: str = "1") -> None: + SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True) + cursor = conn.execute(f"SELECT * FROM {table} ORDER BY {order_by}") + columns = [description[0] for description in cursor.description] + with (SNAPSHOT_DIR / f"{table}.csv").open("w", newline="", encoding="utf-8") as handle: + writer = csv.writer(handle, lineterminator="\n") + writer.writerow(columns) + writer.writerows(cursor.fetchall()) + + +def print_summary(states: list[StageState]) -> None: + counts: dict[str, int] = {} + for state in states: + counts[state.status] = counts.get(state.status, 0) + 1 + print("sync state updated") + for status in sorted(counts): + print(f"{status}: {counts[status]}") + open_items = [state for state in states if state.status in {"pending_due", "blocked"}] + if open_items: + print("actionable items:") + for state in open_items: + print(f"- {state.ticker} {state.stage}: {state.status} due={state.due_date or ''}") + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--db", default=str(DEFAULT_DB_PATH), help="Repo-relative SQLite database path.") + parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH), help="Repo-relative schema path.") + parser.add_argument("--as-of", help="ISO timestamp for deterministic sync-state snapshots.") + parser.add_argument("--run-id", help="Stable sync run id. Defaults to sync_state_.") + parser.add_argument("--mode", default="state_refresh", help="Sync run mode label.") + args = parser.parse_args() + + as_of_dt = parse_as_of(args.as_of) + as_of = as_of_dt.isoformat().replace("+00:00", "Z") + as_of_date = as_of_dt.date() + run_id = args.run_id or "sync_state_" + as_of.replace(":", "").replace("-", "").replace("+", "").replace("Z", "Z") + + db_path = Path(args.db) + schema_path = Path(args.schema) + with sqlite3.connect(db_path) as conn: + conn.row_factory = sqlite3.Row + conn.executescript(schema_path.read_text(encoding="utf-8")) + upsert_sync_run(conn, run_id, args.mode, as_of, "running", "Refreshing derived ticker sync state.") + states = [state for ticker in load_tickers(conn) for state in build_states(conn, ticker, as_of_date)] + replace_state(conn, states, run_id, as_of) + rebuild_tasks(conn, states, run_id, as_of) + upsert_sync_run(conn, run_id, args.mode, as_of, "complete", "Derived ticker sync state refreshed.") + export_snapshot(conn, "sync_runs", "sync_run_id") + export_snapshot( + conn, + "ticker_sync_state", + """ + ticker, + CASE stage + WHEN 'T0_prospectus' THEN 0 + WHEN 'T1_allotment' THEN 1 + WHEN 'T2_grey_market' THEN 2 + WHEN 'D1' THEN 3 + WHEN 'D5' THEN 4 + WHEN 'D20' THEN 5 + WHEN 'D60' THEN 6 + ELSE 99 + END + """, + ) + export_snapshot(conn, "sync_tasks", "task_status, due_date, ticker, stage") + print_summary(states) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())