From 719c8e6ee717db5be8691d7a213339f0957e7d87 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Thu, 12 Mar 2026 13:09:48 -0400 Subject: [PATCH 1/6] update cpdb version --- products/cpdb/recipe.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/products/cpdb/recipe.yml b/products/cpdb/recipe.yml index 67b3a21c23..5c1b3d33a3 100644 --- a/products/cpdb/recipe.yml +++ b/products/cpdb/recipe.yml @@ -1,6 +1,6 @@ name: CPDB product: db-cpdb -version: 25adopt +version: 26prelim inputs: missing_versions_strategy: find_latest datasets: From a99887139979a94a2c2283855aedbf12d02a4bc8 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Wed, 15 Apr 2026 15:12:15 -0400 Subject: [PATCH 2/6] fix FISA soruce data a double-quote charcter at the beginning of description column for the RDAMPSIP project causes the rest of the file to be inserted into the downstream description for that project, breaking CPDB csvs --- ingest_templates/fisa_capitalcommitments.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest_templates/fisa_capitalcommitments.yml b/ingest_templates/fisa_capitalcommitments.yml index a1bc0b8af1..712a5195ca 100644 --- a/ingest_templates/fisa_capitalcommitments.yml +++ b/ingest_templates/fisa_capitalcommitments.yml @@ -10,7 +10,7 @@ ingestion: source: type: s3 bucket: edm-recipes - key: inbox/fisa_capitalcommitments/AICP_OREQ_CAPPLN_PJCP_20260218121103.asc + key: inbox/fisa_capitalcommitments/AICP_OREQ_CAPPLN_PJCP_20260218121103_fixed.asc file_format: type: csv encoding: ISO-8859-1 From d3959f1bc2b8bb96e41bbf8c8665bb5a86ad803d Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 24 Apr 2026 11:33:21 -0400 Subject: [PATCH 3/6] remove manual mapping for capital project 801SANDBCT --- products/cpdb/data/sprints.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/products/cpdb/data/sprints.sql b/products/cpdb/data/sprints.sql index 22dcaec590..9241fa21ce 100644 --- a/products/cpdb/data/sprints.sql +++ b/products/cpdb/data/sprints.sql @@ -4592,7 +4592,6 @@ INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SAND1106' INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SAND1108', '1009620100', NULL, 'AD Sprint', '0106000020E6100000010000000103000000010000004D00000015D4E4114C7E52C0249925ADB35E4440FD5E77114C7E52C0353FDFB0B35E4440A114F8DF4B7E52C00673D0EDB45E4440B9129F954B7E52C0BB85501FB65E44409F1F84334B7E52C000CFE140B75E44409F6918BB4A7E52C0619B424EB85E4440AC2E7B624A7E52C043494DE9B95E44403E3E932D4A7E52C04BA95492BB5E44403257441D4A7E52C068453342BD5E4440CA1080FA497E52C0F4E92C62C05E4440A2F707F3497E52C05F26070EC15E4440E715E6EB497E52C0DF4F1AC5C15E444054BEE3EB497E52C02C5E70C5C15E44406E618408487E52C0A8293AEDBF5E4440CBEB3D0F487E52C06C8A9140BF5E4440A9D96019487E52C0B2935657BE5E444034818524487E52C047FE7F74BD5E444016452D31487E52C0381E9E89BC5E444094C48B3D487E52C02DE2E5B5BB5E4440D3A37044487E52C0E8F67A46BB5E44407EB06B51487E52C0DF717A81BA5E444036CF21C5487E52C019227A1CB55E44405DF51306497E52C0B68157B8B25E4440BD4C7F80497E52C01199ECE6AE5E444065DA66AD497E52C087DFDAA8AD5E4440326428D0497E52C063DE03BEAC5E4440D411B0004A7E52C022C86E84AB5E4440E6F6FD3A4A7E52C0BF20871EAA5E4440B946C3754B7E52C0B0CA659DA35E444011D7EB3D4C7E52C0FE10781BA05E44401A3089524D7E52C0D8F6A6CB9B5E444069C1B75A4F7E52C093376FDA945E4440615148F6507E52C0586BB81E905E444061C0B150527E52C0BC48E2848C5E44409D11F676527E52C0F2E83F2F8C5E44401F50B190527E52C0B33AC9EA8B5E4440A09948AE527E52C0A61354878B5E4440A544CFC0527E52C0B68747388B5E4440C0778BCE527E52C0008C5DE48A5E4440012A29DB527E52C03981A77C8A5E44405D068EDF527E52C09D9AE9268A5E444063EA3CDF527E52C0B51660C5895E44400FE5FBE0527E52C02B30096C895E444036526FE7527E52C0ACD921FF885E4440D7F982F9527E52C0C138C178885E44407D325CFD527E52C04FB19E63885E4440E50DC527537E52C05546B3B7875E4440E6E2BF4F537E52C08FF98144875E444065A03BB6537E52C00D853D23865E4440AFC558E85C7E52C09BDC802E6C5E4440B3F1CA72627E52C08858B48A5C5E444056D12DBC627E52C0EA2DBDFC5B5E4440FD89EFCE627E52C01A542AE55B5E4440385892FD627E52C0EE478EAA5B5E44406C105047637E52C06E3191685B5E44408BF400AC637E52C0C24DC0335B5E444089C7A21F647E52C0DBC68D1F5B5E444049372886647E52C042BA86305B5E4440EC81CAD2647E52C03500B2535B5E444043CEE016657E52C08514957F5B5E44400F276BC7687E52C0265687915E5E4440706E667A8D7E52C05263901A7D5E444079ED8296717E52C095C35DAFC95E44405C1DE6B1697E52C0301D6008C35E4440D08E2BA46C7E52C02A4E5CF1BA5E444061959539597E52C021740294AA5E4440FB0599BF5A7E52C0B5662B65A65E444065F4BC335A7E52C0B30A49EFA55E444090D48CBA517E52C01143FACA9E5E44405B233C80517E52C0BD60166B9F5E4440F9EDFEDC507E52C02A1E80E19E5E44404C1DF47E507E52C0B055B1E39F5E444058F7FB4E4F7E52C017B77CE39E5E4440FE4444AE4E7E52C04757D464A15E444084750C774E7E52C0A0438A5CA25E4440DCFF46804D7E52C006CF95AFA65E444015D4E4114C7E52C0249925ADB35E4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SAND6703', '1013730050', NULL, 'AD Sprint', '0106000020E61000000100000001030000000100000005000000C28A25F72D7C52C0E02BA79EBA624440D50101284D7C52C06269D09467624440691F3B6C657C52C04D29BA6C7D6244404943893B467C52C08C1CC676D0624440C28A25F72D7C52C0E02BA79EBA624440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SAND6706', '1013730050', NULL, 'AD Sprint', '0106000020E61000000100000001030000000100000005000000C28A25F72D7C52C0E02BA79EBA624440D50101284D7C52C06269D09467624440691F3B6C657C52C04D29BA6C7D6244404943893B467C52C08C1CC676D0624440C28A25F72D7C52C0E02BA79EBA624440'); -INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SANDBCT', '3052570045', NULL, 'AD Sprint', '0106000020E61000000100000001030000000100000010000000E8933364567E52C0909BEA62E953444071469547577E52C0E6246F02EC53444091890BB9577E52C05D25BB46ED53444015D08C23587E52C05D0BF77DEE53444099140C81587E52C008D32D8FEF534440CF8182E9587E52C07CD272C0F0534440EC891A46597E52C0B2FB05CFF153444078CB9FB0597E52C03BA04D06F3534440196AB40D5A7E52C077EA4B16F45344406317226D5A7E52C0C82A282DF55344408E65E0D05A7E52C08632A450F65344403CB3852D5B7E52C0CA52585FF753444057C9CAF55B7E52C0039B95A8F953444028690732547E52C0AA9562B5FF534440689F157A517E52C0A1083137ED534440E8933364567E52C0909BEA62E9534440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('841TFD503M24', NULL, NULL, 'DCP Sprint', '0106000020E6100000010000000103000000010000003700000098EB5B8E077F52C02A428843A45E444063EE5C0A227F52C08CAD3C93BA5E4440BBA5640C227F52C0B01AF494BA5E444063888E3F3E7F52C0AD291883D25E444053AFF4413E7F52C0A4932385D25E444082115D08427F52C08D4754C0D55E44402B5758444F7F52C09D1A2DE6E05E4440B7A4F9564F7F52C0A12068F6E05E4440EACCA431527F52C033996888E35E44408DB99C95567F52C0C1E15A2FE75E4440452FB497567F52C02BFC1931E75E4440829CC9818B7F52C084DD4C8A135F4440623D33FA8B7F52C03D861408145F44409DED2A608C7F52C0E45D2DA7145F4440AF86C5AF8C7F52C05E367A61155F444052E4F3E58C7F52C0F641D22F165F444021FDA0008D7F52C02E82470A175F4440D55EC6FE8C7F52C080CA74E8175F4440804476E08C7F52C0BD57D0C1185F44408AE3DAA68C7F52C076CFFF8D195F44403DF62A548C7F52C00B6D2A451A5F44408FF493EB8B7F52C00D3446E01A5F444069D11A718B7F52C03C305D591B5F4440E06F74E98A7F52C0101AC8AB1B5F44408255D7598A7F52C0AD1E5CD41B5F44401B61C8C7897F52C0DB088AD11B5F4440B47EE438897F52C089986DA31B5F4440DD6EA9B2887F52C0BF71CC4B1B5F4440862697C9537F52C046616FF3EE5E444083C0835B4F7F52C04C7A1444EB5E4440282C6D454F7F52C0986EF430EB5E44408CBE0E694C7F52C087C56B9DE85E4440E3A63D343F7F52C04E119B7DDD5E444058A6CF2F3F7F52C04B25D879DD5E4440758563683B7F52C002DDC83DDA5E44404C466C371F7F52C015648051C25E444018684CBC047F52C0DCE98702AC5E44400E7C977CEA7E52C0C3AC62E0955E4440DE506604EA7E52C0C45D2262955E44407743B59EE97E52C04564A3C2945E444043C36C4FE97E52C075DE0608945E444001CB9819E97E52C0D9AC7839935E44408BE74AFFE87E52C0DCE4E85E925E444062DE8501E97E52C014B9BD80915E444037BC3320E97E52C0B8D780A7905E444038AD265AE97E52C00A6C8BDB8F5E4440B49624ADE97E52C01FFDB3248F5E4440FEFFFC15EA7E52C0F551018A8E5E4440ED72A890EA7E52C06D5165118E5E44404D1F7018EB7E52C0338682BF8D5E4440493B1CA8EB7E52C0D1857E978D5E44408F58273AEC7E52C091FBE29A8D5E444045B5F4C8EC7E52C0B6878EC98D5E44406472074FED7E52C0C507B6218E5E444098EB5B8E077F52C02A428843A45E4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('806BVIEW', '2028610001', NULL, 'AD Sprint', '0104000020E6100000010000000101000000420EAE76817A52C0D6AAC71C426C4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('841TFD503X21', NULL, NULL, 'DCP Sprint', '0106000020E610000001000000010300000001000000260000007A865E0E657A52C05935ED1D2C6A4440947FD2D0567A52C0F1A9F2875F6A4440F800288B567A52C0E903504B606A4440064EBD2D567A52C045D546F6606A4440A96D29BC557A52C0382D4582616A4440FEBEC93A557A52C0CDCAE9E9616A4440A00897AE547A52C07C0A3929626A4440F98EF41C547A52C08915C43D626A4440D4127B8B537A52C093D1C026626A4440B4C1C1FF527A52C0E6A411E5616A4440AA35277F527A52C017C33C7B616A4440CEA19C0E527A52C0A25753ED606A4440213375B2517A52C0AD82C940606A444028843B6E517A52C0D4B1407C5F6A4440E5C58E44517A52C0B66446A75E6A4440E7F40837517A52C01EDF09CA5D6A444020182F46517A52C0CBA30BED5C6A44408B246C71517A52C00AD0C9185C6A44406619A4AF5F7A52C00E4575AC286A44403E47F0B05F7A52C06061CDA7286A4440594E46436B7A52C036502064FF694440BA3082896B7A52C0C6DC3BA1FE69444075A46BE76B7A52C0C9EEE6F6FD694440ECC266596C7A52C0233BAD6BFD6944404C3612DB6C7A52C00571E804FD694440265172676D7A52C04E978BC6FC6944403D1422F96D7A52C05532FCB2FC6944406740888A6E7A52C04CB0FACAFC6944406C6A0E166F7A52C029049B0DFD6944400EF357966F7A52C0D7B74D78FD69444087C67606707A52C07A1CF906FE694440A1DC1B62707A52C0E2A021B4FE694440289CC1A5707A52C086C11F79FF694440EE80CECE707A52C0587F614E006A44403BAFAEDB707A52C06BD9B42B016A44404679E3CB707A52C0E56C9808026A4440C23D08A0707A52C0E4218FDC026A44407A865E0E657A52C05935ED1D2C6A4440'); From 8306153a88f7ad9a397c0b1ab35b8daa39ed7690 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 24 Apr 2026 15:21:14 -0400 Subject: [PATCH 4/6] manually assign a project to a bin --- products/cpdb/seeds/dcp_id_bin_map.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/products/cpdb/seeds/dcp_id_bin_map.csv b/products/cpdb/seeds/dcp_id_bin_map.csv index a1ec62f91c..f74627a181 100644 --- a/products/cpdb/seeds/dcp_id_bin_map.csv +++ b/products/cpdb/seeds/dcp_id_bin_map.csv @@ -102,3 +102,4 @@ maprojid,bin 126PV176-REP,2116695 850PV176-WHC,2116709 846P-312DMRR,1075616 +801SANDBCT,3398756 From ab8791a2488f353f408faf8180a0d5ff2a03b39c Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Thu, 30 Apr 2026 13:43:57 -0400 Subject: [PATCH 5/6] manually assign a project to a bin --- products/cpdb/data/sprints.sql | 1 - products/cpdb/seeds/dcp_id_bin_map.csv | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/products/cpdb/data/sprints.sql b/products/cpdb/data/sprints.sql index 9241fa21ce..8e33059719 100644 --- a/products/cpdb/data/sprints.sql +++ b/products/cpdb/data/sprints.sql @@ -4570,7 +4570,6 @@ INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801RICONNECT INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801ROCKBLVD', NULL, NULL, 'DCP Sprint', '0104000020E61000000100000001010000003DB5856F387052C01D3A803AFC524440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('056PO7990LOC', '3024650100', NULL, 'AD Sprint', '0104000020E610000001000000010100000000000099D77C52C0C8EB567C675A4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('126PV256DIMM', '1010340022', NULL, 'AD Sprint', '0106000020E610000001000000010300000001000000050000004B4D5B65577F52C072765B1B1661444026391C535A7F52C0EEB477140E614440A8B341D8607F52C01D236F91136144405A6B83EA5D7F52C03DBE57981B6144404B4D5B65577F52C072765B1B16614440'); -INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SANDRBCT', '3052570045', NULL, 'AD Sprint', '0106000020E61000000100000001030000000100000010000000E8933364567E52C0909BEA62E953444071469547577E52C0E6246F02EC53444091890BB9577E52C05D25BB46ED53444015D08C23587E52C05D0BF77DEE53444099140C81587E52C008D32D8FEF534440CF8182E9587E52C07CD272C0F0534440EC891A46597E52C0B2FB05CFF153444078CB9FB0597E52C03BA04D06F3534440196AB40D5A7E52C077EA4B16F45344406317226D5A7E52C0C82A282DF55344408E65E0D05A7E52C08632A450F65344403CB3852D5B7E52C0CA52585FF753444057C9CAF55B7E52C0039B95A8F953444028690732547E52C0AA9562B5FF534440689F157A517E52C0A1083137ED534440E8933364567E52C0909BEA62E9534440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801WATERSIDE', '1009910035', NULL, 'AD Sprint', '0106000020E610000001000000010300000001000000200000001B7DC498527E52C0A6E1E8C0D45D444012D82C41537E52C046D3556AD45D4440BDA7CFFD537E52C044B2B0AED95D44404B2DD3B8547E52C0629970E7DE5D44408A599E99547E52C0A5A6EAB7DE5D4440F0799249547E52C02A97E636DE5D444070239D28547E52C05DF2BC04DE5D4440F109C833537E52C0C2595EC9DC5D4440211785A6527E52C056D2010FDC5D4440D45F9185527E52C0E81128EBDB5D44409F6AE65F527E52C05FA953C7DB5D4440D26BE1EE517E52C0868CA07FDB5D444047C0E8CD517E52C0EA414971DB5D44409F9A6F74517E52C063D99054DB5D44404B79ED3B517E52C0EFBC8E29DB5D44404FDFC705517E52C007969FECDA5D44401ECCC1DD507E52C08ACCB6AFDA5D4440DBA638C6507E52C02D55887DDA5D4440B04B7192507E52C09F9737EED95D4440FD2F6163507E52C05BDDE35ED95D4440B1852547507E52C0475792CFD85D44402D4B0B39507E52C043291839D85D4440E2662740507E52C0F86C6482D75D4440231CEF4B507E52C02A2F2737D75D444064A76F5C507E52C09C5E47FAD65D444021DB76A5507E52C0B9853D60D65D4440C2F9B9C1507E52C0E1301C2ED65D44409F4BB6E2507E52C0475D2303D65D444015EE180D517E52C0DAC9FDD0D55D44404BBA9F45517E52C0EC0ADB9ED55D44402C520FE1517E52C04E407733D55D44401B7DC498527E52C0A6E1E8C0D45D4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('81948201040', NULL, NULL, 'DCP Sprint', NULL); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('81948201102', NULL, NULL, 'AD Sprint', NULL); diff --git a/products/cpdb/seeds/dcp_id_bin_map.csv b/products/cpdb/seeds/dcp_id_bin_map.csv index f74627a181..aaf534a69a 100644 --- a/products/cpdb/seeds/dcp_id_bin_map.csv +++ b/products/cpdb/seeds/dcp_id_bin_map.csv @@ -103,3 +103,4 @@ maprojid,bin 850PV176-WHC,2116709 846P-312DMRR,1075616 801SANDBCT,3398756 +801SANDRBCT,3398756 From 6f9dc2022f1475b3ef2370732a79f117dcb1b728 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 24 Apr 2026 15:21:36 -0400 Subject: [PATCH 6/6] add notebook to compare build files in S3 --- .../marimo/lifecycle/build_qa/s3_compare.py | 410 ++++++++++++++++++ 1 file changed, 410 insertions(+) create mode 100644 notebooks/marimo/lifecycle/build_qa/s3_compare.py diff --git a/notebooks/marimo/lifecycle/build_qa/s3_compare.py b/notebooks/marimo/lifecycle/build_qa/s3_compare.py new file mode 100644 index 0000000000..7cbdd76147 --- /dev/null +++ b/notebooks/marimo/lifecycle/build_qa/s3_compare.py @@ -0,0 +1,410 @@ +import marimo + +__generated_with = "0.23.1" +app = marimo.App(width="full") + +with app.setup: + import marimo as mo + import pandas as pd + + from dcpy.utils import s3 + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + # S3 Build QA + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Directory Comparison + + Explore and compare files across two S3 paths (e.g. a published draft vs. a new build). + """) + return + + +@app.cell +def _(): + bucket_input = mo.ui.text(value="edm-publishing", label="Bucket") + path_a_input = mo.ui.text( + value="db-cpdb/draft/26prelim/2/", + label="Path A — baseline (e.g. published draft)", + full_width=True, + ) + path_b_input = mo.ui.text( + value="db-cpdb/build/dm-cpdb-26prelim/", + label="Path B — new build", + full_width=True, + ) + mo.vstack( + [ + bucket_input, + mo.hstack([path_a_input, path_b_input], gap=2), + ], + align="center", + ) + return bucket_input, path_a_input, path_b_input + + +@app.cell +def _(bucket_input, path_a_input, path_b_input): + def _fetch(prefix: str) -> pd.DataFrame: + objs = s3.list_objects(bucket_input.value, prefix) + if not objs: + return pd.DataFrame(columns=["filename", "size_bytes", "last_modified"]) + return pd.DataFrame( + [ + { + "filename": o["Key"].removeprefix(prefix), + "size_bytes": o["Size"], + "last_modified": o["LastModified"], + } + for o in objs + if not o["Key"].endswith("/") + ] + ) + + with mo.status.spinner(title="Fetching objects from S3…"): + df_a = _fetch(path_a_input.value) + df_b = _fetch(path_b_input.value) + return df_a, df_b + + +@app.cell +def _(df_a, df_b, path_a_input, path_b_input): + _merged = pd.merge( + df_a[["filename", "size_bytes"]].rename(columns={"size_bytes": "size_a"}), + df_b[["filename", "size_bytes"]].rename(columns={"size_bytes": "size_b"}), + on="filename", + how="outer", + indicator=True, + ) + _merged["in_a"] = _merged["_merge"].isin(["left_only", "both"]) + _merged["in_b"] = _merged["_merge"].isin(["right_only", "both"]) + _merged["size_diff_bytes"] = _merged["size_b"] - _merged["size_a"] + _merged["size_diff_pct"] = ( + _merged["size_diff_bytes"] / _merged["size_a"] * 100 + ).round(1) + comparison = _merged.drop(columns=["_merge"]) + + _n_only_a = comparison[~comparison["in_b"]].shape[0] + _n_only_b = comparison[~comparison["in_a"]].shape[0] + _n_both = comparison[comparison["in_a"] & comparison["in_b"]].shape[0] + + mo.md( + f""" + ## Summary + + | | Count | + |---|---| + | Files in **A** (`{path_a_input.value}`) | {len(df_a)} | + | Files in **B** (`{path_b_input.value}`) | {len(df_b)} | + | Only in A | {_n_only_a} | + | Only in B | {_n_only_b} | + | In both | {_n_both} | + """ + ) + return (comparison,) + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Only in A — missing from new build + """) + return + + +@app.cell +def _(comparison): + _df = comparison[~comparison["in_b"]][["filename", "size_a"]].reset_index(drop=True) + mo.ui.table(_df, selection=None) if len(_df) else mo.md( + "_None — all files from A are present in B._" + ) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Only in B — new files + """) + return + + +@app.cell +def _(comparison): + _df = comparison[~comparison["in_a"]][["filename", "size_b"]].reset_index(drop=True) + mo.ui.table(_df, selection=None) if len(_df) else mo.md( + "_None — no new files in B._" + ) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Files in both — size comparison + """) + return + + +@app.cell +def _(comparison): + _df = ( + comparison[comparison["in_a"] & comparison["in_b"]][ + ["filename", "size_a", "size_b", "size_diff_bytes", "size_diff_pct"] + ] + .sort_values("size_diff_pct", key=abs, ascending=False, na_position="last") + .reset_index(drop=True) + ) + mo.ui.table(_df, selection=None) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Table Comparison + """) + return + + +@app.cell(hide_code=True) +def _(comparison): + _files_in_both = sorted( + comparison[comparison["in_a"] & comparison["in_b"]]["filename"].tolist() + ) + table_selector = mo.ui.dropdown( + options=_files_in_both, + label="Select a file to compare", + searchable=True, + ) + mo.vstack([table_selector], align="center") + return (table_selector,) + + +@app.cell(hide_code=True) +def _(bucket_input, path_a_input, path_b_input, table_selector): + from io import BytesIO + + def _load_file(prefix: str, filename: str) -> pd.DataFrame: + key = prefix + filename + body = s3.get_file(bucket_input.value, key) + data = BytesIO(body.read()) + ext = filename.rsplit(".", 1)[-1].lower() + if ext == "csv": + return pd.read_csv(data) + elif ext == "parquet": + return pd.read_parquet(data) + elif ext == "json": + return pd.read_json(data) + else: + raise ValueError(f"Unsupported file extension: .{ext}") + + if table_selector.value: + with mo.status.spinner(title=f"Loading {table_selector.value}…"): + tbl_a = _load_file(path_a_input.value, table_selector.value) + tbl_b = _load_file(path_b_input.value, table_selector.value) + else: + tbl_a = tbl_b = None + return tbl_a, tbl_b + + +@app.cell(hide_code=True) +def _(table_selector, tbl_a, tbl_b): + mo.stop( + tbl_a is None or tbl_b is None, + mo.md("_Select a file above to load and compare._"), + ) + _cols_only_a = sorted(set(tbl_a.columns) - set(tbl_b.columns)) + _cols_only_b = sorted(set(tbl_b.columns) - set(tbl_a.columns)) + _col_status = ( + "Column sets match." + if not _cols_only_a and not _cols_only_b + else f"Only in A: `{'`, `'.join(_cols_only_a) or '—'}` · Only in B: `{'`, `'.join(_cols_only_b) or '—'}`" + ) + mo.vstack( + [ + mo.md(f""" + | | A | B | + |---|---|---| + | Rows | {len(tbl_a):,} | {len(tbl_b):,} | + | Columns | {len(tbl_a.columns)} | {len(tbl_b.columns)} | + + **Columns:** {_col_status} + """), + mo.ui.tabs( + { + f"A — {table_selector.value}": mo.ui.table(tbl_a, selection=None), + f"B — {table_selector.value}": mo.ui.table(tbl_b, selection=None), + } + ), + ] + ) + return + + +@app.cell(hide_code=True) +def _(): + _df = mo.sql( + """ + select * from + """ + ) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Geospatial Comparison + """) + return + + +@app.cell +def _(): + import openlayers as ol + from openlayers.basemaps import Carto, CartoBasemapLayer + + return Carto, CartoBasemapLayer, ol + + +@app.cell(hide_code=True) +def _(comparison): + _zip_files = sorted( + comparison.loc[ + comparison["in_a"] + & comparison["in_b"] + & comparison["filename"].str.endswith(".zip"), + "filename", + ].tolist() + ) + mo.stop(not _zip_files, mo.md("_No shared `.zip` files found in both paths._")) + geo_selector = mo.ui.dropdown( + options=_zip_files, + label="Select a shapefile (.zip) to compare", + searchable=True, + ) + mo.vstack([geo_selector], align="center") + return (geo_selector,) + + +@app.cell(hide_code=True) +def _(bucket_input, geo_selector, path_a_input, path_b_input): + import os + import tempfile + + import geopandas as gpd + + mo.stop(geo_selector.value is None, mo.md("_Select a shapefile above._")) + + def _load_shapefile(prefix: str, filename: str) -> gpd.GeoDataFrame: + body = s3.get_file(bucket_input.value, prefix + filename) + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as _tmp: + _tmp.write(body.read()) + _tmp_path = _tmp.name + try: + _gdf = gpd.read_file(f"zip://{_tmp_path}") + finally: + os.unlink(_tmp_path) + return _gdf + + with mo.status.spinner(title=f"Loading {geo_selector.value} from both paths…"): + geo_gdf_a = ( + _load_shapefile(path_a_input.value, geo_selector.value) + .to_crs(4326) + .reset_index(drop=True) + ) + geo_gdf_b = ( + _load_shapefile(path_b_input.value, geo_selector.value) + .to_crs(4326) + .reset_index(drop=True) + ) + return geo_gdf_a, geo_gdf_b + + +@app.cell(hide_code=True) +def _(geo_gdf_a, geo_selector, path_a_input): + _geom_col = geo_gdf_a.geometry.name + _attr_cols = [c for c in geo_gdf_a.columns if c != _geom_col] + geo_table_a = mo.ui.table( + geo_gdf_a[_attr_cols], + selection="multi", + page_size=20, + label=f"A — {path_a_input.value}{geo_selector.value} ({len(geo_gdf_a):,} features) · check rows to highlight on map", + ) + geo_table_a + return (geo_table_a,) + + +@app.cell(hide_code=True) +def _(Carto, CartoBasemapLayer, geo_gdf_a, geo_table_a, ol): + _sel_idx = geo_table_a.value.index.tolist() + _plot_a = geo_gdf_a.loc[_sel_idx] if _sel_idx else geo_gdf_a + _style_a = ol.FlatStyle( + fill_color="rgba(70, 130, 180, 0.35)", + stroke_color="#4682b4", + stroke_width=2, + circle_radius=6, + circle_fill_color="rgba(70, 130, 180, 0.7)", + circle_stroke_color="#4682b4", + circle_stroke_width=1.5, + ) + _layer_a = _plot_a.ol.to_layer(style=_style_a, fit_bounds=False) + _map_a = ol.MapWidget( + view=ol.View(center=[0, 0], zoom=1), + layers=[CartoBasemapLayer(Carto.LIGHT_ALL), _layer_a], + ) + _map_a.fit_bounds(tuple(_plot_a.geometry.total_bounds)) + _map_a.add_tooltip() + _map_a + return + + +@app.cell(hide_code=True) +def _(geo_gdf_b, geo_selector, path_b_input): + _geom_col = geo_gdf_b.geometry.name + _attr_cols = [c for c in geo_gdf_b.columns if c != _geom_col] + geo_table_b = mo.ui.table( + geo_gdf_b[_attr_cols], + selection="multi", + page_size=20, + label=f"B — {path_b_input.value}{geo_selector.value} ({len(geo_gdf_b):,} features) · check rows to highlight on map", + ) + geo_table_b + return (geo_table_b,) + + +@app.cell(hide_code=True) +def _(Carto, CartoBasemapLayer, geo_gdf_b, geo_table_b, ol): + _sel_idx = geo_table_b.value.index.tolist() + _plot_b = geo_gdf_b.loc[_sel_idx] if _sel_idx else geo_gdf_b + _style_b = ol.FlatStyle( + fill_color="rgba(230, 57, 70, 0.35)", + stroke_color="#e63946", + stroke_width=2, + circle_radius=6, + circle_fill_color="rgba(230, 57, 70, 0.7)", + circle_stroke_color="#e63946", + circle_stroke_width=1.5, + ) + _layer_b = _plot_b.ol.to_layer(style=_style_b, fit_bounds=False) + _map_b = ol.MapWidget( + view=ol.View(center=[0, 0], zoom=1), + layers=[CartoBasemapLayer(Carto.LIGHT_ALL), _layer_b], + ) + _map_b.fit_bounds(tuple(_plot_b.geometry.total_bounds)) + _map_b.add_tooltip() + _map_b + return + + +if __name__ == "__main__": + app.run()