slacgismo · bmeyers · Nov 12, 2025 · Nov 12, 2025 · Nov 13, 2025 · Nov 19, 2025
diff --git a/data/DEV_load_canadian_renewables_data.py b/data/DEV_load_canadian_renewables_data.py
@@ -0,0 +1,182 @@
+import marimo
+
+__generated_with = "0.17.8"
+app = marimo.App(width="full")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import numpy as np
+    import pandas as pd
+    import pdfplumber
+    import requests
+    from io import BytesIO
+    return BytesIO, mo, np, pd, pdfplumber, requests
+
+
+@app.function
+def levenshtein_distance(s1, s2):
+    m, n = len(s1), len(s2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(m + 1):
+        dp[i][0] = i
+    for j in range(n + 1):
+        dp[0][j] = j
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            cost = 0 if s1[i - 1] == s2[j - 1] else 1
+            dp[i][j] = min(dp[i - 1][j] + 1,  # Deletion
+                           dp[i][j - 1] + 1,  # Insertion
+                           dp[i - 1][j - 1] + cost) # Substitution
+
+    return dp[m][n]
+
+
+@app.cell
+def _(BytesIO, np, pd, pdfplumber, requests):
+    def load_canadian_renewables_data(pdf_location="https://renewablesassociation.ca/wp-content/uploads/2025/01/New-Project-List.pdf"):
+        tech_labels = ['Wind', 'Solar', 'Energy Storage', "Solar-Storage", "Wind-Storage"]
+        provinces = ['NL', 'PE' ,'NS', 'NB', 'QC', 'ON', 'MB', 'SK', 'AB', 'BC', 'YT', 'NT', 'NU']
+        pdf_response = requests.get(pdf_location)
+        def my_float(_x):
+            try:
+                _o = float(_x)
+            except ValueError:
+                _o = np.nan
+            return _o
+        def my_int(_x):
+            try:
+                _o = int(_x)
+            except ValueError:
+                _o = np.nan
+            return _o
+        all_tables = []
+        with BytesIO(pdf_response.content) as pdf_file:
+            with pdfplumber.open(pdf_file) as pdf:
+                for _page in pdf.pages:
+                    _tables = _page.extract_tables()
+                    all_tables.append(_tables)
+        columns = all_tables[0][0][0]
+        data = pd.DataFrame(columns=columns)
+        _ix = 0
+        for _p in range(len(all_tables)):
+            for _r in range(len(all_tables[_p][0])):
+                if _p == 0 and _r == 0:
+                    continue
+                else:
+                    _v = all_tables[_p][0][_r]
+                    _label = _v[1]
+                    _distances = [levenshtein_distance(_label, _t) for _t in tech_labels]
+                    _new_label = tech_labels[np.argmin(_distances)]
+                    _province = _v[2]
+                    _distances = [levenshtein_distance(_province, _t) for _t in provinces]
+                    _new_p = provinces[np.argmin(_distances)]
+                    data.loc[_ix] = [_v[0], _new_label, _new_p, my_int(_v[3]), my_float(_v[4]), 
+                                      my_float(_v[5]), my_float(_v[6]), 
+                                      my_float(_v[7]), _v[8]]
+                    _ix += 1
+        return data
+    return (load_canadian_renewables_data,)
+
+
+@app.cell
+def _(load_canadian_renewables_data):
+    canadian_renewables = load_canadian_renewables_data()
+    return (canadian_renewables,)
+
+
+@app.cell
+def _(canadian_renewables):
+    canadian_renewables
+    return
+
+
+@app.cell
+def _():
+    pdf_location = "https://renewablesassociation.ca/wp-content/uploads/2025/01/New-Project-List.pdf"
+    return (pdf_location,)
+
+
+@app.cell
+def _(pdf_location, requests):
+    pdf_response = requests.get(pdf_location)
+    return (pdf_response,)
+
+
+@app.cell
+def _(BytesIO, np, pd, pdf_response, pdfplumber):
+    def my_float(_x):
+        try:
+            _o = float(_x)
+        except ValueError:
+            _o = np.nan
+        return _o
+    def my_int(_x):
+        try:
+            _o = int(_x)
+        except ValueError:
+            _o = np.nan
+        return _o
+    all_tables = []
+    with BytesIO(pdf_response.content) as pdf_file:
+        with pdfplumber.open(pdf_file) as pdf:
+            for _page in pdf.pages:
+                _tables = _page.extract_tables()
+                all_tables.append(_tables)
+    columns = all_tables[0][0][0]
+    data = pd.DataFrame(columns=columns)
+    _ix = 0
+    for _p in range(len(all_tables)):
+        for _r in range(len(all_tables[_p][0])):
+            if _p == 0 and _r == 0:
+                continue
+            else:
+                _v = all_tables[_p][0][_r]
+                data.loc[_ix] = [_v[0], _v[1], _v[2], my_int(_v[3]), my_float(_v[4]), 
+                                  my_float(_v[5]), my_float(_v[6]), 
+                                  my_float(_v[7]), _v[8]]
+                _ix += 1
+    return (data,)
+
+
+@app.cell
+def _():
+    return
+
+
+@app.cell
+def _(data):
+    data
+    return
+
+
+@app.cell
+def _(data):
+    data.groupby('Technology')['Project Name'].count()
+    return
+
+
+@app.cell
+def _(data):
+    techs = list(set(data['Technology']))
+    techs
+    return (techs,)
+
+
+@app.cell
+def _(mo, np, techs):
+    targets = ['Wind', 'Solar', 'Energy Storage', "Solar-Storage", "Wind-Storage"]
+    test = techs[10]
+    distances = [levenshtein_distance(test, _t) for _t in targets]
+    _text = f"""
+    - test string: {test}
+    - closest match: {targets[np.argmin(distances)]}"""
+    mo.md(_text)
+    return
+
+
+if __name__ == "__main__":
+    app.run()
diff --git a/data/README.md b/data/README.md
@@ -4,6 +4,8 @@ To update the GLM file, run the Makefile.
 
 # Description
 
+The original WECC 240 model files are in the `wecc240` subdirectory, and the main file is `wecc240/240busWECC_2018_PSS.raw`. The bus information is extracted in `wecc240_gis.csv`. The file `nodes.csv` contains all the reduced set of nodes with unique locations, as many of the nodes in the original model are co-located at the same physical location (see `nodes.py` for node reduction methodology). 
+
 # Validation
 
 To review the load model, run the following marimo app:
@@ -13,9 +15,6 @@ To review the load model, run the following marimo app:
 Bob is the subject matter expert who can tell whether your solution is any good
 just by looking at it.
 
-# Notes
-
-The `nodes.csv` contains a list of all the WECC 240 bus model locations with duplicate locations removed (see `nodes.py` for node reduction methodology).
 
 # Data Sources
 1. `powerplants.csv`: https://hifld-geoplatform.hub.arcgis.com/datasets/9dd630378fcf439999094a56c352670d_0/explore

diff --git a/data/WECC_DG_solar_analysis.py b/data/WECC_DG_solar_analysis.py
@@ -0,0 +1,182 @@
+import marimo
+
+__generated_with = "0.17.8"
+app = marimo.App(width="full")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import pandas as pd
+    import numpy as np
+    return np, pd
+
+
+@app.cell
+def _():
+    short_to_long = {
+        'NM': 'New Mexico', 
+        'CA': 'California', 
+        'WY': 'Wyoming', 
+        'OR': 'Oregon', 
+        'UT': 'Utah', 
+        'WA': 'Washington', 
+        'AZ': 'Arizona', 
+        'NV': 'Nevada', 
+        'ID': 'Idaho', 
+        'MT': 'Montana', 
+        'CO': 'Colorado'
+    }
+    long_to_short = {value: key for key, value in short_to_long.items()}
+    return (long_to_short,)
+
+
+@app.cell
+def _():
+    import requests
+    import geopandas as gpd
+    from shapely.geometry import Point
+    import json
+    import time
+
+    # Load US states shapefile. This is a sample URL; you can find others or download and point to your local file.
+    STATES_URL = "https://raw.githubusercontent.com/PublicaMundi/MappingAPI/refs/heads/master/data/geojson/us-states.json"
+
+    # Load GeoDataFrame with US states
+    states = gpd.read_file(STATES_URL)
+
+    def get_location(lat, lon):
+        point = Point(lon, lat)  # Note: (lon, lat) order is used for Point
+        state_found = states[states['geometry'].contains(point)]
+
+        if not state_found.empty:
+            # If a state is found, return the name of the state
+            return state_found.iloc[0]['name']  # Adjust this key based on the GeoJSON properties
+        else:
+            # If no state is found, call an API to get the country
+            return get_country(lat, lon)
+
+    def get_country(lat, lon):
+        # Using a free API for country lookup
+        response = requests.get(f'http://geocode.xyz/{lat},{lon}?json=1')
+        if response.status_code == 200:
+            data = response.content
+            response_json = json.loads(response.content.decode('utf-8'))
+            provence = response_json['prov']
+            if provence == 'Throttled! See geocode.xyz/pricing':
+                # oops, wait a second for free API to unlock...
+                time.sleep(1)
+                response = requests.get(f'http://geocode.xyz/{lat},{lon}?json=1')
+                response_json = json.loads(response.content.decode('utf-8'))
+                provence = response_json['prov']
+            if provence == 'MX':
+                country = 'Mexico'
+            elif provence == 'CA':
+                country = 'Canada'
+            elif provence == 'US':
+                # If in the US, return the state
+                country = response_json['statename']
+            else:
+                country = provence
+            return country
+        else:
+            return 'Unable to find country'
+    return (get_location,)
+
+
+@app.cell
+def _(get_location):
+    ## Test State/Country lookup
+    # lat = 51.5074  # Latitude for London
+    # lon = -0.1278  # Longitude for London
+    lat = 47.6321 # Spokane, WA
+    lon = -117.478965 # Spokane, WA
+    # lat = 12.3548 # Hanoi, Vietnam
+    # lon = 108.4654 # Hanoi, Vietnam
+    location = get_location(lat, lon)
+    print(location)  
+    return
+
+
+@app.cell
+def _(pd):
+    wecc_dg_data = pd.read_csv('wecc_dg_solar.csv')
+    # numbers have commas which causes data to be read as strings instead of floats
+    for _col in wecc_dg_data.columns:
+        if _col not in ['State', 'Data Status']:
+            try:
+                wecc_dg_data[_col] = wecc_dg_data[_col].str.replace(',', '').astype(float)
+            except AttributeError:
+                pass
+    return (wecc_dg_data,)
+
+
+@app.cell
+def _(wecc_dg_data):
+    wecc_dg_data
+    return
+
+
+@app.cell
+def _(get_location, np, pd):
+    wecc_bus_summary = pd.read_csv('test_wecc240_2020m_gis.csv')
+    wecc_bus_summary['state'] = [get_location(_row['LAT'], _row['LON']) for _, _row in wecc_bus_summary.iterrows()]
+    wecc_bus_summary['load_fraction'] = 0.0
+    grouped = wecc_bus_summary[wecc_bus_summary['LOAD'] > 0].groupby('state')
+    for _state in set(wecc_bus_summary['state']):
+        load_frac = grouped.get_group(_state)['LOAD'] / np.sum(grouped.get_group(_state)['LOAD'])
+        wecc_bus_summary.loc[grouped.get_group(_state).index, 'load_fraction'] = load_frac
+    return (wecc_bus_summary,)
+
+
+@app.cell
+def _(wecc_bus_summary):
+    wecc_bus_summary
+    return
+
+
+@app.cell
+def _(long_to_short, pd, wecc_bus_summary, wecc_dg_data):
+    dataframe_list = []
+    grouped_dg = wecc_dg_data.groupby('State')
+    for _ix, _row in wecc_bus_summary.iterrows():
+        include = _row['load_fraction'] > 0 and _row['state'] not in ['Canada', 'Mexico']
+        if include:
+            new_df = pd.DataFrame(columns=['Year', 'Month', 'State', 'bus_id', 'bus_name', 'geohash', 'lat', 'lon', 'Capacity [MW]', 'Generation [MWh]'])
+            for _ix2, _row2 in grouped_dg.get_group(long_to_short[_row['state']]).iterrows():
+                _e = [_row2['Year'], _row2['Month'], _row2['State'], _row['BUS_I'], _row['NAME'], 
+                      _row['GEOHASH'], _row['LAT'], _row['LON'], 
+                      float(_row['load_fraction']) * float(_row2['Total Capacity (MW)']), 
+                      float(_row['load_fraction']) * float(_row2['Total Generation (MWh)'])]
+                new_df.loc[_ix2] = _e
+            dataframe_list.append(new_df)
+    return (dataframe_list,)
+
+
+@app.cell
+def _(dataframe_list):
+    dataframe_list
+    return
+
+
+@app.cell
+def _(dataframe_list, np, pd):
+    wecc_bus_dg_cap_and_gen_by_month = pd.concat(dataframe_list)
+    wecc_bus_dg_cap_and_gen_by_month.index = np.arange(len(wecc_bus_dg_cap_and_gen_by_month))
+    return (wecc_bus_dg_cap_and_gen_by_month,)
+
+
+@app.cell
+def _(wecc_bus_dg_cap_and_gen_by_month):
+    wecc_bus_dg_cap_and_gen_by_month
+    return
+
+
+@app.cell
+def _(wecc_bus_dg_cap_and_gen_by_month):
+    wecc_bus_dg_cap_and_gen_by_month.to_csv('wecc_bus_dg_cap_and_gen_by_month.csv')
+    return
+
+
+if __name__ == "__main__":
+    app.run()
diff --git a/data/WECC_node_generation_mix_estimation.pptx b/data/WECC_node_generation_mix_estimation.pptx