fedderw · fedderw · Mar 16, 2023 · Mar 14, 2023 · Mar 15, 2023 · Mar 16, 2023
diff --git a/streamlit_app.py → Home.py b/streamlit_app.py → Home.py
@@ -1,58 +1,55 @@
+# import os,sys
+# sys.path.append(os.getcwd())
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 import streamlit as st
 from datetime import datetime
-from viz import plot_ridership_average, map_bus_routes
+from app.viz import plot_ridership_average, map_bus_routes, plot_recovery_over_this_quarter, plot_bar_top_n_for_daterange
+from app.load_data import get_rides,get_rides_quarterly, get_route_linestrings
 import geopandas as gpd
 from streamlit_extras.badges import badge
-
-
+from streamlit_extras.dataframe_explorer import dataframe_explorer
 st.set_page_config(
 layout="wide", 
-page_title="MTA Bus Ridership"
+page_icon="🚌",
+page_title="Compare MTA Bus Routes by Ridership",
 )
 
-@st.cache_data
-def get_rides(file_path="data/mta_bus_ridership.parquet"):
-    """Get the MTA bus ridership data"""
-    rides = pd.read_parquet(file_path)
-    return rides[[ "route","date", "ridership_weekday",'ridership']]
 
-@st.cache_data
-def get_rides_quarterly(file_path="data/mta_bus_ridership_quarterly.parquet"):
-    """Get the MTA bus ridership data"""
-    rides = pd.read_parquet(file_path)
-    return rides[[ "route","date", "ridership_weekday",'quarter_year','ridership']]
 
 @st.cache_data
-def get_route_linestrings(file_path="data/mta_bus_route_linestring.geojson"):
-    """Get the MTA bus ridership data"""
-    gdf = gpd.read_file(file_path)
-    # The geometry column contains many multiline strings	, so we need to convert them to single linestrings
-
-    return gdf
-@st.cache_data
 def convert_df(df):
     # IMPORTANT: Cache the conversion to prevent computation on every rerun
     return df.to_csv().encode('utf-8')
 
 
 
 
-rides = get_rides_quarterly()
 route_linestrings = get_route_linestrings()
-csv = convert_df(rides)
 # Streamlit app
 st.title("MTA Bus Ridership")
 st.sidebar.title("TransitScope Baltimore")
+freq = "Quarterly"
+if freq == "Quarterly":
+    rides = get_rides_quarterly()
+    csv = convert_df(rides)
+else:
+    rides = get_rides()
+    csv = convert_df(rides)
 # Get the top 5 routes from 2022, group by route number and sum the ridership
 top_5_routes = rides[rides["date"] >= datetime(2022, 1, 1)].groupby("route")["ridership"].sum().sort_values(ascending=False).head(5).reset_index()["route"].tolist()
 print(type(top_5_routes))
 route_numbers = st.sidebar.multiselect(
     "Select routes", list(rides["route"].unique()), default=top_5_routes,
 )
-
+freq=st.sidebar.selectbox("Choose frequency", ["Quarterly", "Monthly"])
+if freq == "Quarterly":
+    rides = get_rides_quarterly()
+    csv = convert_df(rides)
+else:
+    rides = get_rides()
+    csv = convert_df(rides)
 highlight_routes=st.sidebar.checkbox("Show unselected bus routes on map", value=False)
 
 if route_numbers:
@@ -66,6 +63,12 @@ def convert_df(df):
                                 end_date=datetime(2022, 12, 31), 
                                 y_axis_zero = True
                                 )
+    # Add a toggle to set y-axis to start at 0
+    fig2 = plot_recovery_over_this_quarter(rides, 
+                                # Do the top 5 routes from 2022
+                                route_numbers=route_numbers, )
+
+
     col1, col2 = st.columns([3,2])
     with col1:
         st.plotly_chart(
@@ -76,30 +79,47 @@ def convert_df(df):
             )
         y_axis_zero = st.sidebar.checkbox("Y-axis starts at 0", value=True)
 
+
     with col2:
-
+        # Add 3 blank lines 
+        st.markdown("### ")
+        st.markdown("### ")
+        st.markdown("### ")
         map_bus_routes(route_linestrings, route_numbers,highlight_routes=highlight_routes)
-    # st.markdown("### Ridership Data")
-    # dataframe = (rides[rides["route"].isin(route_numbers)])
-    # filtered_dataframe = dataframe_explorer(dataframe)
-    # st.dataframe(filtered_dataframe, use_container_width=True)
-    st.write("NOTE: This is quarterly data. The quarterly data is calculated by taking the sum of the total ridership in each quarter, and dividing it by the number of weekdays in that quarter.")
-    # Add a download link for the data
-    st.download_button(
-        label="Download full dataset as CSV",
-        data=csv,
-        file_name='mta_bus_ridership_quarterly.csv',
-        mime='text/csv',
-    )
-
+
+
+    # Add a date selector, NOT another markdown header
+    st.plotly_chart(fig2, use_container_width=True, )
+    st.markdown("### Explore the top routes over a date range")
+    start_date = st.date_input("Start date", datetime(2022, 1, 1))
+    end_date = st.date_input("End date", datetime(2022, 12, 31))
+
+    fig3 = plot_bar_top_n_for_daterange(rides,top_n=5,col='ridership',daterange=(start_date,end_date))
+    st.plotly_chart(fig3, use_container_width=True, )
+    # Show the ridership recovery chart
+
+    with st.expander("Data details"):
+        st.write("NOTE: This is quarterly data. The quarterly data is calculated by taking the sum of the total ridership in each quarter, and dividing it by the number of weekdays in that quarter.")
+        st.write("The routes displayed on the map do not include the supplemental services that provide service to Baltimore City Schools. These riders **are** included in the ridership data.")
+        st.markdown(":red[Maps may not reflect service changes. These should be considered as a guide to the general service area only.]")
+        # Add a download link for the data
+
+
 
 else:
     # Show a message if no routes are selected
     st.warning("Please select at least one route.")
+
+
+
 badge(type="twitter", name="willfedder")
 badge(type="github", name="fedderw/transitscope-baltimore")
 st.sidebar.write("App created by [Will Fedder](https://linkedin.com/in/fedderw).")
 st.sidebar.write("Data provided by [MDOT MTA](https://www.arcgis.com/apps/dashboards/1bbc19f2abfe4fde94e4c563f5e8371c). To view a geographic system map in PDF format, visit the [MTA's website](https://s3.amazonaws.com/mta-website-staging/mta-website-staging/files/System%20Maps/Geographic_System_Map_08_2022.pdf).") 
 st.sidebar.write("Data extracted using this [script](https://github.com/jamespizzurro/mta-bus-ridership-scraper) authored by James Pizzurro.")
 
+# Wait for user to press a button
+import time
+time.sleep(20)
+st.experimental_rerun()
 
diff --git a/app/constants.py b/app/constants.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+
+data_raw_dir = Path("data/raw")
+data_dir = Path("data")
+
+CITYLINK_COLORS = {
+    "CityLink Red": "#FF0000",
+    "CityLink Blue": "#4169E1",
+    "CityLink Green": "#008000",
+    "CityLink Yellow": "#FFFF00",
+    "CityLink Brown": "#A52A2A",
+    "CityLink Orange": "#FF6600",
+    "CityLink Purple": "#800080",
+    "CityLink Pink": "#FFC0CB",
+    "CityLink Lime": "#00FF00",
+    # "CityLink Navy": "#3A1078",
+    "CityLink Navy": "#00337C",
+    "CityLink Navy": "#486581",
+    # "CityLink Navy": "#13005A",
+    # "CityLink Navy": "#0F3460",
+    "CityLink Silver": "#C0C0C0",
+    "CityLink Gold": "#FFD700",
+    "Other": "#FFFFFF",
+}
diff --git a/app/load_data.py b/app/load_data.py
@@ -0,0 +1,76 @@
+import pandas as pd
+import geopandas as gpd
+import streamlit as st
+import numpy as np
+from datetime import datetime
+
+
+def add_ridership_weekday_2019(
+    df: pd.DataFrame, freq: str = "quarter"
+) -> pd.DataFrame:
+    """Calculate the ridership recovery over 2019 for a given frequency
+
+    Args:
+        df (pd.DataFrame): DataFrame with ridership and date columns
+        freq (str, optional): Frequency to group by. Can be 'quarter' or 'month'. Defaults to 'quarter'.
+
+    Returns:
+        pd.DataFrame: DataFrame with additional columns for the ridership recovery over 2019
+    """
+    # Extract quarter and year information from date column
+
+    # freq can be 'quarter' or 'month'
+
+    df[freq] = (
+        df["date"].dt.quarter if freq == "quarter" else df["date"].dt.month
+    )
+    df["year"] = df["date"].dt.year
+
+    # Filter to same quarter in 2019
+    filter_df = (
+        df[(df["year"] == 2019) & (df[freq] == df[freq])]
+        .groupby(["route", freq])[["ridership_weekday"]]
+        .sum()
+        .reset_index()
+    )
+
+    # Merge filtered DataFrame with original DataFrame
+    merged_df = df.merge(
+        filter_df, on=["route", freq], how="left", suffixes=("", "_2019")
+    )
+    merged_df.date = pd.to_datetime(merged_df.date)
+    merged_df["recovery_over_2019"] = np.where(
+        merged_df.date.dt.year < 2020,
+        np.nan,
+        merged_df["ridership_weekday"] / merged_df["ridership_weekday_2019"],
+    )
+
+    return merged_df
+
+
+cols = ["route", "date", "ridership_weekday", "ridership", ]
+
+
+@st.cache_data
+def get_rides(file_path="data/mta_bus_ridership.parquet"):
+    """Get the MTA bus ridership data"""
+    rides = pd.read_parquet(file_path)
+    rides = add_ridership_weekday_2019(rides, freq="month")
+    return rides
+
+
+@st.cache_data
+def get_rides_quarterly(file_path="data/mta_bus_ridership_quarterly.parquet"):
+    """Get the MTA bus ridership data"""
+    rides = pd.read_parquet(file_path)
+    rides = add_ridership_weekday_2019(rides, freq="quarter")
+    return rides
+
+
+@st.cache_data
+def get_route_linestrings(file_path="data/mta_bus_route_linestring.geojson"):
+    """Get the MTA bus ridership data"""
+    gdf = gpd.read_file(file_path)
+    # The geometry column contains many multiline strings	, so we need to convert them to single linestrings
+
+    return gdf