From cbbc725ebec55b6c238457e64c4a9362a10eaa6f Mon Sep 17 00:00:00 2001 From: Lilah Rosenfield Date: Thu, 14 Aug 2025 13:34:21 -0600 Subject: [PATCH 1/2] Test for Loops Initial commit, just includes testing for loops. Logic for fixing will be implemented in subsequent commits. --- gtfs_functions/aux_functions.py | 19 +++++++++++++++++++ gtfs_functions/gtfs_functions.py | 18 +++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/gtfs_functions/aux_functions.py b/gtfs_functions/aux_functions.py index 6db210c..1e960a4 100644 --- a/gtfs_functions/aux_functions.py +++ b/gtfs_functions/aux_functions.py @@ -72,6 +72,25 @@ def add_speed(speeds): return speeds +def detect_loops(df): + """ + Runs on a shape_stop df, grouped by patterns and warns on the number of + potential loops detected + """ + df['cdsp_prev_stop'] = df['cut_distance_stop_point'].shift(1).fillna(0) + df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-1).fillna(1) + df['prev_wrong'] = (df['cdsp_prev_stop'] > df['cut_distance_stop_point']) + df['next_wrong'] = (df['cdsp_next_stop'] < df['cut_distance_stop_point']) + #df['cdsp_next_ordered'] + if df['next_wrong'].any() or df['prev_wrong'].any(): + num_loops = (df['prev_wrong'].sum() + df['next_wrong'].sum())//2 + logging.warning(f"Identified {num_loops} loops in Pattern " + f"{df['shape_id'].iloc[0]} of Route " + f"{df['route_name'].iloc[0]}") + return False + else: + return True + def fix_outliers(speeds): # Calculate average speed to modify outliers diff --git a/gtfs_functions/gtfs_functions.py b/gtfs_functions/gtfs_functions.py index 122c4a2..453fa19 100644 --- a/gtfs_functions/gtfs_functions.py +++ b/gtfs_functions/gtfs_functions.py @@ -14,6 +14,7 @@ add_runtime, add_distance, add_speed, + detect_loops, code, fix_outliers, num_to_letters, @@ -52,6 +53,7 @@ def __init__( patterns: bool = True, start_date: str = None, end_date: str = None, + fix_loops: bool = False ): self._gtfs_path = gtfs_path @@ -61,6 +63,7 @@ def __init__( self._patterns = patterns self._start_date = start_date self._end_date = end_date + self._fix_loops = fix_loops self._dates = None self._routes_patterns = None self._trips_patterns = None @@ -871,6 +874,9 @@ def get_segments(self): """Splits each route's shape into stop-stop LineString called segments Returns the segment geometry as well as additional segment information + + If fix_loops is set to true on init, will attempt experimental + loop-resolution recursion. """ logging.info("Getting segments...") stop_times = self.stop_times @@ -886,9 +892,15 @@ def get_segments(self): .merge(shapes, on="shape_id", suffixes=("_stop", "_shape")) ) logging.info("Projecting stops onto shape...") - df_shape_stop["cut_distance_stop_point"] = df_shape_stop[["geometry_stop", "geometry_shape"]].apply( - lambda x: x[1].project(x[0], normalized=True), axis=1 - ) + df_shape_stop["cut_distance_stop_point"] = \ + df_shape_stop['geometry_shape'].project( + df_shape_stop['geometry_stop'], normalized=True + ) + df_shape_stop_group = df_shape_stop.groupby("shape_id") + test = df_shape_stop_group.apply(detect_loops) + if test.any(): + logging.warning("Detected looped routes! It is recommended you " \ + "re-initialize the feed with fix_loops enabled") logging.info("Interpolating stops onto shape...") df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]].apply( lambda x: x[0].interpolate(x[1], normalized=True), axis=1 From 54b50c19e7d04f4663da293c3776ca3b20c0e653 Mon Sep 17 00:00:00 2001 From: Lilah Rosenfield Date: Fri, 15 Aug 2025 15:41:28 -0600 Subject: [PATCH 2/2] Refactored get_segments() Implemented an alternative method for getting segments, hidden behind a flag because I'm not 100% confident in it. In most (in spec) cases, this allows for correct segmentation of loops. It's also more likely to provide more notice for unexpected behaviors vs the current implementation. Unexpected behaviors are just less predictable, unfortunately. --- .gitignore | 4 +- gtfs_functions/aux_functions.py | 122 +++++++++++++++++++++++++++++++ gtfs_functions/gtfs_functions.py | 100 +++++++++++++++++++++---- 3 files changed, 211 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 4340cdf..32474de 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ data example notebooks *.pyc -*.sh \ No newline at end of file +*.sh +Segments_Debugging.ipynb +segments.zip diff --git a/gtfs_functions/aux_functions.py b/gtfs_functions/aux_functions.py index 1e960a4..6c2cfab 100644 --- a/gtfs_functions/aux_functions.py +++ b/gtfs_functions/aux_functions.py @@ -4,6 +4,9 @@ import geopandas as gpd import logging import numpy as np +from shapely import Point +from shapely import LineString +from shapely.ops import substring def add_runtime(st): @@ -76,6 +79,8 @@ def detect_loops(df): """ Runs on a shape_stop df, grouped by patterns and warns on the number of potential loops detected + + Returns false if there are loops detected, True if not """ df['cdsp_prev_stop'] = df['cut_distance_stop_point'].shift(1).fillna(0) df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-1).fillna(1) @@ -91,6 +96,123 @@ def detect_loops(df): else: return True +def loops_fixer(df:pd.DataFrame): + """ + Fixes loops using a relaxaation algorithm + + Parameter: df, a gtfs combined stop & shape dataframe + Returns a corrected dataframe + + Lilah Rosenfield | Utah Transit Authority + lilah.rosenfield@rideuta.com + """ + df = df.copy() + df['sub_string'] = np.nan + df.loc[df.index[0], 'cut_distance_stop_point'] = 0 # First stop + df.loc[df.index[-1], 'cut_distance_stop_point'] = 1 # Last stop + df = _diff_and_reproject(df) + df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-1).fillna(1) + df['end_stop_name'] = df['stop_name'].shift(-1) + df['end_stop_id'] = df['stop_id'].shift(-1) + df = df.iloc[:-1] + df['geometry'] = df.apply(_final_substring, axis=1) + return df + +def _diff_and_reproject(df:pd.DataFrame, + max_iterations:int = 50): + """ + Iterates outwards from identified discontinuities in the linestring from + the sequence, looping until it finds a fix. + + Returns a corrected df. + + My initial implementation of this was recursive, which I liked, but + this version uses a while loop, which is more memory efficient. + + Lilah Rosenfield | Utah Transit Authority + (with the support of a large pile of linear algebra) + lilah.rosenfield@rideuta.com + """ + df = df.sort_index(ascending=True) + iteration = 0 + offset = 1 + while iteration < max_iterations: + iteration += 1 + df['cdsp_prev_stop'] = df['cut_distance_stop_point'].shift(offset).fillna(0) + df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-offset).fillna(1) + df['prev_wrong'] = (df['cdsp_prev_stop'] > df['cut_distance_stop_point']) + df['next_wrong'] = (df['cdsp_next_stop'] < df['cut_distance_stop_point']) + if not (df['next_wrong'].any() or df['prev_wrong'].any()): + break + df['prev_cdsp'] = df['cut_distance_stop_point'].copy() + + # Fix Pattern + df['sub_string'] = df[df['next_wrong'] | df['prev_wrong']].apply( + _substring_general, axis=1 + ) + df['substproj'] = df['sub_string'].project( + df['geometry_stop'], normalized = True) + + # Convert back to full line coordinates + df['substproj'] = ( + df['substproj']* + (df['cdsp_next_stop']-df['cdsp_prev_stop'])+ + df['cdsp_prev_stop']) + valid_update = ( + (df['substproj'] < df['cdsp_next_stop']) & + (df['substproj'] > df['cdsp_prev_stop']) + ) + df['cut_distance_stop_point'][valid_update] = df['substproj'] + tester = df['prev_cdsp'] != df['cut_distance_stop_point'] + if tester.any(): + logging.info("Made corrections, re-setting offset to 1") + offset=1 + else: + logging.info("No corrections make, increasing offset") + offset = offset+1 + if offset > len(df) // 2: + raise pd.errors.InvalidIndexError("Offset got too large!") + if iteration >= max_iterations: + logging.warning("Maximum iteration depth reached for shape id " + f"{df['shape_id'].iloc[0]}. You are likely to" + "encounter further errors") + return df + + +def _final_substring(ser:pd.Series): + subst = substring(ser['geometry_shape'], + ser['cut_distance_stop_point'], + ser['cdsp_next_stop'], + normalized = True) + if type(subst) == Point: + logging.warning(f"Segment between {ser['stop_name']} and " + f"{ser['end_stop_name']} on {ser['route_name']}" + "is a point! This is likely due to a out-of-spec " + "GTFS implementation! We're returning the segment with " + "an empty linestring instead.") + return LineString() + return subst + +def _substring_general(ser:pd.Series): + """ + Applied substringification function for _diff_and_reproject + """ + logging.info(f'Trying {ser['stop_name']} of line {ser['route_name']}') + subst = substring(ser['geometry_shape'], + ser['cdsp_prev_stop'], + ser['cdsp_next_stop'], + normalized=True) + if type(subst) == Point: + logging.warning("Got a point from the substring function!") + logging.warning("This is probably a single-stop deviation, " + "in which case you are safe to ignore.") + logging.warning("However, if the loop fix fails, " \ + "your feed may have a corner case") + return ser['geometry_shape'] + else: + return subst + + def fix_outliers(speeds): # Calculate average speed to modify outliers diff --git a/gtfs_functions/gtfs_functions.py b/gtfs_functions/gtfs_functions.py index 453fa19..42f1aaf 100644 --- a/gtfs_functions/gtfs_functions.py +++ b/gtfs_functions/gtfs_functions.py @@ -9,6 +9,7 @@ import pendulum as pl import hashlib from shapely.geometry import LineString, MultiPoint +from shapely.ops import substring from gtfs_functions.aux_functions import ( add_all_lines, add_runtime, @@ -16,6 +17,7 @@ add_speed, detect_loops, code, + loops_fixer, fix_outliers, num_to_letters, add_route_name, @@ -283,6 +285,21 @@ def dates_service_id(self): if self._dates_service_id is None: self._dates_service_id = self.get_dates_service_id() return self._dates_service_id + + @property + def loop_log(self): + """ + A dataframe logging changes that have been made if fix_loops is enabled + + Warns and returns false if disabled + """ + if self._fix_loops: + self.get_segments() + return self._loop_log.copy() + else: + logging.warning("fix_loops is set to false, " + "so loop_log does not exist!") + return False @trips.setter def trips(self, value): @@ -869,20 +886,26 @@ def get_lines_freq(self): line_frequencies = line_frequencies.loc[~line_frequencies.geometry.isnull(), keep_these] return line_frequencies + - def get_segments(self): - """Splits each route's shape into stop-stop LineString called segments + def _segments_helper(self): + """ + Splits each route's shape into stop-stop LineString called segments - Returns the segment geometry as well as additional segment information + This part just gets the segment geometry, and returns. Addtional + segment information is added in the main get_segments function. - If fix_loops is set to true on init, will attempt experimental - loop-resolution recursion. + Lilah Rosenfield | Utah Transit Authority + lilah.rosenfield@rideuta.com """ logging.info("Getting segments...") stop_times = self.stop_times shapes = self.shapes - req_columns = ["shape_id", "stop_sequence", "stop_id", "geometry"] + # added route_pattern to handle the corner case two patterns can + # Have the same linestring but different stops + req_columns = ["shape_id", "stop_sequence", "stop_id", "geometry", + "route_pattern"] add_columns = ["route_id", "route_name", "direction_id", "stop_name"] # merge stop_times and shapes to calculate cut distance and interpolated point @@ -896,17 +919,25 @@ def get_segments(self): df_shape_stop['geometry_shape'].project( df_shape_stop['geometry_stop'], normalized=True ) - df_shape_stop_group = df_shape_stop.groupby("shape_id") + df_shape_stop_group = df_shape_stop.groupby(["shape_id","route_pattern"]) test = df_shape_stop_group.apply(detect_loops) - if test.any(): - logging.warning("Detected looped routes! It is recommended you " \ + if not self._fix_loops and test.any(): + logging.warning("Detected looped routes! It is recommended you " "re-initialize the feed with fix_loops enabled") - logging.info("Interpolating stops onto shape...") - df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]].apply( - lambda x: x[0].interpolate(x[1], normalized=True), axis=1 - ) + segment_df = self._segments_helper_noloopfix(df_shape_stop) + if test.any() and self._fix_loops: + segment_df = self._segments_helper_with_loopfix(df_shape_stop_group, df_shape_stop) + + return segment_df + def _segments_helper_noloopfix(self, df_shape_stop:pd.DataFrame): + stop_times = self.stop_times + shapes = self.shapes + logging.info("Interpolating stops onto shape...") # calculate cut distance for + df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]].apply( + lambda x: x[0].interpolate(x[1], normalized=True), axis=1) + logging.info("Sorting shape points and stops...") df_shape = shapes[shapes.shape_id.isin(stop_times.shape_id.unique())] df_shape["list_of_points"] = df_shape.geometry.apply(lambda x: list(MultiPoint(x.coords).geoms)) @@ -969,7 +1000,47 @@ def get_segments(self): ], axis=1, inplace=True, - ) + ) + return segment_df + + def _segments_helper_with_loopfix(self, df_shape_stop_group, df_shape_stop): + old_cdsp = df_shape_stop["cut_distance_stop_point"] + fixed_loops = df_shape_stop_group.apply(loops_fixer) + df_shape_stop = fixed_loops.droplevel([0,1]) + + # Double checking may add a bit of inefficiency + # but we'll do it anyway + df_shape_stop_group = df_shape_stop.groupby(["shape_id","route_pattern"]) + logging.info("Finished fixing loops!") + test = df_shape_stop_group.apply(detect_loops) + if test.any(): + logging.warning("Warning we were not able to fix all loops!") + new_cdsp = df_shape_stop["cut_distance_stop_point"] + #changes_made = new_cdsp != old_cdsp + self._loop_log = pd.concat([ + df_shape_stop[['stop_id', 'stop_sequence']], + old_cdsp.rename('old_cut_distance'), + new_cdsp.rename('new_cut_distance'), + #changes_made.rename('changed') + ], axis=1) + df_shape_stop = df_shape_stop.drop( + ['cdsp_prev_stop','cdsp_next_stop','prev_wrong', + 'next_wrong','prev_cdsp','substproj', 'sub_string'], axis=1) + return df_shape_stop + + + def get_segments(self): + """Splits each route's shape into stop-stop LineString called segments + + Returns the segment geometry as well as additional segment information + + If fix_loops is set to true on init, will attempt experimental + loop-resolution loop. + """ + #stop_times = self.stop_times + #shapes = self.shapes + segment_gdf = self._segments_helper() + segment_gdf.crs = "EPSG:4326" # Add segment length in meters @@ -984,6 +1055,7 @@ def get_segments(self): "shape_id", "route_id", "route_name", + "route_pattern", "direction_id", "stop_sequence", "segment_name",