Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ data
example
notebooks
*.pyc
*.sh
*.sh
Segments_Debugging.ipynb
segments.zip
141 changes: 141 additions & 0 deletions gtfs_functions/aux_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import geopandas as gpd
import logging
import numpy as np
from shapely import Point
from shapely import LineString
from shapely.ops import substring


def add_runtime(st):
Expand Down Expand Up @@ -72,6 +75,144 @@ def add_speed(speeds):

return speeds

def detect_loops(df):
"""
Runs on a shape_stop df, grouped by patterns and warns on the number of
potential loops detected

Returns false if there are loops detected, True if not
"""
df['cdsp_prev_stop'] = df['cut_distance_stop_point'].shift(1).fillna(0)
df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-1).fillna(1)
df['prev_wrong'] = (df['cdsp_prev_stop'] > df['cut_distance_stop_point'])
df['next_wrong'] = (df['cdsp_next_stop'] < df['cut_distance_stop_point'])
#df['cdsp_next_ordered']
if df['next_wrong'].any() or df['prev_wrong'].any():
num_loops = (df['prev_wrong'].sum() + df['next_wrong'].sum())//2
logging.warning(f"Identified {num_loops} loops in Pattern "
f"{df['shape_id'].iloc[0]} of Route "
f"{df['route_name'].iloc[0]}")
return False
else:
return True

def loops_fixer(df:pd.DataFrame):
"""
Fixes loops using a relaxaation algorithm

Parameter: df, a gtfs combined stop & shape dataframe
Returns a corrected dataframe

Lilah Rosenfield | Utah Transit Authority
[email protected]
"""
df = df.copy()
df['sub_string'] = np.nan
df.loc[df.index[0], 'cut_distance_stop_point'] = 0 # First stop
df.loc[df.index[-1], 'cut_distance_stop_point'] = 1 # Last stop
df = _diff_and_reproject(df)
df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-1).fillna(1)
df['end_stop_name'] = df['stop_name'].shift(-1)
df['end_stop_id'] = df['stop_id'].shift(-1)
df = df.iloc[:-1]
df['geometry'] = df.apply(_final_substring, axis=1)
return df

def _diff_and_reproject(df:pd.DataFrame,
max_iterations:int = 50):
"""
Iterates outwards from identified discontinuities in the linestring from
the sequence, looping until it finds a fix.

Returns a corrected df.

My initial implementation of this was recursive, which I liked, but
this version uses a while loop, which is more memory efficient.

Lilah Rosenfield | Utah Transit Authority
(with the support of a large pile of linear algebra)
[email protected]
"""
df = df.sort_index(ascending=True)
iteration = 0
offset = 1
while iteration < max_iterations:
iteration += 1
df['cdsp_prev_stop'] = df['cut_distance_stop_point'].shift(offset).fillna(0)
df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-offset).fillna(1)
df['prev_wrong'] = (df['cdsp_prev_stop'] > df['cut_distance_stop_point'])
df['next_wrong'] = (df['cdsp_next_stop'] < df['cut_distance_stop_point'])
if not (df['next_wrong'].any() or df['prev_wrong'].any()):
break
df['prev_cdsp'] = df['cut_distance_stop_point'].copy()

# Fix Pattern
df['sub_string'] = df[df['next_wrong'] | df['prev_wrong']].apply(
_substring_general, axis=1
)
df['substproj'] = df['sub_string'].project(
df['geometry_stop'], normalized = True)

# Convert back to full line coordinates
df['substproj'] = (
df['substproj']*
(df['cdsp_next_stop']-df['cdsp_prev_stop'])+
df['cdsp_prev_stop'])
valid_update = (
(df['substproj'] < df['cdsp_next_stop']) &
(df['substproj'] > df['cdsp_prev_stop'])
)
df['cut_distance_stop_point'][valid_update] = df['substproj']
tester = df['prev_cdsp'] != df['cut_distance_stop_point']
if tester.any():
logging.info("Made corrections, re-setting offset to 1")
offset=1
else:
logging.info("No corrections make, increasing offset")
offset = offset+1
if offset > len(df) // 2:
raise pd.errors.InvalidIndexError("Offset got too large!")
if iteration >= max_iterations:
logging.warning("Maximum iteration depth reached for shape id "
f"{df['shape_id'].iloc[0]}. You are likely to"
"encounter further errors")
return df


def _final_substring(ser:pd.Series):
subst = substring(ser['geometry_shape'],
ser['cut_distance_stop_point'],
ser['cdsp_next_stop'],
normalized = True)
if type(subst) == Point:
logging.warning(f"Segment between {ser['stop_name']} and "
f"{ser['end_stop_name']} on {ser['route_name']}"
"is a point! This is likely due to a out-of-spec "
"GTFS implementation! We're returning the segment with "
"an empty linestring instead.")
return LineString()
return subst

def _substring_general(ser:pd.Series):
"""
Applied substringification function for _diff_and_reproject
"""
logging.info(f'Trying {ser['stop_name']} of line {ser['route_name']}')
subst = substring(ser['geometry_shape'],
ser['cdsp_prev_stop'],
ser['cdsp_next_stop'],
normalized=True)
if type(subst) == Point:
logging.warning("Got a point from the substring function!")
logging.warning("This is probably a single-stop deviation, "
"in which case you are safe to ignore.")
logging.warning("However, if the loop fix fails, " \
"your feed may have a corner case")
return ser['geometry_shape']
else:
return subst



def fix_outliers(speeds):
# Calculate average speed to modify outliers
Expand Down
108 changes: 96 additions & 12 deletions gtfs_functions/gtfs_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
import pendulum as pl
import hashlib
from shapely.geometry import LineString, MultiPoint
from shapely.ops import substring
from gtfs_functions.aux_functions import (
add_all_lines,
add_runtime,
add_distance,
add_speed,
detect_loops,
code,
loops_fixer,
fix_outliers,
num_to_letters,
add_route_name,
Expand Down Expand Up @@ -52,6 +55,7 @@ def __init__(
patterns: bool = True,
start_date: str = None,
end_date: str = None,
fix_loops: bool = False
):

self._gtfs_path = gtfs_path
Expand All @@ -61,6 +65,7 @@ def __init__(
self._patterns = patterns
self._start_date = start_date
self._end_date = end_date
self._fix_loops = fix_loops
self._dates = None
self._routes_patterns = None
self._trips_patterns = None
Expand Down Expand Up @@ -280,6 +285,21 @@ def dates_service_id(self):
if self._dates_service_id is None:
self._dates_service_id = self.get_dates_service_id()
return self._dates_service_id

@property
def loop_log(self):
"""
A dataframe logging changes that have been made if fix_loops is enabled

Warns and returns false if disabled
"""
if self._fix_loops:
self.get_segments()
return self._loop_log.copy()
else:
logging.warning("fix_loops is set to false, "
"so loop_log does not exist!")
return False

@trips.setter
def trips(self, value):
Expand Down Expand Up @@ -866,17 +886,26 @@ def get_lines_freq(self):
line_frequencies = line_frequencies.loc[~line_frequencies.geometry.isnull(), keep_these]

return line_frequencies


def get_segments(self):
"""Splits each route's shape into stop-stop LineString called segments
def _segments_helper(self):
"""
Splits each route's shape into stop-stop LineString called segments

Returns the segment geometry as well as additional segment information
This part just gets the segment geometry, and returns. Addtional
segment information is added in the main get_segments function.

Lilah Rosenfield | Utah Transit Authority
[email protected]
"""
logging.info("Getting segments...")
stop_times = self.stop_times
shapes = self.shapes

req_columns = ["shape_id", "stop_sequence", "stop_id", "geometry"]
# added route_pattern to handle the corner case two patterns can
# Have the same linestring but different stops
req_columns = ["shape_id", "stop_sequence", "stop_id", "geometry",
"route_pattern"]
add_columns = ["route_id", "route_name", "direction_id", "stop_name"]

# merge stop_times and shapes to calculate cut distance and interpolated point
Expand All @@ -886,15 +915,29 @@ def get_segments(self):
.merge(shapes, on="shape_id", suffixes=("_stop", "_shape"))
)
logging.info("Projecting stops onto shape...")
df_shape_stop["cut_distance_stop_point"] = df_shape_stop[["geometry_stop", "geometry_shape"]].apply(
lambda x: x[1].project(x[0], normalized=True), axis=1
)
df_shape_stop["cut_distance_stop_point"] = \
df_shape_stop['geometry_shape'].project(
df_shape_stop['geometry_stop'], normalized=True
)
df_shape_stop_group = df_shape_stop.groupby(["shape_id","route_pattern"])
test = df_shape_stop_group.apply(detect_loops)
if not self._fix_loops and test.any():
logging.warning("Detected looped routes! It is recommended you "
"re-initialize the feed with fix_loops enabled")
segment_df = self._segments_helper_noloopfix(df_shape_stop)
if test.any() and self._fix_loops:
segment_df = self._segments_helper_with_loopfix(df_shape_stop_group, df_shape_stop)

return segment_df

def _segments_helper_noloopfix(self, df_shape_stop:pd.DataFrame):
stop_times = self.stop_times
shapes = self.shapes
logging.info("Interpolating stops onto shape...")
df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]].apply(
lambda x: x[0].interpolate(x[1], normalized=True), axis=1
)

# calculate cut distance for
df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]].apply(
lambda x: x[0].interpolate(x[1], normalized=True), axis=1)

logging.info("Sorting shape points and stops...")
df_shape = shapes[shapes.shape_id.isin(stop_times.shape_id.unique())]
df_shape["list_of_points"] = df_shape.geometry.apply(lambda x: list(MultiPoint(x.coords).geoms))
Expand Down Expand Up @@ -957,7 +1000,47 @@ def get_segments(self):
],
axis=1,
inplace=True,
)
)
return segment_df

def _segments_helper_with_loopfix(self, df_shape_stop_group, df_shape_stop):
old_cdsp = df_shape_stop["cut_distance_stop_point"]
fixed_loops = df_shape_stop_group.apply(loops_fixer)
df_shape_stop = fixed_loops.droplevel([0,1])

# Double checking may add a bit of inefficiency
# but we'll do it anyway
df_shape_stop_group = df_shape_stop.groupby(["shape_id","route_pattern"])
logging.info("Finished fixing loops!")
test = df_shape_stop_group.apply(detect_loops)
if test.any():
logging.warning("Warning we were not able to fix all loops!")
new_cdsp = df_shape_stop["cut_distance_stop_point"]
#changes_made = new_cdsp != old_cdsp
self._loop_log = pd.concat([
df_shape_stop[['stop_id', 'stop_sequence']],
old_cdsp.rename('old_cut_distance'),
new_cdsp.rename('new_cut_distance'),
#changes_made.rename('changed')
], axis=1)
df_shape_stop = df_shape_stop.drop(
['cdsp_prev_stop','cdsp_next_stop','prev_wrong',
'next_wrong','prev_cdsp','substproj', 'sub_string'], axis=1)
return df_shape_stop


def get_segments(self):
"""Splits each route's shape into stop-stop LineString called segments

Returns the segment geometry as well as additional segment information

If fix_loops is set to true on init, will attempt experimental
loop-resolution loop.
"""
#stop_times = self.stop_times
#shapes = self.shapes
segment_gdf = self._segments_helper()

segment_gdf.crs = "EPSG:4326"

# Add segment length in meters
Expand All @@ -972,6 +1055,7 @@ def get_segments(self):
"shape_id",
"route_id",
"route_name",
"route_pattern",
"direction_id",
"stop_sequence",
"segment_name",
Expand Down