Skip to content

Commit 49471d9

Browse files
committed
Adding join on pandas dataframes and placekeying of pandas dataframes
1 parent bb1a9ac commit 49471d9

File tree

3 files changed

+125
-1
lines changed

3 files changed

+125
-1
lines changed

placekey/api.py

+94
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from json import JSONDecodeError
55

66
import backoff
7+
import pandas as pd
78
import requests
89
from ratelimit import limits, RateLimitException
910
from .general import _post_request_function
@@ -88,6 +89,26 @@ class PlacekeyAPI:
8889

8990
DEFAULT_QUERY_ID_PREFIX = "place_"
9091

92+
MIN_INPUTS = [
93+
['latitude', 'longitude'],
94+
['street_address', 'city', 'region', 'postal_code'],
95+
['street_address', 'region', 'postal_code'],
96+
['street_address', 'region', 'city'],
97+
]
98+
99+
PLACEKEY_OUTPUTS = {
100+
"placekey",
101+
"address_placekey",
102+
"building_placekey",
103+
"gers",
104+
"confidence_score",
105+
"upi",
106+
"geoid",
107+
"parcel",
108+
"gers",
109+
"address_confidence_score"
110+
}
111+
91112
def __init__(self, api_key=None, max_retries=DEFAULT_MAX_RETRIES, logger=log,
92113
user_agent_comment=None):
93114
self.api_key = api_key
@@ -120,6 +141,79 @@ def __init__(self, api_key=None, max_retries=DEFAULT_MAX_RETRIES, logger=log,
120141
calls=self.BULK_REQUEST_LIMIT,
121142
period=self.BULK_REQUEST_WINDOW,
122143
max_tries=self.max_retries)
144+
145+
def _has_minimum_inputs(self, user_inputs: set[str]) -> bool:
146+
for inputs in self.MIN_INPUTS:
147+
hasRequiredInputs = True
148+
for key in inputs:
149+
if key not in user_inputs:
150+
hasRequiredInputs = False
151+
break
152+
if hasRequiredInputs:
153+
return True
154+
return False
155+
156+
def _join_pandas_df(self, df1: pd.DataFrame, column_mapping_1: dict, df2: pd.DataFrame, column_mapping_2: dict, how: str = 'inner', on: str = "placekey", fields=None, batch_size=MAX_BATCH_SIZE, verbose=False):
157+
fields = [on] if fields is None else fields + [on]
158+
if on not in df1:
159+
if on in self.PLACEKEY_OUTPUTS:
160+
df1 = self._placekey_pandas_df(df1, column_mapping=column_mapping_1, fields=fields, batch_size=batch_size, verbose=verbose, return_original_values=True)
161+
else:
162+
raise ValueError("The first dataset does not contain the join key {}".format(on))
163+
if on not in df2:
164+
if on in self.PLACEKEY_OUTPUTS:
165+
df2 = self._placekey_pandas_df(df2, column_mapping=column_mapping_2, fields=fields, batch_size=batch_size, verbose=verbose, return_original_values=True)
166+
else:
167+
raise ValueError("The second dataset does not contain the join key {}".format(on))
168+
169+
return pd.merge(df1, df2, how=how, on=on)
170+
171+
172+
def _placekey_pandas_df(self, df: pd.DataFrame, column_mapping: dict, fields=None, batch_size=MAX_BATCH_SIZE, verbose=False, return_original_values=True):
173+
"""
174+
Takes a DataFrame and a list of column names that map to placekey input fields and returns a placekey'd pandas dataframe.
175+
176+
Args:
177+
:param df (pd.DataFrame): The input DataFrame.
178+
:param column_mapping (dict): List of column names to map as inputs to the method.
179+
:param fields: A list of requested parameters other than placekey. For example: address_placekey, building_placekey
180+
Defaults to None
181+
:param batch_size: Integer for the number of places to lookup in a single batch.
182+
Defaults to 100, and cannot exceeded 100.
183+
:param verbose: Boolean for whether or not to log additional information.
184+
Defaults to False
185+
186+
Returns:
187+
- pd.DataFrame: The updated DataFrame with new rows for placekey outputs
188+
"""
189+
if not self._validate_query(column_mapping):
190+
raise ValueError(
191+
"Some queries contain keys other than: {}".format(self.QUERY_PARAMETERS))
192+
193+
if not self._has_minimum_inputs(column_mapping.keys()):
194+
raise ValueError(
195+
"The inputted DataFrame doesn't have enough information. Refer to minimum inputs documentation here: https://docs.placekey.io/documentation/placekey-api/input-parameters/minimum-inputs")
196+
197+
temp_query_id = 'temp_query_id'
198+
df[temp_query_id] = ''
199+
places = []
200+
for i, row in df.iterrows():
201+
place = {}
202+
for place_key, column_name in column_mapping.items():
203+
if column_name in df.columns and pd.notna(row[column_name]):
204+
place[place_key] = row[column_name]
205+
query_id = self.DEFAULT_QUERY_ID_PREFIX + str(i)
206+
place['query_id'] = query_id
207+
df.at[i, temp_query_id] = query_id
208+
places.append(place)
209+
result = self.lookup_placekeys(places=places, fields=fields, batch_size=batch_size, verbose=verbose)
210+
result_df = pd.DataFrame(result).rename(columns={"query_id": temp_query_id})
211+
212+
if not return_original_values:
213+
return result_df
214+
merged_df = pd.merge(df, result_df, how='inner', on=temp_query_id).drop([temp_query_id], axis=1)
215+
return merged_df
216+
123217

124218
def lookup_placekey(self,
125219
fields=None,

placekey/tests/test_api.py

+30
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import os
77
import random
88
import unittest
9+
import pandas as pd
910

1011
import pytest
1112

@@ -121,5 +122,34 @@ def test_lookup_placekeys_slow(self):
121122
self.assertEqual(len(results), num_samples)
122123
self.assertTrue(all(['placekey' in r for r in results]))
123124

125+
def test_pandas_placekey_and_join(self):
126+
df = pd.DataFrame({
127+
"address": ["1543 Mission Street, Floor 3", "598 Portola Dr", None],
128+
"city": ["San Francisco", "San Francisco", None],
129+
"region": ["CA", "CA", None],
130+
"postal": ["94105", "94131", None],
131+
"country": ["US", "US", None],
132+
"latitude": [None, None, 37.7371],
133+
"longitude": [None, None, -122.44283]
134+
})
135+
136+
column_mappings = {
137+
"street_address": "address",
138+
"city": "city",
139+
"region": "region",
140+
"postal_code": "postal",
141+
"iso_country_code": "country",
142+
"latitude": "latitude",
143+
"longitude": "longitude"
144+
}
145+
146+
df_with_placekeys = self.pk_api._placekey_pandas_df(df, column_mappings, fields=['address_placekey', 'address_placekey', 'address_confidence_score'])
147+
self.assertTrue('address_placekey' in df_with_placekeys)
148+
self.assertTrue('address_confidence_score' in df_with_placekeys)
149+
self.assertTrue('placekey' in df_with_placekeys)
150+
double_join = self.pk_api._join_pandas_df(df_with_placekeys, {}, df.copy(deep=True), column_mappings, on='address_placekey')
151+
self.assertTrue('city_x' in double_join)
152+
self.assertTrue('city_y' in double_join)
153+
124154

125155

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def get_version():
2828
long_description_content_type="text/markdown",
2929
url="https://github.com/Placekey/placekey-py",
3030
packages=setuptools.find_packages(),
31-
install_requires=['h3>=4.2.1,<5', 'shapely', 'requests', 'ratelimit', 'backoff', 'boto3'],
31+
install_requires=['h3>=4.2.1,<5', 'shapely', 'requests', 'ratelimit', 'backoff', 'boto3', 'pandas'],
3232
classifiers=[
3333
"Programming Language :: Python :: 3",
3434
"License :: OSI Approved :: Apache Software License",

0 commit comments

Comments
 (0)