Skip to content

Commit

Permalink
Merge pull request #66 from nickolasteixeira/master
Browse files Browse the repository at this point in the history
Incrementally improve data pipeline process
  • Loading branch information
RocioSNg authored Mar 5, 2020
2 parents 50a5ad6 + 330efe2 commit a07f948
Show file tree
Hide file tree
Showing 14 changed files with 94,199 additions and 85,988 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

6,970 changes: 3,551 additions & 3,419 deletions data/cleaned/all_quarters__one_record_per_project.csv

Large diffs are not rendered by default.

83,973 changes: 42,866 additions & 41,107 deletions data/cleaned/all_quarters_merged.csv

Large diffs are not rendered by default.

83,973 changes: 42,866 additions & 41,107 deletions data/cleaned/all_quarters_merged_PRECLEAN.csv

Large diffs are not rendered by default.

174 changes: 90 additions & 84 deletions data/cleaner.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,8 @@
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
Expand All @@ -29,7 +27,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 65,
"metadata": {
"collapsed": true
},
Expand All @@ -47,10 +45,8 @@
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"#Define Functions\n",
Expand Down Expand Up @@ -122,11 +118,13 @@
" Use advanced date parsing (python-dateutil) to parse\n",
" the first 10 chars of each line.\n",
" \"\"\"\n",
" try:\n",
" return dateutil.parser.parse(s.strip()[:10]).strftime(\"%Y-%m-%d\")\n",
" except Exception as e:\n",
" logging.exception(\"Date formatting failed for {}\".format(s))\n",
" return s\n",
" if len(s) > 0:\n",
" try:\n",
" return dateutil.parser.parse(s.strip()[:10]).strftime(\"%Y-%m-%d\")\n",
" except Exception as e:\n",
" logging.exception(\"Date formatting failed for {}\".format(s))\n",
" return s\n",
" return s\n",
" \n",
"def get_coords_tuple_from_address_lat_long_glob(s):\n",
" \"\"\"\n",
Expand All @@ -135,25 +133,32 @@
"\n",
" Returns: tuple\n",
" \"\"\"\n",
" lat_long_tuple = s.split('\\n')[-1]\n",
" # Dirty hack, the lat long tuple happens to be valid python syntax so.. YOLO\n",
" return eval(lat_long_tuple)\n",
"\n",
" if len(s)>0:\n",
" lat_long_tuple = s.split('\\n')[-1]\n",
" # Dirty hack, the lat long tuple happens to be valid python syntax so.. YOLO\n",
" return eval(lat_long_tuple)\n",
" else:\n",
" return s\n",
"\n",
"def get_lat_from_glob(s):\n",
" try:\n",
" return get_coords_tuple_from_address_lat_long_glob(s)[0]\n",
" except Exception as e:\n",
" logging.exception(\"Lat long glob parsing failed for {}\".format(s))\n",
" if len(s) > 0:\n",
" try:\n",
" return get_coords_tuple_from_address_lat_long_glob(s)[0]\n",
" except Exception as e:\n",
" logging.exception(\"Lat long glob parsing failed for {}\".format(s))\n",
" return np.nan\n",
" else:\n",
" return np.nan\n",
"\n",
"\n",
"def get_long_from_glob(s):\n",
" try:\n",
" return get_coords_tuple_from_address_lat_long_glob(s)[1]\n",
" except Exception as e:\n",
" logging.exception(\"Lat long glob parsing failed for {}\".format(s))\n",
" return np.nan\n",
" if len(s) > 0:\n",
" try:\n",
" return get_coords_tuple_from_address_lat_long_glob(s)[1]\n",
" except Exception as e:\n",
" logging.exception(\"Lat long glob parsing failed for {}\".format(s))\n",
" return np.nan\n",
" else:\n",
" return np.nan\n",
" \n",
"def get_address_from_glob(s):\n",
" return s.split('\\n')[0]\n",
Expand Down Expand Up @@ -226,97 +231,98 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 67,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading raw/San_Francisco_Development_Pipeline_2009_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2009_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2009_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2009_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2010_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2010_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2010_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2010_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2010_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2010_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2010_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2010_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2011_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2011_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2011_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2011_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2011_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2011_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2015_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2015_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2017_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2017_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2011_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2011_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2012_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2012_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2012_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2012_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2012_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2012_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2012_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2012_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2013_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2013_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2013_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2013_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2013_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2013_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2013_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2013_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2014_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2014_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2014_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2014_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2014_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2014_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2014_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2014_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2015_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2015_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2015_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2015_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2015_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2015_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2017_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2017_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2013_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2013_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2011_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2011_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2011_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2011_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2013_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2013_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2015_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2015_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2011_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2011_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2013_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2013_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2014_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2014_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2012_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2012_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2016_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2016_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2016_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2016_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2014_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2014_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2014_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2014_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2009_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2009_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2016_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2016_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2010_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2010_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2016_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2016_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2014_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2014_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2012_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2012_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2012_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2012_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2010_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2010_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2010_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2010_Quarter_3.json\n",
"loading raw/San_Francisco_Development_Pipeline_2012_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2012_Quarter_1.json\n",
"loading raw/San_Francisco_Development_Pipeline_2010_Quarter_2.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2010_Quarter_2.json\n",
"loading raw/San_Francisco_Development_Pipeline_2016_Quarter_4.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2016_Quarter_4.json\n",
"loading raw/San_Francisco_Development_Pipeline_2017_Quarter_1.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2017_Quarter_1.json\n"
"loading raw/San_Francisco_Development_Pipeline_2009_Quarter_3.csv\n",
"writing cleaned/San_Francisco_Development_Pipeline_2009_Quarter_3.json\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/briangoggin/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:43: SettingWithCopyWarning: \n",
"/Users/RSW/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:43: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"/Users/briangoggin/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: \n",
"/Users/RSW/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"/Users/briangoggin/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:53: SettingWithCopyWarning: \n",
"/Users/RSW/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:53: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
"/Users/briangoggin/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:55: SettingWithCopyWarning: \n",
"/Users/RSW/anaconda3/lib/python3.6/site-packages/ipykernel/__main__.py:55: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n"
Expand Down Expand Up @@ -368,9 +374,9 @@
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [Root]",
"display_name": "Python [conda root]",
"language": "python",
"name": "Python [Root]"
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -382,9 +388,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
"nbformat_minor": 1
}
74 changes: 74 additions & 0 deletions data/create_column_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
##Create a dictionary from the uniq_header_mappings from previous files
def create_dict_object_from_uniq_headers(filename):
with open(filename, "r") as fd:
# skips the key,value notation
uniq_header_dict = {}
fd.readline()
for line in fd:
items = line.split(',')
uniq_header_dict[items[0].replace("\"", "").replace("\n", "")] = items[1].replace("\n", "")

return uniq_header_dict


def create_list_of_column_headers(filename):
new_headers = []
path = f'./raw/{filename}'
with open(path, "r") as fd:
new_headers = fd.readline().split(",")
# strip the \n from the last header column
new_headers[-1] = new_headers[-1].replace("\n", "")

return new_headers


def create_new_unique_header_mappings(headers, unique_headers_dict, filename):
new_unique_headers = {}
for item in headers:
if item not in uniq_headers_dict:
new_unique_headers[item] = item.lower()

# if a new item is added to the unique_header_dict, then update the file
if len(new_unique_headers) > 0:
with open(filename, "a") as fd:

for idx in new_unique_headers.keys():
print(f'Adding new header: {idx}')
fd.write(f'"{idx}",{new_unique_headers[idx]}\n')


def create_new_columnnames_filename(filename):
# File names are currently in this format SF_Development_Pipeline_2017_Q1.csv
# parse filename and create a new filename to this format YYYYQ#.txt

new_file_arr = filename.split(".")[0].split("_")
new_file_arr = new_file_arr[-3] + new_file_arr[-2][0] + new_file_arr[-1]
return ''.join(new_file_arr) + '.txt'


def create_new_columnnames_file(filename, new_headers, uniq_headers_dict):
path = f'./raw/columnnames/{filename}'
with open(path, "w") as fd:
fd.write(f'key,value\n')
for item in new_headers:
fd.write(f'"{item}",{uniq_headers_dict.get(item)}\n')

print(f'Created file: {filename}')

if __name__ == "__main__":
uniq_header_mappings_file_name = "./uniq_header_mappings.txt"
# get a dictionary of unique dict objects
uniq_headers_dict = create_dict_object_from_uniq_headers(uniq_header_mappings_file_name)

# new file to parse
new_csv_to_parse = "San_Francisco_Development_Pipeline_2017_Quarter_3.csv"
# get all column heads from the file
new_headers = create_list_of_column_headers(new_csv_to_parse)

# add a new header mappings to all header mappings
create_new_unique_header_mappings(new_headers, uniq_headers_dict, uniq_header_mappings_file_name)
# create a new file name for the column names
new_file_name = create_new_columnnames_filename(new_csv_to_parse)

# create new file for the colunnames
create_new_columnnames_file(new_file_name, new_headers, uniq_headers_dict)
Loading

0 comments on commit a07f948

Please sign in to comment.