forked from thfield/sf-planning-pipeline-report
-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #66 from nickolasteixeira/master
Incrementally improve data pipeline process
- Loading branch information
Showing
14 changed files
with
94,199 additions
and
85,988 deletions.
There are no files selected for viewing
1 change: 1 addition & 0 deletions
1
data/cleaned/San_Francisco_Development_Pipeline_2017_Quarter_2.json
Large diffs are not rendered by default.
Oops, something went wrong.
1 change: 1 addition & 0 deletions
1
data/cleaned/San_Francisco_Development_Pipeline_2017_Quarter_3.json
Large diffs are not rendered by default.
Oops, something went wrong.
6,970 changes: 3,551 additions & 3,419 deletions
6,970
data/cleaned/all_quarters__one_record_per_project.csv
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
83,973 changes: 42,866 additions & 41,107 deletions
83,973
data/cleaned/all_quarters_merged_PRECLEAN.csv
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
##Create a dictionary from the uniq_header_mappings from previous files | ||
def create_dict_object_from_uniq_headers(filename): | ||
with open(filename, "r") as fd: | ||
# skips the key,value notation | ||
uniq_header_dict = {} | ||
fd.readline() | ||
for line in fd: | ||
items = line.split(',') | ||
uniq_header_dict[items[0].replace("\"", "").replace("\n", "")] = items[1].replace("\n", "") | ||
|
||
return uniq_header_dict | ||
|
||
|
||
def create_list_of_column_headers(filename): | ||
new_headers = [] | ||
path = f'./raw/{filename}' | ||
with open(path, "r") as fd: | ||
new_headers = fd.readline().split(",") | ||
# strip the \n from the last header column | ||
new_headers[-1] = new_headers[-1].replace("\n", "") | ||
|
||
return new_headers | ||
|
||
|
||
def create_new_unique_header_mappings(headers, unique_headers_dict, filename): | ||
new_unique_headers = {} | ||
for item in headers: | ||
if item not in uniq_headers_dict: | ||
new_unique_headers[item] = item.lower() | ||
|
||
# if a new item is added to the unique_header_dict, then update the file | ||
if len(new_unique_headers) > 0: | ||
with open(filename, "a") as fd: | ||
|
||
for idx in new_unique_headers.keys(): | ||
print(f'Adding new header: {idx}') | ||
fd.write(f'"{idx}",{new_unique_headers[idx]}\n') | ||
|
||
|
||
def create_new_columnnames_filename(filename): | ||
# File names are currently in this format SF_Development_Pipeline_2017_Q1.csv | ||
# parse filename and create a new filename to this format YYYYQ#.txt | ||
|
||
new_file_arr = filename.split(".")[0].split("_") | ||
new_file_arr = new_file_arr[-3] + new_file_arr[-2][0] + new_file_arr[-1] | ||
return ''.join(new_file_arr) + '.txt' | ||
|
||
|
||
def create_new_columnnames_file(filename, new_headers, uniq_headers_dict): | ||
path = f'./raw/columnnames/{filename}' | ||
with open(path, "w") as fd: | ||
fd.write(f'key,value\n') | ||
for item in new_headers: | ||
fd.write(f'"{item}",{uniq_headers_dict.get(item)}\n') | ||
|
||
print(f'Created file: {filename}') | ||
|
||
if __name__ == "__main__": | ||
uniq_header_mappings_file_name = "./uniq_header_mappings.txt" | ||
# get a dictionary of unique dict objects | ||
uniq_headers_dict = create_dict_object_from_uniq_headers(uniq_header_mappings_file_name) | ||
|
||
# new file to parse | ||
new_csv_to_parse = "San_Francisco_Development_Pipeline_2017_Quarter_3.csv" | ||
# get all column heads from the file | ||
new_headers = create_list_of_column_headers(new_csv_to_parse) | ||
|
||
# add a new header mappings to all header mappings | ||
create_new_unique_header_mappings(new_headers, uniq_headers_dict, uniq_header_mappings_file_name) | ||
# create a new file name for the column names | ||
new_file_name = create_new_columnnames_filename(new_csv_to_parse) | ||
|
||
# create new file for the colunnames | ||
create_new_columnnames_file(new_file_name, new_headers, uniq_headers_dict) |
Oops, something went wrong.