-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_index_md.py
131 lines (119 loc) · 5.39 KB
/
make_index_md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import xlrd
import glob
import os
import time
# Generate a Markdown file with links to the reports.
# Read all the XLS files in the reports dirrectory
# The XLS file contains these columns:
# EPA Facility ID, Facility Name, Facility Address, City, State, County, Zip, Facility DUNS, Latitude, Longitude, Chemical(s), NAICS Code(s)
# For each XLS file, get the state name from the filename, and use that as a second-level heading.
# Then, create a Markdown table with the following columns:
# EPA Facility ID, Facility Name, Facility Address, City, County, Zip, Facility DUNS, Latitude, Longitude, Chemical(s), NAICS Code(s)
# Create a link to the PDF report in the EPA Facility ID column, if the PDF file exists.
def state_name_to_directory_name(state: str) -> str:
return state.replace(" ", "_").lower()
def create_state_index_md(state: str, xls_file: str, data_dict: dict):
state_directory = state_name_to_directory_name(state)
full_state_directory = f"reports/{state_directory}"
if not os.path.exists(full_state_directory):
return ""
xls_creation_time = os.path.getctime(xls_file)
xls_creation_date = time.ctime(xls_creation_time)
book = xlrd.open_workbook(xls_file)
sheet = book.sheet_by_index(0)
with open(f"{full_state_directory}/index.md", "w") as f:
f.write(f"# {state}\n\n")
f.write(f"The state data was downloaded on ({xls_creation_date})\n\n")
n, completed = data_dict[state_directory]
percent_completed = float(completed) / n if n > 0 else 0
f.write(
f"Of the {n} reports, {completed} were downloaded ({percent_completed:.2%})\n\n"
)
f.write(
"| EPA Facility ID | Facility Name | Facility Address | City | County | Zip | Facility DUNS | Latitude | Longitude | Chemical(s) | NAICS Code(s) | Update Date |\n"
)
f.write(
"|------------------|---------------|------------------|------|--------|-----|--------------|----------|-----------|--------------|---------------|------------|\n"
)
for row in range(1, sheet.nrows):
row_data = sheet.row_values(row)
for row in range(1, sheet.nrows):
row_data = sheet.row_values(row)
facility_id = row_data[0]
facility_name = row_data[1]
facility_address = row_data[2]
city = row_data[3]
county = row_data[5]
zip = row_data[6]
facility_duns = row_data[7]
latitude = row_data[8]
longitude = row_data[9]
chemicals = row_data[10]
naics_codes = row_data[11]
pdf_file = f"reports/{facility_id}.pdf"
fac_creation_date = "N/A"
fac_text = facility_id
if os.path.exists(pdf_file):
fac_text = f" [{facility_id}](./{pdf_file})"
fac_creation_time = os.path.getctime(pdf_file)
fac_creation_date = time.ctime(fac_creation_time)
f.write(
f"| {fac_text} | {facility_name} | {facility_address} | {city} | {county} | {zip} | {facility_duns} | {latitude} | {longitude} | {chemicals} | {naics_codes} | {fac_creation_date}\n"
)
def count_completed_reports(
state: str, verbose: bool = False
) -> (str, int, int, float):
state_directory = state_name_to_directory_name(state)
full_state_directory = f"reports/{state_directory}"
if not os.path.exists(full_state_directory):
return 0
xls_file = f"reports/{state}.xls"
n = 0
completed = 0
book = xlrd.open_workbook(xls_file)
sheet = book.sheet_by_index(0)
for row in range(1, sheet.nrows):
n += 1
row_data = sheet.row_values(row)
facility_id = row_data[0]
pdf_file = f"{full_state_directory}/{facility_id}.pdf"
if os.path.exists(pdf_file):
completed += 1
else:
if verbose:
print(f"Missing report for {state} facility {facility_id}")
if n == 0:
return (state_directory, 0, 0, 0.0)
return (state_directory, completed, n)
def report_stats():
# get a list of 'xls' files from reports directory
xls_files = glob.glob("reports/*.xls")
xls_files.sort()
total = 0
total_completed = 0
data_dict = {}
for xls_file in xls_files:
state = os.path.basename(xls_file).split(".")[0]
directory = state_name_to_directory_name(state)
_, completed, n = count_completed_reports(state)
total += n
total_completed += completed
data_dict[directory] = (completed, n)
return data_dict, total, total_completed
if __name__ == "__main__":
# get a list of 'xlsx' files from reports directory
xls_files = glob.glob("reports/*.xls")
# sort by state name
xls_files.sort()
data_dict, total, total_completed = report_stats()
with open("index.md", "w") as f:
f.write("# EPA RMP Data\n\n")
f.write(
f"Of the {total} reports, {total_completed} were downloaded ({total_completed/total:.2%})\n\n"
)
for xls_file in xls_files:
state = os.path.basename(xls_file).split(".")[0]
print(f"Creating index for {state} ... ", end="", flush=True)
create_state_index_md(state, xls_file, data_dict)
f.write(f"- [{state}](./reports/{state}/index.md)\n")
print("done.")