-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxlsx_to_sqlite.py
146 lines (134 loc) · 4.1 KB
/
xlsx_to_sqlite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import pandas as pd
import sqlite3
# pd.options.display.max_columns = None
# pd.options.display.max_rows = None
# pd.options.display.width = 1000
# https://stackoverflow.com/a/61473956/4656035
conn = sqlite3.connect("data/nsorg.sqlite3")
df1 = pd.read_excel('data/public_records_request/Grant Application.xlsx', sheet_name=0)
print(df1.head())
print('----------------------')
# print(df1.columns.tolist())
# print(df1.loc[0, :])
# Drop row 2
df1.drop(df1.index[0], inplace=True)
# Provide new column names to be database friendly
df1.columns = [
"ID",
"OrgName",
"PhysicalAddress",
"MailingAddress",
"Website",
"SocialMediaAccounts",
"Name",
"Title",
"EmailAddress",
"Phone",
"Team",
"TeamExplanation",
"OrganizationalChart",
"OtherCompletedProjects",
"ProposalTitle",
"TotalBudget",
"LB1024GrantFundingRequest",
"ProposalType",
"BriefProposalSummary",
"Timeline",
"PercentageCompletedByJuly2025",
"FundingGoals",
"Community Needs",
"OtherExplanation",
"ProposalDescriptionAndNeedsAlignment",
"VisioningWorkshopFindingsAlignment",
"PrioritiesAlignment",
"EconomicImpact",
"EconomicImpactPermanentJobsCreated",
"EconomicImpactTemporaryJobsCreated",
"EconomicImpactWageLevels",
"EconomicImpactAlignProposedJobs",
"CommunityBenefit",
"CommunityBenefitSustainability",
"BestPracticesInnovation",
"OutcomeMeasurement",
"OutcomeMeasurementHow",
"OutcomeMeasurementCoinvestment",
"Partnerships",
"PartnershipsOrgs",
"PartnershipsMOU",
"Displacement",
"DisplacementExplanation",
"PhysicalLocation",
"QualifiedCensusTract",
"AdditionalLocationDocuments",
"PropertyZoning",
"ConnectedToUtilities",
"ConnectedToUtilitiesConnected",
"ConnectedToUtilitiesUpgradesNeeded",
"DesignEstimatingBidding",
"DesignEstimatingBiddingPackageDeveloped",
"DesignEstimatingBiddingCostsDetermined",
"GeneralContractor",
"GeneralContractorPublicCompetitiveBid",
"GeneralContractorPublicCompetitiveBidWhyNot",
"RequestRationale",
"GrantFundsUsage",
"ProposalFinancialSustainability",
"ProposalFinancialSustainabilityOperations",
"FundingSources",
"FundingSourcesPendingDecisions",
"FundingSourcesCannotContinue",
"Scalability",
"ScalabilityComponents",
"FinancialCommitment",
"ARPAComplianceAcknowledgment",
"ARPAReportingMonitoringProcessAck",
"LB1024FundingSourcesAck",
"PublicInformation",
"FileUploads"
]
# Change to Pandas int to get rid of all the ".0"s
df1["ID"] = df1["ID"].astype('Int64') # capital I
print(df1.head())
# df2 = df2[~df2["school"].str.contains("Total", na=False)]
# Create a database table and write all the dataframe data into it
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html
df1.to_sql("applications", conn, if_exists="replace", index=False)
conn.execute('CREATE INDEX "ix_applications_index" ON "applications" ("ID")')
"""
df2 = pd.read_excel(
'data/SchoolLevel_RaceGenderGradeMembership_1718to1920.xlsx',
sheet_name=1,
usecols="A:W",
header=None,
skiprows=3, # Drop the 3 header rows, the human-friendly formatting is confusing
skipfooter=1, # Also drop grand total row at the bottom
)
"""
df2 = pd.read_excel('data/public_records_request/NSORG Awards Data.xlsx', sheet_name=0)
print(df2.head())
# Provide new column names to be database friendly
df2.columns = [
"DropMe",
"Category",
"ProposalName",
"OrganizationName",
"NSORGID",
"FundingAmount",
]
# Drop column A
df2 = df2.drop(["DropMe"], axis=1)
# Drop row 1
df2.drop(df2.index[0], inplace=True)
# Drop all rows with "Total" in the Category column
df2 = df2[~df2["Category"].str.contains("Total", na=False)]
# They didn't re-state the Category every time, which is convenient for humans, but terrible
# for data processing. Luckily Pandas can fill the missing data back in for us:
df2["Category"] = df2["Category"].ffill()
# Change to Pandas float so we don't end up with SQLite TEXT
df2["FundingAmount"] = df2["FundingAmount"].astype('float')
print("Final dataframe:")
print(df2.head())
# Create a database table and write all the dataframe data into it
df2.to_sql("awards", conn, if_exists="replace")
conn.commit()
conn.close()