Skip to content

Commit 53f0836

Browse files
committed
Pulled out transforms into bash files
1 parent 59234ec commit 53f0836

14 files changed

+39
-31
lines changed

.gitignore

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,2 @@
11
node_modules
2-
data/**/*.fec
3-
data/**/*.zip
4-
bin/*.sh
2+
data/**

bin/rss2psql

-11
This file was deleted.

bin/zip2psql

-13
This file was deleted.

pipelines/download.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ transform:
66
cmd:
77
- bash
88
stdin:
9-
- for file in $(find /pfs/check/ -name "*.fec"); do curl -s "http://docquery.fec.gov/dcdev/posted/$(basename $file)" | gzip -9 > "/pfs/out/$(basename $file).gz"; done
9+
- ./transforms/download.sh /pfs/check/ /pfs/out/
1010
parallelism_spec:
1111
constant: 2
1212
datum_tries: 2

pipelines/load.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ transform:
66
cmd:
77
- bash
88
stdin:
9-
- for file in $(find /pfs/filings/ -name "*.fec.gz"); do gunzip < $file | (echo "DELETE FROM fec_filings WHERE filing_id = $(echo $file | tr -dc '0-9');" && cat) | ./bin/fec convert --format=psql $(echo $file | tr -dc '0-9') | psql -v ON_ERROR_STOP=on --single-transaction 2> "/pfs/out/"$(echo $file | tr -dc '0-9')".log"; done
9+
- ./transforms/load.sh /pfs/filings/ /pfs/out/
1010
secrets:
1111
- name: pachyderm-postgres-politics-auth
1212
env_var: PGHOST

pipelines/notify.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ transform:
66
cmd:
77
- bash
88
stdin:
9-
- ./pipelines/notify.sh
9+
- ./transforms/notify.sh /pfs/filings/ /pfs/out/
1010
secrets:
1111
- name: pachyderm-postgres-politics-auth
1212
env_var: PGHOST

pipelines/rss.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ transform:
66
cmd:
77
- bash
88
stdin:
9-
- ./bin/fec list --rss --headers=false --format=tsv --columns=fec_url | head -n 100 | while read url; do touch "/pfs/out/$(echo $url | tr -dc '0-9').fec"; done
9+
- ./transforms/rss.sh /pfs/out/
1010
parallelism_spec:
1111
constant: 1
1212
input:

transforms/all.sh

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#!/bin/bash
2+
3+
mkdir -p ./data/rss
4+
mkdir -p ./data/download
5+
mkdir -p ./data/load
6+
./transforms/rss.sh ./data/rss/
7+
./transforms/download.sh ./data/rss/ ./data/download/
8+
./transforms/load.sh ./data/download/ ./data/load/
9+
./transforms/notify.sh ./data/load/

transforms/clean.sh

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
3+
rm -rf ./data

transforms/download.sh

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
for file in $(find $1 -name "*.fec");
4+
do
5+
curl -s "http://docquery.fec.gov/dcdev/posted/$(basename $file)" | gzip -9 > $2"$(basename $file).gz"
6+
done

transforms/load.sh

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
for file in $(find $1 -name "*.fec.gz");
4+
do
5+
gunzip < $file | ./bin/fec convert --format=psql $(echo $file | tr -dc '0-9') | (echo "DELETE FROM fec_filings WHERE filing_id = $(echo $file | tr -dc '0-9');" && cat) | psql -v ON_ERROR_STOP=on --single-transaction 2> $2$(echo $file | tr -dc '0-9')".log"
6+
done
File renamed without changes.

transforms/rss.sh

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
3+
./bin/fec list --rss --headers=false --format=tsv --columns=fec_url | head -n 100 | while read url; do
4+
touch $1"$(echo $url | tr -dc '0-9').fec";
5+
done

transforms/unzip.sh

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
3+
for file in $(find $1 -name "*.zip");
4+
unzip -qq -d $2 $file
5+
done

0 commit comments

Comments
 (0)