Skip to content

Commit a0d5e08

Browse files
authored
Add 2021 data with some updates to infra (#64)
Took 25 minutes
1 parent 72c7bc0 commit a0d5e08

File tree

8 files changed

+17
-14
lines changed

8 files changed

+17
-14
lines changed

.env

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
CHADWICK_VERSION=v0.9.0
2-
BASEBALLDATABANK_VERSION=36b88bb6abebddc5b1f584fab40b5655adc1ba70
3-
RETROSHEET_VERSION=c76e2b0aef6bfe821c48f440ba94c9cad68202b8
1+
CHADWICK_VERSION=v0.9.3
2+
BASEBALLDATABANK_VERSION=dd1a4503b9d6ec2bdda5e345ba06c867e368dd13
3+
RETROSHEET_VERSION=e540755f22b65d2f85f4da9180d1a31754c331f9
44

55
EXTRACT_DIR=extract
66
REPO=doublewick/boxball
7-
VERSION=2021.0.0
7+
VERSION=2022.0.0

extract/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ ARG BUILD_ENV
22
ARG RETROSHEET_IMAGE=get-retrosheet-${BUILD_ENV}
33
ARG BASEBALLDATABANK_IMAGE=get-baseballdatabank-${BUILD_ENV}
44

5-
FROM python:3.7.3-alpine3.9 AS build-common
5+
FROM python:3.10.4-alpine3.15 AS build-common
66
RUN apk add --no-cache \
77
parallel \
88
libtool \
@@ -29,7 +29,7 @@ COPY fixtures/raw/retrosheet.zip .
2929

3030
FROM build-common as get-baseballdatabank-prod
3131
ARG BASEBALLDATABANK_VERSION
32-
RUN wget https://github.com/droher/baseballdatabank/archive/${BASEBALLDATABANK_VERSION}.zip -O baseballdatabank.zip
32+
RUN wget https://github.com/chadwickbureau/baseballdatabank/archive/${BASEBALLDATABANK_VERSION}.zip -O baseballdatabank.zip
3333

3434
FROM build-common as get-baseballdatabank-test
3535
COPY fixtures/raw/baseballdatabank.zip .

extract/parsers/baseballdatabank.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@
55
from parsers.util import compress, OUTPUT_PATH
66

77
DOS_EOF = chr(26)
8-
BASEBALLDATABANK_PATH = Path("baseballdatabank/core")
8+
BASEBALLDATABANK_PATHS = Path("baseballdatabank/core"), Path("baseballdatabank/contrib")
99

1010

1111
def get_baseballdatabank_files():
12-
for file in BASEBALLDATABANK_PATH.glob("*.csv"):
12+
files = [f for path in BASEBALLDATABANK_PATHS
13+
for f in path.glob("*.csv")]
14+
print("Processing Baseball Databank files:", files)
15+
for file in files:
1316
# Just need to change from PascalCase to snake_case to match table names
1417
# Editing OF fielding files to get PascalCasev conformity for all databank filenames
1518
file_name = file.name.replace("OFs", "OfS").replace("OF", "Of")

load/mysql/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ ARG VERSION
22
FROM doublewick/boxball:ddl-${VERSION} as ddl
33
FROM doublewick/boxball:csv-${VERSION} as csv
44

5-
FROM mysql:8.0.23 as mysql-build
5+
FROM mysql:8.0.28-debian as mysql-build
66
ENV MYSQL_ALLOW_EMPTY_PASSWORD=yes
77
COPY my.cnf /etc/mysql/conf.d/
88
COPY A_unzip_csvs.sh z_remove_csvs.sh /docker-entrypoint-initdb.d/

transform/ddl.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM python:3.7.3-slim-stretch AS build-common
1+
FROM python:3.10-slim-bullseye AS build-common
22
COPY requirements.txt .
33
RUN pip install -r requirements.txt
44
ENV PYTHONPATH="/"

transform/parquet.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
ARG VERSION
22
FROM doublewick/boxball:extract-${VERSION} as extract
33

4-
FROM python:3.7.3-slim-stretch AS build-common
4+
FROM python:3.10-slim-bullseye AS build-common
55
COPY requirements.txt .
66
RUN pip install -r requirements.txt
77
ENV PYTHONPATH="/"

transform/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
SQLAlchemy==1.3.23
22
sqlalchemy-fdw==0.3.0
33
clickhouse-sqlalchemy==0.1.5
4-
pyarrow==3.0.0
5-
zstandard==0.15.2
4+
pyarrow==7.0.0
5+
zstandard==0.17.0

transform/src/schemas/retrosheet.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -863,7 +863,7 @@ class Event(Base):
863863
base1_run_id = Column(CHAR(8), doc="ID of runner on first")
864864
base2_run_id = Column(CHAR(8), doc="ID of runner on second")
865865
base3_run_id = Column(CHAR(8), doc="ID of runner on third")
866-
event_tx = Column(String(58), doc="Event text (in scoring shorthand")
866+
event_tx = Column(String(128), doc="Event text (in scoring shorthand")
867867
leadoff_fl = Column(Boolean, doc="Batter is leading off the inning")
868868
ph_fl = Column(Boolean, doc="Batter is pinch-hitting")
869869
bat_fld_cd = Column(SmallInteger, doc="Defensive position of batter (10 for DH, 11 for PH, 12 for PR")

0 commit comments

Comments
 (0)