Skip to content

Commit ded0c09

Browse files
committed
Update source data handling
1 parent 624992c commit ded0c09

File tree

3 files changed

+6
-15
lines changed

3 files changed

+6
-15
lines changed

.gitignore

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -49,19 +49,10 @@ ehthumbs.db
4949
Thumbs.db
5050

5151
# Data files (these be large treasures that shouldn't go in git)
52-
data/kaggle_so_2023/
53-
# But keep the zip file for distribution
52+
# Ignore everything in data/ directory
53+
data/*
54+
# But keep the zip file for distribution, arrr!
5455
!data/kaggle_so_2023_data.zip
55-
data/*.csv
56-
data/*.json
57-
data/*.xlsx
58-
59-
# Large data files - too big for GitHub's hold
60-
data/kaggle_so_2023_data/
61-
data/kaggle_so_2023_data.zip
62-
data/kaggle_so_2023/
63-
*.csv
64-
*.zip
6556

6657
# Test coverage
6758
.coverage

app/data_config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,8 @@ def _setup_data_sources(self):
138138
discovered = self._discover_data_sources()
139139

140140
# Configure Stack Overflow 2023 Survey if found
141-
if "kaggle_so_2023" in discovered:
142-
so_2023 = discovered["kaggle_so_2023"]
141+
if "kaggle_so_2023_data" in discovered:
142+
so_2023 = discovered["kaggle_so_2023_data"]
143143
self.register_data_source(
144144
DataSource(
145145
name="stackoverflow_2023",
@@ -172,7 +172,7 @@ def _setup_data_sources(self):
172172

173173
# Auto-configure other discovered data sources with generic settings
174174
for dir_name, files_info in discovered.items():
175-
if dir_name != "kaggle_so_2023": # Skip already configured ones
175+
if dir_name not in ["kaggle_so_2023_data"]: # Skip already configured ones
176176
# Try to detect common column patterns by loading a sample
177177
try:
178178
sample_df = pd.read_csv(files_info["data_file"], nrows=1)

data/kaggle_so_2023_data.zip

19.6 MB
Binary file not shown.

0 commit comments

Comments
 (0)