Skip to content

Guess schema with DuckDB #177

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
/nullified.csv
/numericalized.csv
/subsampled.csv
/summarized.csv
/validated.csv

# schema artifacts
/cgpm-schema.edn
/mapping-table.edn
/schema.edn
/schema.csv

# loom artifacts
/loom-schema.json
Expand Down
25 changes: 16 additions & 9 deletions dvc-cgpm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,25 @@ stages:
outs:
- data/nullified.csv

guess-schema:
summarize:
cmd: >
clojure -X inferenceql.structure-learning.main/guess-schema
duckdb -csv :memory: "SUMMARIZE SELECT * FROM read_csv_auto('/dev/stdin', header=true)"
< data/nullified.csv
> data/schema.edn
> data/summarized.csv
deps:
- data/nullified.csv
params:
- schema
outs:
- data/schema.edn
- data/summarized.csv

guess-schema:
cmd: >
duckdb -csv :memory: "$(cat scripts/guess_schema.sql)"
< data/summarized.csv
> data/schema.csv
deps:
- data/summarized.csv
outs:
- data/schema.csv

cgpm-schema:
cmd: >
Expand All @@ -60,13 +68,12 @@ stages:

ignore:
cmd: >
clojure -X inferenceql.structure-learning.main/ignore
:schema '"data/schema.edn"'
./scripts/ignore.sh
< data/nullified.csv
> data/ignored.csv
deps:
- data/nullified.csv
- data/schema.edn
- data/schema.csv
outs:
- data/ignored.csv

Expand Down
40 changes: 20 additions & 20 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,37 +36,34 @@ stages:
outs:
- data/nullified.csv

guess-schema:
summarize:
cmd: >
clojure -X inferenceql.structure-learning.main/guess-schema
duckdb -csv :memory: "SUMMARIZE SELECT * FROM read_csv_auto('/dev/stdin', header=true)"
< data/nullified.csv
> data/schema.edn
> data/summarized.csv
deps:
- data/nullified.csv
params:
- schema
outs:
- data/schema.edn
- data/summarized.csv

cgpm-schema:
guess-schema:
cmd: >
clojure -X inferenceql.structure-learning.main/cgpm-schema
< data/schema.edn
> data/cgpm-schema.edn
duckdb -csv :memory: "$(cat scripts/guess_schema.sql)"
< data/summarized.csv
> data/schema.csv
deps:
- data/schema.edn
- data/summarized.csv
outs:
- data/cgpm-schema.edn
- data/schema.csv

ignore:
cmd: >
clojure -X inferenceql.structure-learning.main/ignore
:schema '"data/schema.edn"'
./scripts/ignore.sh
< data/nullified.csv
> data/ignored.csv
deps:
- data/nullified.csv
- data/schema.edn
- data/schema.csv
outs:
- data/ignored.csv

Expand All @@ -86,11 +83,14 @@ stages:

loom-schema:
cmd: >
clojure -X inferenceql.structure-learning.main/loom-schema
< data/schema.edn
duckdb -json :memory: '
SELECT column_name, loom_statistical_type
FROM read_csv_auto("data/schema.csv", header=true)
WHERE loom_statistical_type IS NOT NULL'
| jq 'map({(.column_name): .loom_statistical_type}) | add'
> data/loom-schema.json
deps:
- data/schema.edn
- data/schema.csv
outs:
- data/loom-schema.json

Expand Down Expand Up @@ -153,14 +153,14 @@ stages:
--metadata {}
--output data/cgpm/hydrated/{/}
--data data/numericalized.csv
--schema data/cgpm-schema.edn
--schema data/schema.csv
--mapping-table data/mapping-table.edn
--seed $((${seed} + {#} - 1))'
params:
- parallel.flags
- seed
deps:
- data/cgpm-schema.edn
- data/schema.csv
- data/cgpm/raw
- data/mapping-table.edn
- data/numericalized.csv
Expand Down
27 changes: 27 additions & 0 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions flake.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-23.11";

outputs = { self, nixpkgs, ... }: let
system = "aarch64-darwin";
pkgs = nixpkgs.legacyPackages."${system}";
in {
devShells."${system}".default = pkgs.mkShell {
buildInputs = with pkgs; [ (python3.withPackages (ps: with ps; [ duckdb ])) ];
};
};
}
10 changes: 6 additions & 4 deletions scripts/cgpm_hydrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import argparse
import cgpm.utils.general as general
import duckdb
import edn_format
import json
import pandas
Expand All @@ -25,7 +26,7 @@ def main():
"--data", type=argparse.FileType("r"), help="Path to numericalized CSV."
)
parser.add_argument(
"--schema", type=argparse.FileType("r"), help="Path to CGPM schema."
"--schema", help="Path to schema."
)
parser.add_argument(
"--mapping-table",
Expand All @@ -51,16 +52,17 @@ def main():
sys.exit(1)

df = pandas.read_csv(args.data)
schema = edn_format.loads(args.schema.read(), write_ply_tables=False)
schema = duckdb.read_csv(args.schema, header=True)
stattypes = dict(duckdb.sql("SELECT column_name, column_cgpm_statistical_type FROM schema").fetchall())
mapping_table = edn_format.loads(args.mapping_table.read(), write_ply_tables=False)

def n_categories(column):
return len(mapping_table[column])

def distarg(column):
return {"k": n_categories(column)} if schema[column] == "categorical" else None
return {"k": n_categories(column)} if stattypes[column] == "categorical" else None

cctypes = [schema[column] for column in df.columns]
cctypes = [stattypes[column] for column in df.columns]
distargs = [distarg(column) for column in df.columns]

if args.metadata is not None:
Expand Down
45 changes: 45 additions & 0 deletions scripts/guess_schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
SELECT
column_name,
column_type,
(
CASE
WHEN column_type = 'VARCHAR' THEN
(
CASE
WHEN approx_unique > 1 AND approx_unique <= 50
THEN 'NOMINAL'
ELSE 'IGNORE'
END
)
WHEN column_type = 'DOUBLE' THEN
(
-- Not enough unique values. Better off modeling as nominal.
CASE
WHEN approx_unique < 20
OR (approx_unique / count) < 0.02
THEN 'NOMINAL'

-- Consecutive numbers -- probably an ID?
WHEN approx_unique = count
AND count = TRY_CAST(min AS INTEGER) - TRY_CAST(max AS INTEGER) + 1
THEN 'IGNORE'

ELSE 'NUMERICAL'
END
)
END
) AS column_statistical_type,
(
CASE
WHEN column_statistical_type = 'IGNORE' THEN 'ignore'
WHEN column_statistical_type = 'NOMINAL' THEN 'categorical'
WHEN column_statistical_type = 'NUMERICAL' THEN 'normal'
END
) AS column_cgpm_statistical_type,
(
CASE
WHEN column_statistical_type = 'NOMINAL' THEN 'dd'
WHEN column_statistical_type = 'NUMERICAL' THEN 'nich'
END
) AS loom_statistical_type,
FROM read_csv_auto('/dev/stdin', header=true)
9 changes: 9 additions & 0 deletions scripts/ignore.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash
columns=$(
duckdb -noheader -list :memory: <<-EOF | paste -s -d , -
SELECT column_name
FROM read_csv_auto('data/schema.csv', header=true)
WHERE column_statistical_type != 'IGNORE'
EOF
)
duckdb -csv :memory: "SELECT $columns FROM read_csv_auto('/dev/stdin', header=true)"
37 changes: 0 additions & 37 deletions src/clojure/inferenceql/structure_learning/csv.clj
Original file line number Diff line number Diff line change
Expand Up @@ -116,43 +116,6 @@
(recur csv (first ks) (next ks))
csv))))

(defn heuristic-coerce
[& coll]
(try (into []
(map #(if (nil? %)
%
(Long/parseLong %)))
coll)
(catch java.lang.NumberFormatException _
(try (into []
(map #(if (nil? %)
%
(Double/parseDouble %)))
coll)
(catch java.lang.NumberFormatException _
coll)))))

(defn apply-column
[k f coll]
(let [new-column (->> coll
(map #(get % k))
(apply f))]
(map-indexed (fn [index m]
(let [v (get new-column index)]
(cond-> m
(some? v) (assoc k v))))
coll)))

(defn heuristic-coerce-all
[coll]
(let [ks (into #{}
(mapcat keys)
coll)]
(reduce (fn [acc k]
(apply-column k heuristic-coerce acc))
coll
ks)))

(defn update-by-key
"For each key k in coll if (f k) returns a function update the value for k in
coll with that function."
Expand Down
40 changes: 0 additions & 40 deletions src/clojure/inferenceql/structure_learning/main.clj
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
[clojure.string :as string]
[inferenceql.structure-learning.csv :as iql.csv]
[inferenceql.structure-learning.dvc :as dvc]
[inferenceql.structure-learning.schema :as schema]
[inferenceql.inference.gpm :as gpm]
[inferenceql.query.db :as db]
[inferenceql.query.io :as query.io]
Expand All @@ -26,45 +25,6 @@
headers)
(csv/write-csv *out*))))

(defn guess-schema
[_]
(let [params (dvc/yaml)
params-schema (:schema params)
default-stattype (get params :default-stat-type :ignore)
guessed-schema (->> (csv/read-csv *in*)
(sequence (comp (iql.csv/as-maps)
(map #(medley/remove-vals (every-pred string? string/blank?) %))
(map #(medley/remove-keys (set (keys params-schema)) %))))
(iql.csv/heuristic-coerce-all)
(schema/guess default-stattype))
schema (merge guessed-schema params-schema)]
(assert (not (every? #{:ignore} (vals schema)))
"The statistical types of the columns in data.csv can't be guessed confidently.\nAll columns are ignored. Set statistical types manually in params.yaml to fix this")
(schema/print-ignored schema)
(prn schema)))

(defn loom-schema
[_]
(-> (edn/read *in*)
(schema/loom)
(json/generate-stream *out*)))

(defn cgpm-schema
[_]
(-> (edn/read *in*)
(schema/cgpm)
(pr)))

(defn ignore
[{:keys [schema]}]
(let [ignored (into #{}
(comp (filter (comp #{:ignore} val))
(map key))
(edn/read-string (slurp schema)))
csv (csv/read-csv *in*)
ignored-csv (apply iql.csv/dissoc csv ignored)]
(csv/write-csv *out* ignored-csv)))

(defn numericalize
[{table-path :table schema-path :schema}]
(let [schema (edn/read-string (slurp (str schema-path)))
Expand Down
Loading