From 59abe3218d14a8198ae5405f785503f4151312c2 Mon Sep 17 00:00:00 2001 From: "eve n.u" Date: Tue, 2 Apr 2024 18:14:20 -0400 Subject: [PATCH] feat: Add poisson primitives for :count types --- deps.edn | 2 +- dvc.yaml | 67 +------------------ scripts/ast_export.py | 2 +- scripts/predict.py | 4 +- .../inferenceql/structure_learning/schema.clj | 2 + .../inferenceql/structure_learning/xcat.clj | 14 ++-- .../structure_learning/schema_test.clj | 37 +++++----- 7 files changed, 32 insertions(+), 96 deletions(-) diff --git a/deps.edn b/deps.edn index ca4e8ad0..a9c05cf5 100644 --- a/deps.edn +++ b/deps.edn @@ -6,7 +6,7 @@ com.cognitect/transit-clj {:mvn/version "1.0.324"} com.github.haifengl/smile-core {:mvn/version "3.0.1"} io.github.inferenceql/inferenceql.gpm.sppl {:git/sha "52f8316e094b3644709dccde8f0a935f9b55f187"} - io.github.inferenceql/inferenceql.inference {:git/sha "40e77dedf680b7936ce988b66186a86f5c4db6a5"} + io.github.inferenceql/inferenceql.inference {:git/sha "78b72a4f0356b705311fc003ee94bc48e5c1c142"} io.github.inferenceql/inferenceql.query {:git/sha "933a1d1b620bb227ddcf6f649187c8759b46ac27"} lambdaisland/regal {:mvn/version "0.0.143"} medley/medley {:mvn/version "1.4.0"} diff --git a/dvc.yaml b/dvc.yaml index f4f29fc6..b31d05f5 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -182,14 +182,12 @@ stages: --kernel alpha --kernel view_alphas --kernel column_hypers - --kernel rows - --kernel columns --output data/cgpm/complete/{/} --data data/numericalized.csv --params params.yaml --seed $((${seed} + {#} - 1)) --minutes ${cgpm.minutes}' - #--iterations ${cgpm.iterations} + # --iterations ${cgpm.iterations} params: - parallel.flags - seed @@ -259,69 +257,6 @@ stages: outs: - data/dep-prob.svg - save-linear-stats: - cmd: > - python scripts/linear_stats.py - --data data/ignored.csv - --schema data/schema.edn - --output data/linear-stats.json - deps: - - data/ignored.csv - - data/schema.edn - - scripts/linear_stats.py - outs: - - data/linear-stats.json - - linear-stats-vl: - cmd: > - clojure -X inferenceql.structure-learning.heatmap/vega-lite - :stats-path '"data/linear-stats.json"' - :sort-path '"data/dep-prob.json"' - :domain '[1.0 0.0]' - :default 0.0 - :name '"statistics"' - :field '"p-value"' - :scheme '"oranges"' - > data/linear-stats.vl.json - deps: - - data/linear-stats.json - - data/dep-prob.json - - src/clojure/inferenceql/structure_learning/heatmap.clj - outs: - - data/linear-stats.vl.json - - linear-stats-vg: - cmd: > - pnpm vl2vg - < data/linear-stats.vl.json - > data/linear-stats.vg.json - deps: - - data/linear-stats.vl.json - outs: - - data/linear-stats.vg.json - - linear-stats-svg: - cmd: > - pnpm vg2svg - < data/linear-stats.vg.json - > data/linear-stats.svg - deps: - - data/linear-stats.vg.json - outs: - - data/linear-stats.svg - compare-dep-prob-with-linear: - desc: "Compares results from dependency probability with standard statistical tests" - cmd: > - python scripts/compare_deps.py - --deps data/dep-prob.json - --linear data/linear-stats.json - >> data/qc-statistical-tests.txt - deps: - - data/dep-prob.json - - data/linear-stats.json - outs: - - data/qc-statistical-tests.txt - ast-export: desc: "Exports ASTs of the parametric model programs resulting from truncating CGPM-CrossCat models." cmd: diff --git a/scripts/ast_export.py b/scripts/ast_export.py index bfecff6c..6be2dbc4 100644 --- a/scripts/ast_export.py +++ b/scripts/ast_export.py @@ -192,7 +192,7 @@ def export_primitive(output, cctype, hypers, suffstats, distargs, categorical_ma a = hypers["a"] b = hypers["b"] N = suffstats["N"] - x_sum = suffstats["x_sum"] + sum_x = suffstats["sum_x"] # Compute the distribution. # The implementation of Poisson.logpdf in CGPM is rather suspicious: # https://github.com/probcomp/cgpm/issues/251 diff --git a/scripts/predict.py b/scripts/predict.py index 6aab64da..574d82c7 100644 --- a/scripts/predict.py +++ b/scripts/predict.py @@ -29,6 +29,8 @@ def impute_missing_features(train_dataset, test_dataset, schema): replacements[c] = train_dataset[c].median() elif schema[c] == "nominal": replacements[c] = train_dataset[c].mode()[0] + elif schema[c] == "count": + replacements[c] = int(train_dataset[c].median()) else: raise ValueError(error_message_stat_type(schema[c])) train_dataset = train_dataset.fillna(replacements) @@ -62,7 +64,7 @@ def recode_categoricals(train_dataset, test_dataset, schema, target): ) # Add new cols to df col_names = [ - c for c in train_dataset.columns if (schema[c] == "numerical") and (c != target) + c for c in train_dataset.columns if (schema[c] not in ["nominal", "ignored"] ) and (c != target) ] for i in range(X_transformed.shape[1]): col_name = f"c_{i}" diff --git a/src/clojure/inferenceql/structure_learning/schema.clj b/src/clojure/inferenceql/structure_learning/schema.clj index ec4c5c3c..48c3f01c 100644 --- a/src/clojure/inferenceql/structure_learning/schema.clj +++ b/src/clojure/inferenceql/structure_learning/schema.clj @@ -68,6 +68,7 @@ "Returns the Loom schema for an InferenceQL schema." [schema] (let [replacements {:nominal "dd" ; discrete dirichlet + :count "gp" ; gamma-poisson :numerical "nich"}] ; normal inverse chi squared (into {} (comp (remove (comp #{:ignore} val)) @@ -78,6 +79,7 @@ "Returns the CGPM schema for an InferenceQL schema." [schema] (let [replacements {:nominal "categorical" ; discrete dirichlet + :count "poisson" :numerical "normal"}] (into {} (comp (remove (comp #{:ignore} val)) diff --git a/src/clojure/inferenceql/structure_learning/xcat.clj b/src/clojure/inferenceql/structure_learning/xcat.clj index bee13eec..c8cb5168 100644 --- a/src/clojure/inferenceql/structure_learning/xcat.clj +++ b/src/clojure/inferenceql/structure_learning/xcat.clj @@ -11,12 +11,12 @@ (defn ^:private view-name "Returns a cluster name for view index n." [n] - (str "view_" n)) + (keyword (str "view_" n))) (defn ^:private cluster-name "Returns a cluster name for cluster index n." [n] - (str "cluster_" n)) + (keyword (str "cluster_" n))) (defn ^:private map-invert "Returns m with its vals as keys and its keys grouped into a vector as vals." @@ -35,8 +35,9 @@ (defn ^:private data [data-cells schema num-rows] - (let [headers (map name (first data-cells)) + (let [headers (map keyword (first data-cells)) column->f (comp {:numerical am.csv/parse-number + :count am.csv/parse-number :nominal am.csv/parse-str} schema name) @@ -58,15 +59,16 @@ (defn ^:private col-names [numericalized cgpm-model] - (mapv name (get cgpm-model :col_names (first numericalized)))) + (mapv keyword (get cgpm-model :col_names (first numericalized)))) (defn ^:private spec [numericalized schema cgpm-model] (let [columns (col-names numericalized cgpm-model) views (views columns cgpm-model) types (->> schema - (medley/map-keys name) + (medley/map-keys keyword) (medley/map-vals {:nominal :categorical + :count :poisson :numerical :gaussian}))] {:views views :types types})) @@ -93,7 +95,7 @@ (defn options [mapping-table] (->> mapping-table - (medley/map-keys name) + (medley/map-keys keyword) (medley/map-vals #(->> % (sort-by val) (map key) (into []))))) (defn xcat-model diff --git a/test/clojure/inferenceql/structure_learning/schema_test.clj b/test/clojure/inferenceql/structure_learning/schema_test.clj index 77c45aa1..a8add0d8 100644 --- a/test/clojure/inferenceql/structure_learning/schema_test.clj +++ b/test/clojure/inferenceql/structure_learning/schema_test.clj @@ -13,11 +13,7 @@ (is (= [0 1 2] (schema/column :x [{:x 0 :y "a"} {:x 1 :y "b"} - {:x 2 :y "c"}]))) - (is (= [0 1 2] - (schema/column "x" [{"x" 0 "y" "a"} - {"x" 1 "y" "b"} - {"x" 2 "y" "c"}])))) + {:x 2 :y "c"}])))) (deftest guess-stattype (are [stattype coll] (= stattype (schema/guess-stattype :ignore coll)) @@ -36,24 +32,23 @@ :ignore [{:id 0 :x 0.0 :y "a"} {:id 1 :x 1.0 :y "b"} - {:id 2 :x 2.0 :y "c"}]))) - (is (= {"id" :ignore - "x" :numerical - "y" :nominal} - (schema/guess - :ignore - [{"id" 0 "x" 0.0 "y" "a"} - {"id" 1 "x" 1.0 "y" "b"} - {"id" 2 "x" 2.0 "y" "c"}])))) + {:id 2 :x 2.0 :y "c"}])))) (deftest loom (is (= {:x "nich" - :y "dd"} + :y "dd" + :z "gp"} (schema/loom {:id :ignore :x :numerical - :y :nominal}))) - (is (= {"x" "nich" - "y" "dd"} - (schema/loom {"id" :ignore - "x" :numerical - "y" :nominal})))) + :y :nominal + :z :count})))) +(deftest cgpm + (is (= {"x" "normal" + "y" "categorical" + "z" "poisson" + } + (schema/cgpm {:id :ignore + :x :numerical + :y :nominal + :z :count + }))))