Add README; minor changes

alex-lew · alex-lew · commit 8b5046a4a437 · 2021-03-29T09:22:41.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -18,5 +18,6 @@ Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
 PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
diff --git a/README.md b/README.md
@@ -0,0 +1,105 @@
+# PClean
+
+[![Build Status](https://travis-ci.com/probcomp/PClean.svg?branch=master)](https://travis-ci.com/probcomp/PClean)
+
+PClean: A Domain-Specific Probabilistic Programming Language for Bayesian Data Cleaning
+
+*Warning: This is a rapidly evolving research prototype.*
+
+PClean was created at the [MIT Probabilistic Computing Project](http://probcomp.csail.mit.edu/).
+
+If you use PClean in your research, please cite the our 2021 AISTATS paper:
+
+PClean: Bayesian Data Cleaning at Scale with Domain-Specific Probabilistic Programming. Lew, A. K.; Agrawal, M.; Sontag, D.; and Mansinghka, V. K. (2021, March).
+In International Conference on Artificial Intelligence and Statistics (pp. 1927-1935). PMLR. ([pdf](http://proceedings.mlr.press/v130/lew21a/lew21a.pdf))
+
+## Using PClean
+
+
+To use PClean, create a Julia file with the following structure:
+
+```julia
+using PClean
+using DataFrames: DataFrame
+import CSV
+
+# Load data
+data = CSV.File(filepath) |> DataFrame
+
+# Define PClean model
+PClean.@model MyModel begin
+    @class ClassName1 begin
+        ...
+    end
+
+    ...
+    
+    @class ClassNameN begin
+        ...
+    end
+end
+
+# Align column names of CSV with variables in the model.
+# Format is ColumnName CleanVariable DirtyVariable, or, if
+# there is no corruption for a certain variable, one can omit
+# the DirtyVariable.
+query = @query MyModel.ClassNameN [
+  HospitalName hosp.name             observed_hosp_name
+  Condition    metric.condition.desc observed_condition
+  ...
+]
+
+# Configure observed dataset
+observations = [ObservedDataset(query, data)]
+
+# Configuration
+config = PClean.InferenceConfig(1, 2; use_mh_instead_of_pg=true)
+
+# SMC initialization
+state = initialize_trace(observations, config)
+
+# Rejuvenation sweeps
+run_inference!(state, config)
+
+# Evaluate accuracy, if ground truth is available
+ground_truth = CSV.File(filepath) |> CSV.DataFrame
+results = evaluate_accuracy(data, ground_truth, state, query)
+
+# Can print results.f1, results.precision, results.accuracy, etc.
+println(results)
+
+# Even without ground truth, can save the entire latent database to CSV files:
+PClean.save_results(dir, dataset_name, state, observations)
+```
+
+Then, from this directory, run the Julia file.
+
+```
+JULIA_PROJECT=. julia my_file.jl
+```
+
+To learn to write a PClean model, see [our paper](http://proceedings.mlr.press/v130/lew21a/lew21a.pdf), but note
+the surface syntax changes described below.
+
+## Differences from the paper
+
+As a DSL embedded into Julia, our implementation of the PClean language has some differences, in terms of surface syntax,
+from the stand-alone syntax presented in our paper:
+
+(1) Instead of `latent class C ... end`, we write `@class C begin ... end`.
+
+(2) Instead of `subproblem begin ... end`, inference hints are given using ordinary
+    Julia `begin ... end` blocks.
+
+(3) Instead of `parameter x ~ d(...)`, we use `@learned x :: D{...}`. The set of
+    distributions D for parameters is somewhat restricted.
+
+(4) Instead of `x ~ d(...) preferring E`, we write `x ~ d(..., E)`.
+
+(5) Instead of `observe x as y, ... from C`, write `@query ModelName.C [x y; ...]`.
+    Clauses of the form `x z y` are also allowed, and tell PClean that the model variable
+    `C.z` represents a clean version of `x`, whose observed (dirty) version is modeled
+    as `C.y`. This is used when automatically reconstructing a clean, flat dataset.
+
+The names of built-in distributions may also be different, e.g. `AddTypos` instead of `typos`,
+and `ProportionsParameter` instead of `dirichlet`.
diff --git a/experiments/flights/run.jl b/experiments/flights/run.jl
@@ -5,11 +5,10 @@ include("load_data.jl")
 websites = unique(dirty_table.src)
 
 PClean.@model FlightsModel begin
-
   @class TrackingWebsite begin
     name ~ StringPrior(2, 30, websites)
   end
-
+  
   @class Flight begin
     begin 
       flight_id ~ StringPrior(10, 20, flight_ids); @guaranteed flight_id
@@ -24,10 +23,10 @@ PClean.@model FlightsModel begin
     @learned error_probs::Dict{String, ProbParameter{10.0, 50.0}}
     begin 
       flight ~ Flight; 
-      src ~ TrackingWebsite 
     end
+    src ~ TrackingWebsite 
+    error_prob = lowercase(src.name) == lowercase(flight.flight_id[1:2]) ? 1e-5 : error_probs[src.name]
     begin
-      error_prob = lowercase(src.name) == lowercase(flight.flight_id[1:2]) ? 1e-5 : error_probs[src.name]
       sdt ~ MaybeSwap(flight.sdt, times_for_flight["$(flight.flight_id)-sched_dep_time"], error_prob)
       sat ~ MaybeSwap(flight.sat, times_for_flight["$(flight.flight_id)-sched_arr_time"], error_prob)
       adt ~ MaybeSwap(flight.adt, times_for_flight["$(flight.flight_id)-act_dep_time"],   error_prob)
diff --git a/experiments/hospital/load_data.jl b/experiments/hospital/load_data.jl
@@ -5,9 +5,15 @@ dataset = "hospital"
 dirty_table = CSV.File("datasets/$(dataset)_dirty.csv") |> DataFrame
 clean_table = CSV.File("datasets/$(dataset)_clean.csv") |> DataFrame
 
+# In the dirty data, CSV.jl infers that PhoneNumber, ZipCode, and ProviderNumber
+# are strings. Our PClean script also models these columns as string-valued.
+# However, in the clean CSV file (without typos) it infers they are
+# numbers. To facilitate comparison of PClean's results (strings) with 
+# ground-truth, we preprocess the clean values to convert them into strings.
 clean_table[!, :PhoneNumber] = map(x -> "$x", clean_table[!, :PhoneNumber])
 clean_table[!, :ZipCode] = map(x -> "$x", clean_table[!, :ZipCode])
 clean_table[!, :ProviderNumber] = map(x -> "$x", clean_table[!, :ProviderNumber])
 
+# Stores sets of unique observed values of each attribute.
 possibilities = Dict(col => remove_missing(unique(collect(dirty_table[!, col])))
                      for col in propertynames(dirty_table))
diff --git a/experiments/hospital/run.jl b/experiments/hospital/run.jl
@@ -26,8 +26,7 @@ PClean.@model HospitalModel begin
     @class Hospital begin
         @learned owner_dist::ProportionsParameter
         @learned service_dist::ProportionsParameter
-        loc ~ Place        
-        type ~ HospitalType
+        loc ~ Place; type ~ HospitalType
         provider ~ ChooseUniformly(possibilities[:ProviderNumber])
         name ~ StringPrior(3, 50, possibilities[:HospitalName])
         addr ~ StringPrior(10, 30, possibilities[:Address1])
@@ -36,7 +35,7 @@ PClean.@model HospitalModel begin
         zip ~ ChooseUniformly(possibilities[:ZipCode])
         service ~ ChooseProportionally(possibilities[:EmergencyService], service_dist)
     end;
-    @class Obs begin
+    @class Record begin
         begin
             hosp     ~ Hospital;                         service ~ AddTypos(hosp.service)
             provider ~ AddTypos(hosp.provider);          name    ~ AddTypos(hosp.name)
@@ -56,7 +55,7 @@ PClean.@model HospitalModel begin
     end;
 end;
 
-query = @query HospitalModel.Obs [
+query = @query HospitalModel.Record [
     ProviderNumber   hosp.provider          provider
     HospitalName     hosp.name              name
     HospitalType     hosp.type.desc         type
@@ -81,6 +80,6 @@ observations = [ObservedDataset(query, dirty_table)];
     run_inference!(trace, config);
 end
 
-results = evaluate_accuracy(dirty_table, clean_table, trace.tables[:Obs], query)
+results = evaluate_accuracy(dirty_table, clean_table, trace.tables[:Record], query)
 PClean.save_results("results", "hospital", trace, observations)
 println(results)
diff --git a/src/PClean.jl b/src/PClean.jl
@@ -3,6 +3,7 @@ module PClean
 using Distributions
 using LightGraphs
 using CSV
+using DataFrames: DataFrame
 
 include("utils.jl")
 
diff --git a/src/analysis.jl b/src/analysis.jl
@@ -1,5 +1,4 @@
 using Dates
-using DataFrames: DataFrame
 
 is_saveable(::RandomChoiceNode) = true
 is_saveable(::JuliaNode) = true
@@ -34,7 +33,7 @@ function save_results(dir, name, trace, observed_datasets, timestamp=true)
 end
 
 
-function evaluate_accuracy(dirty_data, clean_data, table, query)
+function evaluate_accuracy(dirty_data, clean_data, table, query; verbose=false)
   total_errors = 0
   total_changed = 0 # not including imputed
   total_cleaned = 0 # correct repairs; total_changed - total_cleaned gives incorrect repairs
@@ -71,13 +70,11 @@ function evaluate_accuracy(dirty_data, clean_data, table, query)
           total_changed += 1
           if our_version == clean[colname]
             total_cleaned += 1
-          else
+          elseif verbose
             println("Changed: $(dirty[colname]) -> $our_version instead of $(clean[colname])")
           end
-        else
-          if dirty[colname] != clean[colname]
-            println("Left unchanged: $(dirty[colname]) (should be $(clean[colname]))")
-          end
+        elseif verbose && dirty[colname] != clean[colname]
+          println("Left unchanged: $(dirty[colname]) (should be $(clean[colname]))")
         end
       end
     end
@@ -132,12 +129,6 @@ function evaluate_accuracy_up_to(dirty_data, clean_data, table, query, N)
           total_changed += 1
           if our_version == clean[colname]
             total_cleaned += 1
-          else
-           # println("Changed: $(dirty[colname]) -> $our_version instead of $(clean[colname])")
-          end
-        else
-          if dirty[colname] != clean[colname]
-           # println("Left unchanged: $(dirty[colname]) (should be $(clean[colname]))")
           end
         end
       end
diff --git a/src/dsl/query.jl b/src/dsl/query.jl
@@ -1,5 +1,3 @@
-using DataFrames: DataFrame
-
 struct Query
     model::PCleanModel
     class::ClassID

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,3 @@`
`1`		`-using DataFrames: DataFrame`
`2`		`-`
`3`	`1`	`struct Query`
`4`	`2`	`model::PCleanModel`
`5`	`3`	`class::ClassID`