Skip to content

Commit 8b5046a

Browse files
committed
Add README; minor changes
1 parent f51c948 commit 8b5046a

File tree

8 files changed

+124
-24
lines changed

8 files changed

+124
-24
lines changed

Project.toml

+1
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,6 @@ Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826"
1818
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
1919
Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
2020
PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee"
21+
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
2122
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
2223
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"

README.md

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# PClean
2+
3+
[![Build Status](https://travis-ci.com/probcomp/PClean.svg?branch=master)](https://travis-ci.com/probcomp/PClean)
4+
5+
PClean: A Domain-Specific Probabilistic Programming Language for Bayesian Data Cleaning
6+
7+
*Warning: This is a rapidly evolving research prototype.*
8+
9+
PClean was created at the [MIT Probabilistic Computing Project](http://probcomp.csail.mit.edu/).
10+
11+
If you use PClean in your research, please cite the our 2021 AISTATS paper:
12+
13+
PClean: Bayesian Data Cleaning at Scale with Domain-Specific Probabilistic Programming. Lew, A. K.; Agrawal, M.; Sontag, D.; and Mansinghka, V. K. (2021, March).
14+
In International Conference on Artificial Intelligence and Statistics (pp. 1927-1935). PMLR. ([pdf](http://proceedings.mlr.press/v130/lew21a/lew21a.pdf))
15+
16+
## Using PClean
17+
18+
19+
To use PClean, create a Julia file with the following structure:
20+
21+
```julia
22+
using PClean
23+
using DataFrames: DataFrame
24+
import CSV
25+
26+
# Load data
27+
data = CSV.File(filepath) |> DataFrame
28+
29+
# Define PClean model
30+
PClean.@model MyModel begin
31+
@class ClassName1 begin
32+
...
33+
end
34+
35+
...
36+
37+
@class ClassNameN begin
38+
...
39+
end
40+
end
41+
42+
# Align column names of CSV with variables in the model.
43+
# Format is ColumnName CleanVariable DirtyVariable, or, if
44+
# there is no corruption for a certain variable, one can omit
45+
# the DirtyVariable.
46+
query = @query MyModel.ClassNameN [
47+
HospitalName hosp.name observed_hosp_name
48+
Condition metric.condition.desc observed_condition
49+
...
50+
]
51+
52+
# Configure observed dataset
53+
observations = [ObservedDataset(query, data)]
54+
55+
# Configuration
56+
config = PClean.InferenceConfig(1, 2; use_mh_instead_of_pg=true)
57+
58+
# SMC initialization
59+
state = initialize_trace(observations, config)
60+
61+
# Rejuvenation sweeps
62+
run_inference!(state, config)
63+
64+
# Evaluate accuracy, if ground truth is available
65+
ground_truth = CSV.File(filepath) |> CSV.DataFrame
66+
results = evaluate_accuracy(data, ground_truth, state, query)
67+
68+
# Can print results.f1, results.precision, results.accuracy, etc.
69+
println(results)
70+
71+
# Even without ground truth, can save the entire latent database to CSV files:
72+
PClean.save_results(dir, dataset_name, state, observations)
73+
```
74+
75+
Then, from this directory, run the Julia file.
76+
77+
```
78+
JULIA_PROJECT=. julia my_file.jl
79+
```
80+
81+
To learn to write a PClean model, see [our paper](http://proceedings.mlr.press/v130/lew21a/lew21a.pdf), but note
82+
the surface syntax changes described below.
83+
84+
## Differences from the paper
85+
86+
As a DSL embedded into Julia, our implementation of the PClean language has some differences, in terms of surface syntax,
87+
from the stand-alone syntax presented in our paper:
88+
89+
(1) Instead of `latent class C ... end`, we write `@class C begin ... end`.
90+
91+
(2) Instead of `subproblem begin ... end`, inference hints are given using ordinary
92+
Julia `begin ... end` blocks.
93+
94+
(3) Instead of `parameter x ~ d(...)`, we use `@learned x :: D{...}`. The set of
95+
distributions D for parameters is somewhat restricted.
96+
97+
(4) Instead of `x ~ d(...) preferring E`, we write `x ~ d(..., E)`.
98+
99+
(5) Instead of `observe x as y, ... from C`, write `@query ModelName.C [x y; ...]`.
100+
Clauses of the form `x z y` are also allowed, and tell PClean that the model variable
101+
`C.z` represents a clean version of `x`, whose observed (dirty) version is modeled
102+
as `C.y`. This is used when automatically reconstructing a clean, flat dataset.
103+
104+
The names of built-in distributions may also be different, e.g. `AddTypos` instead of `typos`,
105+
and `ProportionsParameter` instead of `dirichlet`.

experiments/flights/run.jl

+3-4
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@ include("load_data.jl")
55
websites = unique(dirty_table.src)
66

77
PClean.@model FlightsModel begin
8-
98
@class TrackingWebsite begin
109
name ~ StringPrior(2, 30, websites)
1110
end
12-
11+
1312
@class Flight begin
1413
begin
1514
flight_id ~ StringPrior(10, 20, flight_ids); @guaranteed flight_id
@@ -24,10 +23,10 @@ PClean.@model FlightsModel begin
2423
@learned error_probs::Dict{String, ProbParameter{10.0, 50.0}}
2524
begin
2625
flight ~ Flight;
27-
src ~ TrackingWebsite
2826
end
27+
src ~ TrackingWebsite
28+
error_prob = lowercase(src.name) == lowercase(flight.flight_id[1:2]) ? 1e-5 : error_probs[src.name]
2929
begin
30-
error_prob = lowercase(src.name) == lowercase(flight.flight_id[1:2]) ? 1e-5 : error_probs[src.name]
3130
sdt ~ MaybeSwap(flight.sdt, times_for_flight["$(flight.flight_id)-sched_dep_time"], error_prob)
3231
sat ~ MaybeSwap(flight.sat, times_for_flight["$(flight.flight_id)-sched_arr_time"], error_prob)
3332
adt ~ MaybeSwap(flight.adt, times_for_flight["$(flight.flight_id)-act_dep_time"], error_prob)

experiments/hospital/load_data.jl

+6
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,15 @@ dataset = "hospital"
55
dirty_table = CSV.File("datasets/$(dataset)_dirty.csv") |> DataFrame
66
clean_table = CSV.File("datasets/$(dataset)_clean.csv") |> DataFrame
77

8+
# In the dirty data, CSV.jl infers that PhoneNumber, ZipCode, and ProviderNumber
9+
# are strings. Our PClean script also models these columns as string-valued.
10+
# However, in the clean CSV file (without typos) it infers they are
11+
# numbers. To facilitate comparison of PClean's results (strings) with
12+
# ground-truth, we preprocess the clean values to convert them into strings.
813
clean_table[!, :PhoneNumber] = map(x -> "$x", clean_table[!, :PhoneNumber])
914
clean_table[!, :ZipCode] = map(x -> "$x", clean_table[!, :ZipCode])
1015
clean_table[!, :ProviderNumber] = map(x -> "$x", clean_table[!, :ProviderNumber])
1116

17+
# Stores sets of unique observed values of each attribute.
1218
possibilities = Dict(col => remove_missing(unique(collect(dirty_table[!, col])))
1319
for col in propertynames(dirty_table))

experiments/hospital/run.jl

+4-5
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ PClean.@model HospitalModel begin
2626
@class Hospital begin
2727
@learned owner_dist::ProportionsParameter
2828
@learned service_dist::ProportionsParameter
29-
loc ~ Place
30-
type ~ HospitalType
29+
loc ~ Place; type ~ HospitalType
3130
provider ~ ChooseUniformly(possibilities[:ProviderNumber])
3231
name ~ StringPrior(3, 50, possibilities[:HospitalName])
3332
addr ~ StringPrior(10, 30, possibilities[:Address1])
@@ -36,7 +35,7 @@ PClean.@model HospitalModel begin
3635
zip ~ ChooseUniformly(possibilities[:ZipCode])
3736
service ~ ChooseProportionally(possibilities[:EmergencyService], service_dist)
3837
end;
39-
@class Obs begin
38+
@class Record begin
4039
begin
4140
hosp ~ Hospital; service ~ AddTypos(hosp.service)
4241
provider ~ AddTypos(hosp.provider); name ~ AddTypos(hosp.name)
@@ -56,7 +55,7 @@ PClean.@model HospitalModel begin
5655
end;
5756
end;
5857

59-
query = @query HospitalModel.Obs [
58+
query = @query HospitalModel.Record [
6059
ProviderNumber hosp.provider provider
6160
HospitalName hosp.name name
6261
HospitalType hosp.type.desc type
@@ -81,6 +80,6 @@ observations = [ObservedDataset(query, dirty_table)];
8180
run_inference!(trace, config);
8281
end
8382

84-
results = evaluate_accuracy(dirty_table, clean_table, trace.tables[:Obs], query)
83+
results = evaluate_accuracy(dirty_table, clean_table, trace.tables[:Record], query)
8584
PClean.save_results("results", "hospital", trace, observations)
8685
println(results)

src/PClean.jl

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module PClean
33
using Distributions
44
using LightGraphs
55
using CSV
6+
using DataFrames: DataFrame
67

78
include("utils.jl")
89

src/analysis.jl

+4-13
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
using Dates
2-
using DataFrames: DataFrame
32

43
is_saveable(::RandomChoiceNode) = true
54
is_saveable(::JuliaNode) = true
@@ -34,7 +33,7 @@ function save_results(dir, name, trace, observed_datasets, timestamp=true)
3433
end
3534

3635

37-
function evaluate_accuracy(dirty_data, clean_data, table, query)
36+
function evaluate_accuracy(dirty_data, clean_data, table, query; verbose=false)
3837
total_errors = 0
3938
total_changed = 0 # not including imputed
4039
total_cleaned = 0 # correct repairs; total_changed - total_cleaned gives incorrect repairs
@@ -71,13 +70,11 @@ function evaluate_accuracy(dirty_data, clean_data, table, query)
7170
total_changed += 1
7271
if our_version == clean[colname]
7372
total_cleaned += 1
74-
else
73+
elseif verbose
7574
println("Changed: $(dirty[colname]) -> $our_version instead of $(clean[colname])")
7675
end
77-
else
78-
if dirty[colname] != clean[colname]
79-
println("Left unchanged: $(dirty[colname]) (should be $(clean[colname]))")
80-
end
76+
elseif verbose && dirty[colname] != clean[colname]
77+
println("Left unchanged: $(dirty[colname]) (should be $(clean[colname]))")
8178
end
8279
end
8380
end
@@ -132,12 +129,6 @@ function evaluate_accuracy_up_to(dirty_data, clean_data, table, query, N)
132129
total_changed += 1
133130
if our_version == clean[colname]
134131
total_cleaned += 1
135-
else
136-
# println("Changed: $(dirty[colname]) -> $our_version instead of $(clean[colname])")
137-
end
138-
else
139-
if dirty[colname] != clean[colname]
140-
# println("Left unchanged: $(dirty[colname]) (should be $(clean[colname]))")
141132
end
142133
end
143134
end

src/dsl/query.jl

-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
using DataFrames: DataFrame
2-
31
struct Query
42
model::PCleanModel
53
class::ClassID

0 commit comments

Comments
 (0)