BigDaMa · SpeedyRagou · Nov 9, 2023 · Nov 9, 2023 · Nov 9, 2023 · Nov 9, 2023
diff --git a/.gitignore b/.gitignore
@@ -152,9 +152,11 @@ dmypy.json
 # Cython debug symbols
 cython_debug/
 
+# Raha
+raha-baran-results-*
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,6 @@
-recursive-include raha/tools/dBoost *
-recursive-include raha/tools/KATARA *
-include raha/tools/KATARA/knowledge-base/*
+recursive-include raha/original/tools/dBoost *
+recursive-include raha/original/tools/KATARA *
+include raha/original/tools/KATARA/knowledge-base/*
+recursive-include raha/dask_version/tools/dBoost *
+recursive-include raha/dask_version/tools/KATARA *
+include raha/dask_version/tools/KATARA/knowledge-base/*
diff --git a/README.md b/README.md
@@ -9,22 +9,32 @@ To install Raha and Baran, you can run:
 ```console
 pip3 install raha
 ```
+To install Raha and Baran with Dask, you can run:
+```console
+pip3 install raha[dask]
+```
 
 To install Raha and Baran using the github repository:
 ```console
 git clone [email protected]:BigDaMa/raha.git
 pip3 install -e raha
 ```
 
+To install Raha and Baran with Dask using the github repository:
+```console
+git clone [email protected]:BigDaMa/raha.git
+pip3 install -e raha[dask]
+```
+
 To uninstall them, you can run:
 ```console
 pip3 uninstall raha
 ```
 
 ## Usage
 Running Raha and Baran is simple!
-   - **Benchmarking**: If you have a dirty dataset and its corresponding clean dataset and you want to benchmark Raha and Baran, please check the sample codes in `raha/benchmark.py`, `raha/detection.py`, and `raha/correction.py`.
-   - **Interactive data cleaning with Raha and Baran**: If you have a dirty dataset and you want to interatively detect and correct data errors, please check our interactive Jupyter notebooks in the `raha` folder. The Jupyter notebooks provide graphical user interfaces.
+   - **Benchmarking**: If you have a dirty dataset and its corresponding clean dataset and you want to benchmark Raha and Baran, please check the sample codes in `raha/original/benchmark.py`, `raha/original/detection.py`, and `raha/original/correction.py`.
+   - **Interactive data cleaning with Raha and Baran**: If you have a dirty dataset and you want to interatively detect and correct data errors, please check our interactive Jupyter notebooks in the `raha/original` folder. The Jupyter notebooks provide graphical user interfaces.
    ![Data Annotation](pictures/ui.png)   
    ![Promising Strategies](pictures/ui_strategies.png)   
    ![Drill Down](pictures/ui_clusters.png)   
@@ -61,7 +71,8 @@ You can find more information about this project and the authors [here](https://
   publisher={VLDB Endowment}
 }
 ```
-
+### Dask Version
+The implementation for Raha and Baran with Dask was created by Yusuf Mandirali. The original code can be found [here](https://github.com/yimlyim/DaskRaha).
 
 ## A Note on the Naming
 Raha and Baran are Persian feminine names that are conceptually related to their corresponding error detection/correction systems. Raha (which means "free" in Persian) is assigned to our "configuration-free" error detection system. Baran (which means "rain" in Persian and rain washes/cleans everything) is assigned to our error correction system that "cleans" data.
diff --git a/raha/__init__.py b/raha/__init__.py
@@ -1,8 +1,29 @@
-from .dataset import *
 from .detection import *
 from .correction import *
 from .baselines import *
-from .utilities import *
+from .constants import *
+from .dataset import *
 from .benchmark import *
-from .tools.KATARA.katara import *
-from .tools.dBoost.dboost.imported_dboost import *
+
+from .original.utilities import *
+
+from .original.tools.KATARA import *
+from .original.tools.KATARA.katara import *
+from .original.tools.dBoost import *
+from .original.tools.dBoost.dboost import *
+from .original.tools.dBoost.dboost.imported_dboost import *
+from .original.dataset import *
+from .original.detection import *
+from .original.correction import *
+from .original import *
+
+from .dask_version import *
+from .dask_version.dataset_parallel import *
+from .dask_version.tools.KATARA import *
+from .dask_version.tools.KATARA.katara import *
+from .dask_version.tools.dBoost import *
+from .dask_version.tools.dBoost.dboost import *
+from .dask_version.tools.dBoost.dboost.imported_dboost import *
+from .dask_version.container import *
+from .dask_version.detection_parallel import *
+from .dask_version.correction_parallel import *
diff --git a/raha/baselines.py b/raha/baselines.py
@@ -107,7 +107,7 @@ def run_dboost(self, dd):
             print("------------------------------------------------------------------------\n"
                   "------------------------------Running dBoost----------------------------\n"
                   "------------------------------------------------------------------------")
-        d = raha.dataset.Dataset(dd)
+        d = raha.original.dataset.Dataset(dd)
         sp_folder_path = os.path.join(os.path.dirname(dd["path"]), "raha-baran-results-" + d.name, "strategy-profiling")
         strategy_profiles_list = [pickle.load(open(os.path.join(sp_folder_path, strategy_file), "rb"))
                                   for strategy_file in os.listdir(sp_folder_path)]
@@ -135,7 +135,7 @@ def run_nadeef(self, dd):
             print("------------------------------------------------------------------------\n"
                   "------------------------------Running NADEEF----------------------------\n"
                   "------------------------------------------------------------------------")
-        d = raha.dataset.Dataset(dd)
+        d = raha.original.dataset.Dataset(dd)
         detection_dictionary = {}
         for fd in self.DATASET_CONSTRAINTS[d.name]["functions"]:
             l_attribute, r_attribute = fd
@@ -171,7 +171,7 @@ def run_katara(self, dd):
             print("------------------------------------------------------------------------\n"
                   "------------------------------Running KATARA----------------------------\n"
                   "------------------------------------------------------------------------")
-        d = raha.dataset.Dataset(dd)
+        d = raha.original.dataset.Dataset(dd)
         sp_folder_path = os.path.join(os.path.dirname(dd["path"]), "raha-baran-results-" + d.name, "strategy-profiling")
         strategy_profiles_list = [pickle.load(open(os.path.join(sp_folder_path, strategy_file), "rb"))
                                   for strategy_file in os.listdir(sp_folder_path)]
@@ -190,7 +190,7 @@ def run_activeclean(self, dd, sampling_budget=20):
             print("------------------------------------------------------------------------\n"
                   "----------------------------Running ActiveClean-------------------------\n"
                   "------------------------------------------------------------------------")
-        d = raha.dataset.Dataset(dd)
+        d = raha.original.dataset.Dataset(dd)
         actual_errors_dictionary = d.get_actual_errors_dictionary()
         vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=1, stop_words="english")
         text = [" ".join(row) for row in d.dataframe.values.tolist()]
@@ -238,7 +238,7 @@ def run_min_k(self, dd):
             print("------------------------------------------------------------------------\n"
                   "------------------------------Running Min-k-----------------------------\n"
                   "------------------------------------------------------------------------")
-        d = raha.dataset.Dataset(dd)
+        d = raha.original.dataset.Dataset(dd)
         sp_folder_path = os.path.join(os.path.dirname(dd["path"]), "raha-baran-results-" + d.name, "strategy-profiling")
         strategy_profiles_list = [pickle.load(open(os.path.join(sp_folder_path, strategy_file), "rb"))
                                   for strategy_file in os.listdir(sp_folder_path)]
@@ -272,7 +272,7 @@ def run_maximum_entropy(self, dd, sampling_budget=20):
             print("------------------------------------------------------------------------\n"
                   "--------------------------Running Maximum Entropy-----------------------\n"
                   "------------------------------------------------------------------------")
-        d = raha.dataset.Dataset(dd)
+        d = raha.original.dataset.Dataset(dd)
         actual_errors_dictionary = d.get_actual_errors_dictionary()
         sp_folder_path = os.path.join(os.path.dirname(dd["path"]), "raha-baran-results-" + d.name, "strategy-profiling")
         strategy_profiles_list = [pickle.load(open(os.path.join(sp_folder_path, strategy_file), "rb"))
@@ -306,7 +306,7 @@ def run_metadata_driven(self, dd, sampling_budget=20):
             print("------------------------------------------------------------------------\n"
                   "--------------------------Running Metadata Driven-----------------------\n"
                   "------------------------------------------------------------------------")
-        d = raha.dataset.Dataset(dd)
+        d = raha.original.dataset.Dataset(dd)
         actual_errors_dictionary = d.get_actual_errors_dictionary()
         dboost_output = self.run_dboost(dd)
         nadeef_output = self.run_nadeef(dd)