diff --git a/.github/workflows/eval-overhead-e2e.yml b/.github/workflows/eval-overhead-e2e.yml
index 8e40cb91..2b28020d 100644
--- a/.github/workflows/eval-overhead-e2e.yml
+++ b/.github/workflows/eval-overhead-e2e.yml
@@ -6,16 +6,16 @@ on:
     paths:
       - '.github/workflows/**'
       - 'eval_scripts/perf_benchmark/**'
-      - 'mldaikon/instrumentor/**'
-      - 'mldaikon/proxy_wrapper/**'
-      - 'mldaikon/collect_trace.py'
+      - 'traincheck/instrumentor/**'
+      - 'traincheck/proxy_wrapper/**'
+      - 'traincheck/collect_trace.py'
   pull_request:
     paths:
       - '.github/workflows/**'
       - 'eval_scripts/perf_benchmark/**'
-      - 'mldaikon/instrumentor/**'
-      - 'mldaikon/proxy_wrapper/**'
-      - 'mldaikon/collect_trace.py'
+      - 'traincheck/instrumentor/**'
+      - 'traincheck/proxy_wrapper/**'
+      - 'traincheck/collect_trace.py'
     
 
 permissions:
diff --git a/.github/workflows/pre-commit-checks.yml b/.github/workflows/pre-commit-checks.yml
index d3943689..41a3f4c4 100644
--- a/.github/workflows/pre-commit-checks.yml
+++ b/.github/workflows/pre-commit-checks.yml
@@ -6,12 +6,12 @@ on:
       - main
     paths:
       - '.github/workflows/**'
-      - 'mldaikon/**'
+      - 'traincheck/**'
       - 'tests/**'
   pull_request:
     paths:
       - '.github/workflows/**'
-      - 'mldaikon/**'
+      - 'traincheck/**'
       - 'tests/**'
 
 jobs:
@@ -42,19 +42,19 @@ jobs:
 
       - name: Run black
         id: black
-        run: black --check mldaikon --exclude tests
+        run: black --check traincheck --exclude tests
 
       - name: Run mypy on main source code folder
         id: mypy
-        run: mypy mldaikon --install-types --non-interactive --ignore-missing-imports
+        run: mypy traincheck --install-types --non-interactive --ignore-missing-imports
 
       - name: Run isort
         id: isort
-        run: isort --check --profile=black mldaikon --skip tests
+        run: isort --check --profile=black traincheck --skip tests
 
       - name: Run ruff
         id: ruff
-        run: ruff check mldaikon
+        run: ruff check traincheck
 
       - name: Check if any checks failed
         if: failure()
diff --git a/.gitignore b/.gitignore
index 810d90a9..7036fc5b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,7 +26,7 @@ experiments/*
 
 *.pt
 *.pstats
-_ml_daikon_*
+_traincheck_*
 test_meta_hypothesis_combination.py
 !call_graph.json
 
@@ -44,7 +44,7 @@ torch_wrapper.py
 trace-analyzer.ipynb
 instrumented_84911_watch.py
 results/1_0.01/case_1_confusion_matrix.csv
-!/mldaikon/static_analyzer/func_level/*.log
+!/traincheck/static_analyzer/func_level/*.log
 
 *.json
 *.prof
@@ -67,6 +67,5 @@ eval_scripts/perf_benchmark/overhead-e2e/*/traincheck*
 *.pth*
 *ubyte*
 eval_scripts/**/*.png
-traincheck
-mldaikon_run*
+traincheck_run*
 trace_*
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3cd1ab8e..54abfa91 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,7 +10,7 @@ repos:
     rev: v1.9.0
     hooks:
       - id: mypy
-        files: ^mldaikon/
+        files: ^traincheck/
         types: [python]
         args: [--install-types, --non-interactive, --ignore-missing-imports] 
         # args: [--strict]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..115d4479
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,13 @@
+Copyright 2025 OrderLab and University of Michigan
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/README.md b/README.md
index ab225a20..46d341d6 100644
--- a/README.md
+++ b/README.md
@@ -1,48 +1,40 @@
 
-# ML-DAIKON
-[![Pre-commit checks](https://github.com/OrderLab/ml-daikon/actions/workflows/pre-commit-checks.yml/badge.svg)](https://github.com/OrderLab/ml-daikon/actions/workflows/pre-commit-checks.yml)
-
-Instrumentor Performance Benchmark Results: http://orderlab.io/ml-daikon/dev/bench/
-
-## Instrumentator Usage
-ML-Daikon performs automatic instrumentation of programs and supports out-of-tree execution. To use the instrumentor, please install mldaikon as a pip package in the desired python environment where the example pipeline should be run in.
-
-To install the instrumentor:
-```shell
-git clone git@github.com:OrderLab/ml-daikon.git
-cd ml-daikon
-pip3 install -e .
-conda install cudatoolkit
-```
-
-A typical instrumentor invocation looks like
-```bash
-python3 -m mldaikon.collect_trace \
-  -p <path to your python script> \
-  -s <optional path to sh script that invokes the python script> \
-  -t [names of the module to be instrumented, e.g. torch, megatron] \ # `torch` is the default value here so you probably don't need to set it
-  --scan_proxy_in_args \ # dynamic analysis for APIContainRelation in 84911, keep it on
-  --allow_disable_dump \ # skip instrumentation for functions in modules specified in config.WRAP_WITHOUT_DUMP, keep it on for instrumentor overhead, inform @Essoz if you need those functions for invariant inference
-  -d # enabling debug logging, if you are not debugging the trace collector, you probably don't need it
-```
-
-The instrumentor will dump the collected trace to the folder where you invoked the command. There should be one trace per thread and the names of trace files follow the pattern:
-```bash
-_ml_daikon_<pyscript-file-name>_mldaikon_trace_API_<time-of-instrumentor-invocation>_<process-id>_<thread-id>.log
-```
-After execution completion, you can also look at `program_output.txt` for the stdout and stderr of the pipeline being executed.
-
-## Infer Engine Usage
-
-```bash
-python3 -m mldaikon.infer_engine \
-  -t <path to your trace files> \
-  -d \ # enable debug logging 
-  -o invariant.json \ # name of the file to dump the inferred invariants to
-```
-
-There are two other arguments that you might need.
-```bash
---disable_precond_sampling \ # by default we enable sampling of examples to be used in precondition inference when the number of examples exceeds 10000. Sampling might cause us to lose information and you can disable this behavior by setting this flag.
---precond_sampling_threshold \ # the default threshold to sample examples is 10000, change this if you need to
-```
+[![format and types](https://github.com/OrderLab/traincheck/actions/workflows/pre-commit-checks.yml/badge.svg)](https://github.com/OrderLab/traincheck/actions/workflows/pre-commit-checks.yml)
+[![Chat on Discord](https://img.shields.io/discord/1362661016760090736?label=Discord&logo=discord&style=flat)](https://discord.gg/DPEd7Xeg)
+
+# TrainCheck
+
+TrainCheck is a lightweight, extensible tool for runtime monitoring of “silent” bugs in deep‑learning training pipelines. Instead of waiting for a crash or a bad model, TrainCheck:
+1. **Automatically instruments** your existing training scripts (e.g., from [pytorch/examples](https://github.com/pytorch/examples) or [huggingface/transformers/examples](https://github.com/huggingface/transformers/tree/main/examples)), inserting tracing hooks with minimal code changes.
+2. **Learns precise invariants**–precise properties that should hold during training across API calls and model updates-by analyzing executions of known-good runs.
+3. **Catches silent issues early**–by checking invariants on new or modified training jobs, alerting you immediately if something didn't happen as expected (e.g., model weight inconsistency, mixed precision not applied successfully, unexpected tensor shapes). On violation, TrainCheck flags the point of divergence—so users can diagnose silent issues before they derail your model.
+
+![Workflow](docs/assets/images/workflow.png)
+
+Under the hood, TrainCheck decomposes into three CLI tools:
+- **Instrumentor** (`traincheck-collect`)
+  Wraps target training programs with lightweight tracing logic. It produces an instrumented version of the target program that logs API calls and model states without altering training semantics.
+- **Inference Engine** (`traincheck-infer`)
+  Consumes one or more trace logs from successful runs to infer low‑level invariants.
+- **Checker** (`traincheck-check`)
+  Runs alongside or after new training jobs to verify that each recorded event satisfies the inferred invariants.
+
+## Status
+
+TrainCheck is under active development. Features may be incomplete and the documentation is evolving—if you give it a try, please join our 💬 [Discord server](https://discord.gg/DPEd7Xeg) or file a GitHub issue for support. Currently, the **Checker** operates in a semi‑online mode: you invoke it against the live, growing trace output to catch silent bugs as they appear. Fully automatic monitoring is on the roadmap, and we welcome feedback and contributions from early adopters.
+
+## Try TrainCheck
+
+1. **Install**  
+   Follow the [Installation Guide](./docs/installation-guide.md) to get TrainCheck set up on your machine.
+
+2. **Explore**  
+   Work through our "[5‑Minute Experience with TrainCheck](./docs/5-min-tutorial.md)" tutorial. You’ll learn how to:
+   - Instrument a training script and collect a trace  
+   - Automatically infer low‑level invariants  
+   - Run the Checker in semi‑online mode to uncover silent bugs
+
+## Documentation
+Please visit [TrainCheck Technical Doc](./docs/technical-doc.md).
+
+🕵️‍♀️ OSDI AE members, please see [TrainCheck AE Guide](./docs/ae.md).
diff --git a/docs/5-min-tutorial.md b/docs/5-min-tutorial.md
new file mode 100644
index 00000000..fe951b8c
--- /dev/null
+++ b/docs/5-min-tutorial.md
@@ -0,0 +1,294 @@
+# Quick Start: TrainCheck Tutorial
+
+In this tutorial, you will use TrainCheck to detect & diagnose the real‑world silent issue in [PyTorch‑Forum‑84911: Obtaining abnormal changes in loss and accuracy](https://discuss.pytorch.org/t/obtaining-abnormal-changes-in-loss-and-accuracy/84911), with invariants inferred from PyTorch’s official MNIST example. We’ll refer to this buggy pipeline simply as '84911'.
+
+**Estimated time**: ~5 minutes (plus model/inference overhead)
+
+**Prerequisites**  
+- [A working TrainCheck installation](./installation-guide.md)  
+- `efficientnet_pytorch` (install via `pip3 install efficientnet_pytorch`)  
+- A Linux machine with a CUDA‑enabled GPU  
+  - 💡 Tip: If you don’t have a CUDA GPU, you can still run this tutorial on CPU—it’ll just take longer.
+
+## Background: What’s wrong with 84911?
+The author attempts to finetune a pretrained `EfficientNet_b0` model for image classification but notices that—even after many epochs—the training loss barely improves (x‑axis = epoch, y‑axis = loss):
+
+<div style="text-align: center;">
+    <img src="https://discuss.pytorch.org/uploads/default/original/3X/4/7/47252703dfeb2062b0a581df5572071657aa82c5.png" alt="loss curve v.s. epochs" style="max-width: 400px; height: auto;">
+</div>
+
+It appears from the plot that the model is still being trained, but somehow it is just not improving meaningfully. 
+The original issue post discussed adjusting learning rate, and training for longer epochs. However, the issue remained unresolved.
+
+We have diagnosed the root cause for you. You can look at it now or come at it yourself with the help of TrainCheck.
+
+<details>
+<summary>Click here to reveal the root cause.</summary><br>
+
+The developer, for some reason, sets `requires_grad` to `False` for all parameters except for batch normalization layers, yet only initializes the optimizer with the final fully-connected layer.
+
+```bash
+for name,param in model_transfer.module.named_parameters():
+    if("bn" not in name):
+        param.requires_grad = False
+
+for param in model_transfer.module._fc.parameters():
+    param.requires_grad = False
+
+...
+optimizer_transfer = optim.Adam(model_transfer.module._fc.parameters(), lr=0.001)
+```
+
+This freeze logic leaves virtually no trainable parameters. Since batch normalization layers still update their running mean/variance each forward pass, the loss/accuracy curves drift slightly instead of remaining flat—masking the lack of actual learning. Logging metrics only once per epoch further hides the anomalies, so the initialization bug only becomes apparent after several epochs have already run.
+</details>
+
+## Detecting & Diagnosing 84911
+
+We will infer invariants from the mnist.py, a very simple PyTorch-official pipeline that trains a 2-layer CNN on MNIST, to showcase TrainCheck's capability.
+
+### 1. Download example scripts
+
+```bash
+cd ~
+mkdir traincheck-tutorial && cd traincheck-tutorial
+wget https://raw.githubusercontent.com/OrderLab/traincheck/main/docs/assets/code/mnist.py
+wget https://raw.githubusercontent.com/OrderLab/traincheck/main/docs/assets/code/84911.py
+```
+
+💡 If the wget links above fail (e.g. due to branch changes or access issues), you can also download the files manually from:
+- [mnist.py](assets/code/mnist.py)
+- [84911.py](assets/code/84911.py)
+
+### 2. **Instrument & collect trace from `mnist.py`** (~1 minute)
+
+```bash
+traincheck-collect \
+  --pyscript mnist.py \
+  --models-to-track model \
+  --output-dir traincheck_mnist_trace
+```
+
+This instruments torch and model in `mnist.py`, runs it with default arguments, and writes JSON trace files into `traincheck_mnist_trace/` (≈ 1 minute). You’ll see the training logs and any benign PyTorch warnings on stdout.
+
+### 3. **Infer Invariants from `mnist.py`** (~1-4 minutes)
+
+We will infer invariants from the trace we just collected using the command below.
+
+```bash
+traincheck-infer -f ./traincheck_mnist_trace
+```
+This will produce an invariants.json file (one JSON Line per invariant). Verify the count:
+
+```bash
+wc -l invariants.json  # should output ~913
+```
+
+The generated invariants capture API invocation order, event expectations, and input-output relationships. Since the trace comes from a single, simple script, some invariants may overfit—we’ll cover filtering in the next steps.
+
+### 4. Check for silent issues in **84911.py** with invariants (~5-10 minutes)
+
+> **Note**: For this quickstart, we do offline checking for simplicity.
+
+```bash
+# trace the buggy pipeline (~5 minutes)
+traincheck-collect \
+  --pyscript 84911.py \
+  --models-to-track model_transfer \
+  --output-dir traincheck_84911_trace
+
+# run the checker (~2–6 minutes)
+traincheck-check \
+  --trace-dir traincheck_84911_trace \
+  --invariants invariants.json
+```
+
+The output of the traincheck-check command will contain this in the end:
+```bash
+Checking finished. 913 invariants checked
+Total failed invariants: 25/913
+Total passed invariants: 888/913 # number here includes both passed and not triggered invariants
+Total invariants that are not triggered: 552/913
+```
+
+361 invariants were checked on `84911.py`, and 25 got violated.
+
+The checker writes the full results to a folder named `traincheck_checker_results_<timestamp>`, containing the results (`failed_*.log`, `not_triggered_*.log`, `passed_*.log`, depending if an invariant is violated, not checked at all, or checked and passed.), and a copy of `invariants.json`.
+
+### 5. Detection & Diagnosis
+
+Ready to play detective? 🔍 TrainCheck flagged **25 invariant violations** right at the start of training—well before the fluctuating loss/accuracy pattern was observed. Let’s interpret the results first and then if you want to learn more.
+
+**1. Quick filter**  
+- **Event‑order invariants noise** (20/25 failures):  
+  - `FunctionCoverRelation` and `FunctionLeadRelation` invariants (basically specifying API invocation orders) overfit our single demo trace.  
+  - Examples: strict ordering of `torch.distributed.is_initialized` (6 invariants violated but we are not even doing distributed training in 84911!) or `torch.cuda.is_initialized` (another 7 invariants violated but shouldn't matter at all for training).
+  - **Ignore these**.
+
+**2. Spot the real issues**  
+- **APIContainRelation** invariant violations (5/25):  
+  1. `Optimizer.zero_grad` did **not** reset `.grad` from non-zero to zero/None.  
+     - Implies either no gradients were ever populated or zeroing silently failed.  
+  2. `Adadelta.step` did **not** update `.data` of any parameters.  
+     - Indicates the optimizer had **no trainable parameters** to touch.  
+
+**🧩 Putting it all together: The optimizer wasn’t updating anything because… the parameters it received had requires_grad=False. [Go to Background: What’s wrong in 84911?](#background-whats-wrong-in-84911) to see the full root cause confirmed and explained.**
+
+<details>
+<summary>🙋 Click here to learn how to inspect the raw results</summary><br>
+
+Open the `failed_*.log` file—TrainCheck writes each violated invariant as a standalone JSON object. For example:
+
+```json
+{
+  "invariant": { … },
+  "check_passed": false,
+  "triggered": true,
+  "detection_time": 18343040207314524,
+  "detection_time_percentage": 0.1805434802294184,
+  "trace": [
+    {
+      "func_call_id": "...",
+      "meta_vars.step": 1,
+      "function": "torch.optim.optimizer.Optimizer.zero_grad",
+      …
+    }
+    ...
+  ]
+}
+```
+
+- `"invariant"` shows the invariant that this result correspond to, and 
+- `"trace"` corresponds to the specific trace that caused the violation.
+- `"check_passed": false` means that the invariant has been violated.
+- `"triggered": true` means that the invariant has been checked at least once, which is always the case if the invariant is violated.
+- `"detection_time"` is the timestamp when the violation happened.
+- `"detection_percentage"` is the percentage of this timestamp in the entire duration of the training, and gives a rough impression of how early the detection is. We are working on providing a field `"detection_step"` that pinpoints on which step the issue is detected. For now, to get "step", you can look at the `"trace"` field and look for step numbers in `"meta_vars"`.
+
+For example, the "`optimizer.zero_grad` did **not** reset `.grad` from non-zero to zero/None" is represented as:
+
+```json
+{
+    "invariant": {
+        "relation": "APIContainRelation",
+        "params": [
+            {
+                "param_type": "APIParam",
+                "api_full_name": "torch.optim.optimizer.Optimizer.zero_grad"
+            },
+            {
+                "param_type": "VarTypeParam",
+                "var_type": "torch.nn.Parameter",
+                "attr_name": "grad",
+                "pre_value": "non_zero",
+                "post_value": null
+            }
+        ],
+        "precondition": {
+            "parent_func_call_pre": {
+                "inverted": true,
+                "preconditions": [
+                    {
+                        "clauses": [
+                            {
+                                "type": "constant",
+                                "prop_name": "meta_vars.step",
+                                "additional_path": "None",
+                                "prop_dtype": "int",
+                                "values": [
+                                    0
+                                ]
+                            }
+                        ]
+                    },
+                    {
+                        "clauses": [
+                            {
+                                "type": "constant",
+                                "prop_name": "meta_vars.stage",
+                                "additional_path": "None",
+                                "prop_dtype": "str",
+                                "values": [
+                                    "testing",
+                                    "init"
+                                ]
+                            }
+                        ]
+                    }
+                ]
+            }
+        },
+        "num_positive_examples": 20,
+        "num_negative_examples": 1
+    },
+    "check_passed": false,
+    "triggered": true,
+    "detection_time": 18343039144178123,
+    "detection_time_percentage": 0.16245728748900484,
+    "trace": [
+        {
+            "func_call_id": "3f7265b362c34725b412cf693ceea8f3_18343039144122325",
+            "thread_id": 140156043466560,
+            "process_id": 1263911,
+            "meta_vars.step": 1,
+            "type": "function_call (pre)",
+            "function": "torch.optim.optimizer.Optimizer.zero_grad",
+            "is_bound_method": true,
+            "obj_id": 140152527083248,
+            "args": {
+                "0": {
+                    "torch.optim.adadelta.Adadelta": {}
+                }
+            },
+            "kwargs": {},
+            "time": 18343039144178123,
+            "return_values": NaN,
+            "var_name": NaN,
+            "var_type": NaN,
+            "mode": NaN,
+            "dump_loc": NaN,
+            "attributes._ML_DAIKON_data_ID": NaN,
+            "attributes.data": NaN,
+            "attributes.dtype": NaN,
+            "attributes.grad": NaN,
+            "attributes.grad_fn": NaN,
+            "attributes.is_cpu": NaN,
+            "attributes.is_cuda": NaN,
+            "attributes.is_ipu": NaN,
+            "attributes.is_leaf": NaN,
+            "attributes.is_meta": NaN,
+            "attributes.is_mkldnn": NaN,
+            "attributes.is_mps": NaN,
+            "attributes.is_mtia": NaN,
+            "attributes.is_nested": NaN,
+            "attributes.is_ort": NaN,
+            "attributes.is_quantized": NaN,
+            "attributes.is_sparse": NaN,
+            "attributes.is_sparse_csr": NaN,
+            "attributes.is_vulkan": NaN,
+            "attributes.is_xla": NaN,
+            "attributes.is_xpu": NaN,
+            "attributes.itemsize": NaN,
+            "attributes.name": NaN,
+            "attributes.nbytes": NaN,
+            "attributes.ndim": NaN,
+            "attributes.requires_grad": NaN,
+            "attributes.retains_grad": NaN,
+            "attributes.shape": NaN,
+            "attributes._ML_DAIKON_grad_ID": NaN,
+            "exception": NaN,
+            "exception_msg": NaN,
+            "proxy_obj_names": NaN
+        }
+    ]
+}
+```
+
+The invariant specifies that `torch.optim.optimizer.Optimizer.zero_grad` (*the first invariant parameter*) invocations must change `.grad` from a non-zero value to `null` (*the second invariant parameter*), except during the very first iteration (*i.e. before any backward pass when no `.grad` exists, as per the invariant precondition*). We then inspect the trace record where the invariant is violated: `meta_vars.step` is 1, indicating detection occurred in the second training iteration. You can review the other results in the same way.
+
+The `NaN` values denote missing fields and can be safely ignored.
+
+</details>
+
+---
+
+🎉 You just used TrainCheck to catch a real-world silent bug before it impacted training!
\ No newline at end of file
diff --git a/docs/ae.md b/docs/ae.md
new file mode 100644
index 00000000..65fe3c0a
--- /dev/null
+++ b/docs/ae.md
@@ -0,0 +1,247 @@
+# TrainCheck Artifact Evaluation Guide
+
+Welcome to the artifact evaluation guide for **TrainCheck** (OSDI'25). This document outlines the procedures needed to reproduce our results and guides you through the key experiments presented in the paper.
+
+> **Note:** We may update both the main TrainCheck repository and the evaluation workloads repository during the evaluation period.  
+> Please make sure to **pull the latest version** of each repository before proceeding.
+
+## ✅ Checklist
+
+- [ ] Environment set up (Python, dependencies, 2 CUDA GPUs with ≥ 12GiB memory each)
+- [ ] Ran **[Silent Issue Detection](#eval-silent-issue-detection)** experiment
+- [ ] Ran **[Invariant Transferability](#eval-transferability)** evaluation
+- [ ] Ran **[False Positive Rate](#false-positive-rate)** evaluation
+- [ ] Ran **[Performance Overhead](#eval-performance-overhead)** measurement
+- [ ] Verified outputs match expected results (tolerances noted per experiment)
+
+## 📎 Additional Resources
+
+In addition to this guide, you will need the following resources throughout the evaluation process:
+
+1. [**5-Minute Tutorial**](./5-min-tutorial.md) — A quick walkthrough that introduces TrainCheck’s workflow using a real-world bug.
+2. [**TrainCheck Installation Guide**](./installation-guide.md) — Step-by-step instructions for setting up TrainCheck.
+3. [**Technical Usage Guide**](./technical-doc.md) — Detailed documentation on how to use TrainCheck, configure instrumentation, and interpret outputs.
+4. [**Evaluation Workloads Repository**](https://github.com/OrderLab/TrainCheck-Evaluation-Workloads) — Contains all evaluation workloads used in the experiments.
+
+## 1. Overview
+
+**TrainCheck** is an invariant-based tool for detecting silent correctness issues in PyTorch training pipelines.
+
+This artifact enables reproduction of the four main evaluation results from the paper:
+
+- **[Silent Issue Detection (Section 5.1)](#eval-silent-issue-detection)**
+- **[Invariant Transferability (Section 5.3)](#eval-transferability)**
+- **[False Positive Rate (Section 5.4)](#false-positive-rate)**
+- **[Performance Overhead (Section 5.5)](#eval-performance-overhead)**
+
+To get familiar with TrainCheck, we recommend starting with the [**5-Minute Tutorial**](./5-min-tutorial.md), which walks you through detecting a real-world bug from Section 5.1.
+
+
+### ⏱️ Recommended Evaluation Order
+
+We suggest running the evaluations in the following order, based on automation level and runtime requirements:
+1. Performance Overhead (~10 minutes)
+
+2. False Positive Rate (~1.5 hours)
+3. Transferability (~30 minutes)
+3. Silent Issue Detection (~ variate, should be able to finish within a day)
+
+Before starting the evaluation, we encourage you to go through the [**5 min tutorial with TrainCheck**](./5-min-tutorial.md) that provides some basic concepts about TrainCheck and walks you through using TrainCheck workflows, making you more familiar with our artifact as well.
+
+## 2. Environment Requirements
+
+For a full and efficient AE experience, we recommend the following setup:
+- 🖥 1 machine with 2× CUDA-enabled GPUs
+- Each GPU should have at least 12 GiB memory.
+- Compatible with CUDA 11.8 or 12.1
+- 🧠 32 host memory (recommended)
+
+### 🔧 Recommended Hardware: Chameleon Cloud
+
+Most experiments require **2× CUDA-enabled GPUs** with support for **CUDA 11.8+**. While some workloads can run on GPUs with as little as 2 GiB memory, the main experiments (e.g., Section 5.1) benefit from higher-capacity GPUs.
+
+We recommend using the `compute_liqid` node type on [Chameleon Cloud](https://www.chameleoncloud.org):
+
+- ✅ `liqid01` and `liqid02`:  
+  These nodes each have **2× A100 GPUs (40 GiB)** and allow you to reproduce **all results** in the paper.
+
+- 🆗 Other `compute_liqid` nodes with **1× A100 GPU**:  
+  These are sufficient for all **single-GPU experiments** and let you reproduce **~90%** of results.
+
+Please consult the estimated runtimes in each evaluation section before making reservations.  
+⏱️ If working full-time on the artifact, **2 days should be sufficient**, but we recommend reserving **at least 5 days** to allow for possible setup delays or debugging.
+
+### Software Notes
+
+1. If you’re using Chameleon instances:
+    - Please start your machine with an Ubuntu 22.04 image that includes recent GPU drivers.
+    - (TBD: we will provide the specific image ID or setup instructions once finalized.)
+
+2. Follow [Installation Guide](./installation-guide.md) to install TrainCheck.
+
+⏭️ Once your environment is set up, we recommend starting with the [5-Minute Tutorial with TrainCheck](./5-min-tutorial.md).
+It will help you get familiar with the workflow and also verify that your installation is working correctly.
+
+## Eval: Silent Issue Detection
+
+## Eval: False Positive Rate
+
+⏳ Estimated Completion Time: 2 hour.
+- Trace Collection: ~10 minutes
+- Invariant Inference & Checking: ~1.5 hours
+
+### 🎯 Goal
+
+This evaluation measures the false positive rate of alarms from TrainCheck's invariants.
+
+### 📂 Resources & Scripts
+
+- Automation Scripts:
+    1. `traincheck-ae-resources/fp_rate/ae_fp.py`
+- Workloads:
+    1. The PyTorch official pipelines that will be used has been prepared at `traincheck-ae-resource/fp_rate/workloads`. We have shortened training for these scripts such that the experiments run fast without . For AE purposes, you won't need to know the details as the automation scripts will automatically handle the running of these scripts.
+
+### 🛠 How to Run
+
+0. Make sure you have a working TrainCheck installation by following [TrainCheck Installation Guide](./installation-guide.md).
+
+1. Install necessary dependencies for the false positive evaluation workloads.
+```bash
+conda activate traincheck # change this if you installed TrainCheck in a different environment.
+cd fp_rate
+pip3 install -r requirements.txt
+```
+
+2. Execute `ae_fp.py`
+The workload `ddp-multigpu` will need 2 GPUs. We have provided the trace for `ddp-multigpu` in case you do not have two GPUs.
+
+If you need to use our pre-computed trace for `ddp-multigpu`, remove the `--overwrite-existing-results` argument.
+```bash
+python3 ae_fp.py --bench workloads
+```
+
+Or, if you have a machine with 2 GPUs, execute the below command, such that the original results will be re-computed.
+```bash
+python3 ae_fp.py --bench workloads --overwrite-existing-results
+```
+
+### What to Expect During Execution
+
+The script is long running. It performs three tasks at same time. 
+1. It collects trace for all the workloads.
+2. It infers invariants for three setups in Section 5.4.
+3. It checks inferred invariants on the validation workloads.
+
+The experiments might fail if environment installation issues or disruption happens. When you run into problems, please refer to [⚠️ Notes & Troubleshooting](#️-notes--troubleshooting).
+
+### ⚠️ Notes & Troubleshooting
+
+The script will automatically detect any errors in any (1) trace collection, (2) inference tasks, (3) checking tasks. If you encounter any trace collection issues, please check for any missing environment dependencies.
+
+If you encounter any issues on invariant inference tasks or invariant checking tasks, please try to rerun the experiment by adding `--overwrite-existing-results` or delete all `trace_*` folders except for `trace_ddp-multigpu`.
+
+If you see persistent issues, it will likely be a environment issue or software bug. Please contact us for help.
+
+### How to verify the results?
+
+The `ae_fp.py` script generates a file called `fp.csv` under the current directory. Looking like this
+
+```csv
+setup,fp_rate
+1-input,0.3105
+4-input,0.1127
+6-input,0.1066
+```
+
+These values correspond to the results reported in Section 5.4 of the paper.
+You should verify that the false positive rates are similar (within a 5% margin of error) or lower.
+
+## Eval: Transferability
+
+⏳ Estimated Completion Time: TBD hour.
+- Trace Collection: x hours
+- Invariant Inference: x hours
+- Invariant Checking: x hours
+
+### 🎯 Goal
+
+This evaluation measures the transferability of invariants inferred by TrainCheck. 
+
+## Eval: Performance Overhead
+
+⏳ Estimated Completion Time: 1.5 hour.
+
+### 🎯 Goal
+
+This evaluation measures the runtime overhead introduced by TrainCheck’s instrumentation compared to un-instrumented runs across a set of representative ML workloads, during the invariant checking stage. The results correspond to Section 5.5 of the paper.
+
+
+### 📂 Resources & Scripts
+
+- Automation Scripts: 
+  - `eval_scripts/perf_benchmark/run_all.xsh`: run the experiments and collect data.
+  - `eval_scripts/perf_benchmark/analysis.xsh`: analyze raw data and produce input for the plot script.
+  - `eval_scripts/perf_benchmark/plot_e2e.py` and `eval_scripts/perf_benchmark/plot_micro.py`: plot the figures in Section 5.5.
+  
+- Workloads (You probably won't need to touch this):
+    - Located in [overhead-e2e](../eval_scripts/perf_benchmark/overhead-e2e) and [overhead-micro](../eval_scripts/perf_benchmark/overhead-micro)
+	- No pre-collected data is required—this evaluation runs end-to-end automatically and is pretty light weight
+
+- Deployed 100 invariants:
+    [eval_scripts/perf_benchmark/overhead-e2e/sampled_100_invariants.json](../eval_scripts/perf_benchmark/overhead-e2e/sampled_100_invariants.json)
+
+
+### 🛠 How to Run
+
+1. Navigate to the performance benchmark directory:
+    ```bash
+    cd eval_scripts/perf_benchmark/
+    ```
+
+2. Run the full benchmark suite using:
+    ```bash
+    xonsh eval_scripts/perf_benchmark/run_all.xsh
+    ```
+This script will:
+- Execute each workload in three modes:
+    - No instrumentation
+	- TrainCheck selective instrumentation with 100 invariants deployed
+	- Python settrace baseline (a lightweight instrumentation baseline)
+- Measure per-iteration training time.
+- Save raw results in a folder named: `perf_eval_res_<commit_hash>`
+
+You should then execute the below commands that analyze the data and produce plots.
+```bash
+xonsh analysis.xsh --res_folder perf_eval_res_<commit_hash>
+
+python3 plot_e2e.py -o perf_eval_res_<commit_hash>/macro.pdf -i perf_eval_res_<commit_hash>/overhead_e2e.csv -t <commit_hash>
+
+python3 plot_micro.py -o perf_eval_res_<commit_hash>/micro.pdf -i perf_eval_res_<commit_hash>/wrapper_overhead_micro.csv -t <commit_hash>
+```
+
+### Expected Output
+Key files in `perf_eval_res_<commit_hash>`:
+- `overhead_e2e.csv` and `marco.pdf` data and plot for benchmarks presented in Section 5.5.
+- `wrapper_overhead_micro.csv` and `micro.pdf`: data and plot for the pure wrapper overhead on individual APIs.
+
+### ✅ How to Verify
+	•	Check that the overhead percentages in overhead_results.csv are consistent with those reported in Section 5.5.
+	•	Variations (within ±15% TODO confirm) are expected due to runtime and hardware differences.
+
+
+### ⚠️ Notes & Troubleshooting
+1. **Do Not Run Other GPU Tasks in Parallel**
+
+    For stable performance measurements, the evaluation scripts will periodically terminate all CUDA processes to ensure a clean environment. 
+    Please avoid running any other GPU workloads during this evaluation.
+
+2. **Handling Failed Workloads**
+
+    If an end-to-end workload fails:
+    - Navigate to the corresponding workload folder.
+    - Manually rerun it using:
+    ```bash
+    traincheck-collect --use-config --config md-config-var.yml -i ../sampled_100_invariants.json
+    ```
+	- If the issue does not reproduce consistently, simply delete the result folder and rerun the full benchmark.
+	- If the failure is consistent, please contact us for support.
diff --git a/docs/assets/code/84911.py b/docs/assets/code/84911.py
new file mode 100644
index 00000000..6c52cd04
--- /dev/null
+++ b/docs/assets/code/84911.py
@@ -0,0 +1,209 @@
+import json
+import os
+import random
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torchvision.transforms as transforms
+from efficientnet_pytorch import EfficientNet
+from PIL import ImageFile
+from torchvision import datasets
+from tqdm import tqdm
+
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+
+# Deterministic Behaviour
+seed = 786
+os.environ["PYTHONHASHSEED"] = str(seed)
+## Torch RNG
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+torch.cuda.manual_seed_all(seed)
+## Python RNG
+np.random.seed(seed)
+random.seed(seed)
+
+## CuDNN determinsim
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+
+
+### TODO: Write data loaders for training, validation, and test sets
+## Specify appropriate transforms, and batch_sizes
+data_transform = {
+    "train": transforms.Compose(
+        [
+            transforms.Resize((224, 224)),
+            transforms.RandomHorizontalFlip(p=0.5),
+            transforms.RandomVerticalFlip(p=0.5),
+            transforms.RandomRotation(30),
+            transforms.ColorJitter(brightness=0, contrast=0.5, saturation=0.5, hue=0.5),
+            transforms.ToTensor(),
+            transforms.Normalize([0.2829, 0.2034, 0.1512], [0.2577, 0.1834, 0.1411]),
+        ]
+    ),
+    "valid": transforms.Compose(
+        [
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize([0.2829, 0.2034, 0.1512], [0.2577, 0.1834, 0.1411]),
+        ]
+    ),
+}
+
+
+dir_file = "dataset"
+train_dir = os.path.join(dir_file, "train")
+valid_dir = os.path.join(dir_file, "dev")
+
+train_set = datasets.CIFAR100(
+    root="./data", train=True, download=True, transform=data_transform["train"]
+)
+valid_set = datasets.CIFAR100(
+    root="./data", train=False, download=True, transform=data_transform["valid"]
+)
+
+batch_size = 64
+train_loader = torch.utils.data.DataLoader(
+    train_set, batch_size=batch_size, pin_memory=False, num_workers=0, shuffle=False
+)
+valid_loader = torch.utils.data.DataLoader(
+    valid_set, batch_size=1, pin_memory=False, num_workers=0, shuffle=False
+)
+
+data_transfer = {"train": train_loader, "valid": valid_loader}
+
+
+# %%
+
+model_transfer = EfficientNet.from_pretrained("efficientnet-b0")
+n_inputs = model_transfer._fc.in_features
+
+num_classes = 100
+model_transfer._fc = nn.Linear(n_inputs, num_classes)
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+criterion_transfer = nn.CrossEntropyLoss()
+optimizer_transfer = optim.Adadelta(model_transfer._fc.parameters(), lr=1)
+model_transfer.to(device)
+
+for name, param in model_transfer.named_parameters():
+    if "bn" not in name:
+        param.requires_grad = False
+
+for param in model_transfer._fc.parameters():
+    param.requires_grad = False
+
+print(model_transfer._fc.in_features)
+
+
+use_cuda = torch.cuda.is_available()
+
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+def train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path):
+    """returns trained model"""
+    # initialize tracker for minimum validation loss
+    _tc_stats = {  # collecting stats for TrainCheck
+        "granularity": "epoch",
+        "train_loss": [],
+        "valid_loss": [],
+        "valid_acc": [],
+    }
+
+    valid_loss_min = np.inf
+    for epoch in tqdm(range(1, n_epochs + 1), desc="Epochs"):
+        # initialize variables to monitor training and validation loss
+        train_loss = 0.0
+        valid_loss = 0.0
+        correct = 0.0
+        total = 0.0
+        accuracy = 0.0
+
+        model.train()
+        for batch_idx, (data, target) in enumerate(
+            tqdm(loaders["train"], desc="Training")
+        ):
+            # move to GPU
+            if use_cuda:
+                data, target = data.to("cuda", non_blocking=True), target.to(
+                    "cuda", non_blocking=True
+                )
+            optimizer.zero_grad()
+            output = model(data)
+            loss = criterion(output, target)
+            loss.backward()
+            optimizer.step()
+            train_loss += (1 / (batch_idx + 1)) * (float(loss) - train_loss)
+            if batch_idx == 10:
+                break
+
+        ######################
+        # validate the model #
+        ######################
+        model.eval()
+        for batch_idx, (data, target) in enumerate(
+            tqdm(loaders["valid"], desc="Validation")
+        ):
+            # move to GPU
+            if use_cuda:
+                data, target = data.cuda(), target.cuda()
+            ## update the average validation loss
+            output = model(data)
+            loss = criterion(output, target)
+            valid_loss += (1 / (batch_idx + 1)) * (float(loss) - valid_loss)
+            del loss
+            pred = output.data.max(1, keepdim=True)[1]
+            correct += np.sum(
+                np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy()
+            )
+            total += data.size(0)
+
+            if batch_idx == 5:
+                break
+
+        # print training/validation statistics
+        print(
+            "Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}".format(
+                epoch, train_loss, valid_loss
+            )
+        )
+
+        accuracy = 100.0 * (correct / total)
+        print("\nValid Accuracy: %2d%% (%2d/%2d)" % (accuracy, correct, total))
+
+        _tc_stats["train_loss"].append(train_loss)
+        _tc_stats["valid_loss"].append(valid_loss)
+        _tc_stats["valid_acc"].append(accuracy)
+
+        ## TODO: save the model if validation loss has decreased
+        if valid_loss <= valid_loss_min:
+            print(
+                "Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...".format(
+                    valid_loss_min, valid_loss
+                )
+            )
+            torch.save(model.state_dict(), "case_3_model.pt")
+            valid_loss_min = valid_loss
+
+        # save the stats
+        with open("result_stats.json", "w") as f:
+            json.dump(_tc_stats, f, indent=4)
+
+    return model
+
+
+model_transfer = train(
+    2,
+    data_transfer,
+    model_transfer,
+    optimizer_transfer,
+    criterion_transfer,
+    use_cuda,
+    "model_transfer.pt",
+)
diff --git a/docs/assets/code/mnist.py b/docs/assets/code/mnist.py
new file mode 100644
index 00000000..58c0254a
--- /dev/null
+++ b/docs/assets/code/mnist.py
@@ -0,0 +1,224 @@
+import argparse
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
+
+meta_vars["step"] = -1
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    annotate_stage("training")  # ML_DAIKON: stage annotation
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        meta_vars["step"] += 1
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item(),
+                )
+            )
+            if args.dry_run:
+                break
+
+        # ML_DAIKON: break after 100 batches
+        if batch_idx == 50:
+            break
+
+
+def test(model, device, test_loader):
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        data_idx = 0
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+            data_idx += 1
+            # ML_DAIKON: break after 10 batches
+            if data_idx == 10:
+                break
+
+    test_loss /= len(test_loader.dataset)
+
+    print(
+        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+            test_loss,
+            correct,
+            len(test_loader.dataset),
+            100.0 * correct / len(test_loader.dataset),
+        )
+    )
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=2,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
+    )
+    parser.add_argument(
+        "--no-mps",
+        action="store_true",
+        default=False,
+        help="disables macOS GPU training",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
+    args = parser.parse_args()
+
+    annotate_stage("init")  # ML_DAIKON: stage annotation
+    use_cuda = not args.no_cuda and torch.cuda.is_available()
+    use_mps = not args.no_mps and torch.backends.mps.is_available()
+
+    torch.manual_seed(args.seed)
+
+    if use_cuda:
+        device = torch.device("cuda")
+    elif use_mps:
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
+    if use_cuda:
+        cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
+        # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
+        # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
+        train_kwargs.update(cuda_kwargs)
+        test_kwargs.update(cuda_kwargs)
+
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
+    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
+    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+
+    model = Net().to(device)
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+
+    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch)
+        test(model, device, test_loader)
+
+        annotate_stage("training")  # ML_DAIKON: stage annotation
+        scheduler.step()
+
+    if args.save_model:
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
+        torch.save(model.state_dict(), "mnist_cnn.pt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/README.md b/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/README.md
new file mode 100644
index 00000000..64a6a4f7
--- /dev/null
+++ b/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/README.md
@@ -0,0 +1,7 @@
+Language model pretraining script from the official examples of the transformers library.
+Trains GPT-2 on 
+
+Modifications:
+1. 10 steps per training/testing epoch.
+2. stage annotations
+3. skip instrumentation for the tokenization step
\ No newline at end of file
diff --git a/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/config.yml b/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/config.yml
new file mode 100644
index 00000000..628b21b3
--- /dev/null
+++ b/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/config.yml
@@ -0,0 +1,10 @@
+modules_to_instr:
+- torch
+- transformers
+- accelerate
+pyscript: run_clm_no_trainer.py
+shscript: run.sh
+copy_all_files: true
+models_to_track:
+- model
+model_tracker_style: proxy
\ No newline at end of file
diff --git a/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/run.sh b/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/run.sh
new file mode 100644
index 00000000..de707967
--- /dev/null
+++ b/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/run.sh
@@ -0,0 +1,8 @@
+python run_clm_no_trainer.py \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --model_name_or_path distilbert/distilgpt2 \
+    --output_dir /tmp/test-clm \
+    --per_device_train_batch_size 2 \
+    --per_device_eval_batch_size 2 \
+    --num_train_epochs 1 \
diff --git a/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/run_clm_no_trainer.py b/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/run_clm_no_trainer.py
new file mode 100755
index 00000000..9ecd841d
--- /dev/null
+++ b/docs/assets/examples/traincheck-collect/gpt2-pretrain-config/run_clm_no_trainer.py
@@ -0,0 +1,855 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
+on a text file or a dataset without using HuggingFace Trainer.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=text-generation
+"""
+# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
+
+import argparse
+import json
+import logging
+import math
+import os
+import random
+from itertools import chain
+from pathlib import Path
+
+import datasets
+import torch
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from datasets import load_dataset
+from huggingface_hub import HfApi
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    SchedulerType,
+    default_data_collator,
+    get_scheduler,
+)
+from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils.versions import require_version
+
+import traincheck.instrumentor.tracer as md_tracer
+from traincheck import annotate_stage
+
+annotate_stage("init")
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.45.0")
+
+logger = get_logger(__name__)
+
+require_version(
+    "datasets>=2.14.0",
+    "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt",
+)
+
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Finetune a transformers model on a causal language modeling task"
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file",
+        type=str,
+        default=None,
+        help="A csv, txt or a json file containing the training data.",
+    )
+    parser.add_argument(
+        "--validation_file",
+        type=str,
+        default=None,
+        help="A csv, txt or a json file containing the validation data.",
+    )
+    parser.add_argument(
+        "--validation_split_percentage",
+        default=5,
+        help="The percentage of the train set used as validation set in case there's no validation split",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=False,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--weight_decay", type=float, default=0.0, help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        type=int,
+        default=3,
+        help="Total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=[
+            "linear",
+            "cosine",
+            "cosine_with_restarts",
+            "polynomial",
+            "constant",
+            "constant_with_warmup",
+        ],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default=None, help="Where to store the final model."
+    )
+    parser.add_argument(
+        "--seed", type=int, default=None, help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument(
+        "--block_size",
+        type=int,
+        default=None,
+        help=(
+            "Optional input sequence length after tokenization. The training dataset will be truncated in block of"
+            " this size for training. Default to the model max input length for single sentence inputs (take into"
+            " account special tokens)."
+        ),
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument(
+        "--overwrite_cache",
+        action="store_true",
+        help="Overwrite the cached training and evaluation sets",
+    )
+    parser.add_argument(
+        "--no_keep_linebreaks",
+        action="store_true",
+        help="Do not keep line breaks when using TXT files.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the model to the Hub.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--hub_token", type=str, help="The token to use to push to the Model Hub."
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=str,
+        default=None,
+        help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help="If the training should continue from a checkpoint folder.",
+    )
+    parser.add_argument(
+        "--with_tracking",
+        action="store_true",
+        help="Whether to enable experiment trackers for logging.",
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="all",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations. '
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--low_cpu_mem_usage",
+        action="store_true",
+        help=(
+            "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. "
+            "If passed, LLM loading time and RAM consumption will be benefited."
+        ),
+    )
+    args = parser.parse_args()
+
+    # Sanity checks
+    if (
+        args.dataset_name is None
+        and args.train_file is None
+        and args.validation_file is None
+    ):
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            if extension not in ["csv", "json", "txt"]:
+                raise ValueError("`train_file` should be a csv, json or txt file.")
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            if extension not in ["csv", "json", "txt"]:
+                raise ValueError("`validation_file` should be a csv, json or txt file.")
+
+    if args.push_to_hub:
+        if args.output_dir is None:
+            raise ValueError(
+                "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
+            )
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    send_example_telemetry("run_clm_no_trainer", args)
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
+    # in the environment
+    accelerator_log_kwargs = {}
+
+    if args.with_tracking:
+        accelerator_log_kwargs["log_with"] = args.report_to
+        accelerator_log_kwargs["project_dir"] = args.output_dir
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        **accelerator_log_kwargs,
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            # Retrieve of infer repo_name
+            repo_name = args.hub_model_id
+            if repo_name is None:
+                repo_name = Path(args.output_dir).absolute().name
+            # Create repo and retrieve repo_id
+            api = HfApi()
+            repo_id = api.create_repo(
+                repo_name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            trust_remote_code=args.trust_remote_code,
+        )
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[:{args.validation_split_percentage}%]",
+                trust_remote_code=args.trust_remote_code,
+            )
+            raw_datasets["train"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[{args.validation_split_percentage}%:]",
+                trust_remote_code=args.trust_remote_code,
+            )
+    else:
+        data_files = {}
+        dataset_args = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+            extension = args.train_file.split(".")[-1]
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+            extension = args.validation_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+            dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
+        raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{args.validation_split_percentage}%]",
+                **dataset_args,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{args.validation_split_percentage}%:]",
+                **dataset_args,
+            )
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(
+            args.config_name,
+            trust_remote_code=args.trust_remote_code,
+        )
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(
+            args.model_name_or_path,
+            trust_remote_code=args.trust_remote_code,
+        )
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name,
+            use_fast=not args.use_slow_tokenizer,
+            trust_remote_code=args.trust_remote_code,
+        )
+    elif args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_name_or_path,
+            use_fast=not args.use_slow_tokenizer,
+            trust_remote_code=args.trust_remote_code,
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script. "
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if args.model_name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            low_cpu_mem_usage=args.low_cpu_mem_usage,
+            trust_remote_code=args.trust_remote_code,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForCausalLM.from_config(
+            config, trust_remote_code=args.trust_remote_code
+        )
+
+    # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
+    # on a small vocab and want a smaller embedding size, remove this test.
+    embedding_size = model.get_input_embeddings().weight.shape[0]
+    if len(tokenizer) > embedding_size:
+        model.resize_token_embeddings(len(tokenizer))
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    def tokenize_function(examples):
+        return tokenizer(examples[text_column_name])
+
+    md_tracer.DISABLE_WRAPPER = True
+    with accelerator.main_process_first():
+        tokenized_datasets = raw_datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
+    md_tracer.DISABLE_WRAPPER = False
+
+    if args.block_size is None:
+        block_size = tokenizer.model_max_length
+        if block_size > config.max_position_embeddings:
+            logger.warning(
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                f"Using block_size={min(1024, config.max_position_embeddings)} instead. You can change that default value by passing --block_size xxx."
+            )
+            block_size = min(1024, config.max_position_embeddings)
+    else:
+        if args.block_size > tokenizer.model_max_length:
+            logger.warning(
+                f"The block_size passed ({args.block_size}) is larger than the maximum length for the model "
+                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
+            )
+        block_size = min(args.block_size, tokenizer.model_max_length)
+
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
+        # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
+    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
+    # to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/process#map
+
+    with accelerator.main_process_first():
+        lm_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            load_from_cache_file=not args.overwrite_cache,
+            desc=f"Grouping texts in chunks of {block_size}",
+        )
+
+    train_dataset = lm_datasets["train"]
+    eval_dataset = lm_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # DataLoaders creation:
+    train_dataloader = DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=default_data_collator,
+        batch_size=args.per_device_train_batch_size,
+    )
+    eval_dataloader = DataLoader(
+        eval_dataset,
+        collate_fn=default_data_collator,
+        batch_size=args.per_device_eval_batch_size,
+    )
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "layer_norm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps * accelerator.num_processes,
+        num_training_steps=(
+            args.max_train_steps
+            if overrode_max_train_steps
+            else args.max_train_steps * accelerator.num_processes
+        ),
+    )
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = (
+        accelerator.prepare(
+            model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
+        )
+    )
+
+    # # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    # if accelerator.distributed_type == DistributedType.TPU:
+    #     model.tie_weights()
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # Figure out how many steps we should save the Accelerator states
+    checkpointing_steps = args.checkpointing_steps
+    if checkpointing_steps is not None and checkpointing_steps.isdigit():
+        checkpointing_steps = int(checkpointing_steps)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if args.with_tracking:
+        experiment_config = vars(args)
+        # TensorBoard cannot log Enums, need the raw value
+        experiment_config["lr_scheduler_type"] = experiment_config[
+            "lr_scheduler_type"
+        ].value
+        accelerator.init_trackers("clm_no_trainer", experiment_config)
+
+    # Train!
+    total_batch_size = (
+        args.per_device_train_batch_size
+        * accelerator.num_processes
+        * args.gradient_accumulation_steps
+    )
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(
+        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
+    )
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(args.max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    completed_steps = 0
+    starting_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
+            checkpoint_path = args.resume_from_checkpoint
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
+            dirs.sort(key=os.path.getctime)
+            path = dirs[
+                -1
+            ]  # Sorts folders by date modified, most recent checkpoint is the last
+            checkpoint_path = path
+            path = os.path.basename(checkpoint_path)
+
+        accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+        accelerator.load_state(checkpoint_path)
+        # Extract `epoch_{i}` or `step_{i}`
+        training_difference = os.path.splitext(path)[0]
+
+        if "epoch" in training_difference:
+            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
+            resume_step = None
+            completed_steps = starting_epoch * num_update_steps_per_epoch
+        else:
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = (
+                int(training_difference.replace("step_", ""))
+                * args.gradient_accumulation_steps
+            )
+            starting_epoch = resume_step // len(train_dataloader)
+            completed_steps = resume_step // args.gradient_accumulation_steps
+            resume_step -= starting_epoch * len(train_dataloader)
+
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(completed_steps)
+
+    for epoch in range(starting_epoch, args.num_train_epochs):
+        annotate_stage("training")
+        model.train()
+        if args.with_tracking:
+            total_loss = 0
+        if (
+            args.resume_from_checkpoint
+            and epoch == starting_epoch
+            and resume_step is not None
+        ):
+            # We skip the first `n` batches in the dataloader when resuming from a checkpoint
+            active_dataloader = accelerator.skip_first_batches(
+                train_dataloader, resume_step
+            )
+        else:
+            active_dataloader = train_dataloader
+        for step, batch in enumerate(active_dataloader):
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                # We keep track of the loss at each epoch
+                if args.with_tracking:
+                    total_loss += loss.detach().float()
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if isinstance(checkpointing_steps, int):
+                if (
+                    completed_steps % checkpointing_steps == 0
+                    and accelerator.sync_gradients
+                ):
+                    output_dir = f"step_{completed_steps}"
+                    if args.output_dir is not None:
+                        output_dir = os.path.join(args.output_dir, output_dir)
+                    accelerator.save_state(output_dir)
+            if completed_steps >= args.max_train_steps:
+                break
+            if step == 10:
+                break
+
+        annotate_stage("testing")
+        model.eval()
+        losses = []
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+
+            loss = outputs.loss
+            losses.append(
+                accelerator.gather_for_metrics(
+                    loss.repeat(args.per_device_eval_batch_size)
+                )
+            )
+            if step == 10:
+                break
+
+        losses = torch.cat(losses)
+        try:
+            eval_loss = torch.mean(losses)
+            perplexity = math.exp(eval_loss)
+        except OverflowError:
+            perplexity = float("inf")
+
+        logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
+
+        if args.with_tracking:
+            accelerator.log(
+                {
+                    "perplexity": perplexity,
+                    "eval_loss": eval_loss,
+                    "train_loss": total_loss.item() / len(train_dataloader),
+                    "epoch": epoch,
+                    "step": completed_steps,
+                },
+                step=completed_steps,
+            )
+
+        annotate_stage("checkpointing")
+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(
+                args.output_dir,
+                is_main_process=accelerator.is_main_process,
+                save_function=accelerator.save,
+            )
+            if accelerator.is_main_process:
+                tokenizer.save_pretrained(args.output_dir)
+                api.upload_folder(
+                    commit_message=f"Training in progress epoch {epoch}",
+                    folder_path=args.output_dir,
+                    repo_id=repo_id,
+                    repo_type="model",
+                    token=args.hub_token,
+                )
+
+        if args.checkpointing_steps == "epoch":
+            output_dir = f"epoch_{epoch}"
+            if args.output_dir is not None:
+                output_dir = os.path.join(args.output_dir, output_dir)
+            accelerator.save_state(output_dir)
+
+    if args.with_tracking:
+        accelerator.end_training()
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(
+            args.output_dir,
+            is_main_process=accelerator.is_main_process,
+            save_function=accelerator.save,
+        )
+        if accelerator.is_main_process:
+            tokenizer.save_pretrained(args.output_dir)
+            if args.push_to_hub:
+                api.upload_folder(
+                    commit_message="End of training",
+                    folder_path=args.output_dir,
+                    repo_id=repo_id,
+                    repo_type="model",
+                    token=args.hub_token,
+                )
+            with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
+                json.dump({"perplexity": perplexity}, f)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/assets/examples/traincheck-collect/mnist-config/config.yml b/docs/assets/examples/traincheck-collect/mnist-config/config.yml
new file mode 100644
index 00000000..34119a94
--- /dev/null
+++ b/docs/assets/examples/traincheck-collect/mnist-config/config.yml
@@ -0,0 +1,10 @@
+modules_to_instr:  # Libraries to instrument. Defaults to ['torch'] if not specified.
+  - torch
+pyscript: mnist.py  # The Python entry point of your training program.
+shscript: run.sh  # [Optional] Shell script to launch the program with custom arguments or environment setup.
+models_to_track:  # [Optional] List of variable names for models you want to track. If omitted, model tracking is disabled.
+  - model
+model_tracker_style: proxy  # [Optional] Method for model tracking. Choose between "proxy" (default) or "sampler".
+copy_all_files: false  # [Optional] Set to true if your code uses relative paths (e.g., loading local datasets or configs). 
+                       # This ensures TrainCheck copies the entire working directory before execution.
+                       # Note: TrainCheck automatically handles PYTHONPATH. Default is false.
\ No newline at end of file
diff --git a/docs/assets/examples/traincheck-collect/mnist-config/mnist.py b/docs/assets/examples/traincheck-collect/mnist-config/mnist.py
new file mode 100644
index 00000000..58c0254a
--- /dev/null
+++ b/docs/assets/examples/traincheck-collect/mnist-config/mnist.py
@@ -0,0 +1,224 @@
+import argparse
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
+
+meta_vars["step"] = -1
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    annotate_stage("training")  # ML_DAIKON: stage annotation
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        meta_vars["step"] += 1
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item(),
+                )
+            )
+            if args.dry_run:
+                break
+
+        # ML_DAIKON: break after 100 batches
+        if batch_idx == 50:
+            break
+
+
+def test(model, device, test_loader):
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        data_idx = 0
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+            data_idx += 1
+            # ML_DAIKON: break after 10 batches
+            if data_idx == 10:
+                break
+
+    test_loss /= len(test_loader.dataset)
+
+    print(
+        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+            test_loss,
+            correct,
+            len(test_loader.dataset),
+            100.0 * correct / len(test_loader.dataset),
+        )
+    )
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=2,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
+    )
+    parser.add_argument(
+        "--no-mps",
+        action="store_true",
+        default=False,
+        help="disables macOS GPU training",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
+    args = parser.parse_args()
+
+    annotate_stage("init")  # ML_DAIKON: stage annotation
+    use_cuda = not args.no_cuda and torch.cuda.is_available()
+    use_mps = not args.no_mps and torch.backends.mps.is_available()
+
+    torch.manual_seed(args.seed)
+
+    if use_cuda:
+        device = torch.device("cuda")
+    elif use_mps:
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
+    if use_cuda:
+        cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
+        # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
+        # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
+        train_kwargs.update(cuda_kwargs)
+        test_kwargs.update(cuda_kwargs)
+
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
+    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
+    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+
+    model = Net().to(device)
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+
+    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch)
+        test(model, device, test_loader)
+
+        annotate_stage("training")  # ML_DAIKON: stage annotation
+        scheduler.step()
+
+    if args.save_model:
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
+        torch.save(model.state_dict(), "mnist_cnn.pt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/assets/examples/traincheck-collect/mnist-config/run.sh b/docs/assets/examples/traincheck-collect/mnist-config/run.sh
new file mode 100644
index 00000000..57d2738f
--- /dev/null
+++ b/docs/assets/examples/traincheck-collect/mnist-config/run.sh
@@ -0,0 +1 @@
+python3 mnist.py --epochs 3
\ No newline at end of file
diff --git a/docs/assets/images/workflow.png b/docs/assets/images/workflow.png
new file mode 100644
index 00000000..df239148
Binary files /dev/null and b/docs/assets/images/workflow.png differ
diff --git a/docs/check.md b/docs/check.md
new file mode 100644
index 00000000..f2d418c3
--- /dev/null
+++ b/docs/check.md
@@ -0,0 +1,35 @@
+# TrainCheck Checker Usage Guide
+
+`traincheck-check` is the **final stage** of the TrainCheck workflow. It verifies a set of invariants against trace files or streams from target programs, reporting any detected violations—helping you catch silent issues in your ML training pipelines.
+
+## 🔧 Current Status
+
+`traincheck-check` is designed to support two modes:
+
+- **Offline Checking**:  
+   Perform invariant checking on completed trace files after the training job finishes. ✅ *[Fully Supported]*
+
+- **Online Checking**:  
+   Perform real-time checking while the target training job is running. 🚧 *[In Development]*
+
+At present, only **offline checking** is available. Support for online mode is actively being developed.
+
+## How to Use: Offline Checking
+
+Run the following command:
+
+```bash
+traincheck-check -f <trace_folder> -i <path_to_invariant_file>
+```
+
+- `-f <trace_folder>`: Path to the folder containing traces collected by `traincheck-collect`.
+- `-i <path_to_invariant_file>`: Path to the JSON file containing inferred invariants.
+
+For details on result format and interpretation, refer to [5. Detection & Diagnosis)](./5-min-tutorial.md#5-detection--diagnosis) in the **5-Minute Tutorial**.
+
+## How to Use: Online Checking
+
+**🚧 Coming Soon**
+Support for real-time, online checking is under construction. This mode will allow TrainCheck to monitor running training jobs and surface invariant violations as they happen.
+
+Stay tuned for updates in future releases.
diff --git a/docs/infer.md b/docs/infer.md
new file mode 100644
index 00000000..9aebdaa7
--- /dev/null
+++ b/docs/infer.md
@@ -0,0 +1,208 @@
+# Invariant Inference & Representation
+
+`traincheck-infer` is part of the **inference stage** of the TrainCheck workflow. It consumes trace files collected from correct training runs and infers behavioral invariants that describe expected runtime behavior. These invariants are later used by `traincheck-check` to detect violations in other training pipelines.
+
+## 📚 Table of Contents
+- [🔧 Basic Usage](#-basic-usage)
+- [⚙️ Advanced Usage](#️-advanced-usage)
+- [📘 Invariant Concepts](#-invariant-concepts)
+- [🧪 Guidelines: Choosing Input Pipelines](#-practical-guidelines-choosing-input-pipelines)
+- [🧠 Tips: Performance and Stability](#-tips-performance-and-stability)
+- [🔗 Next Step](TODO)
+
+## 🔧 Basic Usage
+
+In most cases, you only need to specify one or more folders (generated by `traincheck-collect`) containing trace files using the `-f` or `--trace-folders` flag:
+
+```bash
+traincheck-infer -f ./traincheck_mnist_trace ./traincheck_84911_trace ..
+```
+
+You can provide multiple folders to aggregate traces from different correct runs or programs. This helps TrainCheck generalize better and avoid overfitting to any single pipeline, reducing false positives during checking—especially when the inferred invariants are applied to unrelated or structurally different pipelines.
+
+This command will infer invariants from all trace folders provided, and output invariants into `invariants.json`.
+
+## ⚙️ Advanced Usage
+traincheck-infer provides additional flags for customization and debugging. Some concepts such as "relation" will be explained later.
+
+1. `-o, --output`: Specify a custom file name for the invariants.
+2. `--disable-relation` / `--enable-relation`: Control which types of invariants to infer. This is useful for reducing noise or targeting specific checks.
+    ```bash
+    # Disable ordering-based invariants
+    traincheck-infer -f ./traces --disable-relation FunctionLeadRelation FunctionCoverRelation
+
+    # Enable only contain and variable consistency invariants
+    traincheck-infer -f ./traces --enable-relation APIContainRelation ConsistencyRelation
+    ```
+    > See [traincheck.invariant.relation_pool](../traincheck/invariant/relation_pool.py) for a complete list of invariants.
+3. `-b, --backend`: Select the data processing engine for trace handling.
+    - `pandas` (default): stable and well-tested.
+    - `polars`: faster for large traces (experimental)
+	- `dict`: pure Python dictionary backend (experimental)
+
+> Other flags (e.g. `--debug`, `-t --traces`) are available via traincheck-infer --help, but are rarely needed unless you are debugging or developing TrainCheck itself.
+
+
+## 📘 Invariant Concepts
+
+TrainCheck infers **invariants** — logical properties that are consistently held during correct training runs. These invariants are used to define the *expected* behavior of a training pipeline, and later help detect silent issues when applied to other runs.
+
+Each invariant describes a specific pattern of behavior observed in the trace, such as:
+- Attribute changes during a function call (e.g., `.grad` becomes `None` in `zero_grad()`)
+- Ordering relationships between API calls (e.g., `zero_grad()` should occur before `step()`)
+- Consistency among values across different parameters (e.g., shared parameters should have the same value across devices during distributed training)
+
+### Invariant Representation
+
+An invariant is defined by three things:
+1. **relation**: the relationship this invariant encodes, can be viewed as an invariant template. Each relation has a separate inference algorithm defined (e.g., [ConsistencyRelation.infer](../traincheck/invariant/consistency_relation.py))
+2. **params**: descriptors for entities that should obey the relationship.
+3. **precondition**: a logical predicate defining the context when an invariant can be applied.
+
+In the actual json representation of invariants in the `traincheck-infer` output, an invariant looks like this.
+
+```json
+{
+  "text_description": "torch.optim.optimizer.Optimizer.zero_grad contains VarChangeEvent torch.nn.Parameter, pre_value: non_zero, post_value: None",
+  "relation": "APIContainRelation",
+  "params": [
+    {
+      "param_type": "APIParam",
+      "api_full_name": "torch.optim.optimizer.Optimizer.zero_grad"
+    },
+    {
+      "param_type": "VarTypeParam",
+      "var_type": "torch.nn.Parameter",
+      "attr_name": "grad",
+      "pre_value": "non_zero",
+      "post_value": null
+    }
+  ],
+  "precondition": {
+    "parent_func_call_pre": {
+      "inverted": true,
+      "preconditions": [
+        {
+          "clauses": [
+            {
+              "type": "constant",
+              "prop_name": "meta_vars.step",
+              "additional_path": "None",
+              "prop_dtype": "int",
+              "values": [
+                0
+              ]
+            }
+          ]
+        },
+        {
+          "clauses": [
+            {
+              "type": "constant",
+              "prop_name": "meta_vars.stage",
+              "additional_path": "None",
+              "prop_dtype": "str",
+              "values": [
+                "init",
+                "testing"
+              ]
+            }
+          ]
+        }
+      ]
+    }
+  },
+  "num_positive_examples": 200,
+  "num_negative_examples": 1
+}
+```
+
+This invariant encodes the expectation that calling torch.optim.optimizer.Optimizer.zero_grad() should reset gradients — that is, the .grad attribute of torch.nn.Parameter objects should transition from a non-zero value to null (i.e., None or missing).
+- **text_description:**
+
+    A human-readable summary of the invariant.
+    > Note: This field is generated using a best-effort strategy and may not fully reflect the invariant’s semantics. In some cases, it may be missing or incomplete. 📆 We are planning to further formalize this field in the future.
+
+- **relation: "APIContainRelation"**
+
+    An event is expected to happen within the duration of an API invocation.
+
+- **params:**
+	- An API call: `zero_grad()` on a PyTorch optimizer
+	- An attribute: `.grad` on a `torch.nn.Parameter`, which should change from a non-zero value (`"pre_value": "non_zero"`) to null (`"post_value": null`) during the call
+
+- **precondition:**
+    This invariant only applies **outside** the following contexts:
+    - The first step of training (`meta_vars.step == 0`)
+    - The init or testing stages (`meta_vars.stage in {"init", "testing"}`)
+    > These are specified as inverted preconditions, meaning the invariant does not apply during those times (e.g., it’s okay to not clear .grad on the first step when nothing has been backpropagated yet).
+
+- **num_positive_examples: 20**
+    This behavior was observed and confirmed 200 times in the reference traces.
+
+- **num_negative_examples: 1**
+    The invariant failed once — in this case, during the first training iteration, when .grad had not yet been populated before the zero_grad() call.
+    > **🎯 This behavior is expected and correctly handled by the precondition, which excludes step 0.**
+
+### Invariant Inference Workflow
+
+At a high level, TrainCheck performs invariant inference in three stages:
+
+1. Hypothesis Generation
+
+    For each supported relation type, TrainCheck scans the provided traces and generates hypotheses by identifying patterns where a potential invariant could exist (i.e., when matching examples are observed).
+
+2. Example Collection
+
+    For every hypothesis, TrainCheck performs a full scan across all provided traces to gather positive examples (where the hypothesized invariant holds) and negative examples (where it does not).
+
+3. Precondition Deduction
+
+    TrainCheck analyzes the collected examples to infer a distinguishing predicate—a logical condition that holds true for all positive examples and false for negative ones. This predicate becomes the invariant’s precondition, reducing false positives during checking.
+
+⚙️ For full details on the inference algorithms, please refer to our OSDI’25 paper (documentation is in progress).
+
+## 🧪 Practical Guidelines: Choosing Input Pipelines
+
+When selecting input pipelines for invariant inference, there are two main considerations:
+
+1. **Representativeness**
+
+    You want your input pipelines to be diverse enough to infer a representative set of invariants. This helps:
+    - Avoid overfitting to specific patterns.
+    - Ensure that inferred invariants and preconditions remain accurate across varying scenarios.
+
+    For example, if none of your input pipelines use mixed precision, TrainCheck might infer invariants like:
+
+    > "For mathematical operations, the output dtype must equal the input dtype."
+    
+    However, if mixed precision pipelines are included, TrainCheck will refine such invariants by adding preconditions like:
+    > "This applies only when a torch.autocast context manager is not active."
+
+    **⚡ How many pipelines should you include?** It depends on how different your target pipeline is from available reference pipelines:
+    - If the target is a minor variant of a known-good pipeline, using just that reference may suffice.
+    - If the target pipeline introduces new frameworks, tasks, or architectures, include a broader set of inputs to improve generalization.
+
+
+2. **Inference Time**
+
+    Inference time is generally not a major concern, since inference happens offline. However, due to the repetitive nature of training loops, you can safely shorten reference runs without sacrificing invariant quality.
+
+    In practice:
+	- For all bugs detected by TrainCheck so far, we limited inference traces to at most 100 iterations.
+	- Shortened runs have shown no significant impact on the usefulness or accuracy of inferred invariants.
+
+### Core Principles – A Summary
+- Focus on the diversity of input traces — capturing different configurations, behaviors, or modes of operation.
+- The length or size of traces matters far less.
+- Efficient inference is achievable with short, representative runs.
+
+## Implementation Limitations
+
+TrainCheck operates on large traces with a dynamic schema, where variable types and fields can change over time. This, combined with the need for cross-trace comparisons, limits the use of typical data storage solutions like SQL databases or optimized DataFrame libraries (e.g., Polars), which require fixed schemas.
+
+To handle this, we use in-process Pandas DataFrames backed by NumPy. While effective, this approach is currently single-threaded due to Python’s GIL, leaving room for future performance improvements.
+
+We are exploring options such as shared-memory DataFrames, schema standardization, or schemaless databases (e.g., MongoDB) if data transmission overhead proves manageable.
+
+> Note: While data sharding could improve parallelism, it would overcomplicate cross-trace and cross-time analysis and is better handled at the storage layer rather than within inference logic.
\ No newline at end of file
diff --git a/docs/installation-guide.md b/docs/installation-guide.md
new file mode 100644
index 00000000..9772f61e
--- /dev/null
+++ b/docs/installation-guide.md
@@ -0,0 +1,71 @@
+## Compatibility
+
+- **Python**: 3.10+ (due to reliance on type annotations)
+- **PyTorch**: 1.7.0–2.5.0 (other versions have not been tested.)
+- **CUDA**: 11.2–12.1 (also supports MPS on macOS; see Performance note below)  
+- **Operating Systems**: Ubuntu 20.04+, macOS. Windows is untested but may work—please file an issue if you hit a problem.
+
+> **Performance note:**  
+> On non‑CUDA backends (e.g., MPS), runtime overhead can vary due to differences in tensor‑hashing efficiency. We’re actively measuring and tuning across platforms.
+
+
+
+## Installation Steps
+
+> **Note:** Example workloads are verified on Python 3.10 and PyTorch 2.2.2 + CUDA 12.1. If you’re not reproducing our benchmarks, feel free to install any supported versions.
+
+> **AEC note:** For full artifact evaluation, we recommend Ubuntu 22.04 with two Nvidia Ampere‑class GPUs (≥ 12 GiB GPU memory each). For the 5‑minute tutorial, any Linux or macOS (Apple Silicon) laptop will do.
+
+1. **Install Conda**  
+  Install Miniconda by following the [official Miniconda guide](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions).
+
+2. **Create & activate a Python 3.10 Conda Env**
+    ```bash
+    conda create -n traincheck python=3.10 -y
+    conda activate traincheck
+    ```
+
+3. **Install PyTorch 2.2.2 with CUDA support**
+    ```bash
+    pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
+    ```
+
+    If your GPU does not support CUDA12, CUDA11.8 is also acceptable.
+    
+    ```bash
+    pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu118
+    ```
+
+    If you don't have a CUDA-enabled GPU, just install the CPU version and skip step 4.
+
+    ```bash
+    pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu
+    ```
+
+4. **(CUDA platforms only) Install cudatoolkit**
+    ```bash
+    conda install cudatoolkit
+    ```
+
+5. **Clone & install TrainCheck**
+    ```bash
+    git clone https://github.com/OrderLab/TrainCheck.git
+    cd TrainCheck
+    pip3 install .
+    ```
+
+6. **Verify Installation**
+    You should now have three clis installed in your system. Do a quick test to see of these commands are available and functional.
+    ```bash
+    traincheck-collect --help
+    traincheck-infer --help
+    traincheck-check --help
+    ```
+
+## Next Steps
+
+- **5‑Minute TrainCheck Experience**  
+  Follow the [5‑Minute Tutorial](./5-min-tutorial.md) to instrument a script, infer invariants, and catch silent bugs in under five minutes.
+
+- **Technical Documentation**  
+  Explore the [TrainCheck Technical Doc](./technical-doc.md) for a comprehensive guide to features, configuration, and advanced workflows.
\ No newline at end of file
diff --git a/docs/instr.md b/docs/instr.md
new file mode 100644
index 00000000..d2781316
--- /dev/null
+++ b/docs/instr.md
@@ -0,0 +1,143 @@
+# Instrumentation & Trace Representation
+
+`traincheck-collect` is the starting point of TrainCheck's workflow. It instruments your PyTorch training script to capture runtime behavior, generating detailed execution traces for later invariant inference and issue detection.
+
+This document explains how to use `traincheck-collect` effectively.  
+TrainCheck dynamically wraps key PyTorch APIs and monitors model states—**no modifications to your original training code are required**.
+
+Use `traincheck-collect` when you need to:
+- Generate traces from **reference pipelines** for invariant inference.
+- Collect traces from **target pipelines** to detect silent issues using pre-inferred invariants.
+
+## Table of Contents
+
+1. [Introduction](#instrumentation--trace-representation)
+2. [🔧 Basic Usage](#-basic-usage)
+   - [Configuration File Example](#configuration-file-example)
+   - [Running traincheck-collect](#running-traincheck-collect)
+   - [Selective Instrumentation for Checking](#selective-instrumentation-for-checking)
+   - [Output Structure](#output-structure)
+   - [Overriding Configuration via CLI](#overriding-configuration-via-cli)
+3. [Adding Meta Variables to Traces](#adding-meta-variables-to-traces)
+   - [How Meta Variables Improve Inference](#learn-how-meta-variables-improve-invariant-inference)
+   - [Examples of Useful Meta Variables](#-examples-of-useful-meta-variables)
+   - [How to Annotate Meta Variables](#how-to-annotate-meta-variables)
+4. [Trace Representation](#trace-representation)
+5. [Instrumentation Mechanisms](#instrumentation-mechanisms)
+6. [Advanced Usage](#advanced-usage)  <!-- Placeholder for your future section -->
+7. [Algorithms Overview](#algorithms-overview)  <!-- Placeholder for your future section -->
+8. [Troubleshooting & FAQs](#troubleshooting--faqs)  <!-- Optional but useful for AE -->
+
+## 🔧 Basic Usage
+
+`traincheck-collect` requires three types of input:
+
+1. **Python script** to instrument.
+2. **Launch arguments** (if any) for executing the script.
+3. **Instrumentation-specific configurations**.
+
+You can provide these inputs either directly via the command line or through a configuration file.  
+▶️ **Recommendation**: Use a configuration file for clarity and reusability.
+
+Here’s an example configuration:
+
+```yaml
+pyscript: ./mnist.py        # Python entry point of your training program.
+shscript: ./run.sh          # [Optional] Shell script to launch with custom arguments or environment setup.
+modules_to_instr:           # Libraries to instrument. Defaults to ['torch'] if omitted.
+  - torch
+models_to_track:            # [Optional] Variable names of models to track. Leave empty to disable model tracking.
+  - model
+model_tracker_style: proxy  # [Optional] Tracking method: "proxy" (default) or "sampler".
+copy_all_files: false       # [Optional] Set true if your code relies on relative paths (e.g., local datasets/configs).
+```
+
+You can find example configurations and training programs in:
+	•	[MNIST Example](./assets/examples/traincheck-collect/mnist-config/)
+	•	[GPT-2 Pretrain Example](./assets/examples/traincheck-collect/gpt2-pretrain-config/)
+
+Run TrainCheck trace collection with:  
+
+```bash
+traincheck-collect --use-config --config <path-to-config-file>
+```
+
+This command instruments the specified libraries and model variables, then executes your program.
+(Details on instrumentation mechanisms and limitations will follow in the next section. TODO)
+
+### Selective Instrumentation for Checking
+
+When checking for silent issues, `traincheck-collect` supports selective instrumentation to improve efficiency.
+Simply provide the invariants file:
+
+```bash
+traincheck-collect --use-config --config <path-to-config> --invariants <path-to-inv-file>
+```
+
+TrainCheck will automatically adjust instrumentation granularity based on the provided invariants.
+
+### Output Structure
+By default, TrainCheck creates a folder named:
+
+```bash
+traincheck_run_<pyscript_name>_<instr_libs>_<timestamp>
+```
+
+This folder contains:
+- Collected traces
+- Instrumented scripts and execution logs (if the program completes successfully)
+
+You can also provide any additional arguments not specified in the configuration through the commandline interface, such as
+
+### Overriding Configuration via CLI
+
+You can override or supplement configuration settings by providing additional arguments directly via the command line. For example:
+
+```bash
+# Write trace files to ./trace_training instead of using the default auto-generated folder name
+traincheck-collect --use-config --config <path-to-config-file> --output-dir trace_training
+```
+
+To view all available command-line arguments and configuration options, run:
+
+```bash
+traincheck-collect --help
+```
+
+**Note**: When using a configuration file, replace hyphens (-) in argument names with underscores (_).
+For example:
+- Command-line: `--output-dir trace_training`
+- Configuration file: `output_dir: trace_training`
+
+## Adding Meta Variables to Traces
+
+You can enhance your traces by providing **custom meta variables**—semantic information about your program's execution. These annotations improve the **quality and precision** of inferred invariants by offering context that might not be directly observable from raw traces.
+
+<details>
+<summary>Learn how meta variables improve invariant inference</summary>
+
+TrainCheck infers **preconditions** for each invariant—these are predicates that distinguish between positive and negative examples in the trace.  
+- A **positive example** is a trace segment where the invariant holds.  
+- A **negative example** is where it is violated.
+
+Many invariants are inherently **conditional**, meaning they only hold true under certain contexts (e.g., during training but not initialization). TrainCheck tries to automatically discover such conditions.
+
+However, trace data alone may lack sufficient context. This is where **meta variables** come in—they inject semantic hints (like execution phase or step number) to guide smarter inference.
+
+</details>
+
+### ✨ Examples of Useful Meta Variables
+1. **`stage`** — Indicates whether a trace record belongs to initialization, training, or evaluation.
+2. **`step_id`** — The current training step or iteration number.
+3. **Custom arguments** — Any domain-specific flags or parameters relevant to your training logic.
+
+### How to Annotate Meta Variables
+📌 **[To Be Documented]**  
+Instructions for defining and injecting meta variables into traces will be provided in a future update.
+
+## Trace Representation
+📌 **[To Be Documented]** 
+
+## Instrumentation Mechanisms
+📌 **[To Be Documented]**  
+Details about TrainCheck’s instrumentation strategies, supported APIs, and limitations will be covered here later.
\ No newline at end of file
diff --git a/docs/technical-doc.md b/docs/technical-doc.md
new file mode 100644
index 00000000..c4923738
--- /dev/null
+++ b/docs/technical-doc.md
@@ -0,0 +1,46 @@
+# TrainCheck Documentation
+
+🚜 This documentation is under construction. We welcome any feedback or questions through GitHub Issues or [our Discord server](https://discord.gg/DPEd7Xeg).
+
+
+TrainCheck is a lightweight, invariant-based instrumentation and analysis tool for identifying silent correctness issues in PyTorch training pipelines. It infers behavioral invariants from correct reference runs (e.g., official examples or clean configurations), then checks other scripts for behavioral violations. TrainCheck is designed to be minimally intrusive—requiring no code modifications or rewrites of training logic.
+
+## 🔧 System Overview
+
+TrainCheck consists of three core command-line utilities:
+
+1. **traincheck-collect** – Instruments a training pipeline and collects trace logs.
+2. **traincheck-infer** – Infers behavioral invariants from the collected traces.
+3. **traincheck-check** – Checks new traces against a set of inferred invariants to detect silent issues.
+
+TrainCheck workflows are organized into two stages:
+
+1. **🧪 Inference Stage**
+    - **traincheck-collect** collects execution traces from reference training pipelines.
+    - **traincheck-infer** analyzes traces and produces invariants that describe correct/expected runtime behavior.
+
+2. **🚨 Checking Stage**
+    - **traincheck-collect** is used again to trace the target (possibly buggy) pipeline.
+    - **traincheck-check** verifies whether the collected trace violates any of the known invariants.
+
+### 📦 Pre-Inferred Invariants (On the Roadmap)
+
+In common use cases, users typically do not need to infer invariants manually. TrainCheck provides a high-quality set of pre-inferred invariants that work out-of-the-box with popular libraries such as PyTorch, HuggingFace Transformers, and DeepSpeed.
+
+You may still want to run inference in the following cases:
+- When using certain niche or uncommon features not covered by the default invariants.
+- When working with custom training stacks outside supported libraries.
+- When you want to increase specificity by inferring invariants from a set of related, known-good pipelines (e.g. in industrial settings).
+
+## 📚 Component Documentation
+
+Each utility is documented separately:
+
+- [Collecting Traces with traincheck-collect](instr.md)
+    Usage, instrumentation caveats, and trace file format.
+    
+- [Inferring Invariants with traincheck-infer](infer.md)
+CLI usage, performance considerations, invariant format, and the inference algorithm (relations, preconditions, etc.).
+
+- [Checking Violations with traincheck-check](check.md)
+How to apply invariants to new traces, result interpretation, and result file formats.
diff --git a/eval_scripts/false_positive/CNN/trainset/MNIST-1epo/main.py b/eval_scripts/false_positive/CNN/trainset/MNIST-1epo/main.py
index 21e31818..0468a08c 100644
--- a/eval_scripts/false_positive/CNN/trainset/MNIST-1epo/main.py
+++ b/eval_scripts/false_positive/CNN/trainset/MNIST-1epo/main.py
@@ -1,15 +1,16 @@
 import argparse
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torchvision import datasets, transforms
 from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+meta_vars["step"] = -1
 
-meta_vars['step'] = -1
 
 class Net(nn.Module):
     def __init__(self):
@@ -38,10 +39,10 @@ def forward(self, x):
 
 
 def train(args, model, device, train_loader, optimizer, epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
+    annotate_stage("training")  # ML_DAIKON: stage annotation
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
@@ -49,9 +50,15 @@ def train(args, model, device, train_loader, optimizer, epoch):
         loss.backward()
         optimizer.step()
         if batch_idx % args.log_interval == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.item()))
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item(),
+                )
+            )
             if args.dry_run:
                 break
 
@@ -61,7 +68,7 @@ def train(args, model, device, train_loader, optimizer, epoch):
 
 
 def test(model, device, test_loader):
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     model.eval()
     test_loss = 0
     correct = 0
@@ -70,8 +77,12 @@ def test(model, device, test_loader):
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
-            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
-            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
 
             data_idx += 1
@@ -79,42 +90,90 @@ def test(model, device, test_loader):
             if data_idx == 10:
                 break
 
-
     test_loss /= len(test_loader.dataset)
 
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
+    print(
+        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+            test_loss,
+            correct,
+            len(test_loader.dataset),
+            100.0 * correct / len(test_loader.dataset),
+        )
+    )
 
 
 def main():
     # Training settings
-    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                        help='input batch size for training (default: 64)')
-    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                        help='input batch size for testing (default: 1000)')
-    parser.add_argument('--epochs', type=int, default=14, metavar='N',
-                        help='number of epochs to train (default: 14)')
-    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
-                        help='learning rate (default: 1.0)')
-    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
-                        help='Learning rate step gamma (default: 0.7)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
-                        help='quickly check a single pass')
-    parser.add_argument('--seed', type=int, default=1, metavar='S',
-                        help='random seed (default: 1)')
-    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                        help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
-                        help='For Saving the current Model')
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=14,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
+    )
+    parser.add_argument(
+        "--no-mps",
+        action="store_true",
+        default=False,
+        help="disables macOS GPU training",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
     args = parser.parse_args()
 
-    annotate_stage("init") # ML_DAIKON: stage annotation
+    annotate_stage("init")  # ML_DAIKON: stage annotation
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
@@ -127,26 +186,21 @@ def main():
     else:
         device = torch.device("cpu")
 
-    train_kwargs = {'batch_size': args.batch_size}
-    test_kwargs = {'batch_size': args.test_batch_size}
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
     if use_cuda:
-        cuda_kwargs = {'num_workers': 2,
-                       'pin_memory': True,
-                       'shuffle': True}
+        cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
         # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
         # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
         train_kwargs.update(cuda_kwargs)
         test_kwargs.update(cuda_kwargs)
 
-    transform=transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.1307,), (0.3081,))
-        ])
-    dataset1 = datasets.MNIST('../data', train=True, download=True,
-                       transform=transform)
-    dataset2 = datasets.MNIST('../data', train=False,
-                       transform=transform)
-    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
+    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
 
     model = Net().to(device)
@@ -157,13 +211,13 @@ def main():
         train(args, model, device, train_loader, optimizer, epoch)
         test(model, device, test_loader)
 
-        annotate_stage("training") # ML_DAIKON: stage annotation
+        annotate_stage("training")  # ML_DAIKON: stage annotation
         scheduler.step()
 
     if args.save_model:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
         torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/eval_scripts/false_positive/CNN/trainset/MNIST-3epo-CIFAR10/main.py b/eval_scripts/false_positive/CNN/trainset/MNIST-3epo-CIFAR10/main.py
index c9d19824..97f7f53d 100644
--- a/eval_scripts/false_positive/CNN/trainset/MNIST-3epo-CIFAR10/main.py
+++ b/eval_scripts/false_positive/CNN/trainset/MNIST-3epo-CIFAR10/main.py
@@ -1,15 +1,16 @@
 import argparse
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torchvision import datasets, transforms
 from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+meta_vars["step"] = -1
 
-meta_vars['step'] = -1
 
 class Net(nn.Module):
     def __init__(self):
@@ -39,10 +40,10 @@ def forward(self, x):
 
 
 def train(args, model, device, train_loader, optimizer, epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
+    annotate_stage("training")  # ML_DAIKON: stage annotation
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
@@ -50,17 +51,24 @@ def train(args, model, device, train_loader, optimizer, epoch):
         loss.backward()
         optimizer.step()
         if batch_idx % args.log_interval == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.item()))
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item(),
+                )
+            )
             if args.dry_run:
                 break
 
             if batch_idx == 50:
                 break
 
+
 def test(model, device, test_loader):
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     model.eval()
     test_loss = 0
     correct = 0
@@ -69,8 +77,12 @@ def test(model, device, test_loader):
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
-            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
-            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
 
             data_idx += 1
@@ -79,39 +91,88 @@ def test(model, device, test_loader):
 
     test_loss /= len(test_loader.dataset)
 
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
+    print(
+        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+            test_loss,
+            correct,
+            len(test_loader.dataset),
+            100.0 * correct / len(test_loader.dataset),
+        )
+    )
 
 
 def main():
     # Training settings
-    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                        help='input batch size for training (default: 64)')
-    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                        help='input batch size for testing (default: 1000)')
-    parser.add_argument('--epochs', type=int, default=14, metavar='N',
-                        help='number of epochs to train (default: 14)')
-    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
-                        help='learning rate (default: 1.0)')
-    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
-                        help='Learning rate step gamma (default: 0.7)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
-                        help='quickly check a single pass')
-    parser.add_argument('--seed', type=int, default=1, metavar='S',
-                        help='random seed (default: 1)')
-    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                        help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
-                        help='For Saving the current Model')
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=14,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
+    )
+    parser.add_argument(
+        "--no-mps",
+        action="store_true",
+        default=False,
+        help="disables macOS GPU training",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
     args = parser.parse_args()
 
-    annotate_stage("init") # ML_DAIKON: stage annotation
+    annotate_stage("init")  # ML_DAIKON: stage annotation
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
@@ -124,12 +185,10 @@ def main():
     else:
         device = torch.device("cpu")
 
-    train_kwargs = {'batch_size': args.batch_size}
-    test_kwargs = {'batch_size': args.test_batch_size}
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
     if use_cuda:
-        cuda_kwargs = {'num_workers': 2,
-                       'pin_memory': True,
-                       'shuffle': True}
+        cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
         # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
         # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
         train_kwargs.update(cuda_kwargs)
@@ -144,15 +203,19 @@ def main():
     # dataset2 = datasets.MNIST('../data', train=False,
     #                    transform=transform)
 
-    transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize for CIFAR-10
-    ])
-    dataset1 = datasets.CIFAR10('../data', train=True, download=True,
-                          transform=transform)
-    dataset2 = datasets.CIFAR10('../data', train=False,
-                            transform=transform)
-    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize(
+                (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+            ),  # Normalize for CIFAR-10
+        ]
+    )
+    dataset1 = datasets.CIFAR10(
+        "../data", train=True, download=True, transform=transform
+    )
+    dataset2 = datasets.CIFAR10("../data", train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
 
     model = Net().to(device)
@@ -163,13 +226,13 @@ def main():
         train(args, model, device, train_loader, optimizer, epoch)
         test(model, device, test_loader)
 
-        annotate_stage("training") # ML_DAIKON: stage annotation
+        annotate_stage("training")  # ML_DAIKON: stage annotation
         scheduler.step()
 
     if args.save_model:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
         torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/eval_scripts/false_positive/CNN/trainset/MNIST-3epo-larger-model/main.py b/eval_scripts/false_positive/CNN/trainset/MNIST-3epo-larger-model/main.py
index 7a18b337..ac710038 100644
--- a/eval_scripts/false_positive/CNN/trainset/MNIST-3epo-larger-model/main.py
+++ b/eval_scripts/false_positive/CNN/trainset/MNIST-3epo-larger-model/main.py
@@ -1,22 +1,23 @@
 import argparse
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torchvision import datasets, transforms
 from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+meta_vars["step"] = -1
 
-meta_vars['step'] = -1
 
 class Net(nn.Module):
     def __init__(self):
         super(Net, self).__init__()
-        self.conv1 = nn.Conv2d(1, 64, 3, 1)   # More filters
-        self.conv2 = nn.Conv2d(64, 128, 3, 1) # More filters
-        self.conv3 = nn.Conv2d(128, 256, 3, 1) # Extra conv layer
+        self.conv1 = nn.Conv2d(1, 64, 3, 1)  # More filters
+        self.conv2 = nn.Conv2d(64, 128, 3, 1)  # More filters
+        self.conv3 = nn.Conv2d(128, 256, 3, 1)  # Extra conv layer
         self.dropout1 = nn.Dropout(0.3)
         self.dropout2 = nn.Dropout(0.5)
         self.fc1 = nn.Linear(30976, 256)  # Larger FC layer
@@ -38,10 +39,10 @@ def forward(self, x):
 
 
 def train(args, model, device, train_loader, optimizer, epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
+    annotate_stage("training")  # ML_DAIKON: stage annotation
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
@@ -49,9 +50,15 @@ def train(args, model, device, train_loader, optimizer, epoch):
         loss.backward()
         optimizer.step()
         if batch_idx % args.log_interval == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.item()))
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item(),
+                )
+            )
             if args.dry_run:
                 break
 
@@ -61,7 +68,7 @@ def train(args, model, device, train_loader, optimizer, epoch):
 
 
 def test(model, device, test_loader):
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     model.eval()
     test_loss = 0
     correct = 0
@@ -70,8 +77,12 @@ def test(model, device, test_loader):
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
-            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
-            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
 
             data_idx += 1
@@ -79,42 +90,90 @@ def test(model, device, test_loader):
             if data_idx == 10:
                 break
 
-
     test_loss /= len(test_loader.dataset)
 
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
+    print(
+        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+            test_loss,
+            correct,
+            len(test_loader.dataset),
+            100.0 * correct / len(test_loader.dataset),
+        )
+    )
 
 
 def main():
     # Training settings
-    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                        help='input batch size for training (default: 64)')
-    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                        help='input batch size for testing (default: 1000)')
-    parser.add_argument('--epochs', type=int, default=14, metavar='N',
-                        help='number of epochs to train (default: 14)')
-    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
-                        help='learning rate (default: 1.0)')
-    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
-                        help='Learning rate step gamma (default: 0.7)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
-                        help='quickly check a single pass')
-    parser.add_argument('--seed', type=int, default=1, metavar='S',
-                        help='random seed (default: 1)')
-    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                        help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
-                        help='For Saving the current Model')
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=14,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
+    )
+    parser.add_argument(
+        "--no-mps",
+        action="store_true",
+        default=False,
+        help="disables macOS GPU training",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
     args = parser.parse_args()
 
-    annotate_stage("init") # ML_DAIKON: stage annotation
+    annotate_stage("init")  # ML_DAIKON: stage annotation
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
@@ -127,26 +186,21 @@ def main():
     else:
         device = torch.device("cpu")
 
-    train_kwargs = {'batch_size': args.batch_size}
-    test_kwargs = {'batch_size': args.test_batch_size}
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
     if use_cuda:
-        cuda_kwargs = {'num_workers': 2,
-                       'pin_memory': True,
-                       'shuffle': True}
+        cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
         # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
         # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
         train_kwargs.update(cuda_kwargs)
         test_kwargs.update(cuda_kwargs)
 
-    transform=transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.1307,), (0.3081,))
-        ])
-    dataset1 = datasets.MNIST('../data', train=True, download=True,
-                       transform=transform)
-    dataset2 = datasets.MNIST('../data', train=False,
-                       transform=transform)
-    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
+    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
 
     model = Net().to(device)
@@ -157,13 +211,13 @@ def main():
         train(args, model, device, train_loader, optimizer, epoch)
         test(model, device, test_loader)
 
-        annotate_stage("training") # ML_DAIKON: stage annotation
+        annotate_stage("training")  # ML_DAIKON: stage annotation
         scheduler.step()
 
     if args.save_model:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
         torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/eval_scripts/false_positive/CNN/trainset/MNIST-3epo/main.py b/eval_scripts/false_positive/CNN/trainset/MNIST-3epo/main.py
index 21e31818..0468a08c 100644
--- a/eval_scripts/false_positive/CNN/trainset/MNIST-3epo/main.py
+++ b/eval_scripts/false_positive/CNN/trainset/MNIST-3epo/main.py
@@ -1,15 +1,16 @@
 import argparse
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torchvision import datasets, transforms
 from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+meta_vars["step"] = -1
 
-meta_vars['step'] = -1
 
 class Net(nn.Module):
     def __init__(self):
@@ -38,10 +39,10 @@ def forward(self, x):
 
 
 def train(args, model, device, train_loader, optimizer, epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
+    annotate_stage("training")  # ML_DAIKON: stage annotation
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
@@ -49,9 +50,15 @@ def train(args, model, device, train_loader, optimizer, epoch):
         loss.backward()
         optimizer.step()
         if batch_idx % args.log_interval == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.item()))
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item(),
+                )
+            )
             if args.dry_run:
                 break
 
@@ -61,7 +68,7 @@ def train(args, model, device, train_loader, optimizer, epoch):
 
 
 def test(model, device, test_loader):
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     model.eval()
     test_loss = 0
     correct = 0
@@ -70,8 +77,12 @@ def test(model, device, test_loader):
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
-            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
-            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
 
             data_idx += 1
@@ -79,42 +90,90 @@ def test(model, device, test_loader):
             if data_idx == 10:
                 break
 
-
     test_loss /= len(test_loader.dataset)
 
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
+    print(
+        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+            test_loss,
+            correct,
+            len(test_loader.dataset),
+            100.0 * correct / len(test_loader.dataset),
+        )
+    )
 
 
 def main():
     # Training settings
-    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                        help='input batch size for training (default: 64)')
-    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                        help='input batch size for testing (default: 1000)')
-    parser.add_argument('--epochs', type=int, default=14, metavar='N',
-                        help='number of epochs to train (default: 14)')
-    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
-                        help='learning rate (default: 1.0)')
-    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
-                        help='Learning rate step gamma (default: 0.7)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
-                        help='quickly check a single pass')
-    parser.add_argument('--seed', type=int, default=1, metavar='S',
-                        help='random seed (default: 1)')
-    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                        help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
-                        help='For Saving the current Model')
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=14,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
+    )
+    parser.add_argument(
+        "--no-mps",
+        action="store_true",
+        default=False,
+        help="disables macOS GPU training",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
     args = parser.parse_args()
 
-    annotate_stage("init") # ML_DAIKON: stage annotation
+    annotate_stage("init")  # ML_DAIKON: stage annotation
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
@@ -127,26 +186,21 @@ def main():
     else:
         device = torch.device("cpu")
 
-    train_kwargs = {'batch_size': args.batch_size}
-    test_kwargs = {'batch_size': args.test_batch_size}
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
     if use_cuda:
-        cuda_kwargs = {'num_workers': 2,
-                       'pin_memory': True,
-                       'shuffle': True}
+        cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
         # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
         # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
         train_kwargs.update(cuda_kwargs)
         test_kwargs.update(cuda_kwargs)
 
-    transform=transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.1307,), (0.3081,))
-        ])
-    dataset1 = datasets.MNIST('../data', train=True, download=True,
-                       transform=transform)
-    dataset2 = datasets.MNIST('../data', train=False,
-                       transform=transform)
-    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
+    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
 
     model = Net().to(device)
@@ -157,13 +211,13 @@ def main():
         train(args, model, device, train_loader, optimizer, epoch)
         test(model, device, test_loader)
 
-        annotate_stage("training") # ML_DAIKON: stage annotation
+        annotate_stage("training")  # ML_DAIKON: stage annotation
         scheduler.step()
 
     if args.save_model:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
         torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/eval_scripts/false_positive/CNN/validset/MNIST-FashionMNIST/main.py b/eval_scripts/false_positive/CNN/validset/MNIST-FashionMNIST/main.py
index dfbe45a8..e1173668 100644
--- a/eval_scripts/false_positive/CNN/validset/MNIST-FashionMNIST/main.py
+++ b/eval_scripts/false_positive/CNN/validset/MNIST-FashionMNIST/main.py
@@ -1,15 +1,16 @@
 import argparse
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torchvision import datasets, transforms
 from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+meta_vars["step"] = -1
 
-meta_vars['step'] = -1
 
 class Net(nn.Module):
     def __init__(self):
@@ -38,10 +39,10 @@ def forward(self, x):
 
 
 def train(args, model, device, train_loader, optimizer, epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
+    annotate_stage("training")  # ML_DAIKON: stage annotation
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
@@ -49,17 +50,24 @@ def train(args, model, device, train_loader, optimizer, epoch):
         loss.backward()
         optimizer.step()
         if batch_idx % args.log_interval == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.item()))
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item(),
+                )
+            )
             if args.dry_run:
                 break
 
             if batch_idx == 50:
                 break
 
+
 def test(model, device, test_loader):
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     model.eval()
     test_loss = 0
     correct = 0
@@ -68,8 +76,12 @@ def test(model, device, test_loader):
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
-            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
-            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
 
             data_idx += 1
@@ -78,39 +90,88 @@ def test(model, device, test_loader):
 
     test_loss /= len(test_loader.dataset)
 
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
+    print(
+        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+            test_loss,
+            correct,
+            len(test_loader.dataset),
+            100.0 * correct / len(test_loader.dataset),
+        )
+    )
 
 
 def main():
     # Training settings
-    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                        help='input batch size for training (default: 64)')
-    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                        help='input batch size for testing (default: 1000)')
-    parser.add_argument('--epochs', type=int, default=14, metavar='N',
-                        help='number of epochs to train (default: 14)')
-    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
-                        help='learning rate (default: 1.0)')
-    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
-                        help='Learning rate step gamma (default: 0.7)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
-                        help='quickly check a single pass')
-    parser.add_argument('--seed', type=int, default=1, metavar='S',
-                        help='random seed (default: 1)')
-    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                        help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
-                        help='For Saving the current Model')
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=14,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
+    )
+    parser.add_argument(
+        "--no-mps",
+        action="store_true",
+        default=False,
+        help="disables macOS GPU training",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
     args = parser.parse_args()
 
-    annotate_stage("init") # ML_DAIKON: stage annotation
+    annotate_stage("init")  # ML_DAIKON: stage annotation
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
@@ -123,27 +184,24 @@ def main():
     else:
         device = torch.device("cpu")
 
-    train_kwargs = {'batch_size': args.batch_size}
-    test_kwargs = {'batch_size': args.test_batch_size}
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
     if use_cuda:
-        cuda_kwargs = {'num_workers': 2,
-                       'pin_memory': True,
-                       'shuffle': True}
+        cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
         # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
         # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
         train_kwargs.update(cuda_kwargs)
         test_kwargs.update(cuda_kwargs)
 
-    transform=transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.1307,), (0.3081,))
-        ])
-    dataset1 = datasets.FashionMNIST('../data', train=True, download=True,
-                       transform=transform)
-    dataset2 = datasets.FashionMNIST('../data', train=False,
-                       transform=transform)
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    dataset1 = datasets.FashionMNIST(
+        "../data", train=True, download=True, transform=transform
+    )
+    dataset2 = datasets.FashionMNIST("../data", train=False, transform=transform)
 
-    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
 
     model = Net().to(device)
@@ -154,13 +212,13 @@ def main():
         train(args, model, device, train_loader, optimizer, epoch)
         test(model, device, test_loader)
 
-        annotate_stage("training") # ML_DAIKON: stage annotation
+        annotate_stage("training")  # ML_DAIKON: stage annotation
         scheduler.step()
 
     if args.save_model:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
         torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/eval_scripts/false_positive/CNN/validset/MNIST-HighRes-CIFAR10/main.py b/eval_scripts/false_positive/CNN/validset/MNIST-HighRes-CIFAR10/main.py
index 84ee9f17..34dcdccf 100644
--- a/eval_scripts/false_positive/CNN/validset/MNIST-HighRes-CIFAR10/main.py
+++ b/eval_scripts/false_positive/CNN/validset/MNIST-HighRes-CIFAR10/main.py
@@ -1,15 +1,16 @@
 import argparse
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torchvision import datasets, transforms
 from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+meta_vars["step"] = -1
 
-meta_vars['step'] = -1
 
 class Net(nn.Module):
     def __init__(self):
@@ -39,10 +40,10 @@ def forward(self, x):
 
 
 def train(args, model, device, train_loader, optimizer, epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
+    annotate_stage("training")  # ML_DAIKON: stage annotation
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
@@ -50,17 +51,24 @@ def train(args, model, device, train_loader, optimizer, epoch):
         loss.backward()
         optimizer.step()
         if batch_idx % args.log_interval == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.item()))
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item(),
+                )
+            )
             if args.dry_run:
                 break
 
             if batch_idx == 50:
                 break
 
+
 def test(model, device, test_loader):
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     model.eval()
     test_loss = 0
     correct = 0
@@ -69,8 +77,12 @@ def test(model, device, test_loader):
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
-            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
-            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
 
             data_idx += 1
@@ -79,39 +91,88 @@ def test(model, device, test_loader):
 
     test_loss /= len(test_loader.dataset)
 
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
+    print(
+        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+            test_loss,
+            correct,
+            len(test_loader.dataset),
+            100.0 * correct / len(test_loader.dataset),
+        )
+    )
 
 
 def main():
     # Training settings
-    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                        help='input batch size for training (default: 64)')
-    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                        help='input batch size for testing (default: 1000)')
-    parser.add_argument('--epochs', type=int, default=14, metavar='N',
-                        help='number of epochs to train (default: 14)')
-    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
-                        help='learning rate (default: 1.0)')
-    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
-                        help='Learning rate step gamma (default: 0.7)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
-                        help='quickly check a single pass')
-    parser.add_argument('--seed', type=int, default=1, metavar='S',
-                        help='random seed (default: 1)')
-    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                        help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
-                        help='For Saving the current Model')
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=14,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
+    )
+    parser.add_argument(
+        "--no-mps",
+        action="store_true",
+        default=False,
+        help="disables macOS GPU training",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
     args = parser.parse_args()
 
-    annotate_stage("init") # ML_DAIKON: stage annotation
+    annotate_stage("init")  # ML_DAIKON: stage annotation
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
@@ -124,12 +185,10 @@ def main():
     else:
         device = torch.device("cpu")
 
-    train_kwargs = {'batch_size': args.batch_size}
-    test_kwargs = {'batch_size': args.test_batch_size}
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
     if use_cuda:
-        cuda_kwargs = {'num_workers': 2,
-                       'pin_memory': True,
-                       'shuffle': True}
+        cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
         # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
         # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
         train_kwargs.update(cuda_kwargs)
@@ -144,16 +203,22 @@ def main():
     # dataset2 = datasets.MNIST('../data', train=False,
     #                    transform=transform)
 
-    transform = transforms.Compose([
-        transforms.Resize((32, 32)),  # Rescale STL10 images to 32x32
-        transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize for CIFAR-10
-    ])
-    dataset1 = datasets.STL10('../data', split="train", download=True,
-                          transform=transform)
-    dataset2 = datasets.STL10('../data', split="test", download=True,
-                            transform=transform)
-    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    transform = transforms.Compose(
+        [
+            transforms.Resize((32, 32)),  # Rescale STL10 images to 32x32
+            transforms.ToTensor(),
+            transforms.Normalize(
+                (0.5, 0.5, 0.5), (0.5, 0.5, 0.5)
+            ),  # Normalize for CIFAR-10
+        ]
+    )
+    dataset1 = datasets.STL10(
+        "../data", split="train", download=True, transform=transform
+    )
+    dataset2 = datasets.STL10(
+        "../data", split="test", download=True, transform=transform
+    )
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
 
     model = Net().to(device)
@@ -164,13 +229,13 @@ def main():
         train(args, model, device, train_loader, optimizer, epoch)
         test(model, device, test_loader)
 
-        annotate_stage("training") # ML_DAIKON: stage annotation
+        annotate_stage("training")  # ML_DAIKON: stage annotation
         scheduler.step()
 
     if args.save_model:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
         torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/eval_scripts/false_positive/CNN/validset/MNIST-Longer/main.py b/eval_scripts/false_positive/CNN/validset/MNIST-Longer/main.py
index 188d2be9..a5f7d03b 100644
--- a/eval_scripts/false_positive/CNN/validset/MNIST-Longer/main.py
+++ b/eval_scripts/false_positive/CNN/validset/MNIST-Longer/main.py
@@ -1,15 +1,16 @@
 import argparse
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
-from torchvision import datasets, transforms
 from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+meta_vars["step"] = -1
 
-meta_vars['step'] = -1
 
 class Net(nn.Module):
     def __init__(self):
@@ -38,10 +39,10 @@ def forward(self, x):
 
 
 def train(args, model, device, train_loader, optimizer, epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
+    annotate_stage("training")  # ML_DAIKON: stage annotation
     model.train()
     for batch_idx, (data, target) in enumerate(train_loader):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
         output = model(data)
@@ -49,9 +50,15 @@ def train(args, model, device, train_loader, optimizer, epoch):
         loss.backward()
         optimizer.step()
         if batch_idx % args.log_interval == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader), loss.item()))
+            print(
+                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                    epoch,
+                    batch_idx * len(data),
+                    len(train_loader.dataset),
+                    100.0 * batch_idx / len(train_loader),
+                    loss.item(),
+                )
+            )
             if args.dry_run:
                 break
             if batch_idx == 50:
@@ -59,7 +66,7 @@ def train(args, model, device, train_loader, optimizer, epoch):
 
 
 def test(model, device, test_loader):
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     model.eval()
     test_loss = 0
     correct = 0
@@ -68,8 +75,12 @@ def test(model, device, test_loader):
         for data, target in test_loader:
             data, target = data.to(device), target.to(device)
             output = model(data)
-            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
-            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            test_loss += F.nll_loss(
+                output, target, reduction="sum"
+            ).item()  # sum up batch loss
+            pred = output.argmax(
+                dim=1, keepdim=True
+            )  # get the index of the max log-probability
             correct += pred.eq(target.view_as(pred)).sum().item()
 
             data_idx += 1
@@ -79,39 +90,88 @@ def test(model, device, test_loader):
 
     test_loss /= len(test_loader.dataset)
 
-    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
-        test_loss, correct, len(test_loader.dataset),
-        100. * correct / len(test_loader.dataset)))
+    print(
+        "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
+            test_loss,
+            correct,
+            len(test_loader.dataset),
+            100.0 * correct / len(test_loader.dataset),
+        )
+    )
 
 
 def main():
     # Training settings
-    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
-    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
-                        help='input batch size for training (default: 64)')
-    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
-                        help='input batch size for testing (default: 1000)')
-    parser.add_argument('--epochs', type=int, default=14, metavar='N',
-                        help='number of epochs to train (default: 14)')
-    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
-                        help='learning rate (default: 1.0)')
-    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
-                        help='Learning rate step gamma (default: 0.7)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
-                        help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
-                        help='quickly check a single pass')
-    parser.add_argument('--seed', type=int, default=1, metavar='S',
-                        help='random seed (default: 1)')
-    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
-                        help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
-                        help='For Saving the current Model')
+    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=64,
+        metavar="N",
+        help="input batch size for training (default: 64)",
+    )
+    parser.add_argument(
+        "--test-batch-size",
+        type=int,
+        default=1000,
+        metavar="N",
+        help="input batch size for testing (default: 1000)",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=14,
+        metavar="N",
+        help="number of epochs to train (default: 14)",
+    )
+    parser.add_argument(
+        "--lr",
+        type=float,
+        default=1.0,
+        metavar="LR",
+        help="learning rate (default: 1.0)",
+    )
+    parser.add_argument(
+        "--gamma",
+        type=float,
+        default=0.7,
+        metavar="M",
+        help="Learning rate step gamma (default: 0.7)",
+    )
+    parser.add_argument(
+        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
+    )
+    parser.add_argument(
+        "--no-mps",
+        action="store_true",
+        default=False,
+        help="disables macOS GPU training",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="quickly check a single pass",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=10,
+        metavar="N",
+        help="how many batches to wait before logging training status",
+    )
+    parser.add_argument(
+        "--save-model",
+        action="store_true",
+        default=False,
+        help="For Saving the current Model",
+    )
     args = parser.parse_args()
 
-    annotate_stage("init") # ML_DAIKON: stage annotation
+    annotate_stage("init")  # ML_DAIKON: stage annotation
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
@@ -124,26 +184,21 @@ def main():
     else:
         device = torch.device("cpu")
 
-    train_kwargs = {'batch_size': args.batch_size}
-    test_kwargs = {'batch_size': args.test_batch_size}
+    train_kwargs = {"batch_size": args.batch_size}
+    test_kwargs = {"batch_size": args.test_batch_size}
     if use_cuda:
-        cuda_kwargs = {'num_workers': 2,
-                       'pin_memory': True,
-                       'shuffle': True}
+        cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
         # ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
         # cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
         train_kwargs.update(cuda_kwargs)
         test_kwargs.update(cuda_kwargs)
 
-    transform=transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.1307,), (0.3081,))
-        ])
-    dataset1 = datasets.MNIST('../data', train=True, download=True,
-                       transform=transform)
-    dataset2 = datasets.MNIST('../data', train=False,
-                       transform=transform)
-    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
+    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
     test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
 
     model = Net().to(device)
@@ -154,13 +209,13 @@ def main():
         train(args, model, device, train_loader, optimizer, epoch)
         test(model, device, test_loader)
 
-        annotate_stage("training") # ML_DAIKON: stage annotation
+        annotate_stage("training")  # ML_DAIKON: stage annotation
         scheduler.step()
 
     if args.save_model:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
         torch.save(model.state_dict(), "mnist_cnn.pt")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/eval_scripts/false_positive/CNN/validset/pytorch-cifar-EfficientNet/main.py b/eval_scripts/false_positive/CNN/validset/pytorch-cifar-EfficientNet/main.py
index ec6a7a8a..85d6148a 100644
--- a/eval_scripts/false_positive/CNN/validset/pytorch-cifar-EfficientNet/main.py
+++ b/eval_scripts/false_positive/CNN/validset/pytorch-cifar-EfficientNet/main.py
@@ -1,62 +1,80 @@
-'''Train CIFAR10 with PyTorch.'''
+"""Train CIFAR10 with PyTorch."""
+
+import argparse
+import os
+
 import torch
+import torch.backends.cudnn as cudnn
 import torch.nn as nn
-import torch.optim as optim
 import torch.nn.functional as F
-import torch.backends.cudnn as cudnn
-
+import torch.optim as optim
 import torchvision
 import torchvision.transforms as transforms
-
-import os
-import argparse
-
 from models import *
+from traincheck import annotate_stage
 from utils import progress_bar
 
-from mldaikon import annotate_stage
 annotate_stage("init")
 
 
-parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
-parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
-parser.add_argument('--resume', '-r', action='store_true',
-                    help='resume from checkpoint')
+parser = argparse.ArgumentParser(description="PyTorch CIFAR10 Training")
+parser.add_argument("--lr", default=0.1, type=float, help="learning rate")
+parser.add_argument(
+    "--resume", "-r", action="store_true", help="resume from checkpoint"
+)
 args = parser.parse_args()
 
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
+device = "cuda" if torch.cuda.is_available() else "cpu"
 best_acc = 0  # best test accuracy
 start_epoch = 0  # start from epoch 0 or last checkpoint epoch
 
 # Data
-print('==> Preparing data..')
-transform_train = transforms.Compose([
-    transforms.RandomCrop(32, padding=4),
-    transforms.RandomHorizontalFlip(),
-    transforms.ToTensor(),
-    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-])
-
-transform_test = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-])
+print("==> Preparing data..")
+transform_train = transforms.Compose(
+    [
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ]
+)
+
+transform_test = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ]
+)
 
 trainset = torchvision.datasets.CIFAR10(
-    root='./data', train=True, download=True, transform=transform_train)
+    root="./data", train=True, download=True, transform=transform_train
+)
 trainloader = torch.utils.data.DataLoader(
-    trainset, batch_size=128, shuffle=True, num_workers=2)
+    trainset, batch_size=128, shuffle=True, num_workers=2
+)
 
 testset = torchvision.datasets.CIFAR10(
-    root='./data', train=False, download=True, transform=transform_test)
+    root="./data", train=False, download=True, transform=transform_test
+)
 testloader = torch.utils.data.DataLoader(
-    testset, batch_size=100, shuffle=False, num_workers=2)
-
-classes = ('plane', 'car', 'bird', 'cat', 'deer',
-           'dog', 'frog', 'horse', 'ship', 'truck')
+    testset, batch_size=100, shuffle=False, num_workers=2
+)
+
+classes = (
+    "plane",
+    "car",
+    "bird",
+    "cat",
+    "deer",
+    "dog",
+    "frog",
+    "horse",
+    "ship",
+    "truck",
+)
 
 # Model
-print('==> Building model..')
+print("==> Building model..")
 # net = VGG('VGG11')
 # net = ResNet18()
 # net = PreActResNet18()
@@ -73,29 +91,28 @@
 # net = RegNetX_200MF()
 # net = SimpleDLA()
 net = net.to(device)
-if device == 'cuda':
+if device == "cuda":
     net = torch.nn.DataParallel(net)
     cudnn.benchmark = True
 
 if args.resume:
     # Load checkpoint.
-    print('==> Resuming from checkpoint..')
-    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
-    checkpoint = torch.load('./checkpoint/ckpt.pth')
-    net.load_state_dict(checkpoint['net'])
-    best_acc = checkpoint['acc']
-    start_epoch = checkpoint['epoch']
+    print("==> Resuming from checkpoint..")
+    assert os.path.isdir("checkpoint"), "Error: no checkpoint directory found!"
+    checkpoint = torch.load("./checkpoint/ckpt.pth")
+    net.load_state_dict(checkpoint["net"])
+    best_acc = checkpoint["acc"]
+    start_epoch = checkpoint["epoch"]
 
 criterion = nn.CrossEntropyLoss()
-optimizer = optim.SGD(net.parameters(), lr=args.lr,
-                      momentum=0.9, weight_decay=5e-4)
+optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
 scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
 
 
 # Training
 def train(epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
-    print('\nEpoch: %d' % epoch)
+    annotate_stage("training")  # ML_DAIKON: stage annotation
+    print("\nEpoch: %d" % epoch)
     net.train()
     train_loss = 0
     correct = 0
@@ -113,8 +130,12 @@ def train(epoch):
         total += targets.size(0)
         correct += predicted.eq(targets).sum().item()
 
-        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
-                     % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
+        progress_bar(
+            batch_idx,
+            len(trainloader),
+            "Loss: %.3f | Acc: %.3f%% (%d/%d)"
+            % (train_loss / (batch_idx + 1), 100.0 * correct / total, correct, total),
+        )
         if batch_idx == 50:
             break
 
@@ -122,7 +143,7 @@ def train(epoch):
 def test(epoch):
     global best_acc
 
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     net.eval()
     test_loss = 0
     correct = 0
@@ -138,29 +159,38 @@ def test(epoch):
             total += targets.size(0)
             correct += predicted.eq(targets).sum().item()
 
-            progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
-                         % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
+            progress_bar(
+                batch_idx,
+                len(testloader),
+                "Loss: %.3f | Acc: %.3f%% (%d/%d)"
+                % (
+                    test_loss / (batch_idx + 1),
+                    100.0 * correct / total,
+                    correct,
+                    total,
+                ),
+            )
             if batch_idx == 10:
                 break
 
     # Save checkpoint.
-    acc = 100.*correct/total
+    acc = 100.0 * correct / total
     if acc > best_acc:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
-        print('Saving..')
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
+        print("Saving..")
         state = {
-            'net': net.state_dict(),
-            'acc': acc,
-            'epoch': epoch,
+            "net": net.state_dict(),
+            "acc": acc,
+            "epoch": epoch,
         }
-        if not os.path.isdir('checkpoint'):
-            os.mkdir('checkpoint')
-        torch.save(state, './checkpoint/ckpt.pth')
+        if not os.path.isdir("checkpoint"):
+            os.mkdir("checkpoint")
+        torch.save(state, "./checkpoint/ckpt.pth")
         best_acc = acc
 
 
 # for epoch in range(start_epoch, start_epoch+200):
-for epoch in range(start_epoch, start_epoch+1): # only 3 epochs
+for epoch in range(start_epoch, start_epoch + 1):  # only 3 epochs
     train(epoch)
     test(epoch)
     scheduler.step()
diff --git a/eval_scripts/false_positive/CNN/validset/pytorch-cifar-ResNet18/main.py b/eval_scripts/false_positive/CNN/validset/pytorch-cifar-ResNet18/main.py
index 3c26ca7b..1f5ee4b6 100644
--- a/eval_scripts/false_positive/CNN/validset/pytorch-cifar-ResNet18/main.py
+++ b/eval_scripts/false_positive/CNN/validset/pytorch-cifar-ResNet18/main.py
@@ -1,62 +1,80 @@
-'''Train CIFAR10 with PyTorch.'''
+"""Train CIFAR10 with PyTorch."""
+
+import argparse
+import os
+
 import torch
+import torch.backends.cudnn as cudnn
 import torch.nn as nn
-import torch.optim as optim
 import torch.nn.functional as F
-import torch.backends.cudnn as cudnn
-
+import torch.optim as optim
 import torchvision
 import torchvision.transforms as transforms
-
-import os
-import argparse
-
 from models import *
+from traincheck import annotate_stage
 from utils import progress_bar
 
-from mldaikon import annotate_stage
 annotate_stage("init")
 
 
-parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
-parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
-parser.add_argument('--resume', '-r', action='store_true',
-                    help='resume from checkpoint')
+parser = argparse.ArgumentParser(description="PyTorch CIFAR10 Training")
+parser.add_argument("--lr", default=0.1, type=float, help="learning rate")
+parser.add_argument(
+    "--resume", "-r", action="store_true", help="resume from checkpoint"
+)
 args = parser.parse_args()
 
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
+device = "cuda" if torch.cuda.is_available() else "cpu"
 best_acc = 0  # best test accuracy
 start_epoch = 0  # start from epoch 0 or last checkpoint epoch
 
 # Data
-print('==> Preparing data..')
-transform_train = transforms.Compose([
-    transforms.RandomCrop(32, padding=4),
-    transforms.RandomHorizontalFlip(),
-    transforms.ToTensor(),
-    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-])
-
-transform_test = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-])
+print("==> Preparing data..")
+transform_train = transforms.Compose(
+    [
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ]
+)
+
+transform_test = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ]
+)
 
 trainset = torchvision.datasets.CIFAR10(
-    root='./data', train=True, download=True, transform=transform_train)
+    root="./data", train=True, download=True, transform=transform_train
+)
 trainloader = torch.utils.data.DataLoader(
-    trainset, batch_size=128, shuffle=True, num_workers=2)
+    trainset, batch_size=128, shuffle=True, num_workers=2
+)
 
 testset = torchvision.datasets.CIFAR10(
-    root='./data', train=False, download=True, transform=transform_test)
+    root="./data", train=False, download=True, transform=transform_test
+)
 testloader = torch.utils.data.DataLoader(
-    testset, batch_size=100, shuffle=False, num_workers=2)
-
-classes = ('plane', 'car', 'bird', 'cat', 'deer',
-           'dog', 'frog', 'horse', 'ship', 'truck')
+    testset, batch_size=100, shuffle=False, num_workers=2
+)
+
+classes = (
+    "plane",
+    "car",
+    "bird",
+    "cat",
+    "deer",
+    "dog",
+    "frog",
+    "horse",
+    "ship",
+    "truck",
+)
 
 # Model
-print('==> Building model..')
+print("==> Building model..")
 # net = VGG('VGG11')
 net = ResNet18()
 # net = PreActResNet18()
@@ -73,29 +91,28 @@
 # net = RegNetX_200MF()
 # net = SimpleDLA()
 net = net.to(device)
-if device == 'cuda':
+if device == "cuda":
     net = torch.nn.DataParallel(net)
     cudnn.benchmark = True
 
 if args.resume:
     # Load checkpoint.
-    print('==> Resuming from checkpoint..')
-    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
-    checkpoint = torch.load('./checkpoint/ckpt.pth')
-    net.load_state_dict(checkpoint['net'])
-    best_acc = checkpoint['acc']
-    start_epoch = checkpoint['epoch']
+    print("==> Resuming from checkpoint..")
+    assert os.path.isdir("checkpoint"), "Error: no checkpoint directory found!"
+    checkpoint = torch.load("./checkpoint/ckpt.pth")
+    net.load_state_dict(checkpoint["net"])
+    best_acc = checkpoint["acc"]
+    start_epoch = checkpoint["epoch"]
 
 criterion = nn.CrossEntropyLoss()
-optimizer = optim.SGD(net.parameters(), lr=args.lr,
-                      momentum=0.9, weight_decay=5e-4)
+optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
 scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
 
 
 # Training
 def train(epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
-    print('\nEpoch: %d' % epoch)
+    annotate_stage("training")  # ML_DAIKON: stage annotation
+    print("\nEpoch: %d" % epoch)
     net.train()
     train_loss = 0
     correct = 0
@@ -113,9 +130,13 @@ def train(epoch):
         total += targets.size(0)
         correct += predicted.eq(targets).sum().item()
 
-        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
-                     % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
-        
+        progress_bar(
+            batch_idx,
+            len(trainloader),
+            "Loss: %.3f | Acc: %.3f%% (%d/%d)"
+            % (train_loss / (batch_idx + 1), 100.0 * correct / total, correct, total),
+        )
+
         if batch_idx == 50:
             break
 
@@ -123,7 +144,7 @@ def train(epoch):
 def test(epoch):
     global best_acc
 
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     net.eval()
     test_loss = 0
     correct = 0
@@ -139,29 +160,38 @@ def test(epoch):
             total += targets.size(0)
             correct += predicted.eq(targets).sum().item()
 
-            progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
-                         % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
+            progress_bar(
+                batch_idx,
+                len(testloader),
+                "Loss: %.3f | Acc: %.3f%% (%d/%d)"
+                % (
+                    test_loss / (batch_idx + 1),
+                    100.0 * correct / total,
+                    correct,
+                    total,
+                ),
+            )
             if batch_idx == 10:
                 break
 
     # Save checkpoint.
-    acc = 100.*correct/total
+    acc = 100.0 * correct / total
     if acc > best_acc:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
-        print('Saving..')
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
+        print("Saving..")
         state = {
-            'net': net.state_dict(),
-            'acc': acc,
-            'epoch': epoch,
+            "net": net.state_dict(),
+            "acc": acc,
+            "epoch": epoch,
         }
-        if not os.path.isdir('checkpoint'):
-            os.mkdir('checkpoint')
-        torch.save(state, './checkpoint/ckpt.pth')
+        if not os.path.isdir("checkpoint"):
+            os.mkdir("checkpoint")
+        torch.save(state, "./checkpoint/ckpt.pth")
         best_acc = acc
 
 
 # for epoch in range(start_epoch, start_epoch+200):
-for epoch in range(start_epoch, start_epoch+1): # only 3 epochs
+for epoch in range(start_epoch, start_epoch + 1):  # only 3 epochs
     train(epoch)
     test(epoch)
     scheduler.step()
diff --git a/eval_scripts/false_positive/CNN/validset/pytorch-cifar-VGG11/main.py b/eval_scripts/false_positive/CNN/validset/pytorch-cifar-VGG11/main.py
index 3c538943..de428b6e 100644
--- a/eval_scripts/false_positive/CNN/validset/pytorch-cifar-VGG11/main.py
+++ b/eval_scripts/false_positive/CNN/validset/pytorch-cifar-VGG11/main.py
@@ -1,63 +1,81 @@
-'''Train CIFAR10 with PyTorch.'''
+"""Train CIFAR10 with PyTorch."""
+
+import argparse
+import os
+
 import torch
+import torch.backends.cudnn as cudnn
 import torch.nn as nn
-import torch.optim as optim
 import torch.nn.functional as F
-import torch.backends.cudnn as cudnn
-
+import torch.optim as optim
 import torchvision
 import torchvision.transforms as transforms
-
-import os
-import argparse
-
 from models import *
+from traincheck import annotate_stage
 from utils import progress_bar
 
-from mldaikon import annotate_stage
 annotate_stage("init")
 
 
-parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
-parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
-parser.add_argument('--resume', '-r', action='store_true',
-                    help='resume from checkpoint')
+parser = argparse.ArgumentParser(description="PyTorch CIFAR10 Training")
+parser.add_argument("--lr", default=0.1, type=float, help="learning rate")
+parser.add_argument(
+    "--resume", "-r", action="store_true", help="resume from checkpoint"
+)
 args = parser.parse_args()
 
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
+device = "cuda" if torch.cuda.is_available() else "cpu"
 best_acc = 0  # best test accuracy
 start_epoch = 0  # start from epoch 0 or last checkpoint epoch
 
 # Data
-print('==> Preparing data..')
-transform_train = transforms.Compose([
-    transforms.RandomCrop(32, padding=4),
-    transforms.RandomHorizontalFlip(),
-    transforms.ToTensor(),
-    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-])
-
-transform_test = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-])
+print("==> Preparing data..")
+transform_train = transforms.Compose(
+    [
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ]
+)
+
+transform_test = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ]
+)
 
 trainset = torchvision.datasets.CIFAR10(
-    root='./data', train=True, download=True, transform=transform_train)
+    root="./data", train=True, download=True, transform=transform_train
+)
 trainloader = torch.utils.data.DataLoader(
-    trainset, batch_size=128, shuffle=True, num_workers=2)
+    trainset, batch_size=128, shuffle=True, num_workers=2
+)
 
 testset = torchvision.datasets.CIFAR10(
-    root='./data', train=False, download=True, transform=transform_test)
+    root="./data", train=False, download=True, transform=transform_test
+)
 testloader = torch.utils.data.DataLoader(
-    testset, batch_size=100, shuffle=False, num_workers=2)
-
-classes = ('plane', 'car', 'bird', 'cat', 'deer',
-           'dog', 'frog', 'horse', 'ship', 'truck')
+    testset, batch_size=100, shuffle=False, num_workers=2
+)
+
+classes = (
+    "plane",
+    "car",
+    "bird",
+    "cat",
+    "deer",
+    "dog",
+    "frog",
+    "horse",
+    "ship",
+    "truck",
+)
 
 # Model
-print('==> Building model..')
-net = VGG('VGG11')
+print("==> Building model..")
+net = VGG("VGG11")
 # net = ResNet18()
 # net = PreActResNet18()
 # net = GoogLeNet()
@@ -73,29 +91,28 @@
 # net = RegNetX_200MF()
 # net = SimpleDLA()
 net = net.to(device)
-if device == 'cuda':
+if device == "cuda":
     net = torch.nn.DataParallel(net)
     cudnn.benchmark = True
 
 if args.resume:
     # Load checkpoint.
-    print('==> Resuming from checkpoint..')
-    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
-    checkpoint = torch.load('./checkpoint/ckpt.pth')
-    net.load_state_dict(checkpoint['net'])
-    best_acc = checkpoint['acc']
-    start_epoch = checkpoint['epoch']
+    print("==> Resuming from checkpoint..")
+    assert os.path.isdir("checkpoint"), "Error: no checkpoint directory found!"
+    checkpoint = torch.load("./checkpoint/ckpt.pth")
+    net.load_state_dict(checkpoint["net"])
+    best_acc = checkpoint["acc"]
+    start_epoch = checkpoint["epoch"]
 
 criterion = nn.CrossEntropyLoss()
-optimizer = optim.SGD(net.parameters(), lr=args.lr,
-                      momentum=0.9, weight_decay=5e-4)
+optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
 scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
 
 
 # Training
 def train(epoch):
-    annotate_stage("training") # ML_DAIKON: stage annotation
-    print('\nEpoch: %d' % epoch)
+    annotate_stage("training")  # ML_DAIKON: stage annotation
+    print("\nEpoch: %d" % epoch)
     net.train()
     train_loss = 0
     correct = 0
@@ -113,8 +130,12 @@ def train(epoch):
         total += targets.size(0)
         correct += predicted.eq(targets).sum().item()
 
-        progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
-                     % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
+        progress_bar(
+            batch_idx,
+            len(trainloader),
+            "Loss: %.3f | Acc: %.3f%% (%d/%d)"
+            % (train_loss / (batch_idx + 1), 100.0 * correct / total, correct, total),
+        )
         if batch_idx == 50:
             break
 
@@ -122,7 +143,7 @@ def train(epoch):
 def test(epoch):
     global best_acc
 
-    annotate_stage("testing") # ML_DAIKON: stage annotation
+    annotate_stage("testing")  # ML_DAIKON: stage annotation
     net.eval()
     test_loss = 0
     correct = 0
@@ -138,30 +159,39 @@ def test(epoch):
             total += targets.size(0)
             correct += predicted.eq(targets).sum().item()
 
-            progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
-                         % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
-            
+            progress_bar(
+                batch_idx,
+                len(testloader),
+                "Loss: %.3f | Acc: %.3f%% (%d/%d)"
+                % (
+                    test_loss / (batch_idx + 1),
+                    100.0 * correct / total,
+                    correct,
+                    total,
+                ),
+            )
+
             if batch_idx == 10:
                 break
 
     # Save checkpoint.
-    acc = 100.*correct/total
+    acc = 100.0 * correct / total
     if acc > best_acc:
-        annotate_stage("checkpointing") # ML_DAIKON: stage annotation
-        print('Saving..')
+        annotate_stage("checkpointing")  # ML_DAIKON: stage annotation
+        print("Saving..")
         state = {
-            'net': net.state_dict(),
-            'acc': acc,
-            'epoch': epoch,
+            "net": net.state_dict(),
+            "acc": acc,
+            "epoch": epoch,
         }
-        if not os.path.isdir('checkpoint'):
-            os.mkdir('checkpoint')
-        torch.save(state, './checkpoint/ckpt.pth')
+        if not os.path.isdir("checkpoint"):
+            os.mkdir("checkpoint")
+        torch.save(state, "./checkpoint/ckpt.pth")
         best_acc = acc
 
 
 # for epoch in range(start_epoch, start_epoch+200):
-for epoch in range(start_epoch, start_epoch+1): # only 3 epochs
+for epoch in range(start_epoch, start_epoch + 1):  # only 3 epochs
     train(epoch)
     test(epoch)
     scheduler.step()
diff --git a/eval_scripts/false_positive/RNN/trainset/word_language_model-RNN_TANH-1epo/main.py b/eval_scripts/false_positive/RNN/trainset/word_language_model-RNN_TANH-1epo/main.py
index 9678dc2b..f5894353 100644
--- a/eval_scripts/false_positive/RNN/trainset/word_language_model-RNN_TANH-1epo/main.py
+++ b/eval_scripts/false_positive/RNN/trainset/word_language_model-RNN_TANH-1epo/main.py
@@ -1,69 +1,89 @@
 # coding: utf-8
 import argparse
-import time
 import math
 import os
+import time
+
+import model
 import torch
 import torch.nn as nn
 import torch.onnx
+import traincheck.instrumentor.tracer as md_tracer
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 import data
-import model
-
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon.instrumentor import meta_vars
-from mldaikon import annotate_stage
 
 annotate_stage("init")
 
-parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
-parser.add_argument('--data', type=str, default='./data/wikitext-2',
-                    help='location of the data corpus')
-parser.add_argument('--model', type=str, default='LSTM',
-                    help='type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
-parser.add_argument('--emsize', type=int, default=200,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=200,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=2,
-                    help='number of layers')
-parser.add_argument('--lr', type=float, default=20,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=40,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=20, metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=35,
-                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.2,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--tied', action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--seed', type=int, default=1111,
-                    help='random seed')
-parser.add_argument('--cuda', action='store_true', default=False,
-                    help='use CUDA')
-parser.add_argument('--mps', action='store_true', default=False,
-                        help='enables macOS GPU training')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                    help='report interval')
-parser.add_argument('--save', type=str, default='model.pt',
-                    help='path to save the final model')
-parser.add_argument('--onnx-export', type=str, default='',
-                    help='path to export the final model in onnx format')
-parser.add_argument('--nhead', type=int, default=2,
-                    help='the number of heads in the encoder/decoder of the transformer model')
-parser.add_argument('--dry-run', action='store_true',
-                    help='verify the code and the model')
+parser = argparse.ArgumentParser(
+    description="PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model"
+)
+parser.add_argument(
+    "--data", type=str, default="./data/wikitext-2", help="location of the data corpus"
+)
+parser.add_argument(
+    "--model",
+    type=str,
+    default="LSTM",
+    help="type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)",
+)
+parser.add_argument("--emsize", type=int, default=200, help="size of word embeddings")
+parser.add_argument(
+    "--nhid", type=int, default=200, help="number of hidden units per layer"
+)
+parser.add_argument("--nlayers", type=int, default=2, help="number of layers")
+parser.add_argument("--lr", type=float, default=20, help="initial learning rate")
+parser.add_argument("--clip", type=float, default=0.25, help="gradient clipping")
+parser.add_argument("--epochs", type=int, default=40, help="upper epoch limit")
+parser.add_argument(
+    "--batch_size", type=int, default=20, metavar="N", help="batch size"
+)
+parser.add_argument("--bptt", type=int, default=35, help="sequence length")
+parser.add_argument(
+    "--dropout",
+    type=float,
+    default=0.2,
+    help="dropout applied to layers (0 = no dropout)",
+)
+parser.add_argument(
+    "--tied", action="store_true", help="tie the word embedding and softmax weights"
+)
+parser.add_argument("--seed", type=int, default=1111, help="random seed")
+parser.add_argument("--cuda", action="store_true", default=False, help="use CUDA")
+parser.add_argument(
+    "--mps", action="store_true", default=False, help="enables macOS GPU training"
+)
+parser.add_argument(
+    "--log-interval", type=int, default=200, metavar="N", help="report interval"
+)
+parser.add_argument(
+    "--save", type=str, default="model.pt", help="path to save the final model"
+)
+parser.add_argument(
+    "--onnx-export",
+    type=str,
+    default="",
+    help="path to export the final model in onnx format",
+)
+parser.add_argument(
+    "--nhead",
+    type=int,
+    default=2,
+    help="the number of heads in the encoder/decoder of the transformer model",
+)
+parser.add_argument(
+    "--dry-run", action="store_true", help="verify the code and the model"
+)
 args = parser.parse_args()
 
 # Set the random seed manually for reproducibility.
 torch.manual_seed(args.seed)
 if torch.cuda.is_available():
     if not args.cuda:
-        print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
+        print(
+            "WARNING: You have a CUDA device, so you should probably run with --cuda."
+        )
 if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
     if not args.mps:
         print("WARNING: You have mps device, to enable macOS GPU run with --mps.")
@@ -94,6 +114,7 @@
 # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
 # batch processing.
 
+
 def batchify(data, bsz):
     # Work out how cleanly we can divide the dataset into bsz parts.
     nbatch = data.size(0) // bsz
@@ -103,6 +124,7 @@ def batchify(data, bsz):
     data = data.view(bsz, -1).t().contiguous()
     return data.to(device)
 
+
 eval_batch_size = 10
 train_data = batchify(corpus.train, args.batch_size)
 val_data = batchify(corpus.valid, eval_batch_size)
@@ -113,7 +135,9 @@ def batchify(data, bsz):
 ###############################################################################
 
 ntokens = len(corpus.dictionary)
-model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)
+model = model.RNNModel(
+    args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied
+).to(device)
 
 criterion = nn.NLLLoss()
 
@@ -121,6 +145,7 @@ def batchify(data, bsz):
 # Training code
 ###############################################################################
 
+
 def repackage_hidden(h):
     """Wraps hidden states in new Tensors, to detach them from their history."""
 
@@ -140,10 +165,11 @@ def repackage_hidden(h):
 # by the batchify function. The chunks are along dimension 0, corresponding
 # to the seq_len dimension in the LSTM.
 
+
 def get_batch(source, i):
     seq_len = min(args.bptt, len(source) - 1 - i)
-    data = source[i:i+seq_len]
-    target = source[i+1:i+1+seq_len].view(-1)
+    data = source[i : i + seq_len]
+    target = source[i + 1 : i + 1 + seq_len].view(-1)
     return data, target
 
 
@@ -151,9 +177,9 @@ def evaluate(data_source):
     # Turn on evaluation mode which disables dropout.
     annotate_stage("testing")
     model.eval()
-    total_loss = 0.
+    total_loss = 0.0
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(eval_batch_size)
     with torch.no_grad():
         batch_id = 0
@@ -172,13 +198,13 @@ def train():
     # Turn on training mode which enables dropout.
     annotate_stage("training")
     model.train()
-    total_loss = 0.
+    total_loss = 0.0
     start_time = time.time()
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(args.batch_size)
     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, targets = get_batch(train_data, i)
         # Starting each batch, we detach the hidden state from how it was previously produced.
         # If we didn't, the model would try backpropagating all the way to start of the dataset.
@@ -198,23 +224,37 @@ def train():
         if batch % args.log_interval == 0 and batch > 0:
             cur_loss = total_loss / args.log_interval
             elapsed = time.time() - start_time
-            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
-                    'loss {:5.2f} | ppl {:8.2f}'.format(
-                epoch, batch, len(train_data) // args.bptt, lr,
-                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
+            print(
+                "| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | "
+                "loss {:5.2f} | ppl {:8.2f}".format(
+                    epoch,
+                    batch,
+                    len(train_data) // args.bptt,
+                    lr,
+                    elapsed * 1000 / args.log_interval,
+                    cur_loss,
+                    math.exp(cur_loss),
+                )
+            )
             total_loss = 0
             start_time = time.time()
         if args.dry_run:
             break
-        
+
         if batch == 10:
             break
 
 
 def export_onnx(path, batch_size, seq_len):
-    print('The model is also exported in ONNX format at {}.'.format(os.path.realpath(args.onnx_export)))
+    print(
+        "The model is also exported in ONNX format at {}.".format(
+            os.path.realpath(args.onnx_export)
+        )
+    )
     model.eval()
-    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    dummy_input = (
+        torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    )
     hidden = model.init_hidden(batch_size)
     torch.onnx.export(model, (dummy_input, hidden), path)
 
@@ -225,43 +265,49 @@ def export_onnx(path, batch_size, seq_len):
 
 # At any point you can hit Ctrl + C to break out of training early.
 try:
-    for epoch in range(1, args.epochs+1):
+    for epoch in range(1, args.epochs + 1):
         epoch_start_time = time.time()
         train()
         val_loss = evaluate(val_data)
-        print('-' * 89)
-        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
-                                           val_loss, math.exp(val_loss)))
-        print('-' * 89)
+        print("-" * 89)
+        print(
+            "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | "
+            "valid ppl {:8.2f}".format(
+                epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)
+            )
+        )
+        print("-" * 89)
         # Save the model if the validation loss is the best we've seen so far.
         if not best_val_loss or val_loss < best_val_loss:
-            with open(args.save, 'wb') as f:
+            with open(args.save, "wb") as f:
                 torch.save(model, f)
             best_val_loss = val_loss
         else:
             # Anneal the learning rate if no improvement has been seen in the validation dataset.
             lr /= 4.0
 except KeyboardInterrupt:
-    print('-' * 89)
-    print('Exiting from training early')
+    print("-" * 89)
+    print("Exiting from training early")
 
 # Load the best saved model.
 annotate_stage("checkpointing")
-with open(args.save, 'rb') as f:
+with open(args.save, "rb") as f:
     model = torch.load(f)
     # after load the rnn params are not a continuous chunk of memory
     # this makes them a continuous chunk, and will speed up forward pass
     # Currently, only rnn model supports flatten_parameters function.
-    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
+    if args.model in ["RNN_TANH", "RNN_RELU", "LSTM", "GRU"]:
         model.rnn.flatten_parameters()
 
 # Run on test data.
 test_loss = evaluate(test_data)
-print('=' * 89)
-print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
-    test_loss, math.exp(test_loss)))
-print('=' * 89)
+print("=" * 89)
+print(
+    "| End of training | test loss {:5.2f} | test ppl {:8.2f}".format(
+        test_loss, math.exp(test_loss)
+    )
+)
+print("=" * 89)
 
 if len(args.onnx_export) > 0:
     # Export the model in ONNX format.
diff --git a/eval_scripts/false_positive/RNN/trainset/word_language_model-RNN_TANH-3epo/main.py b/eval_scripts/false_positive/RNN/trainset/word_language_model-RNN_TANH-3epo/main.py
index 9678dc2b..f5894353 100644
--- a/eval_scripts/false_positive/RNN/trainset/word_language_model-RNN_TANH-3epo/main.py
+++ b/eval_scripts/false_positive/RNN/trainset/word_language_model-RNN_TANH-3epo/main.py
@@ -1,69 +1,89 @@
 # coding: utf-8
 import argparse
-import time
 import math
 import os
+import time
+
+import model
 import torch
 import torch.nn as nn
 import torch.onnx
+import traincheck.instrumentor.tracer as md_tracer
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 import data
-import model
-
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon.instrumentor import meta_vars
-from mldaikon import annotate_stage
 
 annotate_stage("init")
 
-parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
-parser.add_argument('--data', type=str, default='./data/wikitext-2',
-                    help='location of the data corpus')
-parser.add_argument('--model', type=str, default='LSTM',
-                    help='type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
-parser.add_argument('--emsize', type=int, default=200,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=200,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=2,
-                    help='number of layers')
-parser.add_argument('--lr', type=float, default=20,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=40,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=20, metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=35,
-                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.2,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--tied', action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--seed', type=int, default=1111,
-                    help='random seed')
-parser.add_argument('--cuda', action='store_true', default=False,
-                    help='use CUDA')
-parser.add_argument('--mps', action='store_true', default=False,
-                        help='enables macOS GPU training')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                    help='report interval')
-parser.add_argument('--save', type=str, default='model.pt',
-                    help='path to save the final model')
-parser.add_argument('--onnx-export', type=str, default='',
-                    help='path to export the final model in onnx format')
-parser.add_argument('--nhead', type=int, default=2,
-                    help='the number of heads in the encoder/decoder of the transformer model')
-parser.add_argument('--dry-run', action='store_true',
-                    help='verify the code and the model')
+parser = argparse.ArgumentParser(
+    description="PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model"
+)
+parser.add_argument(
+    "--data", type=str, default="./data/wikitext-2", help="location of the data corpus"
+)
+parser.add_argument(
+    "--model",
+    type=str,
+    default="LSTM",
+    help="type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)",
+)
+parser.add_argument("--emsize", type=int, default=200, help="size of word embeddings")
+parser.add_argument(
+    "--nhid", type=int, default=200, help="number of hidden units per layer"
+)
+parser.add_argument("--nlayers", type=int, default=2, help="number of layers")
+parser.add_argument("--lr", type=float, default=20, help="initial learning rate")
+parser.add_argument("--clip", type=float, default=0.25, help="gradient clipping")
+parser.add_argument("--epochs", type=int, default=40, help="upper epoch limit")
+parser.add_argument(
+    "--batch_size", type=int, default=20, metavar="N", help="batch size"
+)
+parser.add_argument("--bptt", type=int, default=35, help="sequence length")
+parser.add_argument(
+    "--dropout",
+    type=float,
+    default=0.2,
+    help="dropout applied to layers (0 = no dropout)",
+)
+parser.add_argument(
+    "--tied", action="store_true", help="tie the word embedding and softmax weights"
+)
+parser.add_argument("--seed", type=int, default=1111, help="random seed")
+parser.add_argument("--cuda", action="store_true", default=False, help="use CUDA")
+parser.add_argument(
+    "--mps", action="store_true", default=False, help="enables macOS GPU training"
+)
+parser.add_argument(
+    "--log-interval", type=int, default=200, metavar="N", help="report interval"
+)
+parser.add_argument(
+    "--save", type=str, default="model.pt", help="path to save the final model"
+)
+parser.add_argument(
+    "--onnx-export",
+    type=str,
+    default="",
+    help="path to export the final model in onnx format",
+)
+parser.add_argument(
+    "--nhead",
+    type=int,
+    default=2,
+    help="the number of heads in the encoder/decoder of the transformer model",
+)
+parser.add_argument(
+    "--dry-run", action="store_true", help="verify the code and the model"
+)
 args = parser.parse_args()
 
 # Set the random seed manually for reproducibility.
 torch.manual_seed(args.seed)
 if torch.cuda.is_available():
     if not args.cuda:
-        print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
+        print(
+            "WARNING: You have a CUDA device, so you should probably run with --cuda."
+        )
 if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
     if not args.mps:
         print("WARNING: You have mps device, to enable macOS GPU run with --mps.")
@@ -94,6 +114,7 @@
 # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
 # batch processing.
 
+
 def batchify(data, bsz):
     # Work out how cleanly we can divide the dataset into bsz parts.
     nbatch = data.size(0) // bsz
@@ -103,6 +124,7 @@ def batchify(data, bsz):
     data = data.view(bsz, -1).t().contiguous()
     return data.to(device)
 
+
 eval_batch_size = 10
 train_data = batchify(corpus.train, args.batch_size)
 val_data = batchify(corpus.valid, eval_batch_size)
@@ -113,7 +135,9 @@ def batchify(data, bsz):
 ###############################################################################
 
 ntokens = len(corpus.dictionary)
-model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)
+model = model.RNNModel(
+    args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied
+).to(device)
 
 criterion = nn.NLLLoss()
 
@@ -121,6 +145,7 @@ def batchify(data, bsz):
 # Training code
 ###############################################################################
 
+
 def repackage_hidden(h):
     """Wraps hidden states in new Tensors, to detach them from their history."""
 
@@ -140,10 +165,11 @@ def repackage_hidden(h):
 # by the batchify function. The chunks are along dimension 0, corresponding
 # to the seq_len dimension in the LSTM.
 
+
 def get_batch(source, i):
     seq_len = min(args.bptt, len(source) - 1 - i)
-    data = source[i:i+seq_len]
-    target = source[i+1:i+1+seq_len].view(-1)
+    data = source[i : i + seq_len]
+    target = source[i + 1 : i + 1 + seq_len].view(-1)
     return data, target
 
 
@@ -151,9 +177,9 @@ def evaluate(data_source):
     # Turn on evaluation mode which disables dropout.
     annotate_stage("testing")
     model.eval()
-    total_loss = 0.
+    total_loss = 0.0
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(eval_batch_size)
     with torch.no_grad():
         batch_id = 0
@@ -172,13 +198,13 @@ def train():
     # Turn on training mode which enables dropout.
     annotate_stage("training")
     model.train()
-    total_loss = 0.
+    total_loss = 0.0
     start_time = time.time()
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(args.batch_size)
     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, targets = get_batch(train_data, i)
         # Starting each batch, we detach the hidden state from how it was previously produced.
         # If we didn't, the model would try backpropagating all the way to start of the dataset.
@@ -198,23 +224,37 @@ def train():
         if batch % args.log_interval == 0 and batch > 0:
             cur_loss = total_loss / args.log_interval
             elapsed = time.time() - start_time
-            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
-                    'loss {:5.2f} | ppl {:8.2f}'.format(
-                epoch, batch, len(train_data) // args.bptt, lr,
-                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
+            print(
+                "| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | "
+                "loss {:5.2f} | ppl {:8.2f}".format(
+                    epoch,
+                    batch,
+                    len(train_data) // args.bptt,
+                    lr,
+                    elapsed * 1000 / args.log_interval,
+                    cur_loss,
+                    math.exp(cur_loss),
+                )
+            )
             total_loss = 0
             start_time = time.time()
         if args.dry_run:
             break
-        
+
         if batch == 10:
             break
 
 
 def export_onnx(path, batch_size, seq_len):
-    print('The model is also exported in ONNX format at {}.'.format(os.path.realpath(args.onnx_export)))
+    print(
+        "The model is also exported in ONNX format at {}.".format(
+            os.path.realpath(args.onnx_export)
+        )
+    )
     model.eval()
-    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    dummy_input = (
+        torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    )
     hidden = model.init_hidden(batch_size)
     torch.onnx.export(model, (dummy_input, hidden), path)
 
@@ -225,43 +265,49 @@ def export_onnx(path, batch_size, seq_len):
 
 # At any point you can hit Ctrl + C to break out of training early.
 try:
-    for epoch in range(1, args.epochs+1):
+    for epoch in range(1, args.epochs + 1):
         epoch_start_time = time.time()
         train()
         val_loss = evaluate(val_data)
-        print('-' * 89)
-        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
-                                           val_loss, math.exp(val_loss)))
-        print('-' * 89)
+        print("-" * 89)
+        print(
+            "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | "
+            "valid ppl {:8.2f}".format(
+                epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)
+            )
+        )
+        print("-" * 89)
         # Save the model if the validation loss is the best we've seen so far.
         if not best_val_loss or val_loss < best_val_loss:
-            with open(args.save, 'wb') as f:
+            with open(args.save, "wb") as f:
                 torch.save(model, f)
             best_val_loss = val_loss
         else:
             # Anneal the learning rate if no improvement has been seen in the validation dataset.
             lr /= 4.0
 except KeyboardInterrupt:
-    print('-' * 89)
-    print('Exiting from training early')
+    print("-" * 89)
+    print("Exiting from training early")
 
 # Load the best saved model.
 annotate_stage("checkpointing")
-with open(args.save, 'rb') as f:
+with open(args.save, "rb") as f:
     model = torch.load(f)
     # after load the rnn params are not a continuous chunk of memory
     # this makes them a continuous chunk, and will speed up forward pass
     # Currently, only rnn model supports flatten_parameters function.
-    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
+    if args.model in ["RNN_TANH", "RNN_RELU", "LSTM", "GRU"]:
         model.rnn.flatten_parameters()
 
 # Run on test data.
 test_loss = evaluate(test_data)
-print('=' * 89)
-print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
-    test_loss, math.exp(test_loss)))
-print('=' * 89)
+print("=" * 89)
+print(
+    "| End of training | test loss {:5.2f} | test ppl {:8.2f}".format(
+        test_loss, math.exp(test_loss)
+    )
+)
+print("=" * 89)
 
 if len(args.onnx_export) > 0:
     # Export the model in ONNX format.
diff --git a/eval_scripts/false_positive/RNN/validset/word_language_model-GRU-3epo/main.py b/eval_scripts/false_positive/RNN/validset/word_language_model-GRU-3epo/main.py
index 9678dc2b..f5894353 100644
--- a/eval_scripts/false_positive/RNN/validset/word_language_model-GRU-3epo/main.py
+++ b/eval_scripts/false_positive/RNN/validset/word_language_model-GRU-3epo/main.py
@@ -1,69 +1,89 @@
 # coding: utf-8
 import argparse
-import time
 import math
 import os
+import time
+
+import model
 import torch
 import torch.nn as nn
 import torch.onnx
+import traincheck.instrumentor.tracer as md_tracer
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 import data
-import model
-
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon.instrumentor import meta_vars
-from mldaikon import annotate_stage
 
 annotate_stage("init")
 
-parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
-parser.add_argument('--data', type=str, default='./data/wikitext-2',
-                    help='location of the data corpus')
-parser.add_argument('--model', type=str, default='LSTM',
-                    help='type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
-parser.add_argument('--emsize', type=int, default=200,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=200,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=2,
-                    help='number of layers')
-parser.add_argument('--lr', type=float, default=20,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=40,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=20, metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=35,
-                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.2,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--tied', action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--seed', type=int, default=1111,
-                    help='random seed')
-parser.add_argument('--cuda', action='store_true', default=False,
-                    help='use CUDA')
-parser.add_argument('--mps', action='store_true', default=False,
-                        help='enables macOS GPU training')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                    help='report interval')
-parser.add_argument('--save', type=str, default='model.pt',
-                    help='path to save the final model')
-parser.add_argument('--onnx-export', type=str, default='',
-                    help='path to export the final model in onnx format')
-parser.add_argument('--nhead', type=int, default=2,
-                    help='the number of heads in the encoder/decoder of the transformer model')
-parser.add_argument('--dry-run', action='store_true',
-                    help='verify the code and the model')
+parser = argparse.ArgumentParser(
+    description="PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model"
+)
+parser.add_argument(
+    "--data", type=str, default="./data/wikitext-2", help="location of the data corpus"
+)
+parser.add_argument(
+    "--model",
+    type=str,
+    default="LSTM",
+    help="type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)",
+)
+parser.add_argument("--emsize", type=int, default=200, help="size of word embeddings")
+parser.add_argument(
+    "--nhid", type=int, default=200, help="number of hidden units per layer"
+)
+parser.add_argument("--nlayers", type=int, default=2, help="number of layers")
+parser.add_argument("--lr", type=float, default=20, help="initial learning rate")
+parser.add_argument("--clip", type=float, default=0.25, help="gradient clipping")
+parser.add_argument("--epochs", type=int, default=40, help="upper epoch limit")
+parser.add_argument(
+    "--batch_size", type=int, default=20, metavar="N", help="batch size"
+)
+parser.add_argument("--bptt", type=int, default=35, help="sequence length")
+parser.add_argument(
+    "--dropout",
+    type=float,
+    default=0.2,
+    help="dropout applied to layers (0 = no dropout)",
+)
+parser.add_argument(
+    "--tied", action="store_true", help="tie the word embedding and softmax weights"
+)
+parser.add_argument("--seed", type=int, default=1111, help="random seed")
+parser.add_argument("--cuda", action="store_true", default=False, help="use CUDA")
+parser.add_argument(
+    "--mps", action="store_true", default=False, help="enables macOS GPU training"
+)
+parser.add_argument(
+    "--log-interval", type=int, default=200, metavar="N", help="report interval"
+)
+parser.add_argument(
+    "--save", type=str, default="model.pt", help="path to save the final model"
+)
+parser.add_argument(
+    "--onnx-export",
+    type=str,
+    default="",
+    help="path to export the final model in onnx format",
+)
+parser.add_argument(
+    "--nhead",
+    type=int,
+    default=2,
+    help="the number of heads in the encoder/decoder of the transformer model",
+)
+parser.add_argument(
+    "--dry-run", action="store_true", help="verify the code and the model"
+)
 args = parser.parse_args()
 
 # Set the random seed manually for reproducibility.
 torch.manual_seed(args.seed)
 if torch.cuda.is_available():
     if not args.cuda:
-        print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
+        print(
+            "WARNING: You have a CUDA device, so you should probably run with --cuda."
+        )
 if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
     if not args.mps:
         print("WARNING: You have mps device, to enable macOS GPU run with --mps.")
@@ -94,6 +114,7 @@
 # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
 # batch processing.
 
+
 def batchify(data, bsz):
     # Work out how cleanly we can divide the dataset into bsz parts.
     nbatch = data.size(0) // bsz
@@ -103,6 +124,7 @@ def batchify(data, bsz):
     data = data.view(bsz, -1).t().contiguous()
     return data.to(device)
 
+
 eval_batch_size = 10
 train_data = batchify(corpus.train, args.batch_size)
 val_data = batchify(corpus.valid, eval_batch_size)
@@ -113,7 +135,9 @@ def batchify(data, bsz):
 ###############################################################################
 
 ntokens = len(corpus.dictionary)
-model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)
+model = model.RNNModel(
+    args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied
+).to(device)
 
 criterion = nn.NLLLoss()
 
@@ -121,6 +145,7 @@ def batchify(data, bsz):
 # Training code
 ###############################################################################
 
+
 def repackage_hidden(h):
     """Wraps hidden states in new Tensors, to detach them from their history."""
 
@@ -140,10 +165,11 @@ def repackage_hidden(h):
 # by the batchify function. The chunks are along dimension 0, corresponding
 # to the seq_len dimension in the LSTM.
 
+
 def get_batch(source, i):
     seq_len = min(args.bptt, len(source) - 1 - i)
-    data = source[i:i+seq_len]
-    target = source[i+1:i+1+seq_len].view(-1)
+    data = source[i : i + seq_len]
+    target = source[i + 1 : i + 1 + seq_len].view(-1)
     return data, target
 
 
@@ -151,9 +177,9 @@ def evaluate(data_source):
     # Turn on evaluation mode which disables dropout.
     annotate_stage("testing")
     model.eval()
-    total_loss = 0.
+    total_loss = 0.0
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(eval_batch_size)
     with torch.no_grad():
         batch_id = 0
@@ -172,13 +198,13 @@ def train():
     # Turn on training mode which enables dropout.
     annotate_stage("training")
     model.train()
-    total_loss = 0.
+    total_loss = 0.0
     start_time = time.time()
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(args.batch_size)
     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, targets = get_batch(train_data, i)
         # Starting each batch, we detach the hidden state from how it was previously produced.
         # If we didn't, the model would try backpropagating all the way to start of the dataset.
@@ -198,23 +224,37 @@ def train():
         if batch % args.log_interval == 0 and batch > 0:
             cur_loss = total_loss / args.log_interval
             elapsed = time.time() - start_time
-            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
-                    'loss {:5.2f} | ppl {:8.2f}'.format(
-                epoch, batch, len(train_data) // args.bptt, lr,
-                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
+            print(
+                "| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | "
+                "loss {:5.2f} | ppl {:8.2f}".format(
+                    epoch,
+                    batch,
+                    len(train_data) // args.bptt,
+                    lr,
+                    elapsed * 1000 / args.log_interval,
+                    cur_loss,
+                    math.exp(cur_loss),
+                )
+            )
             total_loss = 0
             start_time = time.time()
         if args.dry_run:
             break
-        
+
         if batch == 10:
             break
 
 
 def export_onnx(path, batch_size, seq_len):
-    print('The model is also exported in ONNX format at {}.'.format(os.path.realpath(args.onnx_export)))
+    print(
+        "The model is also exported in ONNX format at {}.".format(
+            os.path.realpath(args.onnx_export)
+        )
+    )
     model.eval()
-    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    dummy_input = (
+        torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    )
     hidden = model.init_hidden(batch_size)
     torch.onnx.export(model, (dummy_input, hidden), path)
 
@@ -225,43 +265,49 @@ def export_onnx(path, batch_size, seq_len):
 
 # At any point you can hit Ctrl + C to break out of training early.
 try:
-    for epoch in range(1, args.epochs+1):
+    for epoch in range(1, args.epochs + 1):
         epoch_start_time = time.time()
         train()
         val_loss = evaluate(val_data)
-        print('-' * 89)
-        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
-                                           val_loss, math.exp(val_loss)))
-        print('-' * 89)
+        print("-" * 89)
+        print(
+            "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | "
+            "valid ppl {:8.2f}".format(
+                epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)
+            )
+        )
+        print("-" * 89)
         # Save the model if the validation loss is the best we've seen so far.
         if not best_val_loss or val_loss < best_val_loss:
-            with open(args.save, 'wb') as f:
+            with open(args.save, "wb") as f:
                 torch.save(model, f)
             best_val_loss = val_loss
         else:
             # Anneal the learning rate if no improvement has been seen in the validation dataset.
             lr /= 4.0
 except KeyboardInterrupt:
-    print('-' * 89)
-    print('Exiting from training early')
+    print("-" * 89)
+    print("Exiting from training early")
 
 # Load the best saved model.
 annotate_stage("checkpointing")
-with open(args.save, 'rb') as f:
+with open(args.save, "rb") as f:
     model = torch.load(f)
     # after load the rnn params are not a continuous chunk of memory
     # this makes them a continuous chunk, and will speed up forward pass
     # Currently, only rnn model supports flatten_parameters function.
-    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
+    if args.model in ["RNN_TANH", "RNN_RELU", "LSTM", "GRU"]:
         model.rnn.flatten_parameters()
 
 # Run on test data.
 test_loss = evaluate(test_data)
-print('=' * 89)
-print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
-    test_loss, math.exp(test_loss)))
-print('=' * 89)
+print("=" * 89)
+print(
+    "| End of training | test loss {:5.2f} | test ppl {:8.2f}".format(
+        test_loss, math.exp(test_loss)
+    )
+)
+print("=" * 89)
 
 if len(args.onnx_export) > 0:
     # Export the model in ONNX format.
diff --git a/eval_scripts/false_positive/RNN/validset/word_language_model-LSTM-3epo/main.py b/eval_scripts/false_positive/RNN/validset/word_language_model-LSTM-3epo/main.py
index 8eee7e2e..f5894353 100644
--- a/eval_scripts/false_positive/RNN/validset/word_language_model-LSTM-3epo/main.py
+++ b/eval_scripts/false_positive/RNN/validset/word_language_model-LSTM-3epo/main.py
@@ -8,11 +8,11 @@
 import torch
 import torch.nn as nn
 import torch.onnx
+import traincheck.instrumentor.tracer as md_tracer
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 import data
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
 
 annotate_stage("init")
 
diff --git a/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_RELU-3epo/main.py b/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_RELU-3epo/main.py
index 9678dc2b..f5894353 100644
--- a/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_RELU-3epo/main.py
+++ b/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_RELU-3epo/main.py
@@ -1,69 +1,89 @@
 # coding: utf-8
 import argparse
-import time
 import math
 import os
+import time
+
+import model
 import torch
 import torch.nn as nn
 import torch.onnx
+import traincheck.instrumentor.tracer as md_tracer
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 import data
-import model
-
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon.instrumentor import meta_vars
-from mldaikon import annotate_stage
 
 annotate_stage("init")
 
-parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
-parser.add_argument('--data', type=str, default='./data/wikitext-2',
-                    help='location of the data corpus')
-parser.add_argument('--model', type=str, default='LSTM',
-                    help='type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
-parser.add_argument('--emsize', type=int, default=200,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=200,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=2,
-                    help='number of layers')
-parser.add_argument('--lr', type=float, default=20,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=40,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=20, metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=35,
-                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.2,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--tied', action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--seed', type=int, default=1111,
-                    help='random seed')
-parser.add_argument('--cuda', action='store_true', default=False,
-                    help='use CUDA')
-parser.add_argument('--mps', action='store_true', default=False,
-                        help='enables macOS GPU training')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                    help='report interval')
-parser.add_argument('--save', type=str, default='model.pt',
-                    help='path to save the final model')
-parser.add_argument('--onnx-export', type=str, default='',
-                    help='path to export the final model in onnx format')
-parser.add_argument('--nhead', type=int, default=2,
-                    help='the number of heads in the encoder/decoder of the transformer model')
-parser.add_argument('--dry-run', action='store_true',
-                    help='verify the code and the model')
+parser = argparse.ArgumentParser(
+    description="PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model"
+)
+parser.add_argument(
+    "--data", type=str, default="./data/wikitext-2", help="location of the data corpus"
+)
+parser.add_argument(
+    "--model",
+    type=str,
+    default="LSTM",
+    help="type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)",
+)
+parser.add_argument("--emsize", type=int, default=200, help="size of word embeddings")
+parser.add_argument(
+    "--nhid", type=int, default=200, help="number of hidden units per layer"
+)
+parser.add_argument("--nlayers", type=int, default=2, help="number of layers")
+parser.add_argument("--lr", type=float, default=20, help="initial learning rate")
+parser.add_argument("--clip", type=float, default=0.25, help="gradient clipping")
+parser.add_argument("--epochs", type=int, default=40, help="upper epoch limit")
+parser.add_argument(
+    "--batch_size", type=int, default=20, metavar="N", help="batch size"
+)
+parser.add_argument("--bptt", type=int, default=35, help="sequence length")
+parser.add_argument(
+    "--dropout",
+    type=float,
+    default=0.2,
+    help="dropout applied to layers (0 = no dropout)",
+)
+parser.add_argument(
+    "--tied", action="store_true", help="tie the word embedding and softmax weights"
+)
+parser.add_argument("--seed", type=int, default=1111, help="random seed")
+parser.add_argument("--cuda", action="store_true", default=False, help="use CUDA")
+parser.add_argument(
+    "--mps", action="store_true", default=False, help="enables macOS GPU training"
+)
+parser.add_argument(
+    "--log-interval", type=int, default=200, metavar="N", help="report interval"
+)
+parser.add_argument(
+    "--save", type=str, default="model.pt", help="path to save the final model"
+)
+parser.add_argument(
+    "--onnx-export",
+    type=str,
+    default="",
+    help="path to export the final model in onnx format",
+)
+parser.add_argument(
+    "--nhead",
+    type=int,
+    default=2,
+    help="the number of heads in the encoder/decoder of the transformer model",
+)
+parser.add_argument(
+    "--dry-run", action="store_true", help="verify the code and the model"
+)
 args = parser.parse_args()
 
 # Set the random seed manually for reproducibility.
 torch.manual_seed(args.seed)
 if torch.cuda.is_available():
     if not args.cuda:
-        print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
+        print(
+            "WARNING: You have a CUDA device, so you should probably run with --cuda."
+        )
 if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
     if not args.mps:
         print("WARNING: You have mps device, to enable macOS GPU run with --mps.")
@@ -94,6 +114,7 @@
 # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
 # batch processing.
 
+
 def batchify(data, bsz):
     # Work out how cleanly we can divide the dataset into bsz parts.
     nbatch = data.size(0) // bsz
@@ -103,6 +124,7 @@ def batchify(data, bsz):
     data = data.view(bsz, -1).t().contiguous()
     return data.to(device)
 
+
 eval_batch_size = 10
 train_data = batchify(corpus.train, args.batch_size)
 val_data = batchify(corpus.valid, eval_batch_size)
@@ -113,7 +135,9 @@ def batchify(data, bsz):
 ###############################################################################
 
 ntokens = len(corpus.dictionary)
-model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)
+model = model.RNNModel(
+    args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied
+).to(device)
 
 criterion = nn.NLLLoss()
 
@@ -121,6 +145,7 @@ def batchify(data, bsz):
 # Training code
 ###############################################################################
 
+
 def repackage_hidden(h):
     """Wraps hidden states in new Tensors, to detach them from their history."""
 
@@ -140,10 +165,11 @@ def repackage_hidden(h):
 # by the batchify function. The chunks are along dimension 0, corresponding
 # to the seq_len dimension in the LSTM.
 
+
 def get_batch(source, i):
     seq_len = min(args.bptt, len(source) - 1 - i)
-    data = source[i:i+seq_len]
-    target = source[i+1:i+1+seq_len].view(-1)
+    data = source[i : i + seq_len]
+    target = source[i + 1 : i + 1 + seq_len].view(-1)
     return data, target
 
 
@@ -151,9 +177,9 @@ def evaluate(data_source):
     # Turn on evaluation mode which disables dropout.
     annotate_stage("testing")
     model.eval()
-    total_loss = 0.
+    total_loss = 0.0
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(eval_batch_size)
     with torch.no_grad():
         batch_id = 0
@@ -172,13 +198,13 @@ def train():
     # Turn on training mode which enables dropout.
     annotate_stage("training")
     model.train()
-    total_loss = 0.
+    total_loss = 0.0
     start_time = time.time()
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(args.batch_size)
     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, targets = get_batch(train_data, i)
         # Starting each batch, we detach the hidden state from how it was previously produced.
         # If we didn't, the model would try backpropagating all the way to start of the dataset.
@@ -198,23 +224,37 @@ def train():
         if batch % args.log_interval == 0 and batch > 0:
             cur_loss = total_loss / args.log_interval
             elapsed = time.time() - start_time
-            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
-                    'loss {:5.2f} | ppl {:8.2f}'.format(
-                epoch, batch, len(train_data) // args.bptt, lr,
-                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
+            print(
+                "| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | "
+                "loss {:5.2f} | ppl {:8.2f}".format(
+                    epoch,
+                    batch,
+                    len(train_data) // args.bptt,
+                    lr,
+                    elapsed * 1000 / args.log_interval,
+                    cur_loss,
+                    math.exp(cur_loss),
+                )
+            )
             total_loss = 0
             start_time = time.time()
         if args.dry_run:
             break
-        
+
         if batch == 10:
             break
 
 
 def export_onnx(path, batch_size, seq_len):
-    print('The model is also exported in ONNX format at {}.'.format(os.path.realpath(args.onnx_export)))
+    print(
+        "The model is also exported in ONNX format at {}.".format(
+            os.path.realpath(args.onnx_export)
+        )
+    )
     model.eval()
-    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    dummy_input = (
+        torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    )
     hidden = model.init_hidden(batch_size)
     torch.onnx.export(model, (dummy_input, hidden), path)
 
@@ -225,43 +265,49 @@ def export_onnx(path, batch_size, seq_len):
 
 # At any point you can hit Ctrl + C to break out of training early.
 try:
-    for epoch in range(1, args.epochs+1):
+    for epoch in range(1, args.epochs + 1):
         epoch_start_time = time.time()
         train()
         val_loss = evaluate(val_data)
-        print('-' * 89)
-        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
-                                           val_loss, math.exp(val_loss)))
-        print('-' * 89)
+        print("-" * 89)
+        print(
+            "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | "
+            "valid ppl {:8.2f}".format(
+                epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)
+            )
+        )
+        print("-" * 89)
         # Save the model if the validation loss is the best we've seen so far.
         if not best_val_loss or val_loss < best_val_loss:
-            with open(args.save, 'wb') as f:
+            with open(args.save, "wb") as f:
                 torch.save(model, f)
             best_val_loss = val_loss
         else:
             # Anneal the learning rate if no improvement has been seen in the validation dataset.
             lr /= 4.0
 except KeyboardInterrupt:
-    print('-' * 89)
-    print('Exiting from training early')
+    print("-" * 89)
+    print("Exiting from training early")
 
 # Load the best saved model.
 annotate_stage("checkpointing")
-with open(args.save, 'rb') as f:
+with open(args.save, "rb") as f:
     model = torch.load(f)
     # after load the rnn params are not a continuous chunk of memory
     # this makes them a continuous chunk, and will speed up forward pass
     # Currently, only rnn model supports flatten_parameters function.
-    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
+    if args.model in ["RNN_TANH", "RNN_RELU", "LSTM", "GRU"]:
         model.rnn.flatten_parameters()
 
 # Run on test data.
 test_loss = evaluate(test_data)
-print('=' * 89)
-print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
-    test_loss, math.exp(test_loss)))
-print('=' * 89)
+print("=" * 89)
+print(
+    "| End of training | test loss {:5.2f} | test ppl {:8.2f}".format(
+        test_loss, math.exp(test_loss)
+    )
+)
+print("=" * 89)
 
 if len(args.onnx_export) > 0:
     # Export the model in ONNX format.
diff --git a/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_TANH-Longer/main.py b/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_TANH-Longer/main.py
index 9678dc2b..f5894353 100644
--- a/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_TANH-Longer/main.py
+++ b/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_TANH-Longer/main.py
@@ -1,69 +1,89 @@
 # coding: utf-8
 import argparse
-import time
 import math
 import os
+import time
+
+import model
 import torch
 import torch.nn as nn
 import torch.onnx
+import traincheck.instrumentor.tracer as md_tracer
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 import data
-import model
-
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon.instrumentor import meta_vars
-from mldaikon import annotate_stage
 
 annotate_stage("init")
 
-parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
-parser.add_argument('--data', type=str, default='./data/wikitext-2',
-                    help='location of the data corpus')
-parser.add_argument('--model', type=str, default='LSTM',
-                    help='type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
-parser.add_argument('--emsize', type=int, default=200,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=200,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=2,
-                    help='number of layers')
-parser.add_argument('--lr', type=float, default=20,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=40,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=20, metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=35,
-                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.2,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--tied', action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--seed', type=int, default=1111,
-                    help='random seed')
-parser.add_argument('--cuda', action='store_true', default=False,
-                    help='use CUDA')
-parser.add_argument('--mps', action='store_true', default=False,
-                        help='enables macOS GPU training')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                    help='report interval')
-parser.add_argument('--save', type=str, default='model.pt',
-                    help='path to save the final model')
-parser.add_argument('--onnx-export', type=str, default='',
-                    help='path to export the final model in onnx format')
-parser.add_argument('--nhead', type=int, default=2,
-                    help='the number of heads in the encoder/decoder of the transformer model')
-parser.add_argument('--dry-run', action='store_true',
-                    help='verify the code and the model')
+parser = argparse.ArgumentParser(
+    description="PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model"
+)
+parser.add_argument(
+    "--data", type=str, default="./data/wikitext-2", help="location of the data corpus"
+)
+parser.add_argument(
+    "--model",
+    type=str,
+    default="LSTM",
+    help="type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)",
+)
+parser.add_argument("--emsize", type=int, default=200, help="size of word embeddings")
+parser.add_argument(
+    "--nhid", type=int, default=200, help="number of hidden units per layer"
+)
+parser.add_argument("--nlayers", type=int, default=2, help="number of layers")
+parser.add_argument("--lr", type=float, default=20, help="initial learning rate")
+parser.add_argument("--clip", type=float, default=0.25, help="gradient clipping")
+parser.add_argument("--epochs", type=int, default=40, help="upper epoch limit")
+parser.add_argument(
+    "--batch_size", type=int, default=20, metavar="N", help="batch size"
+)
+parser.add_argument("--bptt", type=int, default=35, help="sequence length")
+parser.add_argument(
+    "--dropout",
+    type=float,
+    default=0.2,
+    help="dropout applied to layers (0 = no dropout)",
+)
+parser.add_argument(
+    "--tied", action="store_true", help="tie the word embedding and softmax weights"
+)
+parser.add_argument("--seed", type=int, default=1111, help="random seed")
+parser.add_argument("--cuda", action="store_true", default=False, help="use CUDA")
+parser.add_argument(
+    "--mps", action="store_true", default=False, help="enables macOS GPU training"
+)
+parser.add_argument(
+    "--log-interval", type=int, default=200, metavar="N", help="report interval"
+)
+parser.add_argument(
+    "--save", type=str, default="model.pt", help="path to save the final model"
+)
+parser.add_argument(
+    "--onnx-export",
+    type=str,
+    default="",
+    help="path to export the final model in onnx format",
+)
+parser.add_argument(
+    "--nhead",
+    type=int,
+    default=2,
+    help="the number of heads in the encoder/decoder of the transformer model",
+)
+parser.add_argument(
+    "--dry-run", action="store_true", help="verify the code and the model"
+)
 args = parser.parse_args()
 
 # Set the random seed manually for reproducibility.
 torch.manual_seed(args.seed)
 if torch.cuda.is_available():
     if not args.cuda:
-        print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
+        print(
+            "WARNING: You have a CUDA device, so you should probably run with --cuda."
+        )
 if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
     if not args.mps:
         print("WARNING: You have mps device, to enable macOS GPU run with --mps.")
@@ -94,6 +114,7 @@
 # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
 # batch processing.
 
+
 def batchify(data, bsz):
     # Work out how cleanly we can divide the dataset into bsz parts.
     nbatch = data.size(0) // bsz
@@ -103,6 +124,7 @@ def batchify(data, bsz):
     data = data.view(bsz, -1).t().contiguous()
     return data.to(device)
 
+
 eval_batch_size = 10
 train_data = batchify(corpus.train, args.batch_size)
 val_data = batchify(corpus.valid, eval_batch_size)
@@ -113,7 +135,9 @@ def batchify(data, bsz):
 ###############################################################################
 
 ntokens = len(corpus.dictionary)
-model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)
+model = model.RNNModel(
+    args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied
+).to(device)
 
 criterion = nn.NLLLoss()
 
@@ -121,6 +145,7 @@ def batchify(data, bsz):
 # Training code
 ###############################################################################
 
+
 def repackage_hidden(h):
     """Wraps hidden states in new Tensors, to detach them from their history."""
 
@@ -140,10 +165,11 @@ def repackage_hidden(h):
 # by the batchify function. The chunks are along dimension 0, corresponding
 # to the seq_len dimension in the LSTM.
 
+
 def get_batch(source, i):
     seq_len = min(args.bptt, len(source) - 1 - i)
-    data = source[i:i+seq_len]
-    target = source[i+1:i+1+seq_len].view(-1)
+    data = source[i : i + seq_len]
+    target = source[i + 1 : i + 1 + seq_len].view(-1)
     return data, target
 
 
@@ -151,9 +177,9 @@ def evaluate(data_source):
     # Turn on evaluation mode which disables dropout.
     annotate_stage("testing")
     model.eval()
-    total_loss = 0.
+    total_loss = 0.0
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(eval_batch_size)
     with torch.no_grad():
         batch_id = 0
@@ -172,13 +198,13 @@ def train():
     # Turn on training mode which enables dropout.
     annotate_stage("training")
     model.train()
-    total_loss = 0.
+    total_loss = 0.0
     start_time = time.time()
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(args.batch_size)
     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, targets = get_batch(train_data, i)
         # Starting each batch, we detach the hidden state from how it was previously produced.
         # If we didn't, the model would try backpropagating all the way to start of the dataset.
@@ -198,23 +224,37 @@ def train():
         if batch % args.log_interval == 0 and batch > 0:
             cur_loss = total_loss / args.log_interval
             elapsed = time.time() - start_time
-            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
-                    'loss {:5.2f} | ppl {:8.2f}'.format(
-                epoch, batch, len(train_data) // args.bptt, lr,
-                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
+            print(
+                "| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | "
+                "loss {:5.2f} | ppl {:8.2f}".format(
+                    epoch,
+                    batch,
+                    len(train_data) // args.bptt,
+                    lr,
+                    elapsed * 1000 / args.log_interval,
+                    cur_loss,
+                    math.exp(cur_loss),
+                )
+            )
             total_loss = 0
             start_time = time.time()
         if args.dry_run:
             break
-        
+
         if batch == 10:
             break
 
 
 def export_onnx(path, batch_size, seq_len):
-    print('The model is also exported in ONNX format at {}.'.format(os.path.realpath(args.onnx_export)))
+    print(
+        "The model is also exported in ONNX format at {}.".format(
+            os.path.realpath(args.onnx_export)
+        )
+    )
     model.eval()
-    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    dummy_input = (
+        torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    )
     hidden = model.init_hidden(batch_size)
     torch.onnx.export(model, (dummy_input, hidden), path)
 
@@ -225,43 +265,49 @@ def export_onnx(path, batch_size, seq_len):
 
 # At any point you can hit Ctrl + C to break out of training early.
 try:
-    for epoch in range(1, args.epochs+1):
+    for epoch in range(1, args.epochs + 1):
         epoch_start_time = time.time()
         train()
         val_loss = evaluate(val_data)
-        print('-' * 89)
-        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
-                                           val_loss, math.exp(val_loss)))
-        print('-' * 89)
+        print("-" * 89)
+        print(
+            "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | "
+            "valid ppl {:8.2f}".format(
+                epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)
+            )
+        )
+        print("-" * 89)
         # Save the model if the validation loss is the best we've seen so far.
         if not best_val_loss or val_loss < best_val_loss:
-            with open(args.save, 'wb') as f:
+            with open(args.save, "wb") as f:
                 torch.save(model, f)
             best_val_loss = val_loss
         else:
             # Anneal the learning rate if no improvement has been seen in the validation dataset.
             lr /= 4.0
 except KeyboardInterrupt:
-    print('-' * 89)
-    print('Exiting from training early')
+    print("-" * 89)
+    print("Exiting from training early")
 
 # Load the best saved model.
 annotate_stage("checkpointing")
-with open(args.save, 'rb') as f:
+with open(args.save, "rb") as f:
     model = torch.load(f)
     # after load the rnn params are not a continuous chunk of memory
     # this makes them a continuous chunk, and will speed up forward pass
     # Currently, only rnn model supports flatten_parameters function.
-    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
+    if args.model in ["RNN_TANH", "RNN_RELU", "LSTM", "GRU"]:
         model.rnn.flatten_parameters()
 
 # Run on test data.
 test_loss = evaluate(test_data)
-print('=' * 89)
-print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
-    test_loss, math.exp(test_loss)))
-print('=' * 89)
+print("=" * 89)
+print(
+    "| End of training | test loss {:5.2f} | test ppl {:8.2f}".format(
+        test_loss, math.exp(test_loss)
+    )
+)
+print("=" * 89)
 
 if len(args.onnx_export) > 0:
     # Export the model in ONNX format.
diff --git a/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_TANH-different-hyperparam/main.py b/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_TANH-different-hyperparam/main.py
index 9678dc2b..f5894353 100644
--- a/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_TANH-different-hyperparam/main.py
+++ b/eval_scripts/false_positive/RNN/validset/word_language_model-RNN_TANH-different-hyperparam/main.py
@@ -1,69 +1,89 @@
 # coding: utf-8
 import argparse
-import time
 import math
 import os
+import time
+
+import model
 import torch
 import torch.nn as nn
 import torch.onnx
+import traincheck.instrumentor.tracer as md_tracer
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 import data
-import model
-
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon.instrumentor import meta_vars
-from mldaikon import annotate_stage
 
 annotate_stage("init")
 
-parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
-parser.add_argument('--data', type=str, default='./data/wikitext-2',
-                    help='location of the data corpus')
-parser.add_argument('--model', type=str, default='LSTM',
-                    help='type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
-parser.add_argument('--emsize', type=int, default=200,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=200,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=2,
-                    help='number of layers')
-parser.add_argument('--lr', type=float, default=20,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=40,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=20, metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=35,
-                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.2,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--tied', action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--seed', type=int, default=1111,
-                    help='random seed')
-parser.add_argument('--cuda', action='store_true', default=False,
-                    help='use CUDA')
-parser.add_argument('--mps', action='store_true', default=False,
-                        help='enables macOS GPU training')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                    help='report interval')
-parser.add_argument('--save', type=str, default='model.pt',
-                    help='path to save the final model')
-parser.add_argument('--onnx-export', type=str, default='',
-                    help='path to export the final model in onnx format')
-parser.add_argument('--nhead', type=int, default=2,
-                    help='the number of heads in the encoder/decoder of the transformer model')
-parser.add_argument('--dry-run', action='store_true',
-                    help='verify the code and the model')
+parser = argparse.ArgumentParser(
+    description="PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model"
+)
+parser.add_argument(
+    "--data", type=str, default="./data/wikitext-2", help="location of the data corpus"
+)
+parser.add_argument(
+    "--model",
+    type=str,
+    default="LSTM",
+    help="type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)",
+)
+parser.add_argument("--emsize", type=int, default=200, help="size of word embeddings")
+parser.add_argument(
+    "--nhid", type=int, default=200, help="number of hidden units per layer"
+)
+parser.add_argument("--nlayers", type=int, default=2, help="number of layers")
+parser.add_argument("--lr", type=float, default=20, help="initial learning rate")
+parser.add_argument("--clip", type=float, default=0.25, help="gradient clipping")
+parser.add_argument("--epochs", type=int, default=40, help="upper epoch limit")
+parser.add_argument(
+    "--batch_size", type=int, default=20, metavar="N", help="batch size"
+)
+parser.add_argument("--bptt", type=int, default=35, help="sequence length")
+parser.add_argument(
+    "--dropout",
+    type=float,
+    default=0.2,
+    help="dropout applied to layers (0 = no dropout)",
+)
+parser.add_argument(
+    "--tied", action="store_true", help="tie the word embedding and softmax weights"
+)
+parser.add_argument("--seed", type=int, default=1111, help="random seed")
+parser.add_argument("--cuda", action="store_true", default=False, help="use CUDA")
+parser.add_argument(
+    "--mps", action="store_true", default=False, help="enables macOS GPU training"
+)
+parser.add_argument(
+    "--log-interval", type=int, default=200, metavar="N", help="report interval"
+)
+parser.add_argument(
+    "--save", type=str, default="model.pt", help="path to save the final model"
+)
+parser.add_argument(
+    "--onnx-export",
+    type=str,
+    default="",
+    help="path to export the final model in onnx format",
+)
+parser.add_argument(
+    "--nhead",
+    type=int,
+    default=2,
+    help="the number of heads in the encoder/decoder of the transformer model",
+)
+parser.add_argument(
+    "--dry-run", action="store_true", help="verify the code and the model"
+)
 args = parser.parse_args()
 
 # Set the random seed manually for reproducibility.
 torch.manual_seed(args.seed)
 if torch.cuda.is_available():
     if not args.cuda:
-        print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
+        print(
+            "WARNING: You have a CUDA device, so you should probably run with --cuda."
+        )
 if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
     if not args.mps:
         print("WARNING: You have mps device, to enable macOS GPU run with --mps.")
@@ -94,6 +114,7 @@
 # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
 # batch processing.
 
+
 def batchify(data, bsz):
     # Work out how cleanly we can divide the dataset into bsz parts.
     nbatch = data.size(0) // bsz
@@ -103,6 +124,7 @@ def batchify(data, bsz):
     data = data.view(bsz, -1).t().contiguous()
     return data.to(device)
 
+
 eval_batch_size = 10
 train_data = batchify(corpus.train, args.batch_size)
 val_data = batchify(corpus.valid, eval_batch_size)
@@ -113,7 +135,9 @@ def batchify(data, bsz):
 ###############################################################################
 
 ntokens = len(corpus.dictionary)
-model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)
+model = model.RNNModel(
+    args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied
+).to(device)
 
 criterion = nn.NLLLoss()
 
@@ -121,6 +145,7 @@ def batchify(data, bsz):
 # Training code
 ###############################################################################
 
+
 def repackage_hidden(h):
     """Wraps hidden states in new Tensors, to detach them from their history."""
 
@@ -140,10 +165,11 @@ def repackage_hidden(h):
 # by the batchify function. The chunks are along dimension 0, corresponding
 # to the seq_len dimension in the LSTM.
 
+
 def get_batch(source, i):
     seq_len = min(args.bptt, len(source) - 1 - i)
-    data = source[i:i+seq_len]
-    target = source[i+1:i+1+seq_len].view(-1)
+    data = source[i : i + seq_len]
+    target = source[i + 1 : i + 1 + seq_len].view(-1)
     return data, target
 
 
@@ -151,9 +177,9 @@ def evaluate(data_source):
     # Turn on evaluation mode which disables dropout.
     annotate_stage("testing")
     model.eval()
-    total_loss = 0.
+    total_loss = 0.0
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(eval_batch_size)
     with torch.no_grad():
         batch_id = 0
@@ -172,13 +198,13 @@ def train():
     # Turn on training mode which enables dropout.
     annotate_stage("training")
     model.train()
-    total_loss = 0.
+    total_loss = 0.0
     start_time = time.time()
     ntokens = len(corpus.dictionary)
-    if args.model != 'Transformer':
+    if args.model != "Transformer":
         hidden = model.init_hidden(args.batch_size)
     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
-        meta_vars['step'] += 1
+        meta_vars["step"] += 1
         data, targets = get_batch(train_data, i)
         # Starting each batch, we detach the hidden state from how it was previously produced.
         # If we didn't, the model would try backpropagating all the way to start of the dataset.
@@ -198,23 +224,37 @@ def train():
         if batch % args.log_interval == 0 and batch > 0:
             cur_loss = total_loss / args.log_interval
             elapsed = time.time() - start_time
-            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
-                    'loss {:5.2f} | ppl {:8.2f}'.format(
-                epoch, batch, len(train_data) // args.bptt, lr,
-                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
+            print(
+                "| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | "
+                "loss {:5.2f} | ppl {:8.2f}".format(
+                    epoch,
+                    batch,
+                    len(train_data) // args.bptt,
+                    lr,
+                    elapsed * 1000 / args.log_interval,
+                    cur_loss,
+                    math.exp(cur_loss),
+                )
+            )
             total_loss = 0
             start_time = time.time()
         if args.dry_run:
             break
-        
+
         if batch == 10:
             break
 
 
 def export_onnx(path, batch_size, seq_len):
-    print('The model is also exported in ONNX format at {}.'.format(os.path.realpath(args.onnx_export)))
+    print(
+        "The model is also exported in ONNX format at {}.".format(
+            os.path.realpath(args.onnx_export)
+        )
+    )
     model.eval()
-    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    dummy_input = (
+        torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
+    )
     hidden = model.init_hidden(batch_size)
     torch.onnx.export(model, (dummy_input, hidden), path)
 
@@ -225,43 +265,49 @@ def export_onnx(path, batch_size, seq_len):
 
 # At any point you can hit Ctrl + C to break out of training early.
 try:
-    for epoch in range(1, args.epochs+1):
+    for epoch in range(1, args.epochs + 1):
         epoch_start_time = time.time()
         train()
         val_loss = evaluate(val_data)
-        print('-' * 89)
-        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
-                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
-                                           val_loss, math.exp(val_loss)))
-        print('-' * 89)
+        print("-" * 89)
+        print(
+            "| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | "
+            "valid ppl {:8.2f}".format(
+                epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)
+            )
+        )
+        print("-" * 89)
         # Save the model if the validation loss is the best we've seen so far.
         if not best_val_loss or val_loss < best_val_loss:
-            with open(args.save, 'wb') as f:
+            with open(args.save, "wb") as f:
                 torch.save(model, f)
             best_val_loss = val_loss
         else:
             # Anneal the learning rate if no improvement has been seen in the validation dataset.
             lr /= 4.0
 except KeyboardInterrupt:
-    print('-' * 89)
-    print('Exiting from training early')
+    print("-" * 89)
+    print("Exiting from training early")
 
 # Load the best saved model.
 annotate_stage("checkpointing")
-with open(args.save, 'rb') as f:
+with open(args.save, "rb") as f:
     model = torch.load(f)
     # after load the rnn params are not a continuous chunk of memory
     # this makes them a continuous chunk, and will speed up forward pass
     # Currently, only rnn model supports flatten_parameters function.
-    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
+    if args.model in ["RNN_TANH", "RNN_RELU", "LSTM", "GRU"]:
         model.rnn.flatten_parameters()
 
 # Run on test data.
 test_loss = evaluate(test_data)
-print('=' * 89)
-print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
-    test_loss, math.exp(test_loss)))
-print('=' * 89)
+print("=" * 89)
+print(
+    "| End of training | test loss {:5.2f} | test ppl {:8.2f}".format(
+        test_loss, math.exp(test_loss)
+    )
+)
+print("=" * 89)
 
 if len(args.onnx_export) > 0:
     # Export the model in ONNX format.
diff --git a/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-1epo/run_clm_no_trainer.py b/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-1epo/run_clm_no_trainer.py
index b2dcbf4a..f80a3393 100755
--- a/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-1epo/run_clm_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-1epo/run_clm_no_trainer.py
@@ -33,6 +33,7 @@
 
 import datasets
 import torch
+import traincheck.instrumentor.tracer as md_tracer
 import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
@@ -41,6 +42,7 @@
 from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -54,9 +56,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo-accelerate-naive/run_clm_no_trainer.py b/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo-accelerate-naive/run_clm_no_trainer.py
index b2dcbf4a..f80a3393 100755
--- a/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo-accelerate-naive/run_clm_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo-accelerate-naive/run_clm_no_trainer.py
@@ -33,6 +33,7 @@
 
 import datasets
 import torch
+import traincheck.instrumentor.tracer as md_tracer
 import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
@@ -41,6 +42,7 @@
 from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -54,9 +56,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo-fp16/run_clm_no_trainer.py b/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo-fp16/run_clm_no_trainer.py
index b2dcbf4a..f80a3393 100755
--- a/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo-fp16/run_clm_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo-fp16/run_clm_no_trainer.py
@@ -33,6 +33,7 @@
 
 import datasets
 import torch
+import traincheck.instrumentor.tracer as md_tracer
 import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
@@ -41,6 +42,7 @@
 from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -54,9 +56,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo/run_clm_no_trainer.py b/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo/run_clm_no_trainer.py
index b2dcbf4a..f80a3393 100755
--- a/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo/run_clm_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/trainset/language-modeling-gpt2-88m-3epo/run_clm_no_trainer.py
@@ -33,6 +33,7 @@
 
 import datasets
 import torch
+import traincheck.instrumentor.tracer as md_tracer
 import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
@@ -41,6 +42,7 @@
 from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -54,9 +56,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/eval_scripts/false_positive/Transformers/validset/language-modeling-fim-gpt2-88m-3epo/run_fim_no_trainer.py b/eval_scripts/false_positive/Transformers/validset/language-modeling-fim-gpt2-88m-3epo/run_fim_no_trainer.py
index e04dc277..ca452490 100644
--- a/eval_scripts/false_positive/Transformers/validset/language-modeling-fim-gpt2-88m-3epo/run_fim_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/validset/language-modeling-fim-gpt2-88m-3epo/run_fim_no_trainer.py
@@ -34,6 +34,8 @@
 import datasets
 import numpy as np
 import torch
+import traincheck.instrumentor.tracer as md_tracer
+import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
@@ -41,8 +43,7 @@
 from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
-
-import transformers
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -58,9 +59,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo-bf16/run_clm_no_trainer.py b/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo-bf16/run_clm_no_trainer.py
index b2dcbf4a..f80a3393 100755
--- a/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo-bf16/run_clm_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo-bf16/run_clm_no_trainer.py
@@ -33,6 +33,7 @@
 
 import datasets
 import torch
+import traincheck.instrumentor.tracer as md_tracer
 import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
@@ -41,6 +42,7 @@
 from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -54,9 +56,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo-distributed/run_clm_no_trainer.py b/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo-distributed/run_clm_no_trainer.py
index b2dcbf4a..f80a3393 100755
--- a/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo-distributed/run_clm_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo-distributed/run_clm_no_trainer.py
@@ -33,6 +33,7 @@
 
 import datasets
 import torch
+import traincheck.instrumentor.tracer as md_tracer
 import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
@@ -41,6 +42,7 @@
 from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -54,9 +56,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo/run_clm_no_trainer.py b/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo/run_clm_no_trainer.py
index b2dcbf4a..f80a3393 100755
--- a/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo/run_clm_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-137m-3epo/run_clm_no_trainer.py
@@ -33,6 +33,7 @@
 
 import datasets
 import torch
+import traincheck.instrumentor.tracer as md_tracer
 import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
@@ -41,6 +42,7 @@
 from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -54,9 +56,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-88m-10epo/run_clm_no_trainer.py b/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-88m-10epo/run_clm_no_trainer.py
index b2dcbf4a..f80a3393 100755
--- a/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-88m-10epo/run_clm_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/validset/language-modeling-gpt2-88m-10epo/run_clm_no_trainer.py
@@ -33,6 +33,7 @@
 
 import datasets
 import torch
+import traincheck.instrumentor.tracer as md_tracer
 import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
@@ -41,6 +42,7 @@
 from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -54,9 +56,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/eval_scripts/false_positive/Transformers/validset/language-modeling-mlm-roberta-base-3epo/run_mlm_no_trainer.py b/eval_scripts/false_positive/Transformers/validset/language-modeling-mlm-roberta-base-3epo/run_mlm_no_trainer.py
index c9c9431f..5b7fd1d8 100644
--- a/eval_scripts/false_positive/Transformers/validset/language-modeling-mlm-roberta-base-3epo/run_mlm_no_trainer.py
+++ b/eval_scripts/false_positive/Transformers/validset/language-modeling-mlm-roberta-base-3epo/run_mlm_no_trainer.py
@@ -33,6 +33,8 @@
 
 import datasets
 import torch
+import traincheck.instrumentor.tracer as md_tracer
+import transformers
 from accelerate import Accelerator, DistributedType
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
@@ -40,8 +42,7 @@
 from huggingface_hub import HfApi
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
-
-import transformers
+from traincheck import annotate_stage
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -55,9 +56,6 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as md_tracer
-from mldaikon import annotate_stage
-
 annotate_stage("init")
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -600,7 +598,7 @@ def group_texts(examples):
                 load_from_cache_file=not args.overwrite_cache,
                 desc=f"Grouping texts in chunks of {max_seq_length}",
             )
-            
+
     md_tracer.DISABLE_WRAPPER = False
     train_dataset = tokenized_datasets["train"]
     eval_dataset = tokenized_datasets["validation"]
diff --git a/eval_scripts/false_positive/analyze_results.py b/eval_scripts/false_positive/analyze_results.py
index 36160c68..e867b664 100644
--- a/eval_scripts/false_positive/analyze_results.py
+++ b/eval_scripts/false_positive/analyze_results.py
@@ -5,8 +5,8 @@
 import yaml
 from run_exp_for_class import EXPS, get_checker_output_dir, get_setup_key
 
-from mldaikon.checker import parse_checker_results
-from mldaikon.invariant.base_cls import Invariant, read_inv_file
+from traincheck.checker import parse_checker_results
+from traincheck.invariant.base_cls import Invariant, read_inv_file
 
 
 def discover_checker_results() -> dict:
diff --git a/eval_scripts/false_positive/run_exp_for_class.py b/eval_scripts/false_positive/run_exp_for_class.py
index 1763f77f..8812ae99 100644
--- a/eval_scripts/false_positive/run_exp_for_class.py
+++ b/eval_scripts/false_positive/run_exp_for_class.py
@@ -38,7 +38,7 @@ def get_trace_collection_command(program) -> list[str]:
     return [
         "python",
         "-m",
-        "mldaikon.collect_trace",
+        "traincheck.collect_trace",
         "--use-config",
         "--config",
         f"{PROGRAM_TO_PATH[program]}/md-config-var.yml",
@@ -48,7 +48,7 @@ def get_trace_collection_command(program) -> list[str]:
 
 
 def get_inv_inference_command(setup) -> list[str]:
-    cmd = ["python", "-m", "mldaikon.infer_engine", "-f"]
+    cmd = ["python", "-m", "traincheck.infer_engine", "-f"]
     for program in setup["inputs"]:
         cmd.append(get_trace_collection_dir(program))
     cmd.append("-o")
@@ -57,7 +57,7 @@ def get_inv_inference_command(setup) -> list[str]:
 
 
 def get_inv_checking_command(setup, program) -> list[str]:
-    cmd = ["python", "-m", "mldaikon.checker", "-f"]
+    cmd = ["python", "-m", "traincheck.checker", "-f"]
     cmd.append(get_trace_collection_dir(program))
     cmd.append("-i")
     cmd.append(get_inv_file_name(setup))
@@ -211,10 +211,10 @@ def cleanup_trace_files():
         print(f"Removing {trace_dir}")
         os.system(f"rm -rf {trace_dir}")
 
-    # remove all mldaikon logs
+    # remove all traincheck logs
     files = os.listdir(".")
-    mldaikon_logs = [file for file in files if file.startswith("mldaikon_")]
-    for log in mldaikon_logs:
+    traincheck_logs = [file for file in files if file.startswith("traincheck_")]
+    for log in traincheck_logs:
         print(f"Removing {log}")
         os.system(f"rm {log}")
 
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/README.md b/eval_scripts/perf_benchmark/overhead-e2e/README.md
index 19aa604b..bb9393df 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/README.md
+++ b/eval_scripts/perf_benchmark/overhead-e2e/README.md
@@ -11,9 +11,9 @@
 2. sys.settrace:
     run xxx_settrace.py with the same command, observe per iter time from STDOUT
 3. monkey patch:
-    run python3 -m mldaikon.collect_trace -p xxx.py, kill the program after a few iterations if you don't want it to complete
+    run python3 -m traincheck.collect_trace -p xxx.py, kill the program after a few iterations if you don't want it to complete
 4. selective benchmarking:
     use invariants at sampled_100_invariants.json for instrumentation.
     Note that proxy class handling and args dumping incur large overhead but we have only implemented 
-    selective dumping at the API level. To mimic the overhead data wit selective proxy class handling and args dumping, go to mldaikon.instrumentor.tracer and uncomment the commented out 
+    selective dumping at the API level. To mimic the overhead data wit selective proxy class handling and args dumping, go to traincheck.instrumentor.tracer and uncomment the commented out 
     global wrapper and core wrapper.
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/ac_bert/README.md b/eval_scripts/perf_benchmark/overhead-e2e/ac_bert/README.md
index 238859be..1d5e10a8 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/ac_bert/README.md
+++ b/eval_scripts/perf_benchmark/overhead-e2e/ac_bert/README.md
@@ -1 +1 @@
-Adapted from ml-daikon-input-programs/accelerate/complete_nlp_example
\ No newline at end of file
+Adapted from traincheck-input-programs/accelerate/complete_nlp_example
\ No newline at end of file
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/dcgan/main.py b/eval_scripts/perf_benchmark/overhead-e2e/dcgan/main.py
index 9dc45b33..6bf17798 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/dcgan/main.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/dcgan/main.py
@@ -14,8 +14,7 @@
 import torchvision.datasets as dset
 import torchvision.transforms as transforms
 import torchvision.utils as vutils
-
-from mldaikon import annotate_stage
+from traincheck import annotate_stage
 
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/dcgan/main_settrace.py b/eval_scripts/perf_benchmark/overhead-e2e/dcgan/main_settrace.py
index f8273977..06816f22 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/dcgan/main_settrace.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/dcgan/main_settrace.py
@@ -15,8 +15,7 @@
 import torchvision.datasets as dset
 import torchvision.transforms as transforms
 import torchvision.utils as vutils
-
-from mldaikon import annotate_stage
+from traincheck import annotate_stage
 
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/gcn/main.py b/eval_scripts/perf_benchmark/overhead-e2e/gcn/main.py
index 687f395d..f71e1a78 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/gcn/main.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/gcn/main.py
@@ -9,9 +9,8 @@
 import torch.nn.functional as F
 from torch import nn
 from torch.optim import Adam
-
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 meta_vars["step"] = 0
 annotate_stage("init")
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/gcn/main_settrace.py b/eval_scripts/perf_benchmark/overhead-e2e/gcn/main_settrace.py
index 4aaf54d9..4c4bf7b6 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/gcn/main_settrace.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/gcn/main_settrace.py
@@ -10,9 +10,8 @@
 import torch.nn.functional as F
 from torch import nn
 from torch.optim import Adam
-
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 meta_vars["step"] = 0
 annotate_stage("init")
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/resnet18/main.py b/eval_scripts/perf_benchmark/overhead-e2e/resnet18/main.py
index e03c9a75..c6c98523 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/resnet18/main.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/resnet18/main.py
@@ -18,11 +18,10 @@
 import torchvision.datasets as datasets
 import torchvision.models as models
 import torchvision.transforms as transforms
+import traincheck.instrumentor.tracer as tc_tracer
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import Subset
 
-import mldaikon.instrumentor.tracer as tc_tracer
-
 tc_tracer.DISABLE_WRAPPER = True
 
 model_names = sorted(
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/resnet18/main_settrace.py b/eval_scripts/perf_benchmark/overhead-e2e/resnet18/main_settrace.py
index 500cb8d2..38d695bc 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/resnet18/main_settrace.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/resnet18/main_settrace.py
@@ -19,11 +19,10 @@
 import torchvision.datasets as datasets
 import torchvision.models as models
 import torchvision.transforms as transforms
+import traincheck.instrumentor.tracer as tc_tracer
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import Subset
 
-import mldaikon.instrumentor.tracer as tc_tracer
-
 tc_tracer.DISABLE_WRAPPER = True
 
 model_names = sorted(
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/siamese/main.py b/eval_scripts/perf_benchmark/overhead-e2e/siamese/main.py
index ee5978bc..d592ad46 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/siamese/main.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/siamese/main.py
@@ -12,8 +12,7 @@
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import Dataset
 from torchvision import datasets
-
-from mldaikon import annotate_stage
+from traincheck import annotate_stage
 
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/siamese/main_settrace.py b/eval_scripts/perf_benchmark/overhead-e2e/siamese/main_settrace.py
index 7c505fd7..b9c4635c 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/siamese/main_settrace.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/siamese/main_settrace.py
@@ -13,8 +13,7 @@
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import Dataset
 from torchvision import datasets
-
-from mldaikon import annotate_stage
+from traincheck import annotate_stage
 
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/tf_img_cls/main.py b/eval_scripts/perf_benchmark/overhead-e2e/tf_img_cls/main.py
index fb23d243..93a7cd5a 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/tf_img_cls/main.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/tf_img_cls/main.py
@@ -53,7 +53,7 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-# import mldaikon.instrumentor.tracer as tc_tracer
+# import traincheck.instrumentor.tracer as tc_tracer
 
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 with open(MD_BATCH_FILE_NAME, "w") as f:
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/tf_img_cls/main_settrace.py b/eval_scripts/perf_benchmark/overhead-e2e/tf_img_cls/main_settrace.py
index 87ad3966..e3f2ecfa 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/tf_img_cls/main_settrace.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/tf_img_cls/main_settrace.py
@@ -54,7 +54,7 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-# import mldaikon.instrumentor.tracer as tc_tracer
+# import traincheck.instrumentor.tracer as tc_tracer
 
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 with open(MD_BATCH_FILE_NAME, "w") as f:
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/tf_sum/main.py b/eval_scripts/perf_benchmark/overhead-e2e/tf_sum/main.py
index b1f5263f..a244a17a 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/tf_sum/main.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/tf_sum/main.py
@@ -32,6 +32,7 @@
 import nltk
 import numpy as np
 import torch
+import traincheck.instrumentor.tracer as tc_tracer
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
@@ -58,8 +59,6 @@
 )
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as tc_tracer
-
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 with open(MD_BATCH_FILE_NAME, "w") as f:
     f.write("")
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/tf_sum/main_settrace.py b/eval_scripts/perf_benchmark/overhead-e2e/tf_sum/main_settrace.py
index ee9457ee..587d05fa 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/tf_sum/main_settrace.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/tf_sum/main_settrace.py
@@ -33,6 +33,7 @@
 import nltk
 import numpy as np
 import torch
+import traincheck.instrumentor.tracer as tc_tracer
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
@@ -59,8 +60,6 @@
 )
 from transformers.utils.versions import require_version
 
-import mldaikon.instrumentor.tracer as tc_tracer
-
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 with open(MD_BATCH_FILE_NAME, "w") as f:
     f.write("")
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/vae/main.py b/eval_scripts/perf_benchmark/overhead-e2e/vae/main.py
index 0a85222a..cb002925 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/vae/main.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/vae/main.py
@@ -9,8 +9,7 @@
 from torch.nn import functional as F
 from torchvision import datasets, transforms
 from torchvision.utils import save_image
-
-from mldaikon import annotate_stage
+from traincheck import annotate_stage
 
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 with open(MD_BATCH_FILE_NAME, "w") as f:
diff --git a/eval_scripts/perf_benchmark/overhead-e2e/vae/main_settrace.py b/eval_scripts/perf_benchmark/overhead-e2e/vae/main_settrace.py
index af1ab878..c641172b 100644
--- a/eval_scripts/perf_benchmark/overhead-e2e/vae/main_settrace.py
+++ b/eval_scripts/perf_benchmark/overhead-e2e/vae/main_settrace.py
@@ -10,8 +10,7 @@
 from torch.nn import functional as F
 from torchvision import datasets, transforms
 from torchvision.utils import save_image
-
-from mldaikon import annotate_stage
+from traincheck import annotate_stage
 
 MD_BATCH_FILE_NAME = "iteration_times.txt"
 with open(MD_BATCH_FILE_NAME, "w") as f:
diff --git a/eval_scripts/perf_benchmark/overhead-micro/workload.py b/eval_scripts/perf_benchmark/overhead-micro/workload.py
index 3ea45ced..d195fb76 100644
--- a/eval_scripts/perf_benchmark/overhead-micro/workload.py
+++ b/eval_scripts/perf_benchmark/overhead-micro/workload.py
@@ -3,7 +3,7 @@
 import torch
 
 os.environ["ML_DAIKON_OUTPUT_DIR"] = "."
-from mldaikon.instrumentor.tracer import Instrumentor  # noqa
+from traincheck.instrumentor.tracer import Instrumentor  # noqa
 
 Instrumentor(
     torch,
diff --git a/eval_scripts/perf_benchmark/run_all.xsh b/eval_scripts/perf_benchmark/run_all.xsh
index e4a4ee30..8eecddd7 100644
--- a/eval_scripts/perf_benchmark/run_all.xsh
+++ b/eval_scripts/perf_benchmark/run_all.xsh
@@ -75,8 +75,8 @@ def run_exp(kill_sec: int = 100, workload: str = "mnist", use_proxy: bool = Fals
     SETTRACE_PY = "main_settrace.py"
     RUN_SH = "run.sh"
     MD_CONFIG_YML = "md-config.yml" if not use_proxy else "md-config-var.yml"
-    CMD_TRAINCHECK = f"python -m mldaikon.collect_trace --use-config --config {MD_CONFIG_YML} --output-dir traincheck"
-    CMD_TRAINCHECK_SELECTIVE = f"python -m mldaikon.collect_trace --use-config --config {MD_CONFIG_YML} --output-dir traincheck-selective -i ../{SELC_INV_FILE}"
+    CMD_TRAINCHECK = f"python -m traincheck.collect_trace --use-config --config {MD_CONFIG_YML} --output-dir traincheck"
+    CMD_TRAINCHECK_SELECTIVE = f"python -m traincheck.collect_trace --use-config --config {MD_CONFIG_YML} --output-dir traincheck-selective -i ../{SELC_INV_FILE}"
 
     if not os.path.exists(f"{E2E_FOLDER}/{workload}/{RUN_SH}"):
         cmd = "python3 main.py"
diff --git a/mldaikon/invariant/relation_pool.py b/mldaikon/invariant/relation_pool.py
deleted file mode 100644
index 2e04dc15..00000000
--- a/mldaikon/invariant/relation_pool.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from typing import Type
-
-from mldaikon.invariant.consistency_relation import ConsistencyRelation
-from mldaikon.invariant.consistency_transient_vars import (
-    ConsistentInputOutputRelation,
-    ConsistentOutputRelation,
-    ThresholdRelation,
-)
-from mldaikon.invariant.contain_relation import APIContainRelation
-from mldaikon.invariant.cover_relation import FunctionCoverRelation
-from mldaikon.invariant.DistinctArgumentRelation import DistinctArgumentRelation
-from mldaikon.invariant.lead_relation import FunctionLeadRelation
-
-# from mldaikon.invariant.var_periodic_change_relation import VarPeriodicChangeRelation
-
-relation_pool: list[Type] = [
-    APIContainRelation,
-    ConsistencyRelation,
-    ConsistentOutputRelation,
-    ConsistentInputOutputRelation,
-    #    VarPeriodicChangeRelation,
-    FunctionCoverRelation,
-    FunctionLeadRelation,
-    DistinctArgumentRelation,
-    ThresholdRelation,
-]
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..19dc505e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,52 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["traincheck", "traincheck.*"]
+
+[project]
+requires-python = ">= 3.10"
+name = "traincheck"
+version = "0.1.0"
+description = "Training with Confidence: Catching Silent DL Training Bugs with Automated Proactive Checks"
+authors = [
+  { name = "Yuxuan Jiang", email = "jyuxuan@umich.edu" },
+  { name = "Ziming Zhou", email = "zimingzh@umich.edu" },
+  { name = "Boyu Xu", email = "xuboyu@umich.edu" },
+  { name = "Beijie Liu", email = "kekeliu@umich.edu" },
+  { name = "Yijun Wang", email = "yijunw@umich.edu" },
+  { name = "Ryan Huang", email = "ryanph@umich.edu" },
+]
+maintainers = [
+  {name = "Yuxuan Jiang", email = "jyuxuan@umich.edu" },
+]
+dependencies = [
+  "astor",
+  "deepdiff",
+  "polars>=1.0.0",
+  "torch",
+  "tqdm",
+  "numba",
+  "pandas",
+  "pyyaml",
+  "orjson",
+]
+readme = "README.md"
+classifiers = [
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Operating System :: OS Independent",
+  "Intended Audience :: Developers",
+  "Development Status :: 4 - Beta",
+]
+
+[project.urls]
+homepage = "https://github.com/OrderLab/TrainCheck"
+
+[project.scripts]
+traincheck-collect = "traincheck:collect_trace.main"
+traincheck-infer = "traincheck:infer_engine.main"
+traincheck-check = "traincheck:checker.main"
diff --git a/regression_test.py b/regression_test.py
index 483e0235..ace14752 100644
--- a/regression_test.py
+++ b/regression_test.py
@@ -4,9 +4,8 @@
 import subprocess
 
 import torch.cuda
-
-import mldaikon.instrumentor as instrumentor
-import mldaikon.runner as runner
+import traincheck.instrumentor as instrumentor
+import traincheck.runner as runner
 
 """
 Run default mnist.py and instrumented mnist.py, compare the accuracy of the two.
diff --git a/scripts/dump_api_stats.py b/scripts/dump_api_stats.py
index 7eb47105..c096893c 100644
--- a/scripts/dump_api_stats.py
+++ b/scripts/dump_api_stats.py
@@ -10,9 +10,8 @@
 import os
 
 import pandas as pd
-
-from mldaikon.trace import read_trace_file_Pandas
-from mldaikon.utils import register_custom_excepthook
+from traincheck.trace import read_trace_file_Pandas
+from traincheck.utils import register_custom_excepthook
 
 
 def main(trace, instr_opts, iters: None | int = None):
@@ -51,7 +50,7 @@ def main(trace, instr_opts, iters: None | int = None):
     parser.add_argument(
         "--instr-opts",
         "-o",
-        help="Instrumentation options file generated by MLDAIKON when doing selective instrumentation",
+        help="Instrumentation options file generated by traincheck when doing selective instrumentation",
     )
     parser.add_argument(
         "--iters", "-i", type=int, help="Number of iterations of the experiment"
diff --git a/scripts/dump_top10_freq_API.py b/scripts/dump_top10_freq_API.py
new file mode 100644
index 00000000..8d5099b3
--- /dev/null
+++ b/scripts/dump_top10_freq_API.py
@@ -0,0 +1,40 @@
+import argparse
+import logging
+import os
+
+from traincheck.trace import read_trace_file_Pandas
+
+if __name__ == "__main__":
+    logger = logging.getLogger(__name__)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--trace-folder", "-f", help="Folder containing the trace files"
+    )
+    parser.add_argument(
+        "--instr-opts",
+        "-o",
+        help="Instrumentation options file generated by traincheck when doing selective instrumentation",
+    )
+    parser.add_argument(
+        "--iters", "-i", type=int, help="Number of iterations of the experiment"
+    )
+    parser.add_argument(
+        "--debug", "-d", action="store_true", help="Enable debug logging"
+    )
+    args = parser.parse_args()
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+
+    trace_files = [
+        f"{args.trace_folder}/{file}"
+        for file in os.listdir(args.trace_folder)
+        if file.startswith("trace_") or file.startswith("proxy_log.json")
+    ]
+    logger.info("Reading traces from %s", "\n".join(trace_files))
+    trace = read_trace_file_Pandas(trace_files)
+
+    trace_df = trace.events
+    # dump the most frequent API calls
+    top10_freq_API = trace_df["function"].value_counts().head(50)
+    print(top10_freq_API)
diff --git a/scripts/dump_top_10_apis.py b/scripts/dump_top_10_apis.py
new file mode 100644
index 00000000..8d5099b3
--- /dev/null
+++ b/scripts/dump_top_10_apis.py
@@ -0,0 +1,40 @@
+import argparse
+import logging
+import os
+
+from traincheck.trace import read_trace_file_Pandas
+
+if __name__ == "__main__":
+    logger = logging.getLogger(__name__)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--trace-folder", "-f", help="Folder containing the trace files"
+    )
+    parser.add_argument(
+        "--instr-opts",
+        "-o",
+        help="Instrumentation options file generated by traincheck when doing selective instrumentation",
+    )
+    parser.add_argument(
+        "--iters", "-i", type=int, help="Number of iterations of the experiment"
+    )
+    parser.add_argument(
+        "--debug", "-d", action="store_true", help="Enable debug logging"
+    )
+    args = parser.parse_args()
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+
+    trace_files = [
+        f"{args.trace_folder}/{file}"
+        for file in os.listdir(args.trace_folder)
+        if file.startswith("trace_") or file.startswith("proxy_log.json")
+    ]
+    logger.info("Reading traces from %s", "\n".join(trace_files))
+    trace = read_trace_file_Pandas(trace_files)
+
+    trace_df = trace.events
+    # dump the most frequent API calls
+    top10_freq_API = trace_df["function"].value_counts().head(50)
+    print(top10_freq_API)
diff --git a/scripts/run_infer_parallel.xsh b/scripts/run_infer_parallel.xsh
new file mode 100644
index 00000000..7c8e446d
--- /dev/null
+++ b/scripts/run_infer_parallel.xsh
@@ -0,0 +1,47 @@
+import argparse
+import os
+import signal
+import subprocess
+import time
+
+# configs
+$RAISE_SUBPROC_ERROR = True
+os.environ["PYTHONUNBUFFERED"] = "1"
+
+args = $ARGS
+args = args[1:]
+
+if "-o" in args or "--output" in args:
+    idx = args.index("-o") if "-o" in args else args.index("--output")
+    del args[idx:idx+2]
+
+from traincheck.invariant import relation_pool
+relation_names = [c.__name__ for c in relation_pool]
+
+TMUX_SESSION_NAME = "run_infer_parallel"
+
+def create_tmux_session():
+    """If the tmux session does not exist, create it."""
+    try:
+        tmux has-session -t @(TMUX_SESSION_NAME)
+    except subprocess.CalledProcessError:
+        tmux new-session -d -s @(TMUX_SESSION_NAME)
+
+largest_window_id = int($(tmux list-windows -t @(TMUX_SESSION_NAME) | awk '{print $1}' | sed 's/://g' | sort -n | tail -1).strip() or 0)
+
+def run_cmd(cmd):
+    global largest_window_id
+    largest_window_id += 1
+    tmux new-window -t @(TMUX_SESSION_NAME) -n @(largest_window_id)
+
+    command = f"conda activate fp_torch222; python3 -m traincheck.infer_engine "
+    command += " ".join(cmd)
+    tmux send-keys -t @(TMUX_SESSION_NAME):@(largest_window_id) @(command) Enter
+
+create_tmux_session()
+# for relation in relation_names:
+#     run_cmd(args + ["-o", f"inv_{relation}.json", "--enable-relation", relation])
+
+run_cmd(args + ["-o", f"inv_FunctionCoverRelation.json", "--enable-relation", "FunctionCoverRelation"])
+run_cmd(args + ["-o", f"inv_FunctionLeadRelation.json", "--enable-relation", "FunctionLeadRelation"])
+run_cmd(args + ["-o", f"inv_other_relation.json", "--disable-relation", "FunctionCoverRelation", "FunctionLeadRelation"])
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index ffea6bd7..00000000
--- a/setup.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from setuptools import find_packages, setup
-
-# print(find_packages())
-
-setup(
-    name="mldaikon",
-    version="0.1",
-    python_requires=">=3.8",
-    packages=find_packages(),
-    description="ML-DAIKON in development.",
-    author="Yuxuan Jiang",
-    author_email="jyuxuan@umich.edu",
-    url="https://github.com/yourusername/your-project-name",
-    install_requires=[
-        "astor",
-        "deepdiff",
-        "polars>=1.0.0",
-        "torch",
-        "tqdm",
-        "numba",
-        "pandas",
-        "pyyaml",
-        "orjson",
-    ],
-)
diff --git a/tests/bench_instrumentor/bench.py b/tests/bench_instrumentor/bench.py
index 183502ff..f31c1f07 100644
--- a/tests/bench_instrumentor/bench.py
+++ b/tests/bench_instrumentor/bench.py
@@ -24,7 +24,7 @@ def run_naive_instrumented():
         [
             "python",
             "-m",
-            "mldaikon.collect_trace",
+            "traincheck.collect_trace",
             "-p",
             f"{get_file_parent_dir()}/workloads/84911_efficientnet_b0_1_epochs_naive.py",
             "--use-full-instr",
@@ -38,7 +38,7 @@ def run_naive_instrumented_with_jit_and_c_tracing_disabled():
         [
             "python",
             "-m",
-            "mldaikon.collect_trace",
+            "traincheck.collect_trace",
             "-p",
             f"{get_file_parent_dir()}/workloads/84911_efficientnet_b0_1_epochs_naive.py",
         ]
@@ -51,7 +51,7 @@ def run_naive_instrumented_with_cond_dump_jit_and_c_tracing_disabled():
         [
             "python",
             "-m",
-            "mldaikon.collect_trace",
+            "traincheck.collect_trace",
             "-p",
             f"{get_file_parent_dir()}/workloads/84911_efficientnet_b0_1_epochs_naive.py",
             "--cond-dump",
@@ -65,7 +65,7 @@ def run_sampler_instrumented():
         [
             "python",
             "-m",
-            "mldaikon.collect_trace",
+            "traincheck.collect_trace",
             "-p",
             f"{get_file_parent_dir()}/workloads/84911_efficientnet_b0_1_epochs_sampler.py",
         ]
@@ -78,7 +78,7 @@ def run_proxy_instrumented():
         [
             "python",
             "-m",
-            "mldaikon.collect_trace",
+            "traincheck.collect_trace",
             "-p",
             f"{get_file_parent_dir()}/workloads/84911_efficientnet_b0_1_epochs_proxy.py",
             "--proxy-module",
@@ -93,7 +93,7 @@ def run_proxy_instrumented():
 #         [
 #             "python",
 #             "-m",
-#             "mldaikon.collect_trace",
+#             "traincheck.collect_trace",
 #             "-p",
 #             f"{get_file_parent_dir()}/workloads/84911_efficientnet_b0_1_epochs_proxy.py",
 #             "--scan_proxy_in_args",
@@ -107,7 +107,7 @@ def cleanup():
     subprocess.run(["rm", "-rf", f"{get_file_parent_dir()}/workloads/*.log"])
     subprocess.run(["rm", "-rf", f"{get_file_parent_dir()}/workloads/*.csv"])
     subprocess.run(["rm", "-rf", f"{get_file_parent_dir()}/workloads/*.pt"])
-    subprocess.run(["rm", "-rf", f"{get_file_parent_dir()}/workloads/_ml_daikon*.py"])
+    subprocess.run(["rm", "-rf", f"{get_file_parent_dir()}/workloads/_traincheck*.py"])
 
     subprocess.run(["rm", "-rf", "*.json"])
     subprocess.run(["rm", "-rf", "*.log"])
diff --git a/tests/bench_instrumentor/workloads/84911_efficientnet_b0_1_epochs_sampler.py b/tests/bench_instrumentor/workloads/84911_efficientnet_b0_1_epochs_sampler.py
index 940bade5..51ab6855 100644
--- a/tests/bench_instrumentor/workloads/84911_efficientnet_b0_1_epochs_sampler.py
+++ b/tests/bench_instrumentor/workloads/84911_efficientnet_b0_1_epochs_sampler.py
@@ -15,8 +15,7 @@
 from PIL import ImageFile
 from torchvision import datasets
 from tqdm import tqdm
-
-from mldaikon.instrumentor.tracer import StatelessVarObserver
+from traincheck.instrumentor.tracer import StatelessVarObserver
 
 shape = (224, 224)
 log_dir = f"runs/{shape[0]}"
diff --git a/tests/test_serialization_deserialization.py b/tests/test_serialization_deserialization.py
index 3ff75fca..4102f04f 100644
--- a/tests/test_serialization_deserialization.py
+++ b/tests/test_serialization_deserialization.py
@@ -1,49 +1,49 @@
 import json
 
-from mldaikon.invariant import Invariant
+from traincheck.invariant import Invariant
 
-invariants_json_str = """{"text_description": "torch.optim.adam.Adam.step contains torch.is_grad_enabled of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.is_grad_enabled"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.autograd.grad_mode.set_grad_enabled.__init__ of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.autograd.grad_mode.set_grad_enabled.__init__"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._C._set_grad_enabled of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._set_grad_enabled"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer.Optimizer._cuda_graph_capture_health_check of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer.Optimizer._cuda_graph_capture_health_check"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._utils.is_compiling of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._utils.is_compiling"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.backends.cuda.is_built of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.backends.cuda.is_built"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.cuda.is_available of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.cuda.is_available"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.cuda._is_compiled of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.cuda._is_compiled"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.cuda._nvml_based_avail of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.cuda._nvml_based_avail"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._C._cuda_getDeviceCount of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._cuda_getDeviceCount"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.cuda.graphs.is_current_stream_capturing of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.cuda.graphs.is_current_stream_capturing"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._C._cuda_isCurrentStreamCapturing of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._cuda_isCurrentStreamCapturing"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.optim.adam.Adam._init_group of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam._init_group"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.optim.adam.adam of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.adam.adam"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._C._has_torch_function_unary of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._has_torch_function_unary"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer._default_to_fused_or_foreach of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer._default_to_fused_or_foreach"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._jit_internal.is_scripting of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._jit_internal.is_scripting"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.utils._foreach_utils._get_fused_kernels_supported_devices of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.utils._foreach_utils._get_fused_kernels_supported_devices"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._C._get_privateuse1_backend_name of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._get_privateuse1_backend_name"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.utils._foreach_utils._get_foreach_kernels_supported_devices of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.utils._foreach_utils._get_foreach_kernels_supported_devices"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.optim.adam._multi_tensor_adam of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.adam._multi_tensor_adam"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.utils._foreach_utils._group_tensors_by_device_and_dtype of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.utils._foreach_utils._group_tensors_by_device_and_dtype"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.autograd.grad_mode.no_grad.__init__ of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.autograd.grad_mode.no_grad.__init__"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._C.PyCapsule._group_tensors_by_device_and_dtype of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C.PyCapsule._group_tensors_by_device_and_dtype"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass.is_complex of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass.is_complex"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_add_ of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_add_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_lerp_ of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_lerp_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_mul_ of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_mul_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_addcmul_ of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_addcmul_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer._get_value of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer._get_value"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer._stack_if_compiling of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer._stack_if_compiling"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer._dispatch_sqrt of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer._dispatch_sqrt"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_sqrt of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_sqrt"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_div_ of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_div_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_addcdiv_ of type mldaikon.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_addcdiv_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
-{"text_description": "torch.optim.adam.Adam.step contains ('Parameter', 'data') of type mldaikon.trace.types.VarChangeEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "VarTypeParam", "var_type": "Parameter", "attr_name": "data"}], "precondition": {"parent_func_call_pre": [], "var_events": [{"clauses": [{"type": "constant", "prop_name": "attributes.requires_grad", "prop_dtype": "bool", "values": [true]}]}]}}
-{"text_description": "torch.optim.adam.Adam.step contains ('Parameter', 'grad') of type mldaikon.trace.types.VarChangeEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "VarTypeParam", "var_type": "Parameter", "attr_name": "grad"}], "precondition": {"parent_func_call_pre": [], "var_events": [{"clauses": [{"type": "constant", "prop_name": "attributes.requires_grad", "prop_dtype": "bool", "values": [true]}]}]}}
-{"text_description": "torch.optim.adam.Adam.step contains ('Tensor', 'data') of type mldaikon.trace.types.VarChangeEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "VarTypeParam", "var_type": "Tensor", "attr_name": "data"}], "precondition": {"parent_func_call_pre": [], "var_events": [{"clauses": "Unconditional"}]}}"""
+invariants_json_str = """{"text_description": "torch.optim.adam.Adam.step contains torch.is_grad_enabled of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.is_grad_enabled"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.autograd.grad_mode.set_grad_enabled.__init__ of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.autograd.grad_mode.set_grad_enabled.__init__"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._C._set_grad_enabled of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._set_grad_enabled"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer.Optimizer._cuda_graph_capture_health_check of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer.Optimizer._cuda_graph_capture_health_check"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._utils.is_compiling of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._utils.is_compiling"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.backends.cuda.is_built of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.backends.cuda.is_built"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.cuda.is_available of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.cuda.is_available"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.cuda._is_compiled of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.cuda._is_compiled"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.cuda._nvml_based_avail of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.cuda._nvml_based_avail"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._C._cuda_getDeviceCount of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._cuda_getDeviceCount"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.cuda.graphs.is_current_stream_capturing of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.cuda.graphs.is_current_stream_capturing"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._C._cuda_isCurrentStreamCapturing of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._cuda_isCurrentStreamCapturing"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.optim.adam.Adam._init_group of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam._init_group"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.optim.adam.adam of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.adam.adam"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._C._has_torch_function_unary of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._has_torch_function_unary"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer._default_to_fused_or_foreach of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer._default_to_fused_or_foreach"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._jit_internal.is_scripting of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._jit_internal.is_scripting"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.utils._foreach_utils._get_fused_kernels_supported_devices of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.utils._foreach_utils._get_fused_kernels_supported_devices"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._C._get_privateuse1_backend_name of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C._get_privateuse1_backend_name"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.utils._foreach_utils._get_foreach_kernels_supported_devices of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.utils._foreach_utils._get_foreach_kernels_supported_devices"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.optim.adam._multi_tensor_adam of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.adam._multi_tensor_adam"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.utils._foreach_utils._group_tensors_by_device_and_dtype of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.utils._foreach_utils._group_tensors_by_device_and_dtype"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.autograd.grad_mode.no_grad.__init__ of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.autograd.grad_mode.no_grad.__init__"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._C.PyCapsule._group_tensors_by_device_and_dtype of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._C.PyCapsule._group_tensors_by_device_and_dtype"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass.is_complex of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass.is_complex"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_add_ of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_add_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_lerp_ of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_lerp_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_mul_ of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_mul_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_addcmul_ of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_addcmul_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer._get_value of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer._get_value"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer._stack_if_compiling of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer._stack_if_compiling"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch.optim.optimizer._dispatch_sqrt of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch.optim.optimizer._dispatch_sqrt"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_sqrt of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_sqrt"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_div_ of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_div_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains torch._VariableFunctionsClass._foreach_addcdiv_ of type traincheck.trace.types.FuncCallEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "APIParam", "api_full_name": "torch._VariableFunctionsClass._foreach_addcdiv_"}], "precondition": {"parent_func_call_pre": [{"clauses": "Unconditional"}]}}
+{"text_description": "torch.optim.adam.Adam.step contains ('Parameter', 'data') of type traincheck.trace.types.VarChangeEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "VarTypeParam", "var_type": "Parameter", "attr_name": "data"}], "precondition": {"parent_func_call_pre": [], "var_events": [{"clauses": [{"type": "constant", "prop_name": "attributes.requires_grad", "prop_dtype": "bool", "values": [true]}]}]}}
+{"text_description": "torch.optim.adam.Adam.step contains ('Parameter', 'grad') of type traincheck.trace.types.VarChangeEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "VarTypeParam", "var_type": "Parameter", "attr_name": "grad"}], "precondition": {"parent_func_call_pre": [], "var_events": [{"clauses": [{"type": "constant", "prop_name": "attributes.requires_grad", "prop_dtype": "bool", "values": [true]}]}]}}
+{"text_description": "torch.optim.adam.Adam.step contains ('Tensor', 'data') of type traincheck.trace.types.VarChangeEvent", "relation": "APIContainRelation", "params": [{"param_type": "APIParam", "api_full_name": "torch.optim.adam.Adam.step"}, {"param_type": "VarTypeParam", "var_type": "Tensor", "attr_name": "data"}], "precondition": {"parent_func_call_pre": [], "var_events": [{"clauses": "Unconditional"}]}}"""
 
 
 def test_serialization_deserialization():
-    """Exercise all serialization and deserialization methods of classes in mldaikon.invariants.base_cls module"""
+    """Exercise all serialization and deserialization methods of classes in traincheck.invariants.base_cls module"""
 
     for line in invariants_json_str.split("\n"):
         inv_dict = json.loads(line)
diff --git a/tests/test_tensor_hash/test_cpu_gpu_hashing_result.py b/tests/test_tensor_hash/test_cpu_gpu_hashing_result.py
index ad7348bb..24e9905e 100644
--- a/tests/test_tensor_hash/test_cpu_gpu_hashing_result.py
+++ b/tests/test_tensor_hash/test_cpu_gpu_hashing_result.py
@@ -1,6 +1,5 @@
 import torch
-
-from mldaikon.proxy_wrapper.hash import (
+from traincheck.proxy_wrapper.hash import (
     _reduce_last_axis,
     hash_tensor_cpu,
     hash_tensor_cuda,
diff --git a/tests/test_tensor_hash/test_hash.py b/tests/test_tensor_hash/test_hash.py
index 8e34dbc6..bca8a450 100644
--- a/tests/test_tensor_hash/test_hash.py
+++ b/tests/test_tensor_hash/test_hash.py
@@ -6,9 +6,8 @@
 import torch.nn as nn
 import torch.optim as optim
 from torch.nn.parallel import DistributedDataParallel as DDP
-
-from mldaikon import annotate_stage
-from mldaikon.instrumentor import meta_vars
+from traincheck import annotate_stage
+from traincheck.instrumentor import meta_vars
 
 
 class ToyModel(nn.Module):
@@ -17,7 +16,7 @@ def __init__(self):
         self.net1 = torch.nn.Linear(10, 10)
         self.relu = torch.nn.ReLU()
         self.net2 = torch.nn.Linear(10, 5)
-        from mldaikon.proxy_wrapper.hash import tensor_hash
+        from traincheck.proxy_wrapper.hash import tensor_hash
 
         print("Perform tensor hash in init")
         result = tensor_hash(self.net1.weight)
@@ -25,7 +24,7 @@ def __init__(self):
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.relu(self.net1(x))
-        # from mldaikon.proxy_wrapper.hash import tensor_hash
+        # from traincheck.proxy_wrapper.hash import tensor_hash
         # print("Perform tensor hash in forward pass")
         # result  = tensor_hash(x)
         # print("Tensor hash result: ", result)
@@ -58,7 +57,7 @@ def _ddp_run_single_step(
 
     # create model and move it to GPU with id rank
     model = ToyModel()
-    from mldaikon.proxy_wrapper.hash import tensor_hash
+    from traincheck.proxy_wrapper.hash import tensor_hash
 
     print("1. Perform tensor hash in init (before moving to GPU)")
     result = tensor_hash(model.net1.weight)
@@ -66,7 +65,7 @@ def _ddp_run_single_step(
 
     model.to(rank)
     # dump hash of the model
-    from mldaikon.proxy_wrapper.hash import tensor_hash
+    from traincheck.proxy_wrapper.hash import tensor_hash
 
     print("2. Perform tensor hash in init (after moving to GPU)")
     result = tensor_hash(model.net1.weight)
diff --git a/tests/test_tensor_hash/test_hash_tensor_cuda.py b/tests/test_tensor_hash/test_hash_tensor_cuda.py
index 2f161a81..a347f0c5 100644
--- a/tests/test_tensor_hash/test_hash_tensor_cuda.py
+++ b/tests/test_tensor_hash/test_hash_tensor_cuda.py
@@ -1,6 +1,5 @@
 import torch
-
-from mldaikon.proxy_wrapper.hash import tensor_hash
+from traincheck.proxy_wrapper.hash import tensor_hash
 
 
 def test_model_hash(device):
diff --git a/tests/test_trace_analyzer/test_trace_analyzer.py b/tests/test_trace_analyzer/test_trace_analyzer.py
index a086447c..41d3d39d 100644
--- a/tests/test_trace_analyzer/test_trace_analyzer.py
+++ b/tests/test_trace_analyzer/test_trace_analyzer.py
@@ -1,4 +1,4 @@
-from mldaikon.toolkit.analyze_trace import diff_lists_of_dicts
+from traincheck.toolkit.analyze_trace import diff_lists_of_dicts
 
 # Example usage:
 list1 = [
diff --git a/tests/test_trace_pandas.py b/tests/test_trace_pandas.py
index 97487b9e..e51bcf36 100644
--- a/tests/test_trace_pandas.py
+++ b/tests/test_trace_pandas.py
@@ -3,8 +3,8 @@
 import unittest
 
 # Import the module to test
-from mldaikon.trace.trace_pandas import TracePandas, read_trace_file_Pandas
-from mldaikon.trace.trace_polars import TracePolars, read_trace_file_polars
+from traincheck.trace.trace_pandas import TracePandas, read_trace_file_Pandas
+from traincheck.trace.trace_polars import TracePolars, read_trace_file_polars
 
 # import modin.pandas as pd
 # import polars
diff --git a/tests/test_var_tracker/test_type_2_isinstance.py b/tests/test_var_tracker/test_type_2_isinstance.py
index d3d8db47..d56e6a16 100644
--- a/tests/test_var_tracker/test_type_2_isinstance.py
+++ b/tests/test_var_tracker/test_type_2_isinstance.py
@@ -4,8 +4,8 @@
 import astor
 
 
-def type_handle_mldaikon_proxy(x):
-    if hasattr(x, "is_ml_daikon_proxied_obj"):
+def type_handle_traincheck_proxy(x):
+    if hasattr(x, "is_traincheck_proxied_obj"):
         return type(x._obj)
     return type(x)
 
@@ -20,9 +20,9 @@ def visit_Call(self, node):
             and node.func.id == "type"
             and len(node.args) == 1
         ):
-            # Replace type(xxx) with type_handle_mldaikon_proxy(xxx)
+            # Replace type(xxx) with type_handle_traincheck_proxy(xxx)
             new_node = ast.Call(
-                func=ast.Name(id="type_handle_mldaikon_proxy", ctx=ast.Load()),
+                func=ast.Name(id="type_handle_traincheck_proxy", ctx=ast.Load()),
                 args=node.args,
                 keywords=[],
             )
diff --git a/tests/test_wrap_torchVF/test_wrap_torchVF.py b/tests/test_wrap_torchVF/test_wrap_torchVF.py
index 04e9c7f4..c2815952 100644
--- a/tests/test_wrap_torchVF/test_wrap_torchVF.py
+++ b/tests/test_wrap_torchVF/test_wrap_torchVF.py
@@ -5,7 +5,7 @@
 
 def is_proxied(obj):
     try:
-        if obj is not None and "is_ml_daikon_proxied_obj" in obj.__dict__:
+        if obj is not None and "is_traincheck_proxied_obj" in obj.__dict__:
             return True
     except Exception:
         return False
diff --git a/mldaikon/__init__.py b/traincheck/__init__.py
similarity index 86%
rename from mldaikon/__init__.py
rename to traincheck/__init__.py
index 5a6e42f2..e8556f53 100644
--- a/mldaikon/__init__.py
+++ b/traincheck/__init__.py
@@ -1,4 +1,4 @@
-from mldaikon.developer.annotations import (
+from traincheck.developer.annotations import (
     annotate_answer_start_token_ids,
     annotate_stage,
 )
diff --git a/mldaikon/checker.py b/traincheck/checker.py
similarity index 93%
rename from mldaikon/checker.py
rename to traincheck/checker.py
index 1c7a7418..ac56c907 100644
--- a/mldaikon/checker.py
+++ b/traincheck/checker.py
@@ -6,9 +6,9 @@
 
 from tqdm import tqdm
 
-from mldaikon.invariant import CheckerResult, Invariant, read_inv_file
-from mldaikon.trace import MDNONEJSONEncoder, Trace, select_trace_implementation
-from mldaikon.utils import register_custom_excepthook
+from traincheck.invariant import CheckerResult, Invariant, read_inv_file
+from traincheck.trace import MDNONEJSONEncoder, Trace, select_trace_implementation
+from traincheck.utils import register_custom_excepthook
 
 register_custom_excepthook()
 
@@ -50,7 +50,7 @@ def check_engine(
     return results
 
 
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser(
         description="(Offline) Invariant Checker for ML Pipelines in Python"
     )
@@ -101,7 +101,7 @@ def check_engine(
         "-o",
         "--output-dir",
         type=str,
-        help="Output folder to store the results, defaulted to mldaikon_checker_results_{timestamp}/",
+        help="Output folder to store the results, defaulted to traincheck_checker_results_{timestamp}/",
     )
 
     args = parser.parse_args()
@@ -127,7 +127,7 @@ def check_engine(
     time_now = f"{time_now}_relation_first_{args.check_relation_first}"
     # set logging to a file
     logging.basicConfig(
-        filename=f"mldaikon_checker_{time_now}.log",
+        filename=f"traincheck_checker_{time_now}.log",
         level=log_level,
     )
 
@@ -140,7 +140,7 @@ def check_engine(
 
     # create the output folder if not exists
     if not args.output_dir:
-        args.output_dir = f"mldaikon_checker_results_{time_now}"
+        args.output_dir = f"traincheck_checker_results_{time_now}"
     os.makedirs(args.output_dir, exist_ok=True)
 
     # copy the invariants to the output folder
@@ -197,7 +197,7 @@ def check_engine(
             len(results_per_trace),
         )
         logger.info(
-            "Total invariants that's not triggered: %d/%d",
+            "Total invariants that are not triggered: %d/%d",
             len(results_per_trace_not_triggered),
             len(results_per_trace),
         )
@@ -238,3 +238,7 @@ def check_engine(
                 if res.check_passed and res.triggered:
                     json.dump(res.to_dict(), f, indent=4, cls=MDNONEJSONEncoder)
                     f.write("\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mldaikon/collect_trace.py b/traincheck/collect_trace.py
similarity index 95%
rename from mldaikon/collect_trace.py
rename to traincheck/collect_trace.py
index 20bb37db..48bcfe87 100644
--- a/mldaikon/collect_trace.py
+++ b/traincheck/collect_trace.py
@@ -5,12 +5,12 @@
 
 import yaml
 
-import mldaikon.config.config as config
-import mldaikon.instrumentor as instrumentor
-import mldaikon.proxy_wrapper.proxy_config as proxy_config
-import mldaikon.runner as runner
-from mldaikon.config.config import InstrOpt
-from mldaikon.invariant.base_cls import (
+import traincheck.config.config as config
+import traincheck.instrumentor as instrumentor
+import traincheck.proxy_wrapper.proxy_config as proxy_config
+import traincheck.runner as runner
+from traincheck.config.config import InstrOpt
+from traincheck.invariant.base_cls import (
     APIParam,
     Arguments,
     InputOutputParam,
@@ -19,8 +19,8 @@
     VarTypeParam,
     read_inv_file,
 )
-from mldaikon.invariant.consistency_relation import ConsistencyRelation
-from mldaikon.invariant.contain_relation import VAR_GROUP_NAME, APIContainRelation
+from traincheck.invariant.consistency_relation import ConsistencyRelation
+from traincheck.invariant.contain_relation import VAR_GROUP_NAME, APIContainRelation
 
 
 def get_list_of_funcs_from_invariants(invariants: list[Invariant]) -> list[str]:
@@ -43,7 +43,7 @@ def get_per_func_instr_opts(
     """
 
     # TODO: for APIContainRelation that describes a variable, if the precondition is not unconditional on the variable and the API belongs to a class, then all class methods should be instrumented with `scan_proxy_in_args` set to True
-
+    logger = logging.getLogger(__name__)
     func_instr_opts: dict[str, dict[str, bool | dict]] = {}
     for inv in invariants:
         for param in inv.params:
@@ -170,7 +170,7 @@ def get_disable_proxy_dumping(invariants: list[Invariant]) -> bool:
     return True
 
 
-def dump_env(output_dir: str):
+def dump_env(args, output_dir: str):
     with open(os.path.join(output_dir, "env_dump.txt"), "w") as f:
         f.write("Arguments:\n")
         for arg in vars(args):
@@ -189,10 +189,11 @@ def dump_env(output_dir: str):
         )  # FIXME: conda list here doesn't work in OSX, >>> import os; >>> os.popen('conda list').read(); /bin/sh: conda: command not found
 
 
-def get_default_output_folder(args: argparse.Namespace) -> str:
+def get_default_output_folder(args: argparse.Namespace, start_time) -> str:
     """Get the default output directory for the trace collection
     Note that the output is only the folder name, not an absolute path
     """
+    logger = logging.getLogger(__name__)
     pyfile_basename = os.path.basename(args.pyscript).split(".")[0]
     # get also the versions of the modules specified in `-t`
     modules = args.modules_to_instr
@@ -212,7 +213,7 @@ def get_default_output_folder(args: argparse.Namespace) -> str:
         modules_and_versions.append(f"{module}_{version}")
     # sort the modules and versions
     modules_and_versions.sort()
-    output_folder = f"mldaikon_run_{pyfile_basename}_{'_'.join(modules_and_versions)}_{START_TIME.strftime('%Y-%m-%d_%H-%M-%S')}"
+    output_folder = f"traincheck_run_{pyfile_basename}_{'_'.join(modules_and_versions)}_{start_time.strftime('%Y-%m-%d_%H-%M-%S')}"
     return output_folder
 
 
@@ -230,7 +231,7 @@ def is_path_md_output_dir(output_dir: str) -> bool:
     return False
 
 
-if __name__ == "__main__":
+def main():
     # First parse the deciding arguments.
     use_config_args_parser = argparse.ArgumentParser(add_help=False)
     use_config_args_parser.add_argument(
@@ -282,7 +283,7 @@ def is_path_md_output_dir(output_dir: str) -> bool:
         type=str,
         default="",
         help="""Directory to store the output files, if not provided, it will be 
-        defaulted to mldaikon_run_{pyscript_name}_{timestamp}""",
+        defaulted to traincheck_run_{pyscript_name}_{timestamp}""",
     )
     parser.add_argument(
         "--only-instr",
@@ -403,11 +404,11 @@ def is_path_md_output_dir(output_dir: str) -> bool:
 
     output_dir = args.output_dir
     if not output_dir:
-        output_dir = get_default_output_folder(args)
+        output_dir = get_default_output_folder(args, START_TIME)
     output_dir = os.path.abspath(output_dir)
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
-    dump_env(output_dir)
+    dump_env(args, output_dir)
 
     # set up adjusted proxy_config
     proxy_basic_config: dict[str, int | bool | str] = {}
@@ -533,3 +534,7 @@ def is_path_md_output_dir(output_dir: str) -> bool:
         exit(return_code)
 
     logger.info("Trace collection done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mldaikon/config/__init__.py b/traincheck/config/__init__.py
similarity index 100%
rename from mldaikon/config/__init__.py
rename to traincheck/config/__init__.py
diff --git a/mldaikon/config/config.py b/traincheck/config/config.py
similarity index 99%
rename from mldaikon/config/config.py
rename to traincheck/config/config.py
index a4531e87..b8027988 100644
--- a/mldaikon/config/config.py
+++ b/traincheck/config/config.py
@@ -14,7 +14,7 @@
 }
 
 # tracer + instrumentor configs
-TMP_FILE_PREFIX = "_ml_daikon_"
+TMP_FILE_PREFIX = "_traincheck_"
 INSTR_OPTS_FILE = "instr_opts.json"
 INSTR_MODULES_TO_INSTR = ["torch"]
 INSTR_MODULES_TO_SKIP = [
@@ -214,7 +214,7 @@ def should_disable_proxy_dumping() -> bool:
     "thread_id",
     "dumped_frame_array",
     "func_call_id",
-    "mldaikon_folder",
+    "traincheck_folder",
     "enable_auto_observer_depth",
     "neglect_hidden_func",
     "neglect_hidden_module",
diff --git a/mldaikon/developer/README.md b/traincheck/developer/README.md
similarity index 100%
rename from mldaikon/developer/README.md
rename to traincheck/developer/README.md
diff --git a/mldaikon/developer/annotations.py b/traincheck/developer/annotations.py
similarity index 92%
rename from mldaikon/developer/annotations.py
rename to traincheck/developer/annotations.py
index df2cb33e..b89b1827 100644
--- a/mldaikon/developer/annotations.py
+++ b/traincheck/developer/annotations.py
@@ -1,6 +1,6 @@
-import mldaikon.instrumentor.tracer as tracer
-from mldaikon.config.config import ALL_STAGE_NAMES
-from mldaikon.instrumentor import meta_vars
+import traincheck.instrumentor.tracer as tracer
+from traincheck.config.config import ALL_STAGE_NAMES
+from traincheck.instrumentor import meta_vars
 
 
 def annotate_stage(stage_name: str):
diff --git a/mldaikon/e2e/Changelog.md b/traincheck/e2e/Changelog.md
similarity index 100%
rename from mldaikon/e2e/Changelog.md
rename to traincheck/e2e/Changelog.md
diff --git a/mldaikon/e2e/README.md b/traincheck/e2e/README.md
similarity index 100%
rename from mldaikon/e2e/README.md
rename to traincheck/e2e/README.md
diff --git a/mldaikon/e2e/__init__.py b/traincheck/e2e/__init__.py
similarity index 100%
rename from mldaikon/e2e/__init__.py
rename to traincheck/e2e/__init__.py
diff --git a/mldaikon/e2e/config.py b/traincheck/e2e/config.py
similarity index 80%
rename from mldaikon/e2e/config.py
rename to traincheck/e2e/config.py
index d21f79ff..2a730833 100644
--- a/mldaikon/e2e/config.py
+++ b/traincheck/e2e/config.py
@@ -7,4 +7,4 @@
 EXAMPLE_PIPELINES_DIR = os.path.join(current_dir, "../../machine-learning-issues")
 input_env = {
     "PYTORCH_JIT": "0"
-}  # should appear at the start of the mldaikon.collect_trace running command
+}  # should appear at the start of the traincheck.collect_trace running command
diff --git a/mldaikon/e2e/runner.py b/traincheck/e2e/runner.py
similarity index 93%
rename from mldaikon/e2e/runner.py
rename to traincheck/e2e/runner.py
index 256d5c56..9926e57f 100644
--- a/mldaikon/e2e/runner.py
+++ b/traincheck/e2e/runner.py
@@ -46,9 +46,9 @@ def run_e2e(
     input_config: dict[str, str],
     input_env: dict[str, str],
 ) -> int:
-    # this is the end to end invariant generation pipeline for mldaikon project
+    # this is the end to end invariant generation pipeline for traincheck project
     # input_program: the path to the python script to be run (should be uninstrumented user's script)
-    # input_config: the configuration for the mldaikon project
+    # input_config: the configuration for the traincheck project
     # input_env: the environment variables for the script
     input_program: str = input_config["input_program"]  # with -p flag
     modules_to_instrument: str = input_config["modules_to_instrument"]  # with -t flag
@@ -59,7 +59,7 @@ def run_e2e(
     # run the script with the given arguments and environment variables
     trace_collector_script_args: list[str] = [
         "-m",
-        "mldaikon.collect_trace",
+        "traincheck.collect_trace",
         "-p",
         input_program,
         "-t",
@@ -107,7 +107,7 @@ def run_e2e(
         return return_code
 
     ## Activate the Infer Engine
-    # example: python -m mldaikon.infer_engine -t <proxy_folder>/proxy_trace_processed_* <trace_folder>/<path_to_API_trace>
+    # example: python -m traincheck.infer_engine -t <proxy_folder>/proxy_trace_processed_* <trace_folder>/<path_to_API_trace>
     # trace_folder = os.path.join(output_dir, "trace_log")
     proxy_folder = os.path.join(output_dir, "processed_proxy_traces")
 
@@ -120,7 +120,7 @@ def run_e2e(
 
     infer_engine_script_args: list[str] = [
         "-m",
-        "mldaikon.infer_engine",
+        "traincheck.infer_engine",
         "-o",
         f"{output_dir}/invariants.json",
         "-t",
diff --git a/mldaikon/e2e_runner.py b/traincheck/e2e_runner.py
similarity index 94%
rename from mldaikon/e2e_runner.py
rename to traincheck/e2e_runner.py
index 47c6a6d0..bd6e446f 100644
--- a/mldaikon/e2e_runner.py
+++ b/traincheck/e2e_runner.py
@@ -3,8 +3,8 @@
 import os
 import sys
 
-import mldaikon.e2e.config as e2e_config
-from mldaikon.e2e.runner import find_files, run_e2e
+import traincheck.e2e.config as e2e_config
+from traincheck.e2e.runner import find_files, run_e2e
 
 
 def read_config(config_path: str) -> dict[str, str]:
@@ -99,12 +99,12 @@ def parse_input_env(input_env: str) -> dict[str, str]:
     if not os.path.isfile(input_program):
         input_program_dir = os.path.join(example_pipelines_dir, script_name)
         input_program_list = find_files(input_program_dir, prefix="", suffix=".py")
-        # input program should not include _ml_daikon at the beginning of the name
-        # e.g. '../../example_pipelines/LT-725/_ml_daikon_LT725.py' is not a valid input program
+        # input program should not include _traincheck at the beginning of the name
+        # e.g. '../../example_pipelines/LT-725/_traincheck_LT725.py' is not a valid input program
         input_program_list = [
             file
             for file in input_program_list
-            if not os.path.basename(file).startswith("_ml_daikon")
+            if not os.path.basename(file).startswith("_traincheck")
         ]
         input_bash_script_list = find_files(input_program_dir, prefix="", suffix=".sh")
         input_bash_script_list = [
diff --git a/mldaikon/infer_engine.py b/traincheck/infer_engine.py
similarity index 83%
rename from mldaikon/infer_engine.py
rename to traincheck/infer_engine.py
index 7d35b8cb..5a7dcdc5 100644
--- a/mldaikon/infer_engine.py
+++ b/traincheck/infer_engine.py
@@ -8,8 +8,8 @@
 
 from tqdm import tqdm
 
-import mldaikon.config.config as config
-from mldaikon.invariant import (
+import traincheck.config.config as config
+from traincheck.invariant import (
     FailedHypothesis,
     Hypothesis,
     Invariant,
@@ -17,8 +17,8 @@
     find_precondition,
     relation_pool,
 )
-from mldaikon.trace import MDNONEJSONEncoder, select_trace_implementation
-from mldaikon.utils import register_custom_excepthook
+from traincheck.trace import MDNONEJSONEncoder, select_trace_implementation
+from traincheck.utils import register_custom_excepthook
 
 register_custom_excepthook()
 
@@ -52,11 +52,15 @@ def generate_hypothesis(self) -> dict[Hypothesis, list[int]]:
             dict[Hypothesis, list[int]]: A dictionary mapping hypotheses to the indices of traces that support them
         """
 
-        logger.info("Generating hypotheses")
+        logger.info("============= GENERATING HYPOTHESIS =============")
         hypotheses_and_trace_idxs: dict[Hypothesis, list[int]] = {}
         hypo_lookup = {}  # Dictionary for O(1) lookup of hypotheses
-        for trace_idx, trace in enumerate(tqdm(self.traces, desc="Scanning Traces")):
-            for relation in relation_pool:
+        for trace_idx, trace in enumerate(self.traces):
+            logger.info(f"Processing trace {trace_idx + 1}/{len(self.traces)}")
+            for relation_idx, relation in enumerate(relation_pool):
+                logger.info(
+                    f"Processing relation {relation_idx + 1}/{len(relation_pool)}: {relation.__name__}"
+                )
                 if self.disabled_relations and relation in self.disabled_relations:
                     logger.info(
                         f"Skipping relation {relation.__name__} as it is disabled"
@@ -65,7 +69,10 @@ def generate_hypothesis(self) -> dict[Hypothesis, list[int]]:
                 logger.info(f"Generating hypotheses for relation: {relation.__name__}")
                 inferred_hypos = relation.generate_hypothesis(trace)
                 logger.info(
-                    f"Found {len(inferred_hypos)} hypotheses for relation: {relation.__name__}"
+                    f"Found {len(inferred_hypos)} hypotheses for relation: {relation.__name__} on trace {trace_idx + 1}/{len(self.traces)}"
+                )
+                logger.info(
+                    f"Merging hypotheses with existing ones, number of existing ones: {len(hypotheses_and_trace_idxs)}"
                 )
                 for hypo in tqdm(
                     inferred_hypos, desc="Merging Hypotheses with existing ones"
@@ -95,16 +102,25 @@ def generate_hypothesis(self) -> dict[Hypothesis, list[int]]:
                         ) == orig_num_neg_exps + len(
                             hypo.negative_examples
                         ), f"Expected {orig_num_neg_exps} + {len(hypo.negative_examples)} negative examples, got {len(hypo_lookup[hypo].negative_examples)}"
-
+            logger.info(f"Finished processing trace {trace_idx + 1}/{len(self.traces)}")
+        logger.info(
+            f"Finished generating hypotheses, found {len(hypotheses_and_trace_idxs)} hypotheses"
+        )
         return hypotheses_and_trace_idxs
 
     def collect_examples(self, hypotheses: dict[Hypothesis, list[int]]):
-        logger.info("Collecting examples")
+        logger.info("============= COLLECTING EXAMPLES =============")
+        logger.info(f"Start collecting examples for {len(hypotheses)} hypotheses")
         for hypo, trace_idxs in hypotheses.items():
+            logger.info(
+                f"Collecting examples for hypothesis: {hypo.invariant.text_description}"
+            )
             for trace_idx, trace in enumerate(self.traces):
                 if trace_idx in trace_idxs:
                     continue
-                logger.info(f"Collecting examples for hypothesis: {hypo}")
+                logger.info(
+                    f"Collecting examples for hypothesis: {hypo} on trace {trace_idx + 1}/{len(self.traces)}"
+                )
                 hypo.invariant.relation.collect_examples(trace, hypo)
 
     def prune_incorrect_hypos(self, hypotheses: dict[Hypothesis, list[int]]):
@@ -121,14 +137,18 @@ def prune_incorrect_hypos(self, hypotheses: dict[Hypothesis, list[int]]):
 
     def infer_precondition(self, hypotheses: dict[Hypothesis, list[int]]):
         """TODO: move the precondition inference driving code into Hypothesis.get_invariant()"""
-
+        logger.info("============= INFERING PRECONDITIONS =============")
+        logger.info(f"Inferring preconditions for {len(hypotheses)} hypotheses")
         all_hypotheses: list[Hypothesis] = []
         for hypo in hypotheses:
             all_hypotheses.append(hypo)
 
         invariants = []
         failed_hypos = []
-        for hypothesis in all_hypotheses:
+        for hypo_idx, hypothesis in enumerate(all_hypotheses):
+            logger.info(
+                f"Inferring precondition for hypothesis {hypo_idx + 1}/{len(all_hypotheses)}: {hypothesis.invariant.text_description}"
+            )
             precondition = find_precondition(hypothesis, self.traces)
             if precondition is None:
                 failed_hypos.append(FailedHypothesis(hypothesis))
@@ -152,7 +172,7 @@ def save_failed_hypos(failed_hypos: list[FailedHypothesis], output_file: str):
             f.write("\n")
 
 
-if __name__ == "__main__":
+def main():
     parser = argparse.ArgumentParser(
         description="Invariant Finder for ML Pipelines in Python"
     )
@@ -227,9 +247,11 @@ def save_failed_hypos(failed_hypos: list[FailedHypothesis], output_file: str):
     else:
         log_level = logging.INFO
 
+    # get current process ID
+    pid = os.getpid()
     # set logging to a file
     logging.basicConfig(
-        filename=f'mldaikon_infer_engine_{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log',
+        filename=f'traincheck_infer_engine_{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}_{pid}.log',
         level=log_level,
         format="%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)s - %(funcName)20s()] - %(message)s",
     )
@@ -292,3 +314,7 @@ def save_failed_hypos(failed_hypos: list[FailedHypothesis], output_file: str):
 
     save_invs(invs, args.output)
     save_failed_hypos(failed_hypos, args.output + ".failed")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mldaikon/instrumentor/VFProxy.py b/traincheck/instrumentor/VFProxy.py
similarity index 95%
rename from mldaikon/instrumentor/VFProxy.py
rename to traincheck/instrumentor/VFProxy.py
index 8c5d6b8c..aeb9bb72 100644
--- a/mldaikon/instrumentor/VFProxy.py
+++ b/traincheck/instrumentor/VFProxy.py
@@ -3,7 +3,7 @@
 
 def is_proxied(obj):
     try:
-        if obj is not None and "is_ml_daikon_proxied_obj" in obj.__dict__:
+        if obj is not None and "is_traincheck_proxied_obj" in obj.__dict__:
             return True
     except Exception:
         return False
diff --git a/mldaikon/instrumentor/__init__.py b/traincheck/instrumentor/__init__.py
similarity index 100%
rename from mldaikon/instrumentor/__init__.py
rename to traincheck/instrumentor/__init__.py
diff --git a/mldaikon/instrumentor/caches.py b/traincheck/instrumentor/caches.py
similarity index 78%
rename from mldaikon/instrumentor/caches.py
rename to traincheck/instrumentor/caches.py
index 1ce0b2db..cf07b195 100644
--- a/mldaikon/instrumentor/caches.py
+++ b/traincheck/instrumentor/caches.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 
-from mldaikon.instrumentor.types import PTID
+from traincheck.instrumentor.types import PTID
 
 cache_meta_vars: dict[PTID, dict[str, dict]] = defaultdict(lambda: defaultdict(dict))
 meta_vars: dict[str, object] = {
diff --git a/mldaikon/instrumentor/dumper.py b/traincheck/instrumentor/dumper.py
similarity index 94%
rename from mldaikon/instrumentor/dumper.py
rename to traincheck/instrumentor/dumper.py
index 9b3aef53..05c1192d 100644
--- a/mldaikon/instrumentor/dumper.py
+++ b/traincheck/instrumentor/dumper.py
@@ -9,23 +9,21 @@
 import orjson
 import torch
 
-from mldaikon.config.config import (
+from traincheck.config.config import (
     BUFFER_SIZE,
     FLUSH_INTERVAL,
     RECURSION_ERR_THRESHOLD,
     TYPE_ERR_THRESHOLD,
 )
-from mldaikon.proxy_wrapper.proxy_config import (
+
+# if torch.cuda.is_available():
+from traincheck.proxy_wrapper.hash import tensor_hash
+from traincheck.proxy_wrapper.proxy_config import (
     attribute_black_list,
     primitive_types,
     tensor_dump_format,
 )
-
-if torch.cuda.is_available():
-    from mldaikon.proxy_wrapper.hash import tensor_hash
-
-from mldaikon.proxy_wrapper.utils import print_debug
-from mldaikon.utils import get_timestamp_ns, typename
+from traincheck.utils import get_timestamp_ns, typename
 
 DEBUG = os.environ.get("ML_DAIKON_DEBUG", False)
 THREAD_DATA = threading.local()
@@ -229,8 +227,8 @@ def get_instrumentation_logger_for_process():
 
 
 def tensor_stats(tensor: torch.Tensor):
-    if hasattr(tensor, "mldaikon_tensor_stats"):
-        return tensor.mldaikon_tensor_stats
+    if hasattr(tensor, "traincheck_tensor_stats"):
+        return tensor.traincheck_tensor_stats
     min = float(tensor.min().item())
     max = float(tensor.max().item())
     mean = float(tensor.mean().item())
@@ -243,7 +241,7 @@ def tensor_stats(tensor: torch.Tensor):
         "std": std,
         "shape": shape,
     }
-    tensor.mldaikon_tensor_stats = result  # type: ignore
+    tensor.traincheck_tensor_stats = result  # type: ignore
     return result
 
 
@@ -253,18 +251,12 @@ def dump_tensor(value):
         if tensor_dump_format["dump_tensor_stats"]:
             param_list = tensor_stats(value)
         elif tensor_dump_format["dump_tensor_hash"]:
-            if not IS_CUDA_AVAILABLE:
-                raise Exception(
-                    "CUDA is not available, cannot dump tensor hash, please set '--tensor-dump-format' to 'full' or 'stats'."
-                )
-            try:
-                # perform tensor hash a deep copy of the tensor
+            if value.is_cuda:
                 param_list = tensor_hash(value, with_parallel=True, with_cuda=True)
-            except Exception as e:
-                print_debug(
-                    f"Failed to dump tensor hash, error: {e}, fullback to cpu hashing."
-                )
+            else:
+                # TODO: support quick hashing methods for MPS tensors
                 param_list = tensor_hash(value, with_parallel=True, with_cuda=False)
+
         elif tensor_dump_format["dump_tensor_full"]:
             param_list = value.detach().flatten().tolist()
         else:
diff --git a/mldaikon/instrumentor/replace_functions.py b/traincheck/instrumentor/replace_functions.py
similarity index 92%
rename from mldaikon/instrumentor/replace_functions.py
rename to traincheck/instrumentor/replace_functions.py
index a1cb93cc..2d343519 100644
--- a/mldaikon/instrumentor/replace_functions.py
+++ b/traincheck/instrumentor/replace_functions.py
@@ -2,8 +2,8 @@
 
 import torch.optim.optimizer as optimizer_
 
-from mldaikon.proxy_wrapper.proxy_basics import adapt_func_for_proxy
-from mldaikon.utils import typename
+from traincheck.proxy_wrapper.proxy_basics import adapt_func_for_proxy
+from traincheck.utils import typename
 
 
 def is_funcs_to_be_unproxied(original_func):
diff --git a/mldaikon/instrumentor/source_file.py b/traincheck/instrumentor/source_file.py
similarity index 94%
rename from mldaikon/instrumentor/source_file.py
rename to traincheck/instrumentor/source_file.py
index f9cfb0e6..03662302 100644
--- a/mldaikon/instrumentor/source_file.py
+++ b/traincheck/instrumentor/source_file.py
@@ -2,7 +2,7 @@
 import logging
 import re
 
-from mldaikon.config.config import INSTR_MODULES_TO_INSTR
+from traincheck.config.config import INSTR_MODULES_TO_INSTR
 
 logger = logging.getLogger(__name__)
 
@@ -43,7 +43,7 @@ def __init__(
 
     def get_instrument_node(self, module_name: str):
         return ast.parse(
-            f"from mldaikon.instrumentor.tracer import Instrumentor; Instrumentor({module_name}, scan_proxy_in_args={self.scan_proxy_in_args}, use_full_instr={self.use_full_instr}, funcs_to_instr={str(self.funcs_to_instr)}, API_dump_stack_trace={self.API_dump_stack_trace}).instrument()"
+            f"from traincheck.instrumentor.tracer import Instrumentor; Instrumentor({module_name}, scan_proxy_in_args={self.scan_proxy_in_args}, use_full_instr={self.use_full_instr}, funcs_to_instr={str(self.funcs_to_instr)}, API_dump_stack_trace={self.API_dump_stack_trace}).instrument()"
         ).body
 
     def visit_Import(self, node):
@@ -353,35 +353,35 @@ def instrument_model_tracker_proxy(
 
     if proxy_basic_config:
         if "proxy_log_dir" not in proxy_basic_config:
-            from mldaikon.proxy_wrapper.proxy_config import proxy_log_dir
+            from traincheck.proxy_wrapper.proxy_config import proxy_log_dir
 
             proxy_basic_config["proxy_log_dir"] = proxy_log_dir
 
         proxy_start_code += f"""
-import mldaikon.proxy_wrapper.proxy_config as proxy_config
+import traincheck.proxy_wrapper.proxy_config as proxy_config
 proxy_config.__dict__.update({proxy_basic_config})
 """
     if tensor_dump_format:
         proxy_start_code += f"""
-from mldaikon.proxy_wrapper.proxy_config import tensor_dump_format
+from traincheck.proxy_wrapper.proxy_config import tensor_dump_format
 tensor_dump_format.update({tensor_dump_format})
 """
 
     proxy_start_code += """
-from mldaikon.proxy_wrapper.proxy import Proxy
+from traincheck.proxy_wrapper.proxy import Proxy
 """
 
     if auto_observer_config["enable_auto_observer"]:
         auto_observer_code = """
 import glob
 import importlib
-from mldaikon.proxy_wrapper.proxy_config import auto_observer_config
-spec = importlib.util.find_spec('mldaikon')
+from traincheck.proxy_wrapper.proxy_config import auto_observer_config
+spec = importlib.util.find_spec('traincheck')
 if spec and spec.origin:
-    mldaikon_folder = os.path.dirname(spec.origin)
-    print("mldaikon folder: ", mldaikon_folder)
+    traincheck_folder = os.path.dirname(spec.origin)
+    print("traincheck folder: ", traincheck_folder)
 else:
-    raise Exception("mldaikon is not installed properly")
+    raise Exception("traincheck is not installed properly")
 print("auto observer enabled with observing depth: ", auto_observer_config["enable_auto_observer_depth"])
 enable_auto_observer_depth = auto_observer_config["enable_auto_observer_depth"]
 neglect_hidden_func = auto_observer_config["neglect_hidden_func"]
@@ -392,10 +392,10 @@ def instrument_model_tracker_proxy(
     print("observe up to the depth of the function call")
 else:
     print("observe only the function call at the depth")
-from mldaikon.static_analyzer.graph_generator.call_graph_parser import add_observer_given_call_graph
+from traincheck.static_analyzer.graph_generator.call_graph_parser import add_observer_given_call_graph
 
 log_files = glob.glob(
-    os.path.join(mldaikon_folder, "static_analyzer", "func_level", "*.log")
+    os.path.join(traincheck_folder, "static_analyzer", "func_level", "*.log")
 )
 print("log_files: ", log_files)
 for log_file in log_files:
@@ -494,7 +494,7 @@ def instrument_model_tracker_sampler(
             )
 
     code_head, code_tail = get_code_head_and_tail(source)
-    sampler_import_code = "from mldaikon.instrumentor import VarSampler"
+    sampler_import_code = "from traincheck.instrumentor import VarSampler"
     source = code_head + "\n" + sampler_import_code + "\n" + code_tail
 
     return source
@@ -538,7 +538,7 @@ def instrument_file(
 """
 
     debug_hook_code = """
-from mldaikon.utils import register_custom_excepthook
+from traincheck.utils import register_custom_excepthook
 if os.environ.get("ML_DAIKON_DEBUG") == "1":
     print("ML_DAIKON_DEBUG is set to 1, registering custom excepthook")
     register_custom_excepthook(True)
@@ -546,7 +546,7 @@ def instrument_file(
 
     # general config update
     general_config_update = f"""
-import mldaikon.config.config as general_config
+import traincheck.config.config as general_config
 general_config.INSTR_DESCRIPTORS = {instr_descriptors}
 """
     # TODO: move the INSTR_DESCRIPTORS to the instr_opts file
diff --git a/mldaikon/instrumentor/tracer.py b/traincheck/instrumentor/tracer.py
similarity index 97%
rename from mldaikon/instrumentor/tracer.py
rename to traincheck/instrumentor/tracer.py
index 5ed8ccaa..127e899d 100644
--- a/mldaikon/instrumentor/tracer.py
+++ b/traincheck/instrumentor/tracer.py
@@ -11,28 +11,28 @@
 
 import torch
 
-import mldaikon.config.config as config  # needed to allow for change of values after import
-from mldaikon.config.config import (
+import traincheck.config.config as config  # needed to allow for change of values after import
+from traincheck.config.config import (
     INSTR_MODULES_TO_SKIP,
     SKIP_INSTR_APIS,
     WRAP_WITHOUT_DUMP,
 )
-from mldaikon.instrumentor.caches import meta_vars
-from mldaikon.instrumentor.dumper import (
+from traincheck.instrumentor.caches import meta_vars
+from traincheck.instrumentor.dumper import (
     convert_var_to_dict,
     dump_trace_API,
     dump_trace_VAR,
     get_instrumentation_logger_for_process,
     var_to_serializable,
 )
-from mldaikon.instrumentor.replace_functions import (
+from traincheck.instrumentor.replace_functions import (
     funcs_to_be_replaced,
     is_funcs_to_be_unproxied,
 )
-from mldaikon.proxy_wrapper.proxy_basics import is_proxied, unproxy_func
-from mldaikon.proxy_wrapper.proxy_config import enable_C_level_observer
-from mldaikon.proxy_wrapper.proxy_registry import get_global_registry
-from mldaikon.utils import get_timestamp_ns, get_unique_id, typename
+from traincheck.proxy_wrapper.proxy_basics import is_proxied, unproxy_func
+from traincheck.proxy_wrapper.proxy_config import enable_C_level_observer
+from traincheck.proxy_wrapper.proxy_registry import get_global_registry
+from traincheck.utils import get_timestamp_ns, get_unique_id, typename
 
 _instancemethod_t = type(torch._C._distributed_c10d.ProcessGroup.broadcast)
 
@@ -42,7 +42,7 @@
 
 DISABLE_WRAPPER = False
 
-# for prompt generation tasks using the transformers library (see mldaikon/developer/instr_stage_annotation.py:annotate_answer_start_token_ids)
+# for prompt generation tasks using the transformers library (see traincheck/developer/instr_stage_annotation.py:annotate_answer_start_token_ids)
 GENERATE_START_TOKEN_ID: None | int = None
 GENERATE_START_TOKEN_ID_INCLUDE_START_TOKEN = False
 
@@ -51,7 +51,7 @@
 
 THREAD_DATA = threading.local()
 
-logger = logging.getLogger("mldaikon.instrumentor.tracer")
+logger = logging.getLogger("traincheck.instrumentor.tracer")
 
 
 class TraceLineType:
@@ -257,7 +257,7 @@ def find_proxy_in_args(args):
 
     if handle_proxy:
         if enable_C_level_observer and is_builtin:
-            from mldaikon.proxy_wrapper.proxy_observer import (
+            from traincheck.proxy_wrapper.proxy_observer import (
                 add_observer_to_func,  # import here to avoid circular import
             )
 
@@ -465,8 +465,8 @@ def wrapped(*args, **kwargs):
         else:
             return original_function
 
-    wrapped._ml_daikon_original_function = original_function
-    wrapped._ml_daikon_instrumented = True
+    wrapped._traincheck_original_function = original_function
+    wrapped._traincheck_instrumented = True
     return wrapped
 
 
@@ -514,7 +514,7 @@ def is_API_instrumented(obj: Callable) -> bool:
     # APIs has to be marked with a flag as ids will be changed after instrumentation, and also having the same id would mean that the object is not instrumented (e.g. multiple references to the same object)
     try:
         # we cannot use hasattr as it would trigger the __getattr__ method of the object, and can lead to exceptions at https://github.com/pytorch/pytorch/blob/main/torch/_ops.py#L1029-L1031
-        return obj.__dict__.get("_ml_daikon_instrumented", False)
+        return obj.__dict__.get("_traincheck_instrumented", False)
     except Exception:
         # a wrapped API would have __dict__ and have the flag
         return False
diff --git a/mldaikon/instrumentor/types.py b/traincheck/instrumentor/types.py
similarity index 100%
rename from mldaikon/instrumentor/types.py
rename to traincheck/instrumentor/types.py
diff --git a/mldaikon/instrumentor/variable.py b/traincheck/instrumentor/variable.py
similarity index 100%
rename from mldaikon/instrumentor/variable.py
rename to traincheck/instrumentor/variable.py
diff --git a/mldaikon/invariant/DistinctArgumentRelation.py b/traincheck/invariant/DistinctArgumentRelation.py
similarity index 98%
rename from mldaikon/invariant/DistinctArgumentRelation.py
rename to traincheck/invariant/DistinctArgumentRelation.py
index 170ba509..71719d99 100644
--- a/mldaikon/invariant/DistinctArgumentRelation.py
+++ b/traincheck/invariant/DistinctArgumentRelation.py
@@ -3,7 +3,7 @@
 
 from tqdm import tqdm
 
-from mldaikon.invariant.base_cls import (  # GroupedPreconditions,
+from traincheck.invariant.base_cls import (  # GroupedPreconditions,
     APIParam,
     CheckerResult,
     Example,
@@ -13,9 +13,9 @@
     Invariant,
     Relation,
 )
-from mldaikon.invariant.precondition import find_precondition
-from mldaikon.trace.trace import Trace
-from mldaikon.utils import safe_isnan
+from traincheck.invariant.precondition import find_precondition
+from traincheck.trace.trace import Trace
+from traincheck.utils import safe_isnan
 
 EXP_GROUP_NAME = "distinct_arg"
 MAX_FUNC_NUM_CONSECUTIVE_CALL = 6
diff --git a/mldaikon/invariant/__init__.py b/traincheck/invariant/__init__.py
similarity index 100%
rename from mldaikon/invariant/__init__.py
rename to traincheck/invariant/__init__.py
diff --git a/mldaikon/invariant/base_cls.py b/traincheck/invariant/base_cls.py
similarity index 99%
rename from mldaikon/invariant/base_cls.py
rename to traincheck/invariant/base_cls.py
index eabb54a9..d17f961c 100644
--- a/mldaikon/invariant/base_cls.py
+++ b/traincheck/invariant/base_cls.py
@@ -9,14 +9,14 @@
 from enum import Enum
 from typing import Any, Hashable, Iterable, Optional, Type
 
-import mldaikon.config.config as config
-from mldaikon.instrumentor.dumper import var_to_serializable
-from mldaikon.invariant.symbolic_value import (
+import traincheck.config.config as config
+from traincheck.instrumentor.dumper import var_to_serializable
+from traincheck.invariant.symbolic_value import (
     GENERALIZED_TYPES,
     check_generalized_value_match,
 )
-from mldaikon.trace.trace import Trace, VarInstId
-from mldaikon.trace.types import (
+from traincheck.trace.trace import Trace, VarInstId
+from traincheck.trace.types import (
     MD_NONE,
     FuncCallEvent,
     FuncCallExceptionEvent,
@@ -25,7 +25,7 @@
     MDNONEJSONDecoder,
     VarChangeEvent,
 )
-from mldaikon.utils import safe_isnan
+from traincheck.utils import safe_isnan
 
 
 class Meta_NOT_SET(type):
diff --git a/mldaikon/invariant/consistency_relation.py b/traincheck/invariant/consistency_relation.py
similarity index 99%
rename from mldaikon/invariant/consistency_relation.py
rename to traincheck/invariant/consistency_relation.py
index 7e83e68b..812c6bcc 100644
--- a/mldaikon/invariant/consistency_relation.py
+++ b/traincheck/invariant/consistency_relation.py
@@ -4,8 +4,8 @@
 
 from tqdm import tqdm
 
-from mldaikon.config import config
-from mldaikon.invariant.base_cls import (
+from traincheck.config import config
+from traincheck.invariant.base_cls import (
     CheckerResult,
     Example,
     ExampleList,
@@ -15,9 +15,9 @@
     Relation,
     VarTypeParam,
 )
-from mldaikon.invariant.precondition import find_precondition
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.types import Liveness
+from traincheck.invariant.precondition import find_precondition
+from traincheck.trace.trace import Trace
+from traincheck.trace.types import Liveness
 
 tracker_var_field_prefix = "attributes."
 
diff --git a/mldaikon/invariant/consistency_transient_vars.py b/traincheck/invariant/consistency_transient_vars.py
similarity index 95%
rename from mldaikon/invariant/consistency_transient_vars.py
rename to traincheck/invariant/consistency_transient_vars.py
index d3be0beb..ae3afed0 100644
--- a/mldaikon/invariant/consistency_transient_vars.py
+++ b/traincheck/invariant/consistency_transient_vars.py
@@ -2,9 +2,10 @@
 import re
 from typing import Hashable
 
+import pandas as pd
 from tqdm import tqdm
 
-from mldaikon.invariant.base_cls import (
+from traincheck.invariant.base_cls import (
     APIParam,
     Arguments,
     CheckerResult,
@@ -18,14 +19,14 @@
     VarTypeParam,
     make_hashable,
 )
-from mldaikon.invariant.precondition import find_precondition
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.types import (
+from traincheck.invariant.precondition import find_precondition
+from traincheck.trace.trace import Trace
+from traincheck.trace.types import (
     FuncCallEvent,
     FuncCallExceptionEvent,
     IncompleteFuncCallEvent,
 )
-from mldaikon.utils import safe_isnan
+from traincheck.utils import safe_isnan
 
 TENSOR_PATTERN = r"torch\..*Tensor"
 PARAMETER_KEYWORD = "Parameter"
@@ -34,6 +35,19 @@
 # _CACHE_PATH = "func_with_tensors.pkl"
 
 
+def safe_equality(obj1: object, obj2: object) -> bool:
+    """
+    Check if two objects are equal, handling NaN values.
+    """
+    if safe_isnan(obj1) and safe_isnan(obj2):
+        return True
+
+    if safe_isnan(obj1) or safe_isnan(obj2):
+        return False
+
+    return obj1 == obj2
+
+
 # for each value observed, form a dict of {value: path to access the value}
 def _get_tensor_value_paths(tensors: list[dict]) -> dict:
     logger = logging.getLogger(__name__)
@@ -246,6 +260,18 @@ def get_events_of_funcs_with_tensors(
         func_name: trace.get_func_call_ids(func_name) for func_name in all_func_names
     }
 
+    # sampling 1000 if more than 1000
+    import random
+
+    all_func_call_ids = {
+        func_name: (
+            random.sample(func_call_ids, 1000)
+            if len(func_call_ids) > 1000
+            else func_call_ids
+        )
+        for func_name, func_call_ids in all_func_call_ids.items()
+    }
+
     all_func_call_events = {
         func_name: {
             func_call_id: trace.query_func_call_event(func_call_id)
@@ -329,6 +355,16 @@ def generate_hypothesis(trace) -> list[Hypothesis]:
                             func_call_event
                         )
 
+                    for prop in properties_occur_num:
+                        if prop not in returned_tensor:
+                            if pd.NA not in properties_occur_num[prop]:
+                                properties_occur_num[prop][pd.NA] = 0
+                                properties_corresponding_func_call[prop][pd.NA] = []
+                            properties_occur_num[prop][pd.NA] += 1
+                            properties_corresponding_func_call[prop][pd.NA].append(
+                                func_call_event
+                            )
+
             hypotheses_for_func: list[Hypothesis] = []
             # generate a hypothesis for each property
             for prop, prop_values in properties_occur_num.items():
@@ -363,8 +399,14 @@ def generate_hypothesis(trace) -> list[Hypothesis]:
                         hypothesis.positive_examples.add_example(example)
 
                     for prop_val_other, prop_val_count_other in prop_values.items():
-                        if prop_val_other == prop_val:
-                            continue
+                        try:
+                            if safe_equality(prop_val, prop_val_other):
+                                continue
+                        except TypeError:
+                            print(
+                                f"TypeError: {prop_val} {safe_isnan(prop_val)} {type(prop_val)} and {prop_val_other} {safe_isnan(prop_val_other)} are not comparable, skipping this property."
+                            )
+                            raise
                         for func_call_event in properties_corresponding_func_call[prop][
                             prop_val_other
                         ]:
@@ -398,6 +440,12 @@ def collect_examples(trace, hypothesis):
         # get all the function calls for the function
         func_call_ids = trace.get_func_call_ids(func_name)
 
+        # down sample to 1000
+        import random
+
+        if len(func_call_ids) > 1000:
+            func_call_ids = random.sample(func_call_ids, 1000)
+
         for func_call_id in tqdm(
             func_call_ids, desc=f"Adding examples for {inv.text_description}"
         ):
@@ -710,6 +758,11 @@ def collect_examples(trace, hypothesis):
         func_name = api_param.api_full_name
         func_call_ids = trace.get_func_call_ids(func_name)
 
+        import random
+
+        if len(func_call_ids) > 1000:
+            func_call_ids = random.sample(func_call_ids, 1000)
+
         for func_call_id in tqdm(
             func_call_ids, desc=f"Checking invariant {inv.text_description}"
         ):
@@ -1067,6 +1120,12 @@ def collect_examples(trace, hypothesis):
         func_name = api_param.api_full_name
         # get all function calls for the function
         func_call_ids = trace.get_func_call_ids(func_name)
+
+        import random
+
+        if len(func_call_ids) > 1000:
+            func_call_ids = random.sample(func_call_ids, 1000)
+
         for func_call_id in tqdm(
             func_call_ids, desc=f"Checking invariant {inv.text_description}"
         ):
diff --git a/mldaikon/invariant/contain_relation.py b/traincheck/invariant/contain_relation.py
similarity index 96%
rename from mldaikon/invariant/contain_relation.py
rename to traincheck/invariant/contain_relation.py
index 114fd1a0..9b2550ac 100644
--- a/mldaikon/invariant/contain_relation.py
+++ b/traincheck/invariant/contain_relation.py
@@ -6,9 +6,9 @@
 import numpy as np
 from tqdm import tqdm
 
-from mldaikon.config.config import ANALYSIS_SKIP_FUNC_NAMES
-from mldaikon.instrumentor.tracer import TraceLineType
-from mldaikon.invariant.base_cls import (
+from traincheck.config.config import ANALYSIS_SKIP_FUNC_NAMES
+from traincheck.instrumentor.tracer import TraceLineType
+from traincheck.invariant.base_cls import (
     APIParam,
     Arguments,
     CheckerResult,
@@ -26,17 +26,17 @@
     construct_var_param_from_var_change,
     is_signature_empty,
 )
-from mldaikon.invariant.precondition import find_precondition
-from mldaikon.invariant.symbolic_value import generalize_values
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.types import (
+from traincheck.invariant.precondition import find_precondition
+from traincheck.invariant.symbolic_value import generalize_values
+from traincheck.trace.trace import Trace
+from traincheck.trace.types import (
     ALL_EVENT_TYPES,
     FuncCallEvent,
     FuncCallExceptionEvent,
     IncompleteFuncCallEvent,
     VarChangeEvent,
 )
-from mldaikon.utils import typename
+from traincheck.utils import typename
 
 PARENT_GROUP_NAME = "parent_func_call_pre"
 VAR_GROUP_NAME = "var_events"
@@ -252,15 +252,6 @@ def _merge_hypotheses(hypotheses: list[Hypothesis]) -> list[Hypothesis]:
         generalized_value = generalize_values(values)
         setattr(merged_child_param, field, generalized_value)
 
-    # if the negative examples are empty, log it out!!!
-    if len(all_negative_examples) == 0:
-        logger = logging.getLogger(__name__)
-        logger.error("The negative examples are empty, this is not expected")
-        for hypo in hypotheses:
-            logger.error(
-                f"{hypo.invariant.text_description}, positive examples: {len(hypo.positive_examples)}, negative examples: {len(hypo.negative_examples)}"
-            )
-
     # construct the merged hypotheses
     merged_hypothesis = Hypothesis(
         invariant=Invariant(
@@ -390,10 +381,24 @@ def _collect_examples(trace, hypothesis):
             if VAR_GROUP_NAME in hypothesis.negative_examples.get_group_names():
                 check_for_unchanged_vars = True
 
-        for parent_func_call_id in tqdm(
-            parent_func_call_ids, desc=f"Collecting examples for {inv.text_description}"
+        nums_contained_events = []
+        kind_of_parent_events = []
+        for i, parent_func_call_id in tqdm(
+            enumerate(parent_func_call_ids), desc="Collecting examples"
         ):
+            parent_event_type = _get_parent_type(trace, parent_func_call_id)
+            if not (i < 10 or i > len(parent_func_call_ids) - 10) and prune_func_call(
+                len(parent_func_call_ids),
+                parent_event_type,
+                nums_contained_events,
+                kind_of_parent_events,
+            ):
+                continue
+
             contained_events = events_scanner(trace, parent_func_call_id)
+            nums_contained_events.append(len(contained_events))
+            kind_of_parent_events.append(parent_event_type)
+
             grouped_events = _group_events_by_type(contained_events)
             if isinstance(child_param, APIParam):
                 contained_events = (
@@ -579,6 +584,16 @@ def _infer(
             logger.debug(
                 f"Found {len(parent_func_call_ids)} invocations for the function: {parent}"
             )
+
+            # down sampling the function calls
+            if len(parent_func_call_ids) > 1000:
+                # down sample the function calls, keep the first 10, last 10, and randomly sample 100 from the rest
+                parent_func_call_ids = (
+                    parent_func_call_ids[:10]
+                    + random.sample(parent_func_call_ids[10:-10], 100)  # type: ignore
+                    + parent_func_call_ids[-10:]
+                )
+
             all_contained_events: dict[
                 str, list[FuncCallEvent | FuncCallExceptionEvent | VarChangeEvent]
             ] = {}
@@ -587,8 +602,18 @@ def _infer(
             kind_of_parent_events: list[
                 Type[FuncCallEvent | FuncCallExceptionEvent | IncompleteFuncCallEvent]
             ] = []
-            for parent_func_call_id in parent_func_call_ids:
+            for i, parent_func_call_id in enumerate(parent_func_call_ids):
                 parent_event_type = _get_parent_type(trace, parent_func_call_id)
+
+                if not (
+                    i < 10 or i > len(parent_func_call_ids) - 10
+                ) and prune_func_call(
+                    len(parent_func_call_ids),
+                    parent_event_type,
+                    nums_contained_events,
+                    kind_of_parent_events,
+                ):
+                    continue
                 contained_events = events_scanner(
                     trace=trace, func_call_id=parent_func_call_id
                 )
diff --git a/mldaikon/invariant/cover_relation.py b/traincheck/invariant/cover_relation.py
similarity index 98%
rename from mldaikon/invariant/cover_relation.py
rename to traincheck/invariant/cover_relation.py
index 6dce6955..8c80bac9 100644
--- a/mldaikon/invariant/cover_relation.py
+++ b/traincheck/invariant/cover_relation.py
@@ -4,7 +4,7 @@
 
 from tqdm import tqdm
 
-from mldaikon.invariant.base_cls import (
+from traincheck.invariant.base_cls import (
     APIParam,
     CheckerResult,
     Example,
@@ -15,14 +15,14 @@
     Invariant,
     Relation,
 )
-from mldaikon.invariant.lead_relation import (
+from traincheck.invariant.lead_relation import (
     check_same_level,
     get_func_data_per_PT,
     get_func_names_to_deal_with,
 )
-from mldaikon.invariant.precondition import find_precondition
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.trace_pandas import TracePandas
+from traincheck.invariant.precondition import find_precondition
+from traincheck.trace.trace import Trace
+from traincheck.trace.trace_pandas import TracePandas
 
 EXP_GROUP_NAME = "func_cover"
 
diff --git a/mldaikon/invariant/lead_relation.py b/traincheck/invariant/lead_relation.py
similarity index 99%
rename from mldaikon/invariant/lead_relation.py
rename to traincheck/invariant/lead_relation.py
index f134cdf9..04d3bb94 100644
--- a/mldaikon/invariant/lead_relation.py
+++ b/traincheck/invariant/lead_relation.py
@@ -4,7 +4,7 @@
 
 from tqdm import tqdm
 
-from mldaikon.invariant.base_cls import (
+from traincheck.invariant.base_cls import (
     APIParam,
     CheckerResult,
     Example,
@@ -15,9 +15,9 @@
     Invariant,
     Relation,
 )
-from mldaikon.invariant.precondition import find_precondition
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.trace_pandas import TracePandas
+from traincheck.invariant.precondition import find_precondition
+from traincheck.trace.trace import Trace
+from traincheck.trace.trace_pandas import TracePandas
 
 EXP_GROUP_NAME = "func_lead"
 MAX_FUNC_NUM_CONSECUTIVE_CALL = 4  # ideally this should be proportional to the number of training and testing iterations in the trace
diff --git a/mldaikon/invariant/precondition.py b/traincheck/invariant/precondition.py
similarity index 97%
rename from mldaikon/invariant/precondition.py
rename to traincheck/invariant/precondition.py
index cac1b402..6970fd70 100644
--- a/mldaikon/invariant/precondition.py
+++ b/traincheck/invariant/precondition.py
@@ -4,8 +4,8 @@
 
 from tqdm import tqdm
 
-import mldaikon.config.config as config
-from mldaikon.invariant.base_cls import (
+import traincheck.config.config as config
+from traincheck.invariant.base_cls import (
     PT,
     GroupedPreconditions,
     Hypothesis,
@@ -14,9 +14,9 @@
     Preconditions,
     UnconditionalPrecondition,
 )
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.types import MD_NONE
-from mldaikon.utils import safe_isnan
+from traincheck.trace.trace import Trace
+from traincheck.trace.types import MD_NONE
+from traincheck.utils import safe_isnan
 
 logger = logging.getLogger("Precondition")
 
@@ -299,6 +299,20 @@ def find_precondition(
             # the negative examples are not found, assign an unconditional precondition (to be handled in find_precondition_from_single_group)
             negative_examples = []
 
+        import random
+
+        random.seed(42)
+        if len(positive_examples) > 5000:
+            logger.warning(
+                f"Too many positive examples found for group {group_name}, downsampling to 100000"
+            )
+            positive_examples = random.sample(positive_examples, 5000)
+        if len(negative_examples) > 5000:
+            logger.warning(
+                f"Too many negative examples found for group {group_name}, downsampling to 100000"
+            )
+            negative_examples = random.sample(negative_examples, 5000)
+
         grouped_preconditions[group_name] = Preconditions(
             find_precondition_from_single_group(
                 positive_examples, negative_examples, traces, keys_to_skip
@@ -311,7 +325,7 @@ def find_precondition(
             and len(negative_examples) > 0
         ):
             # try doing inverted precondition inference
-            logger.warning(
+            logger.debug(
                 f"Empty preconditions found for group {group_name}, trying to infer the preconditions by inverting the negative examples."
             )
             # i.e. use negative examples to infer the preconditions, and then invert the final precondition
@@ -323,11 +337,6 @@ def find_precondition(
                 inverted=True,
             )
 
-            # TODO: add wrapper indicating that the precondition is inverted!!!
-
-        if len(grouped_preconditions[group_name]) == 0:
-            logger.warning(f"No preconditions found for group {group_name}")
-
     # if any group's precondition is of length 0, return None
     if all(
         len(grouped_preconditions[group_name]) == 0
@@ -446,9 +455,7 @@ def find_precondition_from_single_group(
         assert (
             len(positive_examples) > 0
         ), "No negative examples found, but no positive examples found either"
-        logger.warning(
-            "No negative examples found, assigning unconditional precondition"
-        )
+        logger.debug("No negative examples found, assigning unconditional precondition")
         return [UnconditionalPrecondition()]
 
     # if (
diff --git a/traincheck/invariant/relation_pool.py b/traincheck/invariant/relation_pool.py
new file mode 100644
index 00000000..781c8a97
--- /dev/null
+++ b/traincheck/invariant/relation_pool.py
@@ -0,0 +1,26 @@
+from typing import Type
+
+from traincheck.invariant.consistency_relation import ConsistencyRelation
+from traincheck.invariant.consistency_transient_vars import (
+    ConsistentInputOutputRelation,
+    ConsistentOutputRelation,
+    ThresholdRelation,
+)
+from traincheck.invariant.contain_relation import APIContainRelation
+from traincheck.invariant.cover_relation import FunctionCoverRelation
+from traincheck.invariant.DistinctArgumentRelation import DistinctArgumentRelation
+from traincheck.invariant.lead_relation import FunctionLeadRelation
+
+# from traincheck.invariant.var_periodic_change_relation import VarPeriodicChangeRelation
+
+relation_pool: list[Type] = [
+    APIContainRelation,
+    ConsistencyRelation,
+    ConsistentOutputRelation,
+    ConsistentInputOutputRelation,
+    #    VarPeriodicChangeRelation,
+    FunctionCoverRelation,
+    FunctionLeadRelation,
+    DistinctArgumentRelation,
+    ThresholdRelation,
+]
diff --git a/mldaikon/invariant/symbolic_value.py b/traincheck/invariant/symbolic_value.py
similarity index 99%
rename from mldaikon/invariant/symbolic_value.py
rename to traincheck/invariant/symbolic_value.py
index eb3d57ce..2cdbb174 100644
--- a/mldaikon/invariant/symbolic_value.py
+++ b/traincheck/invariant/symbolic_value.py
@@ -2,7 +2,7 @@
 
 import pandas as pd
 
-from mldaikon.trace.types import MD_NONE
+from traincheck.trace.types import MD_NONE
 
 ABOVE_ZERO = "above_zero"
 BELOW_ZERO = "below_zero"
diff --git a/mldaikon/invariant/var_periodic_change_relation.py b/traincheck/invariant/var_periodic_change_relation.py
similarity index 98%
rename from mldaikon/invariant/var_periodic_change_relation.py
rename to traincheck/invariant/var_periodic_change_relation.py
index e42e6d68..943d30d9 100644
--- a/mldaikon/invariant/var_periodic_change_relation.py
+++ b/traincheck/invariant/var_periodic_change_relation.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 
-import mldaikon.config.config as config
-from mldaikon.invariant.base_cls import (
+import traincheck.config.config as config
+from traincheck.invariant.base_cls import (
     CheckerResult,
     Example,
     ExampleList,
@@ -16,8 +16,8 @@
     VarTypeParam,
     make_hashable,
 )
-from mldaikon.invariant.precondition import find_precondition
-from mldaikon.trace.trace import Trace, VarInstId
+from traincheck.invariant.precondition import find_precondition
+from traincheck.trace.trace import Trace, VarInstId
 
 
 def count_num_justification(count: int):
diff --git a/mldaikon/proxy_wrapper/Changelog.md b/traincheck/proxy_wrapper/Changelog.md
similarity index 92%
rename from mldaikon/proxy_wrapper/Changelog.md
rename to traincheck/proxy_wrapper/Changelog.md
index f4804942..ecc55135 100644
--- a/mldaikon/proxy_wrapper/Changelog.md
+++ b/traincheck/proxy_wrapper/Changelog.md
@@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning].
 
 ### Added
 
-- Maintain global registry to proxied objects (to access the vars, use `from mldaikon.proxy_wrapper.proxy import get_registered_object`)
+- Maintain global registry to proxied objects (to access the vars, use `from traincheck.proxy_wrapper.proxy import get_registered_object`)
 - Bypass tensor stats/hash computation if it has already been calculated
 
 ### Fixed
@@ -56,7 +56,7 @@ and this project adheres to [Semantic Versioning].
 ### Added
 
 - support automated function observing based on function call graph level
-- add function depth info of `torch/optim` library into `mldaikon/static_analyzer/func_level` (file generated from modified pyan lib)
+- add function depth info of `torch/optim` library into `traincheck/static_analyzer/func_level` (file generated from modified pyan lib)
 - add the following switches to control the behavior of automated function observer
 
     ```python
@@ -80,7 +80,7 @@ and this project adheres to [Semantic Versioning].
 
 - get rid of rubbish trace info due to flawed dumping logic handling
 
-- fix type(Proxy) object handling by replacing with type_handle_mldaikon_proxy(..) function using ast
+- fix type(Proxy) object handling by replacing with type_handle_traincheck_proxy(..) function using ast
 
 ## [0.3.4] - 2024-06-19
 
@@ -138,7 +138,7 @@ Solves: #22
 
 ### Removed
 
-- remove the deprecated torch.Tensor handling logic from `mldaikon/proxy_wrapper/proxy.py:#L231-L295`, since the handling logic for Tensor type is basically the same compared with other types after redesign
+- remove the deprecated torch.Tensor handling logic from `traincheck/proxy_wrapper/proxy.py:#L231-L295`, since the handling logic for Tensor type is basically the same compared with other types after redesign
 
 
 ## [0.3.2] - 2024-06-09
@@ -180,7 +180,7 @@ The `DS-1801` precondition inference is fully supported in this version.
  
  - dump whole tensor instead of {`min`, `max`, `shape`} attributes
 
- - update meta_var dumper to filter out files inside the ml-daikon folder
+ - update meta_var dumper to filter out files inside the traincheck folder
 
 ### Fixed
 
@@ -229,12 +229,12 @@ by iterating through `module.named_parameters()`
 
 ### Changed
 
-- move the proxy_wrapper configurations from `mldaikon.config.config` to `mldaikon.proxywrapper.config`
+- move the proxy_wrapper configurations from `traincheck.config.config` to `traincheck.proxywrapper.config`
 
 ### Deprecated
 
 - Remove the argument unproxying functionality in `tracer.global_wrapper`, make proxy_wrapper transparent
-to the ml-daikon code
+to the traincheck code
 
 ### Fixed
 
diff --git a/mldaikon/proxy_wrapper/README.md b/traincheck/proxy_wrapper/README.md
similarity index 98%
rename from mldaikon/proxy_wrapper/README.md
rename to traincheck/proxy_wrapper/README.md
index d1e2bf06..d7a6b1a8 100644
--- a/mldaikon/proxy_wrapper/README.md
+++ b/traincheck/proxy_wrapper/README.md
@@ -98,7 +98,7 @@ The `_try_get_data` function is responsible for fetching data from the DataLoade
 For now, turn on debug_mode in config.py to see full proxy_trace debug output.
 
 Tracing log for DS-1801:
-run `PYTORCH_JIT=0 python -m mldaikon.collect_trace -p Megatron-DeepSpeed/pretrain_gpt.py -s ./pretrain_gpt2_codeparrot_short.sh -t megatron deepspeed torch`
+run `PYTORCH_JIT=0 python -m traincheck.collect_trace -p Megatron-DeepSpeed/pretrain_gpt.py -s ./pretrain_gpt2_codeparrot_short.sh -t megatron deepspeed torch`
 
 ```
 found the root object
diff --git a/mldaikon/proxy_wrapper/__init__.py b/traincheck/proxy_wrapper/__init__.py
similarity index 68%
rename from mldaikon/proxy_wrapper/__init__.py
rename to traincheck/proxy_wrapper/__init__.py
index 8c600673..d3bf780a 100644
--- a/mldaikon/proxy_wrapper/__init__.py
+++ b/traincheck/proxy_wrapper/__init__.py
@@ -1,4 +1,4 @@
 # This import is necessary to make the observer utility inside torch_proxy.py executed before the instrumented code. This would ensure the observer function is successfully registred before the instrumented code is executed.
 
-import mldaikon.proxy_wrapper.proxy_config  # noqa
-import mldaikon.proxy_wrapper.torch_proxy  # noqa
+import traincheck.proxy_wrapper.proxy_config  # noqa
+import traincheck.proxy_wrapper.torch_proxy  # noqa
diff --git a/mldaikon/proxy_wrapper/dumper.py b/traincheck/proxy_wrapper/dumper.py
similarity index 86%
rename from mldaikon/proxy_wrapper/dumper.py
rename to traincheck/proxy_wrapper/dumper.py
index e6c47e84..cd7cf3d9 100644
--- a/mldaikon/proxy_wrapper/dumper.py
+++ b/traincheck/proxy_wrapper/dumper.py
@@ -1,11 +1,11 @@
 import json
 from typing import Dict
 
-from mldaikon.instrumentor.dumper import convert_var_to_dict
-from mldaikon.instrumentor.tracer import TraceLineType
-from mldaikon.instrumentor.tracer import get_meta_vars as tracer_get_meta_vars
-from mldaikon.proxy_wrapper.proxy_basics import is_proxied
-from mldaikon.proxy_wrapper.proxy_config import primitive_types
+from traincheck.instrumentor.dumper import convert_var_to_dict
+from traincheck.instrumentor.tracer import TraceLineType
+from traincheck.instrumentor.tracer import get_meta_vars as tracer_get_meta_vars
+from traincheck.proxy_wrapper.proxy_basics import is_proxied
+from traincheck.proxy_wrapper.proxy_config import primitive_types
 
 
 class Singleton(type):
diff --git a/mldaikon/proxy_wrapper/hash.py b/traincheck/proxy_wrapper/hash.py
similarity index 78%
rename from mldaikon/proxy_wrapper/hash.py
rename to traincheck/proxy_wrapper/hash.py
index 310cf288..1c1b3088 100644
--- a/mldaikon/proxy_wrapper/hash.py
+++ b/traincheck/proxy_wrapper/hash.py
@@ -10,15 +10,16 @@
 # Define a fixed constant tensor
 FIXED_CONSTANT = torch.tensor([42], dtype=torch.int64)  # Example fixed constant
 
+if torch.cuda.is_available():
 
-@cuda.jit("void(int64[:, :], int64[:], int64, int64)")
-def cuda_hash_kernel(data, hash_values, multiplier, increment):
-    idx = cuda.grid(1)
-    if idx < data.shape[0]:
-        hash_value = 0
-        for i in range(data.shape[1]):
-            hash_value = hash_value * multiplier + data[idx, i] + increment
-        hash_values[idx] = hash_value
+    @cuda.jit("void(int64[:, :], int64[:], int64, int64)")
+    def cuda_hash_kernel(data, hash_values, multiplier, increment):
+        idx = cuda.grid(1)
+        if idx < data.shape[0]:
+            hash_value = 0
+            for i in range(data.shape[1]):
+                hash_value = hash_value * multiplier + data[idx, i] + increment
+            hash_values[idx] = hash_value
 
 
 def hash_tensor_cuda(x):
@@ -53,7 +54,14 @@ def hash_tensor_cpu(x: torch.Tensor) -> int:
     x_np = x.cpu().numpy().astype(np.int64)  # Ensure conversion to NumPy int64
 
     # Compute cumulative multiplication just like in the loop version
-    hash_values = np.zeros(x_np.shape[0], dtype=np.int64)  # Hash storage per row
+    if x_np.shape == ():  # Handle scalar case
+        hash_values = np.zeros(1, dtype=np.int64)
+        x_np = np.expand_dims(x_np, axis=(0, 1))  # Convert scalar to 2D array
+    elif x_np.ndim == 1:  # Handle vector case
+        hash_values = np.zeros(1, dtype=np.int64)  # Single hash value for the vector
+        x_np = np.expand_dims(x_np, axis=0)  # Convert vector to 2D array with one row
+    else:
+        hash_values = np.zeros(x_np.shape[0], dtype=np.int64)  # Hash storage per row
 
     # Accumulate the hash row-wise, matching the loop behavior
     for i in range(x_np.shape[1]):
@@ -85,8 +93,8 @@ def _reduce_last_axis(x: Tensor) -> Tensor:
 
 
 def tensor_hash(x: Tensor, with_parallel: bool = True, with_cuda: bool = True) -> int:
-    if hasattr(x, "_mldaikon_tensor_hash"):
-        return x._mldaikon_tensor_hash
+    if hasattr(x, "_traincheck_tensor_hash"):
+        return x._traincheck_tensor_hash
     if with_parallel:
         if x.dtype in [
             torch.float32,
diff --git a/mldaikon/proxy_wrapper/proxy.py b/traincheck/proxy_wrapper/proxy.py
similarity index 96%
rename from mldaikon/proxy_wrapper/proxy.py
rename to traincheck/proxy_wrapper/proxy.py
index 8c549796..69c82277 100644
--- a/mldaikon/proxy_wrapper/proxy.py
+++ b/traincheck/proxy_wrapper/proxy.py
@@ -8,11 +8,11 @@
 
 import torch
 
-import mldaikon.config.config as general_config
-import mldaikon.proxy_wrapper.proxy_config as proxy_config  # HACK: cannot directly import config variables as then they would be local variables
-import mldaikon.proxy_wrapper.proxy_methods as proxy_methods
-from mldaikon.proxy_wrapper.dumper import dump_attributes, get_meta_vars
-from mldaikon.utils import get_timestamp_ns, typename
+import traincheck.config.config as general_config
+import traincheck.proxy_wrapper.proxy_config as proxy_config  # HACK: cannot directly import config variables as then they would be local variables
+import traincheck.proxy_wrapper.proxy_methods as proxy_methods
+from traincheck.proxy_wrapper.dumper import dump_attributes, get_meta_vars
+from traincheck.utils import get_timestamp_ns, typename
 
 from .dumper import json_dumper as dumper
 from .proxy_basics import unproxy_arg, unproxy_args_kwargs
@@ -204,7 +204,7 @@ def __init__(
         self.__dict__["logdir"] = logdir
         self.__dict__["log_level"] = log_level
         self.__dict__["meta_vars"] = {}
-        self.__dict__["is_ml_daikon_proxied_obj"] = True
+        self.__dict__["is_traincheck_proxied_obj"] = True
         self.__dict__["recurse"] = recurse
         self.__dict__["var_name"] = var_name
         self.__dict__["old_value"] = None
@@ -221,8 +221,8 @@ def __init__(
             self.__dict__["last_update_timestamp"] = obj.__dict__[
                 "last_update_timestamp"
             ]
-            self.__dict__["is_ml_daikon_proxied_obj"] = obj.__dict__[
-                "is_ml_daikon_proxied_obj"
+            self.__dict__["is_traincheck_proxied_obj"] = obj.__dict__[
+                "is_traincheck_proxied_obj"
             ]
             self.__dict__["recurse"] = obj.__dict__["recurse"]
             self.__dict__["var_name"] = obj.__dict__["var_name"]
diff --git a/mldaikon/proxy_wrapper/proxy_basics.py b/traincheck/proxy_wrapper/proxy_basics.py
similarity index 84%
rename from mldaikon/proxy_wrapper/proxy_basics.py
rename to traincheck/proxy_wrapper/proxy_basics.py
index 7fd9d30a..11f8162b 100644
--- a/mldaikon/proxy_wrapper/proxy_basics.py
+++ b/traincheck/proxy_wrapper/proxy_basics.py
@@ -7,7 +7,7 @@
 
 def is_proxied(obj):
     try:
-        if obj is not None and "is_ml_daikon_proxied_obj" in obj.__dict__:
+        if obj is not None and "is_traincheck_proxied_obj" in obj.__dict__:
             return True
     except Exception:
         return False
@@ -57,19 +57,19 @@ def unproxy_args_kwargs(args, kwargs, inspect_torch_module=False):
     return args, kwargs
 
 
-def type_handle_mldaikon_proxy(x):
-    if hasattr(x, "is_ml_daikon_proxied_obj"):
+def type_handle_traincheck_proxy(x):
+    if hasattr(x, "is_traincheck_proxied_obj"):
         return type(x._obj)
     return type(x)
 
 
 class TypeToIsInstanceTransformer(ast.NodeTransformer):
-    # add from mldaiokn.proxy_wrapper.proxy_basics import type_handle_mldaikon_proxy after function definition
+    # add from mldaiokn.proxy_wrapper.proxy_basics import type_handle_traincheck_proxy after function definition
     def visit_FunctionDef(self, node):
         self.generic_visit(node)
         # Inject code right after the def statement
         inject_code = """
-from mldaikon.proxy_wrapper.proxy_basics import type_handle_mldaikon_proxy
+from traincheck.proxy_wrapper.proxy_basics import type_handle_traincheck_proxy
 """
         inject_node = ast.parse(inject_code).body
         node.body = inject_node + node.body
@@ -85,9 +85,9 @@ def visit_Call(self, node):
             and len(node.args) == 1
         ):
 
-            # Replace type(xxx) with type_handle_mldaikon_proxy(xxx)
+            # Replace type(xxx) with type_handle_traincheck_proxy(xxx)
             new_node = ast.Call(
-                func=ast.Name(id="type_handle_mldaikon_proxy", ctx=ast.Load()),
+                func=ast.Name(id="type_handle_traincheck_proxy", ctx=ast.Load()),
                 args=node.args,
                 keywords=[],
             )
@@ -98,7 +98,7 @@ def visit_Call(self, node):
 
 def adapt_func_for_proxy(func):
     """Adapt a function to work with proxied objects.
-    - Replace type() calls with type_handle_mldaikon_proxy() so that type(ProxyObj) returns type(ProxyObj._obj) instead of Proxy
+    - Replace type() calls with type_handle_traincheck_proxy() so that type(ProxyObj) returns type(ProxyObj._obj) instead of Proxy
     """
 
     source = inspect.getsource(func)
diff --git a/mldaikon/proxy_wrapper/proxy_config.py b/traincheck/proxy_wrapper/proxy_config.py
similarity index 100%
rename from mldaikon/proxy_wrapper/proxy_config.py
rename to traincheck/proxy_wrapper/proxy_config.py
diff --git a/mldaikon/proxy_wrapper/proxy_handler.py b/traincheck/proxy_wrapper/proxy_handler.py
similarity index 100%
rename from mldaikon/proxy_wrapper/proxy_handler.py
rename to traincheck/proxy_wrapper/proxy_handler.py
diff --git a/mldaikon/proxy_wrapper/proxy_methods.py b/traincheck/proxy_wrapper/proxy_methods.py
similarity index 87%
rename from mldaikon/proxy_wrapper/proxy_methods.py
rename to traincheck/proxy_wrapper/proxy_methods.py
index 933d85dd..aedb6945 100644
--- a/mldaikon/proxy_wrapper/proxy_methods.py
+++ b/traincheck/proxy_wrapper/proxy_methods.py
@@ -27,7 +27,7 @@ def __add__(self, other):
         lambda: "logger_proxy: "
         + f"Calling __add__ for object '{self.__class__.__name__}'"
     )
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     if isinstance(other, str):
         # If the other operand is a string, concatenate it with the string representation of the Proxy object
         return str(self._obj) + other
@@ -64,7 +64,7 @@ def __ror__(self, other):
         lambda: "logger_proxy: "
         + f"Calling __ror__ for object '{self.__class__.__name__}'"
     )
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     if isinstance(other, bool):
         return other | bool(self._obj)
     return self._obj.__ror__(other)
@@ -76,7 +76,7 @@ def __radd__(self, other):
         + f"Calling __radd__ for object '{self.__class__.__name__}'"
     )
     # Unwrap other if it's a Proxy
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     if isinstance(other, str):
         # If the other operand is a string, concatenate it with the string representation of the Proxy object
         return other + str(self._obj)
@@ -89,7 +89,7 @@ def __iadd__(self, other):
         + f"Calling __iadd__ for object '{self.__class__.__name__}'"
     )
     # Unwrap other if it's a Proxy
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     self._obj.__iadd__(other)
     return self
 
@@ -100,7 +100,7 @@ def __sub__(self, other):
         + f"Calling __sub__ for object '{self.__class__.__name__}'"
     )
     # Unwrap other if it's a Proxy
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     return self._obj - other
 
 
@@ -110,7 +110,7 @@ def __mul__(self, other):
         + f"Calling __mul__ for object '{self.__class__.__name__}'"
     )
     # Unwrap other if it's a Proxy
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     return self._obj * other
 
 
@@ -120,7 +120,7 @@ def __rmul__(self, other):
         + f"Calling __rmul__ for object '{self.__class__.__name__}'"
     )
     # Unwrap other if it's a Proxy
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     return other * self._obj
 
 
@@ -130,7 +130,7 @@ def __truediv__(self, other):
         + f"Calling __truediv__ for object '{self.__class__.__name__}'"
     )
     # Unwrap other if it's a Proxy
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     return self._obj / other
 
 
@@ -140,7 +140,7 @@ def __floatdiv__(self, other):
         + f"Calling __floatdiv__ for object '{self.__class__.__name__}'"
     )
     # Unwrap other if it's a Proxy
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     return self._obj // other
 
 
@@ -150,7 +150,7 @@ def __intdiv__(self, other):
         + f"Calling __intdiv__ for object '{self.__class__.__name__}'"
     )
     # Unwrap other if it's a Proxy
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     return self._obj // other
 
 
@@ -160,7 +160,7 @@ def __rfloordiv__(self, other):
         + f"Calling __ifloordiv__ for object '{self.__class__.__name__}'"
     )
     # Unwrap other if it's a Proxy
-    other = other._obj if hasattr(other, "is_ml_daikon_proxied_obj") else other
+    other = other._obj if hasattr(other, "is_traincheck_proxied_obj") else other
     return other // self._obj
 
 
diff --git a/mldaikon/proxy_wrapper/proxy_observer.py b/traincheck/proxy_wrapper/proxy_observer.py
similarity index 91%
rename from mldaikon/proxy_wrapper/proxy_observer.py
rename to traincheck/proxy_wrapper/proxy_observer.py
index 16009617..06afcc2b 100644
--- a/mldaikon/proxy_wrapper/proxy_observer.py
+++ b/traincheck/proxy_wrapper/proxy_observer.py
@@ -1,11 +1,11 @@
 import functools
 import typing
 
-from mldaikon.config.config import should_disable_proxy_dumping
-from mldaikon.utils import typename
+from traincheck.config.config import should_disable_proxy_dumping
+from traincheck.utils import typename
 
 if typing.TYPE_CHECKING:
-    from mldaikon.proxy_wrapper.proxy import Proxy
+    from traincheck.proxy_wrapper.proxy import Proxy
 from .proxy_basics import is_proxied, unproxy_func
 
 
diff --git a/mldaikon/proxy_wrapper/proxy_registry.py b/traincheck/proxy_wrapper/proxy_registry.py
similarity index 98%
rename from mldaikon/proxy_wrapper/proxy_registry.py
rename to traincheck/proxy_wrapper/proxy_registry.py
index 687a5bd6..ebd93116 100644
--- a/mldaikon/proxy_wrapper/proxy_registry.py
+++ b/traincheck/proxy_wrapper/proxy_registry.py
@@ -1,7 +1,7 @@
 import threading
 import typing
 
-from mldaikon.utils import typename
+from traincheck.utils import typename
 
 if typing.TYPE_CHECKING:
     from .proxy import Proxy
diff --git a/mldaikon/proxy_wrapper/torch_proxy.py b/traincheck/proxy_wrapper/torch_proxy.py
similarity index 93%
rename from mldaikon/proxy_wrapper/torch_proxy.py
rename to traincheck/proxy_wrapper/torch_proxy.py
index 746b358c..09e14736 100644
--- a/mldaikon/proxy_wrapper/torch_proxy.py
+++ b/traincheck/proxy_wrapper/torch_proxy.py
@@ -6,7 +6,7 @@
     pass
 from torch._C._distributed_c10d import ProcessGroup
 
-from mldaikon.proxy_wrapper.proxy_basics import unproxy_func
+from traincheck.proxy_wrapper.proxy_basics import unproxy_func
 
 #################################################
 ###         Proxied Torch functions
diff --git a/mldaikon/proxy_wrapper/utils.py b/traincheck/proxy_wrapper/utils.py
similarity index 57%
rename from mldaikon/proxy_wrapper/utils.py
rename to traincheck/proxy_wrapper/utils.py
index ab0913f3..e736d360 100644
--- a/mldaikon/proxy_wrapper/utils.py
+++ b/traincheck/proxy_wrapper/utils.py
@@ -1,4 +1,4 @@
-from mldaikon.proxy_wrapper.proxy_config import debug_mode
+from traincheck.proxy_wrapper.proxy_config import debug_mode
 
 
 def print_debug(message_func):
diff --git a/mldaikon/runner/__init__.py b/traincheck/runner/__init__.py
similarity index 100%
rename from mldaikon/runner/__init__.py
rename to traincheck/runner/__init__.py
diff --git a/mldaikon/runner/runner.py b/traincheck/runner/runner.py
similarity index 99%
rename from mldaikon/runner/runner.py
rename to traincheck/runner/runner.py
index 5ca2abd1..da41fc8b 100644
--- a/mldaikon/runner/runner.py
+++ b/traincheck/runner/runner.py
@@ -4,7 +4,7 @@
 import subprocess
 import sys
 
-from mldaikon.config.config import RUNNER_DEFAULT_ENV, TMP_FILE_PREFIX
+from traincheck.config.config import RUNNER_DEFAULT_ENV, TMP_FILE_PREFIX
 
 
 def program_print(program_output: str):
diff --git a/mldaikon/script/num_invs_each_rel.py b/traincheck/script/num_invs_each_rel.py
similarity index 95%
rename from mldaikon/script/num_invs_each_rel.py
rename to traincheck/script/num_invs_each_rel.py
index b1aedcdb..1c52971e 100644
--- a/mldaikon/script/num_invs_each_rel.py
+++ b/traincheck/script/num_invs_each_rel.py
@@ -1,6 +1,6 @@
 import argparse
 
-from mldaikon.invariant import read_inv_file
+from traincheck.invariant import read_inv_file
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
diff --git a/mldaikon/static_analyzer/README.md b/traincheck/static_analyzer/README.md
similarity index 81%
rename from mldaikon/static_analyzer/README.md
rename to traincheck/static_analyzer/README.md
index 0a2fa7db..7ae9f0d2 100644
--- a/mldaikon/static_analyzer/README.md
+++ b/traincheck/static_analyzer/README.md
@@ -13,20 +13,20 @@ Edit the `config.py` or add compile flags to modify the behavior of the static a
 ## To run the internal pytorch library: 
 ```bash
 # Output all functions in torch.nn
-python -m mldaikon.static_analyzer.graph_generator.main --lib nn
+python -m traincheck.static_analyzer.graph_generator.main --lib nn
 # Or equivalently
-python -m mldaikon.static_analyzer.graph_generator.main --lib torch.nn
+python -m traincheck.static_analyzer.graph_generator.main --lib torch.nn
 
 # Only output functions with namespace torch.nn.modules.padding
-python -m mldaikon.static_analyzer.graph_generator.main --lib nn --namespace torch.nn.modules.padding
+python -m traincheck.static_analyzer.graph_generator.main --lib nn --namespace torch.nn.modules.padding
 
 # Only output functions with namespace torch.nn.modules.padding and used in torch.nn.modules.padding
-python -m mldaikon.static_analyzer.graph_generator.main --lib nn --namespace torch.nn.modules.padding --function torch.nn.modules.padding.ConstantPad3d
+python -m traincheck.static_analyzer.graph_generator.main --lib nn --namespace torch.nn.modules.padding --function torch.nn.modules.padding.ConstantPad3d
 ```
 
 ## To run the external library:
 ```bash
-python -m mldaikon.static_analyzer.graph_generator.main --ext ../../../example_pipelines/bug_84911_ml_daikon.py
+python -m traincheck.static_analyzer.graph_generator.main --ext ../../../example_pipelines/bug_84911_traincheck.py
 ```
 
 # Thoughts
diff --git a/mldaikon/static_analyzer/__init__.py b/traincheck/static_analyzer/__init__.py
similarity index 100%
rename from mldaikon/static_analyzer/__init__.py
rename to traincheck/static_analyzer/__init__.py
diff --git a/mldaikon/static_analyzer/config.py b/traincheck/static_analyzer/config.py
similarity index 96%
rename from mldaikon/static_analyzer/config.py
rename to traincheck/static_analyzer/config.py
index 06e54542..d32a58f4 100644
--- a/mldaikon/static_analyzer/config.py
+++ b/traincheck/static_analyzer/config.py
@@ -3,7 +3,7 @@
 INTERNAL_LIBS = None
 
 # traverse the user script. Change to the target path.
-EXTERNAL_LIBS = "example_pipelines/bug_84911_ml_daikon.py"
+EXTERNAL_LIBS = "example_pipelines/bug_84911_traincheck.py"
 
 # ============================== USED IN INTERNAL ==============================
 
diff --git a/mldaikon/static_analyzer/func_level/nn_func_level.log b/traincheck/static_analyzer/func_level/nn_func_level.log
similarity index 100%
rename from mldaikon/static_analyzer/func_level/nn_func_level.log
rename to traincheck/static_analyzer/func_level/nn_func_level.log
diff --git a/mldaikon/static_analyzer/func_level/optim_func_level.log b/traincheck/static_analyzer/func_level/optim_func_level.log
similarity index 100%
rename from mldaikon/static_analyzer/func_level/optim_func_level.log
rename to traincheck/static_analyzer/func_level/optim_func_level.log
diff --git a/mldaikon/static_analyzer/graph_generator/__init__.py b/traincheck/static_analyzer/graph_generator/__init__.py
similarity index 100%
rename from mldaikon/static_analyzer/graph_generator/__init__.py
rename to traincheck/static_analyzer/graph_generator/__init__.py
diff --git a/mldaikon/static_analyzer/graph_generator/analyzer.py b/traincheck/static_analyzer/graph_generator/analyzer.py
similarity index 100%
rename from mldaikon/static_analyzer/graph_generator/analyzer.py
rename to traincheck/static_analyzer/graph_generator/analyzer.py
diff --git a/mldaikon/static_analyzer/graph_generator/anutils.py b/traincheck/static_analyzer/graph_generator/anutils.py
similarity index 100%
rename from mldaikon/static_analyzer/graph_generator/anutils.py
rename to traincheck/static_analyzer/graph_generator/anutils.py
diff --git a/mldaikon/static_analyzer/graph_generator/call_graph_parser.py b/traincheck/static_analyzer/graph_generator/call_graph_parser.py
similarity index 98%
rename from mldaikon/static_analyzer/graph_generator/call_graph_parser.py
rename to traincheck/static_analyzer/graph_generator/call_graph_parser.py
index 01a5f1f0..1b66be2a 100644
--- a/mldaikon/static_analyzer/graph_generator/call_graph_parser.py
+++ b/traincheck/static_analyzer/graph_generator/call_graph_parser.py
@@ -4,7 +4,7 @@
 import os
 import re
 
-from mldaikon.proxy_wrapper.proxy_observer import add_observer_to_func
+from traincheck.proxy_wrapper.proxy_observer import add_observer_to_func
 
 
 def unparse_module(module_name, level=0):
diff --git a/mldaikon/static_analyzer/graph_generator/main.py b/traincheck/static_analyzer/graph_generator/main.py
similarity index 100%
rename from mldaikon/static_analyzer/graph_generator/main.py
rename to traincheck/static_analyzer/graph_generator/main.py
diff --git a/mldaikon/static_analyzer/graph_generator/node.py b/traincheck/static_analyzer/graph_generator/node.py
similarity index 100%
rename from mldaikon/static_analyzer/graph_generator/node.py
rename to traincheck/static_analyzer/graph_generator/node.py
diff --git a/mldaikon/toolkit/README.md b/traincheck/toolkit/README.md
similarity index 81%
rename from mldaikon/toolkit/README.md
rename to traincheck/toolkit/README.md
index 6bb7b3e7..8230007b 100644
--- a/mldaikon/toolkit/README.md
+++ b/traincheck/toolkit/README.md
@@ -23,9 +23,9 @@ This would help us to:
 To use this trace analysis tool, simply go with the folllowing bash script:
 
 ### generate bug<->fix diff file
-python -m mldaikon.toolkit.analyze_trace -f <trace file 1> <trace file 2> -o <bug-fix-diff-file>
+python -m traincheck.toolkit.analyze_trace -f <trace file 1> <trace file 2> -o <bug-fix-diff-file>
 ### generate pre-bug<->fix diff file 
-python -m mldaikon.toolkit.analyze_trace -f <trace file 1> <trace file 2> -o <pre-fix-diff-file>
+python -m traincheck.toolkit.analyze_trace -f <trace file 1> <trace file 2> -o <pre-fix-diff-file>
 ### get rid of false positives
-python -m mldaikon.toolkit.detect_anomaly_from_trace_diff <bug-fix-diff-file> <pre-fix-diff-file> -o <trace_anomalies.json>
+python -m traincheck.toolkit.detect_anomaly_from_trace_diff <bug-fix-diff-file> <pre-fix-diff-file> -o <trace_anomalies.json>
 I'm currently using this tool to analyze the feasibility to infer bug LT-725 by simply using the API trace. I may further extend it to support VAR traces if needed.
diff --git a/mldaikon/toolkit/analyze_invariant.py b/traincheck/toolkit/analyze_invariant.py
similarity index 100%
rename from mldaikon/toolkit/analyze_invariant.py
rename to traincheck/toolkit/analyze_invariant.py
diff --git a/mldaikon/toolkit/analyze_trace.py b/traincheck/toolkit/analyze_trace.py
similarity index 100%
rename from mldaikon/toolkit/analyze_trace.py
rename to traincheck/toolkit/analyze_trace.py
diff --git a/mldaikon/toolkit/detect_anomaly_from_trace_diff.py b/traincheck/toolkit/detect_anomaly_from_trace_diff.py
similarity index 99%
rename from mldaikon/toolkit/detect_anomaly_from_trace_diff.py
rename to traincheck/toolkit/detect_anomaly_from_trace_diff.py
index c69f0ba3..644d7676 100644
--- a/mldaikon/toolkit/detect_anomaly_from_trace_diff.py
+++ b/traincheck/toolkit/detect_anomaly_from_trace_diff.py
@@ -13,7 +13,7 @@
 import json
 import re
 
-from mldaikon.toolkit.analyze_trace import diff_dicts
+from traincheck.toolkit.analyze_trace import diff_dicts
 
 
 def read_diff_file(diff_file):
diff --git a/mldaikon/trace/__init__.py b/traincheck/trace/__init__.py
similarity index 71%
rename from mldaikon/trace/__init__.py
rename to traincheck/trace/__init__.py
index a5253963..8ec59971 100644
--- a/mldaikon/trace/__init__.py
+++ b/traincheck/trace/__init__.py
@@ -1,10 +1,10 @@
 from typing import Callable, Type
 
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.trace_dict import TraceDict, read_trace_file_dict
-from mldaikon.trace.trace_pandas import TracePandas, read_trace_file_Pandas
-from mldaikon.trace.trace_polars import TracePolars, read_trace_file_polars
-from mldaikon.trace.types import MDNONEJSONDecoder, MDNONEJSONEncoder
+from traincheck.trace.trace import Trace
+from traincheck.trace.trace_dict import TraceDict, read_trace_file_dict
+from traincheck.trace.trace_pandas import TracePandas, read_trace_file_Pandas
+from traincheck.trace.trace_polars import TracePolars, read_trace_file_polars
+from traincheck.trace.types import MDNONEJSONDecoder, MDNONEJSONEncoder
 
 __all__ = ["select_trace_implementation", "MDNONEJSONDecoder", "MDNONEJSONEncoder"]
 
diff --git a/mldaikon/trace/trace.py b/traincheck/trace/trace.py
similarity index 99%
rename from mldaikon/trace/trace.py
rename to traincheck/trace/trace.py
index 3a75ff71..c04ebeda 100644
--- a/mldaikon/trace/trace.py
+++ b/traincheck/trace/trace.py
@@ -2,9 +2,9 @@
 
 import polars as pl
 
-from mldaikon.config import config
-from mldaikon.instrumentor.tracer import TraceLineType
-from mldaikon.trace.types import (
+from traincheck.config import config
+from traincheck.instrumentor.tracer import TraceLineType
+from traincheck.trace.types import (
     AttrState,
     ContextManagerState,
     FuncCallEvent,
@@ -84,7 +84,7 @@ def __init__(self, events, truncate_incomplete_func_calls=True):
         raise NotImplementedError("This class should not be instantiated directly.")
 
     def _rm_incomplete_trailing_func_calls(self):
-        """Remove incomplete trailing function calls from the trace. For why incomplete function calls exist, refer to https://github.com/OrderLab/ml-daikon/issues/31
+        """Remove incomplete trailing function calls from the trace. For why incomplete function calls exist, refer to https://github.com/OrderLab/traincheck/issues/31
 
         This function would group the function calls by `func_call_id` which is unique for each function call. Thus, each `func_call_id` should
         exactly correspond to two trace records (one pre-call and one post-call/exception). If there is only one record for a `func_call_id`,
diff --git a/mldaikon/trace/trace_dict.py b/traincheck/trace/trace_dict.py
similarity index 99%
rename from mldaikon/trace/trace_dict.py
rename to traincheck/trace/trace_dict.py
index ff678db9..7ae2339b 100644
--- a/mldaikon/trace/trace_dict.py
+++ b/traincheck/trace/trace_dict.py
@@ -5,10 +5,10 @@
 
 from tqdm import tqdm
 
-from mldaikon.config import config
-from mldaikon.instrumentor.tracer import TraceLineType
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.types import (
+from traincheck.config import config
+from traincheck.instrumentor.tracer import TraceLineType
+from traincheck.trace.trace import Trace
+from traincheck.trace.types import (
     AttrState,
     FuncCallEvent,
     FuncCallExceptionEvent,
diff --git a/mldaikon/trace/trace_pandas.py b/traincheck/trace/trace_pandas.py
similarity index 98%
rename from mldaikon/trace/trace_pandas.py
rename to traincheck/trace/trace_pandas.py
index e6d48998..57023a0c 100644
--- a/mldaikon/trace/trace_pandas.py
+++ b/traincheck/trace/trace_pandas.py
@@ -5,11 +5,11 @@
 import pandas as pd
 from tqdm import tqdm
 
-from mldaikon.config import config
-from mldaikon.instrumentor.tracer import TraceLineType
-from mldaikon.instrumentor.types import PTID
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.types import (
+from traincheck.config import config
+from traincheck.instrumentor.tracer import TraceLineType
+from traincheck.instrumentor.types import PTID
+from traincheck.trace.trace import Trace
+from traincheck.trace.types import (
     MD_NONE,
     AttrState,
     ContextManagerState,
@@ -19,14 +19,14 @@
     VarChangeEvent,
     VarInstId,
 )
-from mldaikon.trace.utils import (
+from traincheck.trace.utils import (
     BindedFuncInput,
     bind_args_kwargs_to_signature,
     flatten_dict,
     load_signature_from_class_method_name,
     read_jsonlines_flattened_with_md_none,
 )
-from mldaikon.utils import safe_isnan
+from traincheck.utils import safe_isnan
 
 logger = logging.getLogger(__name__)
 
@@ -102,6 +102,7 @@ def __init__(self, events, truncate_incomplete_func_calls=True):
             )
 
         self.column_dtypes_cached = {}
+        self.stage_traces: dict[str, "TracePandas"] = {}
 
         # HACK: init might not be present at the beginning of the trace due to presence of import-time logs
         self._fill_missing_stage_init()
@@ -113,6 +114,9 @@ def get_traces_for_stage(self) -> dict[str, "TracePandas"]:  # type: ignore
         if not self.is_stage_annotated():
             raise ValueError("Trace is not annotated with stages.")
 
+        if self.stage_traces:
+            return self.stage_traces
+
         traces = {}
         for stage in self.events[STAGE_KEY].unique():
             traces[stage] = TracePandas(
@@ -120,6 +124,8 @@ def get_traces_for_stage(self) -> dict[str, "TracePandas"]:  # type: ignore
                 truncate_incomplete_func_calls=False,
             )
 
+        self.stage_traces = traces
+
         return traces
 
     def get_all_stages(self) -> set[str]:
@@ -192,7 +198,9 @@ def _rm_incomplete_trailing_func_calls(self):
             assert (
                 row["type"] == TraceLineType.FUNC_CALL_PRE
             ), f"Incomplete function call is not a pre-call event, got {row['type']}, id {row['func_call_id']}"
-            logger.warning(f"Incomplete function call detected: {row}")
+            logger.warning(
+                f"Incomplete function call detected: {row['func_call_id']}, {row['function']}"
+            )
             process_id = row["process_id"]
             thread_id = row["thread_id"]
 
@@ -827,7 +835,7 @@ def get_var_insts(self) -> dict[VarInstId, dict[str, list[AttrState]]]:
                         continue
 
                     if col.startswith(config.VAR_ATTR_PREFIX):
-                        from mldaikon.invariant.base_cls import make_hashable
+                        from traincheck.invariant.base_cls import make_hashable
 
                         curr_value = make_hashable(state_change[col])
                         attr_name = get_attr_name(col)
diff --git a/mldaikon/trace/trace_polars.py b/traincheck/trace/trace_polars.py
similarity index 99%
rename from mldaikon/trace/trace_polars.py
rename to traincheck/trace/trace_polars.py
index b17169e4..924637de 100644
--- a/mldaikon/trace/trace_polars.py
+++ b/traincheck/trace/trace_polars.py
@@ -4,10 +4,10 @@
 import polars as pl
 from tqdm import tqdm
 
-from mldaikon.config import config
-from mldaikon.instrumentor.tracer import TraceLineType
-from mldaikon.trace.trace import Trace
-from mldaikon.trace.types import (
+from traincheck.config import config
+from traincheck.instrumentor.tracer import TraceLineType
+from traincheck.trace.trace import Trace
+from traincheck.trace.types import (
     AttrState,
     FuncCallEvent,
     FuncCallExceptionEvent,
diff --git a/mldaikon/trace/types.py b/traincheck/trace/types.py
similarity index 98%
rename from mldaikon/trace/types.py
rename to traincheck/trace/types.py
index b6113008..25c61ccd 100644
--- a/mldaikon/trace/types.py
+++ b/traincheck/trace/types.py
@@ -2,8 +2,8 @@
 from abc import abstractmethod
 from typing import NamedTuple
 
-from mldaikon.instrumentor.tracer import TraceLineType
-from mldaikon.instrumentor.types import PTID
+from traincheck.instrumentor.tracer import TraceLineType
+from traincheck.instrumentor.types import PTID
 
 
 class MD_NONE:
@@ -11,9 +11,14 @@ def __hash__(self) -> int:
         return hash(None)
 
     def __eq__(self, o: object) -> bool:
-
         return type(o) == MD_NONE or o is None
 
+    def __repr__(self):
+        return "None"
+
+    def __str__(self):
+        return "None"
+
     def to_dict(self):
         """Return a serializable dictionary representation of the object."""
         return None
diff --git a/mldaikon/trace/utils.py b/traincheck/trace/utils.py
similarity index 96%
rename from mldaikon/trace/utils.py
rename to traincheck/trace/utils.py
index 1581fd4a..86bf37b4 100644
--- a/mldaikon/trace/utils.py
+++ b/traincheck/trace/utils.py
@@ -4,9 +4,9 @@
 import logging
 from collections.abc import MutableMapping
 
-from mldaikon.instrumentor.dumper import var_to_serializable
-from mldaikon.trace.types import MD_NONE, BindedFuncInput
-from mldaikon.utils import typename
+from traincheck.instrumentor.dumper import var_to_serializable
+from traincheck.trace.types import MD_NONE, BindedFuncInput
+from traincheck.utils import typename
 
 
 def _flatten_dict_gen(d, parent_key, sep, skip_fields=None):
diff --git a/mldaikon/utils.py b/traincheck/utils.py
similarity index 93%
rename from mldaikon/utils.py
rename to traincheck/utils.py
index 482c5967..33d81ab1 100644
--- a/mldaikon/utils.py
+++ b/traincheck/utils.py
@@ -29,6 +29,8 @@ def safe_getattr(obj, attr, default=None):
                 in "RuntimeError: Tried to instantiate class '__qualname__.__qualname__', but it does not exist! Ensure that it is registered via torch::class_"
             ):
                 return default
+        if isinstance(e, ModuleNotFoundError):
+            return default
         raise
 
 
@@ -63,7 +65,7 @@ def handle_excepthook(typ, message, stack):
 
     Print detailed stack information with local variables
     """
-    logger = logging.getLogger("mldaikon")
+    logger = logging.getLogger("traincheck")
 
     if issubclass(typ, KeyboardInterrupt):
         sys.__excepthook__(typ, message, stack)
@@ -111,8 +113,8 @@ def register_custom_excepthook(add_file_handler=False):
             "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
         )
         file_handler.setFormatter(formatter)
-        # add to the "mldaikon" logger and the "threading" logger
-        logging.getLogger("mldaikon").addHandler(file_handler)
+        # add to the "traincheck" logger and the "threading" logger
+        logging.getLogger("traincheck").addHandler(file_handler)
         logging.getLogger("threading").addHandler(file_handler)
     sys.excepthook = handle_excepthook
     threading.excepthook = thread_excepthook
@@ -131,4 +133,7 @@ def get_unique_id():
 
 
 def safe_isnan(value: Any) -> bool:
+    if value is pd.NA:
+        return True
+
     return isinstance(value, float) and pd.isna(value)