Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions auto_causal/data_generation/synthetic/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

from auto_causal.data_generation.synthetic.generator import PSMGenerator, PSWGenerator, IVGenerator, RDDGenerator, RCTGenerator, DiDGenerator, MultiTreatRCTGenerator
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/sh

# create_descriptions.sh
# This script generates the column labels, backstory, and causal query for all the synthetic datasets.
#
# Created by Sawal Acharya on 5/14/25.
#

echo "Generating context for RCT Data"
bash auto_causal/data_generation/synthetic/create_context/create_context_rct.sh

echo "Generating context for Multi-RCT Data"
bash auto_causal/autodata_generation/synthetic/create_context/create_context_multi_rct.sh

echo "Generating context for Front_Door Data"
bash auto_causal/data_generation/synthetic/create_context/create_context_front_door.sh

echo "Generating context for Observational Data"
bash auto_causal/data_generation/synthetic/create_context/create_context_observational.sh

echo "Generating context for Canonical DiD Data"
bash auto_causal/data_generation/synthetic/create_context/create_context_did_canonical.sh

echo "Generating context for TWFE DiD Data"
bash auto_causal/data_generation/synthetic/create_context/create_context_did_twfe.sh

echo "Generating context for IV Data"
bash auto_causal/rdata_generation/synthetic/create_context/create_context_iv.sh

echo "Generating context for IV-Encouragement Data"
bash auto_causal/data_generation/synthetic/create_context/create_context_iv_encouragement.sh

echo "Generating context for RDD Data"
bash auto_causal/data_generation/synthetic/create_context/create_context_rdd.sh

Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# Created by Sawal Acharya on 5/14/25.
#

source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="did_canonical"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata/${METHOD}.json"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"
OUTPUT_FOLDER="${BASE_FOLDER}/${METHOD}/description"

python main/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
python auto_causal/data_generation/synthetic/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# Created by Sawal Acharya on 5/14/25.
#

source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="did_twfe"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata/${METHOD}.json"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"
OUTPUT_FOLDER="${BASE_FOLDER}/${METHOD}/description"

python main/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
python auto_causal/data_generation/synthetic/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="frontdoor"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata/${METHOD}.json"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"
OUTPUT_FOLDER="${BASE_FOLDER}/${METHOD}/description"

python main/generate_context.py \
python auto_causal/data_generation/synthetic/generate_context.py \
-mp ${METADATA_FOLDER} \
-d ${DATA_FOLDER} \
-o ${OUTPUT_FOLDER} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# Created by Sawal Acharya on 5/14/25.
#

source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="iv"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata/${METHOD}.json"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"
OUTPUT_FOLDER="${BASE_FOLDER}/${METHOD}/description"

python main/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
python auto_causal/data_generation/synthetic/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# Created by Sawal Acharya on 5/14/25.
#

source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="iv_encouragement"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata/${METHOD}.json"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"
OUTPUT_FOLDER="${BASE_FOLDER}/${METHOD}/description"

python main/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
python auto_causal/data_generation/synthetic/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# Created by Sawal Acharya on 5/14/25.
#

source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="multi_rct"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata/${METHOD}.json"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"
OUTPUT_FOLDER="${BASE_FOLDER}/${METHOD}/description"

python main/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
python auto_causal/data_generation/synthetic/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# Created by Sawal Acharya on 5/14/25.
#

source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="observational"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata/${METHOD}.json"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"
OUTPUT_FOLDER="${BASE_FOLDER}/${METHOD}/description"

python main/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
python auto_causal/data_generation/synthetic/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# Created by Sawal Acharya on 5/14/25.
#

source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="rct"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata/${METHOD}.json"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"
OUTPUT_FOLDER="${BASE_FOLDER}/${METHOD}/description"

python main/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
python auto_causal/data_generation/synthetic/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
# Created by Sawal Acharya on 5/14/25.
#

source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="rdd"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata/${METHOD}.json"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"
OUTPUT_FOLDER="${BASE_FOLDER}/${METHOD}/description"

python main/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
python auto_causal/data_generation/synthetic/generate_context.py -mp ${METADATA_FOLDER} -d ${DATA_FOLDER} -o ${OUTPUT_FOLDER} -m ${METHOD}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# create_descriptions.sh
#
#
# Created by Sawal Acharya on 5/14/25.
#

source auto_causal/data_generation/synthetic/settings.sh
METHOD="did_canonical"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"

python auto_causal/data_generation/synthetic/generate_synthetic.py -md ${METADATA_FOLDER} -d ${DATA_FOLDER} -m ${METHOD} -s ${DEFAULT_SIZE} -mb ${N_BINARY_OTHERS} -mc ${N_CONTINUOUS_DID_CANONICAL} -o ${DEFAULT_OBS}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# create_descriptions.sh
#
#
# Created by Sawal Acharya on 5/14/25.
#

source auto_causal/data_generation/synthetic/settings.sh
METHOD="did_twfe"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"

python auto_causal/data_generation/synthetic/generate_synthetic.py -md ${METADATA_FOLDER} -d ${DATA_FOLDER} -m ${METHOD} -s ${DEFAULT_SIZE} -mb ${N_BINARY_OTHERS} -mc ${N_CONTINUOUS_DID_TWFE} -np ${MAX_PERIODS} -o ${DEFAULT_OBS_TWFE}
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
source reproduce_results/settings.sh
source auto_causal/data_generation/synthetic/settings.sh
METHOD="frontdoor"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"

python main/generate_synthetic.py \
python auto_causal/data_generation/synthetic/generate_synthetic.py \
-md ${METADATA_FOLDER} \
-d ${DATA_FOLDER} \
-m ${METHOD} \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# create_descriptions.sh
#
#
# Created by Sawal Acharya on 5/14/25.
#

source auto_causal/data_generation/synthetic/settings.sh
METHOD="iv"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"

python auto_causal/data_generation/synthetic/generate_synthetic.py -md ${METADATA_FOLDER} -d ${DATA_FOLDER} -m ${METHOD} -s ${DEFAULT_SIZE} -mb ${N_BINARY} -mc ${N_CONTINUOUS_IV} -o ${DEFAULT_OBS}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# create_descriptions.sh
#
#
# Created by Sawal Acharya on 5/14/25.
#

source auto_causal/data_generation/synthetic/settings.sh
METHOD="iv_encouragement"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"

python auto_causal/data_generation/synthetic/generate_synthetic.py -md ${METADATA_FOLDER} -d ${DATA_FOLDER} -m ${METHOD} -s ${DEFAULT_SIZE} -mb ${N_BINARY_OTHERS} -mc ${N_CONTINUOUS_IV_ENCOURAGEMENT} -o ${DEFAULT_OBS}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# create_descriptions.sh
#
#
# Created by Sawal Acharya on 5/14/25.
#

source auto_causal/data_generation/synthetic/settings.sh
METHOD="multi_rct"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"

python auto_causal/data_generation/synthetic/generate_synthetic.py -md ${METADATA_FOLDER} -d ${DATA_FOLDER} -m ${METHOD} -s ${DEFAULT_SIZE} -mb ${N_BINARY} -mc ${N_CONTINUOUS_MULTI} -nt ${MAX_TREATMENTS} -o ${DEFAULT_OBS}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# create_descriptions.sh
#
#
# Created by Sawal Acharya on 5/14/25.
#

source auto_causal/data_generation/synthetic/settings.sh
METHOD="observational"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"

python auto_causal/data_generation/synthetic/generate_synthetic.py -md ${METADATA_FOLDER} -d ${DATA_FOLDER} -m ${METHOD} -s ${DEFAULT_SIZE} -mb ${N_BINARY} -mc ${N_CONTINUOUS} -o ${DEFAULT_OBS}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# create_descriptions.sh
#
#
# Created by Sawal Acharya on 5/14/25.
#

source auto_causal/data_generation/synthetic/settings.sh
METHOD="rct"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"

python auto_causal/data_generation/synthetic/generate_synthetic.py -md ${METADATA_FOLDER} -d ${DATA_FOLDER} -m ${METHOD} -s ${DEFAULT_SIZE} -mb ${N_BINARY} -mc ${N_CONTINUOUS} -o ${DEFAULT_OBS}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh

# create_descriptions.sh
#
#
# Created by Sawal Acharya on 5/14/25.
#

source auto_causal/data_generation/synthetic/settings.sh
METHOD="rdd"
METADATA_FOLDER="${BASE_FOLDER}/${METHOD}/metadata"
DATA_FOLDER="${BASE_FOLDER}/${METHOD}/data"

python auto_causal/data_generation/synthetic/generate_synthetic.py -md ${METADATA_FOLDER} -d ${DATA_FOLDER} -m ${METHOD} -s ${DEFAULT_SIZE} -mb ${N_BINARY_OTHERS} -mc ${N_CONTINUOUS_RDD} -c ${CUTOFF} -o ${DEFAULT_OBS}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/sh

# create_synthetic_data_all.sh
# This scripts generates all the synthetic data
#
#
# Created by Sawal Acharya on 5/14/25.
#


echo "Generating RCT Data"
bash auto_causal/data_generation/synthetic/create_data/create_rct_data.sh

echo "Generating Multi-RCT Data"
bash auto_causal/data_generation/synthetic/create_data/create_multi_rct_data.sh

echo "Generating Front_Door Data"
bash auto_causal/data_generation/synthetic/create_data/create_front_door_data.sh

echo "Generating Observational Data"
bash auto_causal/data_generation/synthetic/create_data/create_observational_data.sh

echo "Generating Canonical DiD Data"
bash auto_causal/data_generation/synthetic/create_data/create_did_canonical_data.sh

echo "Generating TWFE DiD Data"
bash auto_causal/data_generation/synthetic/create_data/create_did_twfe_data.sh

echo "Generating IV Data"
bash auto_causal/data_generation/synthetic/create_data/create_iv_data.sh

echo "Generating IV-Encouragement Data"
bash auto_causal/data_generation/synthetic/create_data/create_iv_encouragement_data.sh

echo "Generating RDD Data"
bash auto_causal/data_generation/synthetic/create_data/create_rdd_data.sh
68 changes: 68 additions & 0 deletions auto_causal/data_generation/synthetic/finalize_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
## This file contains the functions that creates the final synthetic dataset by renaming the columns. Additionally,
## it also create a csv file summarizing the information about the dataset including description, file name, query, etc.
## This csv file is used to create the input to the pipeline.

from argparse import ArgumentParser
from pathlib import Path
import pandas as pd
import json
import os
from tqdm import tqdm


def argument_parser():
parser = ArgumentParser()
parser.add_argument("-id", "--input_data_path", type=str, required=True,
help="Path to the folder where the raw data is stored")
parser.add_argument("-od", "--output_data_path", type=str, required=True,
help="Path to the folder where the processed data will be saved")
parser.add_argument("-o", "--output_path", type=str, required=True,
help="Path to the folder where the summary will be saved")
parser.add_argument("-md", "--metadata_path", type=str, required=True,
help="Path to the folder where the metadata is stored")
parser.add_argument("-de", "--description_json", type=str, required=True,
help="Path to the json file containing the description and queries")
parser.add_argument("-m", "--method", type=str, required=True,
help="Method associated with the dataset")
return parser.parse_args()


if __name__ == "__main__":

args = argument_parser()

with open(args.description_json, 'r') as f:
description_json = json.load(f)

with open(args.metadata_path, 'r') as f:
metadata_json = json.load(f)

output_info_path = Path(args.output_path)
output_info_path.mkdir(parents=True, exist_ok=True)

output_data_path = Path(args.output_data_path)
output_data_path.mkdir(parents=True, exist_ok=True)


info_dict = {"paper_name":[], "data_description":[], "natural_language_query":[], "answer":[],
"method":[], "data_files":[]}

for file in tqdm(os.listdir(args.input_data_path)):
if file.endswith('.csv'):
dataset_path = os.path.join(args.input_data_path, file)
df = pd.read_csv(dataset_path)
metadata = metadata_json[file]
info = description_json[file]
df_copy = df.copy()
var_names = info.get('variable_labels')
#print(var_names)
df_copy = df_copy.rename(columns=var_names)
info_dict["paper_name"].append("Synthetic Dataset")
info_dict["data_description"].append(info.get('description'))
info_dict["natural_language_query"].append(info.get('question'))
info_dict["answer"].append(metadata.get('true_effect'))
info_dict["method"].append(args.method)
info_dict["data_files"].append(file)
df_copy.to_csv(output_data_path / file, index=False)
df = pd.DataFrame(info_dict)
df.to_csv(output_info_path/f"{args.method}_info.csv", index=False)
Loading