diff --git a/gemini/README.md b/gemini/README.md new file mode 100644 index 0000000..308c0c4 --- /dev/null +++ b/gemini/README.md @@ -0,0 +1,59 @@ +# Gemini-generated data + +### Overview + +You'll find three scripts in this directory for creating egocentric video understanding data using Gemini: +1. `prepare_ego4d_nlq_for_gemini.py`: prepare the Ego4D NLQ video clips for Gemini prompting. +2. `generate_gemini_data.py`: zero-shot multimodal prompting of Gemini to generate the training data. We used version `gemini-1.5-pro-preview-0409` for the published dataset, but we've updated the default to `gemini-1.5-pro-001`. +3. `prepare_ego4d_vqa_gemini_dataset.py`: post-processing Gemini output to prepare for training. + +### Prerequisites + +The following are required to run the above scripts: +1. Ego4D access, request it [here](https://ego4d-data.org/docs/start-here/). The files ego4d.json and nlq_train.json are required locally, as are the AWS credentials for access to the videos. +2. An [VertexAI](https://cloud.google.com/vertex-ai) API key for prompting Gemini. +3. A [GCS bucket](https://cloud.google.com/storage) for storing output Ego4D NLQ clips used for prompting Gemini. + +### How to run + +Perform the following steps, executing from the `gemini` directory: +``` python +# Create and activate virtual environment if you haven't already +python -m venv venv +source venv/bin/activate +pip install -r ../requirements.txt + +# Prepare the Ego4D NLQ data, the video clips will be uploaded to GCS, ready for prompting with VertexAI +python ./prepare_ego4d_nlq_for_gemini.py \ + --ego4d_path [relative path to ego4d.json] \ # Default: ../data/ego4d.json + --ego4d_nlq_path [relative path to nlq_train.json] \ # Default: ../data/nlq_train.json + --ego4d_output_videos_path [path to output clips] \ # Output video object path on GCS (and local path). Default: ego4d_vqa_gemini_videos. + --output_json_path [path to output JSON file] \ # Default: ego4d_vqa_gemini.json + --ego4d_aws_access_key_id [EGO4D_AWS_ACCESS_KEY_ID] \ # Required, obtained from Ego4D + --ego4d_aws_secret_access_key [EGO4D_AWS_SECRET_ACCESS_KEY] \ # Required, obtained from Ego4D + --ego4d_aws_region_name [EGO4D_AWS_REGION_NAME] \ # Required, obtained from Ego4D + --gcs_bucket_name [GCS_BUCKET_NAME] \ # Required, GCS bucket the clips will be saved to + --keep-local-clips # Optional flag to specify keeping the clips locally (requires about 130 Gb of storage) + +# Call VertexAI to generate training data +python ./generate_gemini_data.py \ + --gcs_project_id [GCS_PROJECT_ID] \ # Required, your Google Cloud project ID + --gcs_bucket_name [GCS_BUCKET_NAME] \ # Required, GCS bucket with Ego4D NLQ clips + --gcs_location [GCS_LOCATION] \ # Required, GCS location to use with VertexAI + --resume \ # Optional flag to specify resuming from last clip + --ego4d_vqa_gemini_path [path to Ego4D clips JSON file] \ # Outputted from previous script. Default: ./ego4d_vqa_gemini.json + --output_path [path to output JSON file] \ # Default: gemini_responses.json + --gemini_model GEMINI_MODEL \ # Default: gemini-1.5-pro-001 + --vertexai_quota VERTEXAI_QUOTA # VertexAI request quota per minute. Default: 5 + +# Post-process the Gemini data to create JSON used for training +python ./prepare_ego4d_vqa_gemini_dataset.py \ + --ego4d_path [path to ego4d.json] \ # Default: ../data/ego4d.json + --ego4d_nlq_path [path to nlq_train.json] \ # Default: ../data/nlq_train.json + --gemini_data_path [path to Gemini responses JSON file] \ # Outputted from previous script. Default: gemini_responses.json + --output_path [path to output JSON file] # Default: ego4d_nlq_train.gemini_pro_1.5.json +``` + +#### Note + +After performing human annotation, we manually replaced the Gemini-generated answers with the gold standard answers for inclusion in the EVUD dataset. \ No newline at end of file diff --git a/gemini/generate_gemini_data.py b/gemini/generate_gemini_data.py new file mode 100644 index 0000000..c75f0a2 --- /dev/null +++ b/gemini/generate_gemini_data.py @@ -0,0 +1,161 @@ +import json +import vertexai +from vertexai.generative_models import GenerativeModel +from tqdm import tqdm +import argparse +import time +import math +import os + + +################################################################################ +# Instruction for prompting Gemini Pro +INSTRUCTION = """You are an intelligent embodied agent that can answer questions. You will be shown a video that was collected from a single location. + +Your task is to generate a question for each of the following categories: object recognition, attribute recognition, object state recognition, object localisation, spatial reasoning, functional reasoning, world knowledge. + +Ask diverse questions and give corresponding short answers. Include questions asking about the visual content of the video. The questions you posed can include the actions and behaviors of people or objects in the video, the chronological order of events, and causal relationships. Only include questions that have definite answers. Do not ask any questions that cannot be answered confidently. + +Don't use headers. You should use the following format for each category: +Category: +Question: +Short answer: + +Assistant: +""" + + +################################################################################ +# Parse arguments +parser = argparse.ArgumentParser( + description="Prompt Gemini Pro 1.5 to generate egocentric video understanding training data" +) +parser.add_argument( + "--gcs_project_id", type=str, required=True, help="Your Google Cloud project ID" +) +parser.add_argument( + "--gcs_bucket_name", type=str, required=True, help="GCS bucket with Ego4D NLQ clips" +) +parser.add_argument( + "--gcs_location", type=str, required=True, help="GCS location to use with VertexAI" +) +parser.add_argument("--resume", action="store_true", help="Resume from last clip") +parser.add_argument( + "--ego4d_vqa_gemini_path", + type=str, + default="./ego4d_vqa_gemini.json", + help="Path to ego4d_vqa_gemini.json. Default: ./ego4d_vqa_gemini.json", +) +parser.add_argument( + "--output_path", + type=str, + default="gemini_responses.json", + help="Output path for Gemini responses. Default: gemini_responses.json", +) +parser.add_argument( + "--gemini_model", + type=str, + default="gemini-1.5-pro-001", + help="Gemini Pro model. Default: gemini-1.5-pro-001", +) +parser.add_argument( + "--vertexai_quota", + type=int, + default=5, + help="VertexAI request quota per minute. Default: 5", +) + +args = parser.parse_args() +GCS_PROJECT_ID = args.gcs_project_id +GCS_BUCKET_NAME = args.gcs_bucket_name +GCS_LOCATION = args.gcs_location +RESUME = args.resume +NLQ_VQA_PATH = args.ego4d_vqa_gemini_path +OUTPUT_PATH = args.output_path +GEMINI_MODEL = args.gemini_model +QUOTA = args.vertexai_quota + + +################################################################################ +# Load the NLQ VQA data +with open(NLQ_VQA_PATH, "r") as file: + vqa = json.load(file) + + +################################################################################ +# Initialize Vertex AI +vertexai.init(project=GCS_PROJECT_ID, location=GCS_LOCATION) + +# Load the model +gemini = GenerativeModel(GEMINI_MODEL) + + +################################################################################ +# Resume progress +if RESUME and os.path.exists(OUTPUT_PATH): + with open(OUTPUT_PATH, "r") as file: + responses = json.load(file) + +else: + responses = [] + +processed_clips = [r["example"]["id"] for r in responses] + +if RESUME: + print("-----------------------------------------------------------------------") + print(f"Skipping {len(processed_clips)} already processed!") + print("-----------------------------------------------------------------------") + + +################################################################################ +# Process examples +time_limit = 60 +time_queue = [] + +for idx, example in enumerate(tqdm(vqa, total=len(vqa))): + if example["id"] in processed_clips: + continue + + # record start time + start_time = time.time() + + # Limit the requests to the specified quota + if len(time_queue) == QUOTA: + total_time = sum(time_queue) + sleep_time = math.ceil(time_limit - total_time) + if sleep_time > 0: + time.sleep(sleep_time) + time_queue.pop(0) + + clip_path = f"gs://{GCS_BUCKET_NAME}/{example['video_filename']}" + clip = vertexai.generative_models.Part.from_uri( + uri=clip_path, mime_type="video/mp4" + ) + + try: + response = gemini.generate_content([clip, INSTRUCTION]) + except Exception: + time.sleep(10) + response = gemini.generate_content([clip, INSTRUCTION]) + + responses.append( + { + "example": example, + "response": response.to_dict(), + } + ) + + # Output results every five responses + if (idx + 1) % 5 == 0: + # store results to JSON file + with open(OUTPUT_PATH, "w") as out_file: + json.dump(responses, out_file) + + # append duration to time queue to keep requests within specified quota + time_queue.append(math.ceil(time.time() - start_time)) + +# store the last results to JSON file +with open(OUTPUT_PATH, "w") as out_file: + json.dump(responses, out_file) + +print("Done!") diff --git a/gemini/prepare_ego4d_nlq_for_gemini.py b/gemini/prepare_ego4d_nlq_for_gemini.py new file mode 100644 index 0000000..0201e5b --- /dev/null +++ b/gemini/prepare_ego4d_nlq_for_gemini.py @@ -0,0 +1,216 @@ +import argparse +import json +import math +import os + +import boto3 +from moviepy.editor import VideoFileClip +from tqdm import tqdm +from google.cloud import storage + + +################################################################################ +# GCS utility function +def upload_blob(bucket_name, source_file_name, destination_blob_name): + """Uploads a file to the bucket.""" + try: + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(destination_blob_name) + blob.upload_from_filename(source_file_name, timeout=180) + except Exception as e: + print( + f"Failed to upload {source_file_name} to {bucket_name}/{destination_blob_name}: {e}" + ) + raise + + +################################################################################ +# Parse arguments +parser = argparse.ArgumentParser( + description="Process Ego4D NLQ Train clips for use with Gemini Pro" +) +parser.add_argument( + "--ego4d_path", + type=str, + default="../data/ego4d.json", + help="Path to ego4d.json. Default: ../data/ego4d.json", +) +parser.add_argument( + "--ego4d_nlq_path", + type=str, + default="../data/nlq_train.json", + help="Path to nlq_train.json. Default: ../data/nlq_train.json", +) +parser.add_argument( + "--ego4d_output_videos_path", + type=str, + default="ego4d_vqa_gemini_videos/", + help="Output video object path on GCS (and local path). Default: ego4d_vqa_gemini_videos", +) +parser.add_argument( + "--output_json_path", + type=str, + default="ego4d_vqa_gemini.json", + help="Path to output JSON file", +) +parser.add_argument( + "--ego4d_aws_access_key_id", + type=str, + required=True, + help="Ego4D AWS access key ID, obtained from Ego4D", +) +parser.add_argument( + "--ego4d_aws_secret_access_key", + type=str, + required=True, + help="Ego4D AWS secret access key, obtained from Ego4D", +) +parser.add_argument( + "--ego4d_aws_region_name", + type=str, + required=True, + help="Ego4D AWS region name, obtained from Ego4D", +) +parser.add_argument( + "--gcs_bucket_name", + type=str, + required=True, + help="GCS bucket the clips will be saved to", +) +parser.add_argument( + "--keep-local-clips", + action="store_true", + help="Optional flag to specify keeping the clips locally (requires about 130 Gb of storage)", +) +args = parser.parse_args() + + +################################################################################ +# Load the data +with open(args.ego4d_path) as in_file: + ego4d_videos = json.load(in_file) + video_uid2video = {video["video_uid"]: video for video in ego4d_videos["videos"]} + +with open(args.ego4d_nlq_path) as in_file: + ego4d_nlq = json.load(in_file) + + +################################################################################ +# Process videos +dataset = [] + +# init JSON file +with open(args.output_json_path, "w") as in_file: + pass + +s3 = boto3.client( + "s3", + aws_access_key_id=args.ego4d_aws_access_key_id, + aws_secret_access_key=args.ego4d_aws_secret_access_key, + region_name=args.ego4d_aws_region_name, +) + +last_downloaded_video_filename = None + +for video in tqdm(ego4d_nlq["videos"], total=len(ego4d_nlq["videos"])): + for clip in video["clips"]: + for annotation in clip["annotations"]: + for language_query_index, language_query in enumerate( + annotation["language_queries"] + ): + idx = len(dataset) + try: + s3_video_path_parts = video_uid2video[video["video_uid"]][ + "s3_path" + ].split("/") + s3_bucket_name = s3_video_path_parts[2] + s3_key = "/".join(s3_video_path_parts[3:]) + s3_filename = s3_video_path_parts[-3] + + video_filename = os.path.join( + args.ego4d_output_videos_path, + video["video_uid"], + ) + + if video_filename != last_downloaded_video_filename: + if last_downloaded_video_filename: + os.remove(last_downloaded_video_filename) + s3.download_file(s3_bucket_name, s3_key, video_filename) + last_downloaded_video_filename = video_filename + + video_start_sec = max( + math.floor(language_query["video_start_sec"]), + 0, + ) + video_end_sec = min( + math.ceil(language_query["video_end_sec"]), + video_uid2video[video["video_uid"]]["duration_sec"], + ) + + clip_filename = os.path.join( + args.ego4d_output_videos_path, + video["video_uid"], + clip["clip_uid"], + annotation["annotation_uid"], + f"{language_query_index}.mp4", + ) + + video_clip = VideoFileClip(last_downloaded_video_filename) + video_clip = video_clip.subclip(video_start_sec, video_end_sec) + video_clip.write_videofile( + clip_filename, remove_temp=True, logger=None + ) + + # Upload to GCS + if args.gcs_bucket_name: + upload_blob(args.gcs_bucket_name, clip_filename, clip_filename) + + human_value = (f"