Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
modified the code to handle much larger code files
  • Loading branch information
jeffmeloy authored Oct 10, 2023
1 parent dc834f5 commit f465d44
Show file tree
Hide file tree
Showing 7 changed files with 201 additions and 183 deletions.
54 changes: 26 additions & 28 deletions get_py2dataset_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import importlib
from typing import Dict, List
from pathlib import Path
from transformers import AutoTokenizer

# Setting up a basic logger
logging.basicConfig(level=logging.INFO)
Expand All @@ -59,104 +60,99 @@ def get_default_questions() -> List[Dict]:
questions = [
{
"id": "file_dependencies",
"text": "What are the dependencies of the Python file: '{filename}'?",
"type": "file"
},
{
"id": "internal_code_graph",
"text": "What is the call code graph of the Python file: '{filename}'?",
"text": "Dependencies of Python file: '{filename}'?",
"type": "file"
},
{
"id": "entire_code_graph",
"text": "What are the structural relationships between the functions and classes defined and used in the Python file: '{filename}'?",
"text": "Call code graph of Python file: '{filename}'?",
"type": "file"
},
{
"id": "file_functions",
"text": "What functions are defined in the Python file: '{filename}'?",
"text": "Functions defined in Python file: '{filename}'?",
"type": "file"
},
{
"id": "file_classes",
"text": "What classes are defined in the Python file: '{filename}'?",
"text": "Classes defined in Python file: '{filename}'?",
"type": "file"
},
{
"id": "function_inputs",
"text": "What are the inputs to the function: '{function_name}' in the Python file: '{filename}'?",
"text": "Inputs to function: '{function_name}' in Python file: '{filename}'?",
"type": "function"
},
{
"id": "function_docstring",
"text": "What is the docstring of the function: '{function_name}' in the Python file: '{filename}'?",
"text": "Docstring of function: '{function_name}' in Python file: '{filename}'?",
"type": "function"
},
{
"id": "function_calls",
"text": "What calls are made in the function: '{function_name}' in the Python file: '{filename}'?",
"text": "Calls made in function: '{function_name}' in Python file: '{filename}'?",
"type": "function"
},
{
"id": "function_variables",
"text": "What variables are defined in the function: '{function_name}' in the Python file: '{filename}'?",
"text": "Variables defined in function: '{function_name}' in Python file: '{filename}'?",
"type": "function"
},
{
"id": "function_returns",
"text": "What are the returned items from the function: '{function_name}' in the Python file: '{filename}'?",
"text": "Returned items from function: '{function_name}' in Python file: '{filename}'?",
"type": "function"
},
{
"id": "class_methods",
"text": "What are the methods defined within the class: '{class_name}' in the Python file: '{filename}'?",
"text": "Methods defined in class: '{class_name}' in Python file: '{filename}'?",
"type": "class"
},
{
"id": "class_docstring",
"text": "What is the docstring of the class: '{class_name}' in the Python file: '{filename}'?",
"text": "Docstring of class: '{class_name}' in Python file: '{filename}'?",
"type": "class"
},
{
"id": "class_attributes",
"text": "What are the attributes of the class: '{class_name}' in the Python file: '{filename}'?",
"text": "Attributes of class: '{class_name}' in Python file: '{filename}'?",
"type": "class"
},
{
"id": "class_variables",
"text": "What variables are defined in the class: '{class_name}' in the Python file: '{filename}'?",
"text": "Variables defined in class: '{class_name}' in Python file: '{filename}'?",
"type": "class"
},
{
"id": "class_inheritance",
"text": "What is the Inheritance of the class: '{class_name}' in the Python file: '{filename}'?",
"text": "Inheritance of class: '{class_name}' in Python file: '{filename}'?",
"type": "class"
},
{
"id": "method_inputs",
"text": "What are the inputs to method: '{method_name}' in the class: '{class_name}' in the Python file: '{filename}'?",
"text": "Inputs to method: '{method_name}' in class: '{class_name}' in Python file: '{filename}'?",
"type": "method"
},
{
"id": "method_docstring",
"text": "What is the docstring of the method: '{method_name}' in the class: '{class_name}' in the Python file: '{filename}'?",
"text": "Docstring of method: '{method_name}' in class: '{class_name}' in Python file: '{filename}'?",
"type": "method"
},
{
"id": "method_calls",
"text": "What calls are made in the method: '{method_name}' in the class: '{class_name}' in the Python file: '{filename}'?",
"text": "Calls made in method: '{method_name}' in class: '{class_name}' in Python file: '{filename}'?",
"type": "method"
},
{
"id": "method_returns",
"text": "What are the returns from the method: '{method_name}' in the class: '{class_name}' in the Python file: '{filename}'?",
"text": "Returns from method: '{method_name}' in class: '{class_name}' in Python file: '{filename}'?",
"type": "method"
},
{
"id": "file_purpose",
"text": "1) Describe the purpose and processing summary of the Python file: '{filename}; 2) Provide an itemized detailed description of each applicable function, class, and method; 3) Explain what each of input, output, and variable do within the file.",
"text": "1) DESCRIBE the purpose and processing summary of Python file: '{filename}'; 2) PROVIDE an itemized and detailed description of each applicable function, class, and method; 3) EXPLAIN what each input, output, and variable does in the code.",
"type": "file"
}
}
]
return questions

Expand All @@ -182,7 +178,7 @@ def get_default_model_config() -> Dict:
#"lib": "avx2",
"threads": 28,
"batch_size": 128,
"context_length": 8400,
"context_length": 14000,
"max_new_tokens": 8092,
"gpu_layers": 100,
"reset": True
Expand Down Expand Up @@ -237,6 +233,7 @@ def instantiate_model(model_config: Dict) -> object:
module_name, class_name = model_config['model_import_path'].rsplit('.', 1)
ModelClass = getattr(importlib.import_module(module_name), class_name)
model_params = model_config['model_params']
model_path = model_params['model_path']
inference_function_name = model_config['model_inference_function']
if inference_function_name != "":
inference_function = getattr(ModelClass, inference_function_name)
Expand All @@ -246,7 +243,7 @@ def instantiate_model(model_config: Dict) -> object:
return model
except ImportError or AttributeError or Exception as e:
logging.info(f"Failed to instantiate the model. Error: {e}")
return None
return None, None


def get_model(model_config_pathname: str) -> tuple[object, str]:
Expand All @@ -266,7 +263,8 @@ def get_model(model_config_pathname: str) -> tuple[object, str]:
except:
logging.info(f'Model config file not valid: {model_config_pathname} Using default model config')
model_config = get_default_model_config()
return instantiate_model(model_config['inference_model']), model_config['prompt_template']
model_config['model'] = instantiate_model(model_config['inference_model'])
return model_config


def write_questions_file(output_dir: str='') -> None:
Expand Down
94 changes: 50 additions & 44 deletions get_python_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,24 @@
Generates JSON format question-answer pairs and instructions for a Python file
Requirements:
[req01] The `DatasetGenerator` class shall:
a. Accept a Python file path (str), file details (Dict), base name (str), list of questions (List[Dict]), use_llm flag (bool), language model (object), and prompt (str) as input during instantiation.
b. Initialize and store the Python file path, file details, base name, list of questions, use_llm flag, language model, and prompt as class attributes.
a. Accept a Python file path (str), file details (Dict), base name (str), list of questions (List[Dict]), and model configuration (Dict) as input during instantiation.
b. Initialize and store the Python file path, file details, base name, list of questions, language model, and use_llm flag as class attributes.
c. Provide the `clean_and_get_unique_elements` method to clean an input string (str) and return a string of unique elements.
d. Provide the `add_to_list` method to add a response (str) to a list (List[Dict]).
e. Provide the `get_response_from_llm` method to retrieve a response from the language model.
f. Provide the `process_question` method to process a question and add the generated response to the instruct_list.
g. Provide the `process_question_type` method to process questions related to a file, function, class, or method.
h. Provide the `generate` method to generate responses for all questions in the list and return the instruct_list.
d. Provide the `get_response_from_llm` method to retrieve a response from the language model based on a query and context.
e. Provide the `process_question` method to process a question based on its type and generate a corresponding response to add to the instruct_list.
f. Provide the `process_question_type` method to process questions related to a file, function, class, or method.
g. Provide the `generate` method to generate responses for all questions in the list and return the instruct_list.
h. Internally manage a question mapping to correlate question types to keys in file details.
[req02] The `get_python_datasets` function shall:
a. Accept a Python file path (str), file details (Dict), base name (str), list of questions (List[Dict]), use_llm flag (bool), language model (object), and prompt (str) as input.
b. Create an instance of the `DatasetGenerator` class using the provided input.
a. Accept a Python file path (str), file details (Dict), base name (str), list of questions (List[Dict]), and model configuration (Dict) as input.
b. Instantiate an object of the `DatasetGenerator` class using the provided input.
c. Generate question-answer pairs and instructions using the `generate` method of the `DatasetGenerator` instance.
d. Return the generated `instruct_list`.
"""
import logging
import re
import math
from typing import Dict, List, Tuple

# Set up logging
Expand All @@ -27,6 +29,7 @@
)
logger = logging.getLogger(__name__)


class DatasetGenerator:
"""
Generate JSON formatted dictionary outputs for a Python file.
Expand Down Expand Up @@ -54,13 +57,13 @@ class DatasetGenerator:
generate() -> Tuple[List[Dict], List[Dict]]:
Generate responses for all the questions and return the instruct_list.
"""
def __init__(self, file_path: str, file_details: Dict, base_name: str, questions: List[Dict], llm: object, prompt: str) -> None:
def __init__(self, file_path: str, file_details: Dict, base_name: str, questions: List[Dict], model_config: Dict) -> None:
self.file_path = file_path
self.file_details = file_details
self.base_name = base_name
self.questions = questions
self.llm = llm
self.prompt = prompt
self.model_config = model_config
self.llm = model_config['model']
if self.llm is None:
self.use_llm = False
else:
Expand All @@ -86,26 +89,6 @@ def clean_and_get_unique_elements(input_str: str) -> str:
for element in re.sub(r'\s+', ' ', input_str).split(','))
return ', '.join(cleaned_elements)

@staticmethod
def add_to_list(list_to_update: List[Dict], query: str, response: str, additional_field=None) -> List[Dict]:
"""
Adds response to instruct list.
Args:
list_to_update (List[Dict]): The list to be updated.
query (str): The query to be added.
response (str): The response to be added.
additional_field (str): The additional field to be added.
Returns:
List[Dict]: The updated list.
"""
if response and response.strip() and response != 'None':
list_to_update.append(
{'instruction': query, 'input' : additional_field, 'output': response}
if additional_field else
{'question': query, 'answer': response}
)
return list_to_update

def get_response_from_llm(self, query: str, context: str) -> str:
"""
Get language model response to query for given context.
Expand All @@ -115,19 +98,39 @@ def get_response_from_llm(self, query: str, context: str) -> str:
Returns:
str: The generated response.
"""
# Update the context with the selected instructions from the instruct_list
excluded_instructions = ["What is the call code graph", "What is the docstring"]
filtered_instruct_list = [item for item in self.instruct_list if not any(item['instruction'].startswith(prefix) for prefix in excluded_instructions)]
past_instructs = "\n".join([f"Instruction: {item['instruction']} \nOutput: {item['output']}" for item in filtered_instruct_list])
full_context = context + "\n" + "Here's some detail about this code:" + "\n" + past_instructs

def get_context_and_prompt(query, context, code_qa):
full_context = f"{context}\nCODE Q and A:\n{code_qa}"
prompt = self.model_config['prompt_template'].format(context=full_context, query=query)
context_size = len(self.llm.tokenize(prompt))
return full_context, prompt, context_size

max_context_length = self.model_config['inference_model']['model_params']['context_length']
excluded_instructions = ["Call code graph", "Docstring"]
code_qa = "\n".join([f"Q: {item['instruction']} \nA: {item['output']}" for item in self.instruct_list if not any(item['instruction'].startswith(prefix) for prefix in excluded_instructions)])

# manage context length for LLM using different strategies starting with the longest and most comprehensive
context_strategies = [
lambda: '```python\n' + str(context) + '\n```',
lambda: '```python\n' + str(self.file_details['file_info']['file_code_simplified']) + '\n```',
lambda: self.get_string_from_info(self.file_details['file_info'], 'file_summary'),
lambda: ''
]
for strategy in context_strategies:
context = strategy()
full_context, prompt, context_size = get_context_and_prompt(query, context, code_qa)
if context_size <= 0.70 * max_context_length:
break
else:
logger.error(f'Failed to generate model response, adjust context_length > {math.ceil(context_size/0.70)} in py2dataset_model_config.yaml')
return ''

try:
prompt = self.prompt.format(context=full_context, query=query)
logging.info(f'Query: {query}')
response = self.llm(prompt)
response = re.sub(r'\n\s*\n', '\n\n', self.llm(prompt))
logging.info(f'Response: {response}')
except:
logger.error('Failed to generate model response')
response = ''
return response

def process_question(self, question_type: str, question_id: str, query: str, context: str, info: Dict) -> None:
Expand All @@ -142,10 +145,12 @@ def process_question(self, question_type: str, question_id: str, query: str, con
Returns:
None
"""
if question_id.endswith('code_graph'):
if question_id.endswith('code_graph') or question_id.endswith('docstring'):
response = info.get(question_id, {})
else:
response = self.get_response_from_llm(query, context) if self.use_llm and question_id.endswith('purpose') else self.clean_and_get_unique_elements(str(info.get(question_id, '')))
if question_type == 'file':
context = self.file_details['file_info']['file_code']
if response and response != 'None':
response_str = str(response).strip()
if response_str:
Expand All @@ -170,8 +175,8 @@ def process_question_type(self, question_type: str, question_id: str, question_t
"""
if question_type == 'file':
query = question_text.format(filename=self.base_name)
context = self.file_details['file_info']['file_code']
info = self.file_details['file_info']
context = self.file_details['file_info']['file_code']
self.process_question(question_type, question_id, query, context, info)
elif question_type == 'method':
for class_name, class_info in self.file_details['classes'].items():
Expand Down Expand Up @@ -200,6 +205,7 @@ def process_question_type(self, question_type: str, question_id: str, question_t
self.process_question(question_type, question_id, query, context, info)

def generate(self) -> Tuple[List[Dict], List[Dict]]:

"""
Generate responses for all the questions and returns the instruct_list.
Args:
Expand All @@ -213,7 +219,7 @@ def generate(self) -> Tuple[List[Dict], List[Dict]]:


def get_python_datasets(file_path: str, file_details: Dict, base_name: str, questions: List[Dict],
llm: object, prompt: str) -> Tuple[List[Dict], List[Dict]]:
model_config: Dict) -> Tuple[List[Dict], List[Dict]]:
"""
Extract information from a Python file and return it in JSON format.
Args:
Expand All @@ -226,5 +232,5 @@ def get_python_datasets(file_path: str, file_details: Dict, base_name: str, ques
Returns:
Tuple[List[Dict], List[Dict]]: Extracted information in JSON format.
"""
generator = DatasetGenerator(file_path, file_details, base_name, questions, llm, prompt)
generator = DatasetGenerator(file_path, file_details, base_name, questions, model_config)
return generator.generate()
Loading

0 comments on commit f465d44

Please sign in to comment.