forked from SamuelSchmidgall/AgentLaboratory
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathagents.py
executable file
·728 lines (639 loc) · 50.3 KB
/
agents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
from utils import *
from tools import *
from inference import *
def extract_json_between_markers(llm_output):
# Regular expression pattern to find JSON content between ```json and ```
json_pattern = r"```json(.*?)```"
matches = re.findall(json_pattern, llm_output, re.DOTALL)
if not matches:
# Fallback: Try to find any JSON-like content in the output
json_pattern = r"\{.*?\}"
matches = re.findall(json_pattern, llm_output, re.DOTALL)
for json_string in matches:
json_string = json_string.strip()
try:
parsed_json = json.loads(json_string)
return parsed_json
except json.JSONDecodeError:
# Attempt to fix common JSON issues
try:
# Remove invalid control characters
json_string_clean = re.sub(r"[\x00-\x1F\x7F]", "", json_string)
parsed_json = json.loads(json_string_clean)
return parsed_json
except json.JSONDecodeError:
continue # Try next match
return None # No valid JSON found
def get_score(outlined_plan, latex, reward_model_llm, reviewer_type=None, attempts=3, openai_api_key=None):
e = str()
for _attempt in range(attempts):
try:
# todo: have a reward function here
# template inherited from the AI Scientist (good work on this prompt Sakana AI team :D)
template_instructions = """
Respond in the following format:
THOUGHT:
<THOUGHT>
REVIEW JSON:
```json
<JSON>
```
In <THOUGHT>, first briefly discuss your intuitions and reasoning for the evaluation.
Detail your high-level arguments, necessary choices and desired outcomes of the review.
Do not make generic comments here, but be specific to your current paper.
Treat this as the note-taking phase of your review.
In <JSON>, provide the review in JSON format with the following fields in the order:
- "Summary": A summary of the paper content and its contributions.
- "Strengths": A list of strengths of the paper.
- "Weaknesses": A list of weaknesses of the paper.
- "Originality": A rating from 1 to 4 (low, medium, high, very high).
- "Quality": A rating from 1 to 4 (low, medium, high, very high).
- "Clarity": A rating from 1 to 4 (low, medium, high, very high).
- "Significance": A rating from 1 to 4 (low, medium, high, very high).
- "Questions": A set of clarifying questions to be answered by the paper authors.
- "Limitations": A set of limitations and potential negative societal impacts of the work.
- "Ethical Concerns": A boolean value indicating whether there are ethical concerns.
- "Soundness": A rating from 1 to 4 (poor, fair, good, excellent).
- "Presentation": A rating from 1 to 4 (poor, fair, good, excellent).
- "Contribution": A rating from 1 to 4 (poor, fair, good, excellent).
- "Overall": A rating from 1 to 10 (very strong reject to award quality).
- "Confidence": A rating from 1 to 5 (low, medium, high, very high, absolute).
- "Decision": A decision that has to be one of the following: Accept, Reject.
For the "Decision" field, don't use Weak Accept, Borderline Accept, Borderline Reject, or Strong Reject. Instead, only use Accept or Reject.
This JSON will be automatically parsed, so ensure the format is precise.
"""
neurips_form = ("""
## Review Form
Below is a description of the questions you will be asked on the review form for each paper and some guidelines on what to consider when answering these questions.
When writing your review, please keep in mind that after decisions have been made, reviews and meta-reviews of accepted papers and opted-in rejected papers will be made public.
1. Summary: Briefly summarize the paper and its contributions. This is not the place to critique the paper; the authors should generally agree with a well-written summary.
- Strengths and Weaknesses: Please provide a thorough assessment of the strengths and weaknesses of the paper, touching on each of the following dimensions:
- Originality: Are the tasks or methods new? Is the work a novel combination of well-known techniques? (This can be valuable!) Is it clear how this work differs from previous contributions? Is related work adequately cited
- Quality: Is the submission technically sound? Are claims well supported (e.g., by theoretical analysis or experimental results)? Are the methods used appropriate? Is this a complete piece of work or work in progress? Are the authors careful and honest about evaluating both the strengths and weaknesses of their work
- Clarity: Is the submission clearly written? Is it well organized? (If not, please make constructive suggestions for improving its clarity.) Does it adequately inform the reader? (Note that a superbly written paper provides enough information for an expert reader to reproduce its results.)
- Significance: Are the results important? Are others (researchers or practitioners) likely to use the ideas or build on them? Does the submission address a difficult task in a better way than previous work? Does it advance the state of the art in a demonstrable way? Does it provide unique data, unique conclusions about existing data, or a unique theoretical or experimental approach?
2. Questions: Please list up and carefully describe any questions and suggestions for the authors. Think of the things where a response from the author can change your opinion, clarify a confusion or address a limitation. This can be very important for a productive rebuttal and discussion phase with the authors.
3. Limitations: Have the authors adequately addressed the limitations and potential negative societal impact of their work? If not, please include constructive suggestions for improvement.
In general, authors should be rewarded rather than punished for being up front about the limitations of their work and any potential negative societal impact. You are encouraged to think through whether any critical points are missing and provide these as feedback for the authors.
4. Ethical concerns: If there are ethical issues with this paper, please flag the paper for an ethics review. For guidance on when this is appropriate, please review the NeurIPS ethics guidelines.
5. Soundness: Please assign the paper a numerical rating on the following scale to indicate the soundness of the technical claims, experimental and research methodology and on whether the central claims of the paper are adequately supported with evidence.
4: excellent
3: good
2: fair
1: poor
6. Presentation: Please assign the paper a numerical rating on the following scale to indicate the quality of the presentation. This should take into account the writing style and clarity, as well as contextualization relative to prior work.
4: excellent
3: good
2: fair
1: poor
7. Contribution: Please assign the paper a numerical rating on the following scale to indicate the quality of the overall contribution this paper makes to the research area being studied. Are the questions being asked important? Does the paper bring a significant originality of ideas and/or execution? Are the results valuable to share with the broader NeurIPS community.
4: excellent
3: good
2: fair
1: poor
8. Overall: Please provide an "overall score" for this submission. Choices:
10: Award quality: Technically flawless paper with groundbreaking impact on one or more areas of AI, with exceptionally strong evaluation, reproducibility, and resources, and no unaddressed ethical considerations.
9: Very Strong Accept: Technically flawless paper with groundbreaking impact on at least one area of AI and excellent impact on multiple areas of AI, with flawless evaluation, resources, and reproducibility, and no unaddressed ethical considerations.
8: Strong Accept: Technically strong paper, with novel ideas, excellent impact on at least one area of AI or high-to-excellent impact on multiple areas of AI, with excellent evaluation, resources, and reproducibility, and no unaddressed ethical considerations.
7: Accept: Technically solid paper, with high impact on at least one sub-area of AI or moderate-to-high impact on more than one area of AI, with good-to-excellent evaluation, resources, reproducibility, and no unaddressed ethical considerations.
6: Weak Accept: Technically solid, moderate-to-high impact paper, with no major concerns with respect to evaluation, resources, reproducibility, ethical considerations.
5: Borderline accept: Technically solid paper where reasons to accept outweigh reasons to reject, e.g., limited evaluation. Please use sparingly.
4: Borderline reject: Technically solid paper where reasons to reject, e.g., limited evaluation, outweigh reasons to accept, e.g., good evaluation. Please use sparingly.
3: Reject: For instance, a paper with technical flaws, weak evaluation, inadequate reproducibility and incompletely addressed ethical considerations.
2: Strong Reject: For instance, a paper with major technical flaws, and/or poor evaluation, limited impact, poor reproducibility and mostly unaddressed ethical considerations.
1: Very Strong Reject: For instance, a paper with trivial results or unaddressed ethical considerations
9. Confidence: Please provide a "confidence score" for your assessment of this submission to indicate how confident you are in your evaluation. Choices:
5: You are absolutely certain about your assessment. You are very familiar with the related work and checked the math/other details carefully.
4: You are confident in your assessment, but not absolutely certain. It is unlikely, but not impossible, that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work.
3: You are fairly confident in your assessment. It is possible that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
2: You are willing to defend your assessment, but it is quite likely that you did not understand the central parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
1: Your assessment is an educated guess. The submission is not in your area or the submission was difficult to understand. Math/other details were not carefully checked.
You must make sure that all sections are properly created: abstract, introduction, methods, results, and discussion. Points must be reduced from your scores if any of these are missing.
""" + template_instructions)
if reviewer_type is None: reviewer_type = ""
sys = (
"You are an AI researcher who is reviewing a paper that was submitted to a prestigious ML venue. "
f"Be critical and cautious in your decision. {reviewer_type}\n"
) + neurips_form
scoring = query_model(
model_str=f"{reward_model_llm}",
system_prompt=sys,
openai_api_key=openai_api_key,
prompt=(
f"Outlined in the following text is the research plan that the machine learning engineer was tasked with building: {outlined_plan}\n\n"
f"The following text is the research latex that the model produced: \n{latex}\n\n"), temp=0.0)
review_json = extract_json_between_markers(scoring)
overall = int(review_json["Overall"]) / 10
soundness = int(review_json["Soundness"]) / 4
confidence = int(review_json["Confidence"]) / 5
contribution = int(review_json["Contribution"]) / 4
presentation = int(review_json["Presentation"]) / 4
clarity = int(review_json["Clarity"]) / 4
originality = int(review_json["Originality"]) / 4
quality = int(review_json["Quality"]) / 4
significance = int(review_json["Significance"]) / 4
clarity_weight = 0.1
quality_weight = 0.1
overall_weight = 1.0
soundness_weight = 0.1
confidence_weight = 0.1
originality_weight = 0.1
significance_weight = 0.1
contribution_weight = 0.4
presentation_weight = 0.2
# max possible
max_score = (
clarity_weight + quality_weight + overall_weight + soundness_weight + confidence_weight + originality_weight + significance_weight + contribution_weight + presentation_weight)
performance = ((
soundness_weight * soundness + presentation_weight * presentation + confidence_weight * confidence + contribution_weight * contribution + overall_weight * overall + originality_weight * originality + significance * significance_weight + clarity_weight * clarity + quality_weight * quality) / max_score) * 10
return performance, f"The performance of your submission is: {performance}" + scoring, True
except Exception as e:
print(e)
return None, str(e), False
return 0, e
class ReviewersAgent:
def __init__(self, model="gpt-4o-mini", notes=None, openai_api_key=None):
if notes is None: self.notes = []
else: self.notes = notes
self.model = model
self.openai_api_key = openai_api_key
def inference(self, plan, report):
reviewer_1 = "You are a harsh but fair reviewer and expect good experiments that lead to insights for the research topic."
review_1 = get_score(outlined_plan=plan, latex=report, reward_model_llm=self.model, reviewer_type=reviewer_1, openai_api_key=self.openai_api_key)
reviewer_2 = "You are a harsh and critical but fair reviewer who is looking for an idea that would be impactful in the field."
review_2 = get_score(outlined_plan=plan, latex=report, reward_model_llm=self.model, reviewer_type=reviewer_2, openai_api_key=self.openai_api_key)
reviewer_3 = "You are a harsh but fair open-minded reviewer that is looking for novel ideas that have not been proposed before."
review_3 = get_score(outlined_plan=plan, latex=report, reward_model_llm=self.model, reviewer_type=reviewer_3, openai_api_key=self.openai_api_key)
return f"Reviewer #1:\n{review_1}, \nReviewer #2:\n{review_2}, \nReviewer #3:\n{review_3}"
class BaseAgent:
def __init__(self, model="gpt-4o-mini", notes=None, max_steps=100, openai_api_key=None):
if notes is None: self.notes = []
else: self.notes = notes
self.max_steps = max_steps
self.model = model
self.phases = []
self.plan = str()
self.report = str()
self.history = list()
self.prev_comm = str()
self.prev_report = str()
self.exp_results = str()
self.dataset_code = str()
self.results_code = str()
self.lit_review_sum = str()
self.interpretation = str()
self.prev_exp_results = str()
self.reviewer_response = str()
self.prev_results_code = str()
self.prev_interpretation = str()
self.openai_api_key = openai_api_key
self.second_round = False
self.max_hist_len = 15
def set_model_backbone(self, model):
self.model = model
@staticmethod
def clean_text(text):
"""
Fix minor corrections
:return: (str) corrected text
"""
text = text.replace("```\n", "```")
return text
def inference(self, research_topic, phase, step, feedback="", temp=None):
sys_prompt = f"""You are {self.role_description()} \nTask instructions: {self.phase_prompt(phase)}\n{self.command_descriptions(phase)}"""#\n{self.example_command(phase)}
context = self.context(phase)
history_str = "\n".join([_[1] for _ in self.history])
phase_notes = [_note for _note in self.notes if phase in _note["phases"]]
notes_str = f"Notes for the task objective: {phase_notes}\n" if len(phase_notes) > 0 else ""
complete_str = str()
if step/(self.max_steps-1) > 0.7: complete_str = "You must finish this task and submit as soon as possible!"
prompt = (
f"""{context}\n{'~' * 10}\nHistory: {history_str}\n{'~' * 10}\n"""
f"Current Step #{step}, Phase: {phase}\n{complete_str}\n"
f"[Objective] Your goal is to perform research on the following topic: {research_topic}\n"
f"Feedback: {feedback}\nNotes: {notes_str}\nYour previous command was: {self.prev_comm}. Make sure your new output is very different.\nPlease produce a single command below:\n")
model_resp = query_model(model_str=self.model, system_prompt=sys_prompt, prompt=prompt, temp=temp, openai_api_key=self.openai_api_key)
print("^"*50, phase, "^"*50)
model_resp = self.clean_text(model_resp)
self.prev_comm = model_resp
steps_exp = None
if feedback is not None and "```EXPIRATION" in feedback:
steps_exp = int(feedback.split("\n")[0].replace("```EXPIRATION ", ""))
feedback = extract_prompt(feedback, "EXPIRATION")
self.history.append((steps_exp, f"Step #{step}, Phase: {phase}, Feedback: {feedback}, Your response: {model_resp}"))
# remove histories that have expiration dates
for _i in reversed(range(len(self.history))):
if self.history[_i][0] is not None:
self.history[_i] = (self.history[_i][0] - 1, self.history[_i][1])
if self.history[_i][0] < 0:
self.history.pop(_i)
if len(self.history) >= self.max_hist_len:
self.history.pop(0)
return model_resp
def reset(self):
self.history.clear() # Clear the deque
self.prev_comm = ""
def context(self, phase):
raise NotImplementedError("Subclasses should implement this method.")
def phase_prompt(self, phase):
raise NotImplementedError("Subclasses should implement this method.")
def role_description(self):
raise NotImplementedError("Subclasses should implement this method.")
def command_descriptions(self, phase):
raise NotImplementedError("Subclasses should implement this method.")
def example_command(self, phase):
raise NotImplementedError("Subclasses should implement this method.")
class ProfessorAgent(BaseAgent):
def __init__(self, model="gpt4omini", notes=None, max_steps=100, openai_api_key=None):
super().__init__(model, notes, max_steps, openai_api_key)
self.phases = ["report writing"]
def generate_readme(self):
sys_prompt = f"""You are {self.role_description()} \n Here is the written paper \n{self.report}. Task instructions: Your goal is to integrate all of the knowledge, code, reports, and notes provided to you and generate a readme.md for a github repository."""
history_str = "\n".join([_[1] for _ in self.history])
prompt = (
f"""History: {history_str}\n{'~' * 10}\n"""
f"Please produce the readme below in markdown:\n")
model_resp = query_model(model_str=self.model, system_prompt=sys_prompt, prompt=prompt, openai_api_key=self.openai_api_key)
return model_resp.replace("```markdown", "")
def context(self, phase):
#sr_str = str()
#if self.second_round:
# sr_str = (
# f"The following are results from the previous experiments\n",
# f"Previous Experiment code: {self.prev_results_code}\n"
# f"Previous Results: {self.prev_exp_results}\n"
# f"Previous Interpretation of results: {self.prev_interpretation}\n"
# f"Previous Report: {self.prev_report}\n"
# f"{self.reviewer_response}\n\n\n"
# )
#if phase == "report writing":
# return (
# sr_str,
# f"Current Literature Review: {self.lit_review_sum}\n"
# f"Current Plan: {self.plan}\n"
# f"Current Dataset code: {self.dataset_code}\n"
# f"Current Experiment code: {self.results_code}\n"
# f"Current Results: {self.exp_results}\n"
# f"Current Interpretation of results: {self.interpretation}\n"
# )
return ""
def example_command(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
return (
"You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where dialogue here is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n"
"When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\n<Insert command here>\n``` where COMMAND is the specific command you want to run (e.g. REPORT, DIALOGUE).\n")
def command_descriptions(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
return (
"When you believe a good report has been arrived at between you and the PhD student you can use the following command to end the dialogue and submit the plan ```LATEX\nreport here\n```\n where report here is the actual report written in compilable latex to be transmitted and LATEX is just the word LATEX.\n"
"Your report should include numbers, relevant metrics to the experiment (e.g. accuracy or loss) and measures of significance. You must propagate this information accurately. You must also submit the report promptly. Do not delay too long.\n"
"You must be incredibly detailed about what you did for the experiment and all of the findings.\n"
)
def phase_prompt(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
phase_str = (
"You are directing a PhD student to help them write a report in latex based on results from an experiment, and you interact with them through dialogue.\n"
"Your goal is to write a report in latex for an experiment. You should read through the code, read through the interpretation, and look at the results to understand what occurred. You should then discuss with the PhD student how they can write up the results and give their feedback to improve their thoughts.\n"
)
return phase_str
def role_description(self):
return "a computer science professor at a top university."
class PostdocAgent(BaseAgent):
def __init__(self, model="gpt4omini", notes=None, max_steps=100, openai_api_key=None):
super().__init__(model, notes, max_steps, openai_api_key)
self.phases = ["plan formulation", "results interpretation"]
def context(self, phase):
sr_str = str()
if self.second_round:
sr_str = (
f"The following are results from the previous experiments\n",
f"Previous Experiment code: {self.prev_results_code}\n"
f"Previous Results: {self.prev_exp_results}\n"
f"Previous Interpretation of results: {self.prev_interpretation}\n"
f"Previous Report: {self.prev_report}\n"
f"{self.reviewer_response}\n\n\n"
)
if phase == "plan formulation":
return (
sr_str,
f"Current Literature Review: {self.lit_review_sum}",
)
elif phase == "results interpretation":
return (
sr_str,
f"Current Literature Review: {self.lit_review_sum}\n"
f"Current Plan: {self.plan}\n"
f"Current Dataset code: {self.dataset_code}\n"
f"Current Experiment code: {self.results_code}\n"
f"Current Results: {self.exp_results}"
)
return ""
def example_command(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
return ()
def command_descriptions(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
if phase == "plan formulation":
return (
"You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where dialogue here is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n"
"When you believe a good plan has been arrived at between you and the PhD student you can use the following command to end the dialogue and submit the plan ```PLAN\nplan here\n```\n where plan here is the actual plan to be transmitted and PLAN is just the word PLAN. Plan here should provide a clear outline for how to achieve the task, including what machine learning models to use and implement, what types of datasets should be searched for and used to train the model, and the exact details of the experiment.\n"
"You can only use a SINGLE command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, NOT BOTH.\n"
"Make sure not to produce too much dialogue and to submit an plan in reasonable time."
"When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. PLAN, DIALOGUE).\n"
)
elif phase == "results interpretation":
return (
"When you believe a good interpretation has been arrived at between you and the PhD student you can use the following command to end the dialogue and submit the plan ```INTERPRETATION\ninterpretation here\n```\n where interpretation here is the actual interpretation to be transmitted and INTERPRETATION is just the word INTERPRETATION. Please provide an INTERPRETATION in a reasonable amount of time.\n"
"You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where dialogue here is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n"
"You must submit the interpretation during this phase in a reasonable amount of time. Do not delay the submission."
"When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. INTERPRETATION, DIALOGUE).\n"
)
def phase_prompt(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
if phase == "plan formulation":
phase_str = (
"You are directing a PhD student to help them come up with a good plan, and you interact with them through dialogue.\n"
"Your goal is to produce plans that would make good experiments for the given topic. You should aim for a very simple experiment that showcases your plan, not a complex one. You should integrate the provided literature review and come up with plans on how to expand and build on these works for the given topic. Your plans should provide a clear outline for how to achieve the task, including what machine learning models to use and implement, what types of datasets should be searched for and used to train the model, and the exact details of the experiment.\n"
)
elif phase == "results interpretation":
phase_str = (
"You are directing a PhD student to help them come up with an interpretation for results from an experiment, and you interact with them through dialogue.\n"
"Your goal is to interpret results from experiments that were previously run. You should read through the code and look at the results to understand what occurred. You should then discuss with the PhD student how they can interpret the results and give their feedback to improve their thoughts. You should integrate the provided literature review, code, and plans to come up with an exciting interpretation that could make a compelling paper. Your plans should provide a clear outline that can be used to write an academic paper.\n"
"Your interpretation should include numbers, relevant metrics to the experiment (e.g. accuracy or loss) and measures of significance. You must propagate this information accurately. You must also complete this in a reasonable amount of time and then submit your results.\n"
)
return phase_str
def role_description(self):
return "a computer science postdoctoral student at a top university."
class MLEngineerAgent(BaseAgent):
def __init__(self, model="gpt4omini", notes=None, max_steps=100, openai_api_key=None):
super().__init__(model, notes, max_steps, openai_api_key)
self.phases = [
"data preparation",
"running experiments",
]
def context(self, phase):
sr_str = str()
if self.second_round:
sr_str = (
f"The following are results from the previous experiments\n",
f"Previous Experiment code: {self.prev_results_code}\n"
f"Previous Results: {self.prev_exp_results}\n"
f"Previous Interpretation of results: {self.prev_interpretation}\n"
f"Previous Report: {self.prev_report}\n"
f"{self.reviewer_response}\n\n\n"
)
if phase == "data preparation":
return (
sr_str,
f"Current Literature Review: {self.lit_review_sum}\nPlan: {self.plan}",
f"Current Plan: {self.plan}")
#elif phase == "running experiments":
# return (
# sr_str,
# f"Current Literature Review: {self.lit_review_sum}\n"
# f"Current Plan: {self.plan}\n"
# f"Current Dataset code: {self.dataset_code}\n"
# )
return ""
def example_command(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
return ()
def command_descriptions(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
if phase == "data preparation":
return (
"You can produce code using the following command: ```python\ncode here\n```\n where code here is the actual code you will execute in a Python terminal, and python is just the word python. Try to incorporate some print functions. Do not use any classes or functions. If your code returns any errors, they will be provided to you, and you are also able to see print statements. You will receive all print statement results from the code. Make sure function variables are created inside the function or passed as a function parameter.\n" # Try to avoid creating functions.
"You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where dialogue here is the actual dialogue you will send, and DIALOGUE is just the word DIALOGUE.\n"
"You also have access to HuggingFace datasets. You can search the datasets repository using the following command: ```SEARCH_HF\nsearch query here\n``` where search query here is the query used to search HuggingFace datasets, and SEARCH_HF is the word SEARCH_HF. This will return a list of HuggingFace dataset descriptions which can be loaded into Python using the datasets library. Your code MUST use an external HuggingFace directory.\n"
"You MUST use a HuggingFace dataset in your code. DO NOT CREATE A MAIN FUNCTION. Try to make the code very simple.\n"
"You can only use a SINGLE command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, NOT BOTH.\n"
"When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. python, DIALOGUE, SEARCH_HF).\n")
return ()
def phase_prompt(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
if phase == "data preparation":
phase_str = (
"You are a machine learning engineer being directed by a PhD student who will help you write the code, and you can interact with them through dialogue.\n"
"Your goal is to produce code that prepares the data for the provided experiment. You should aim for simple code to prepare the data, not complex code. You should integrate the provided literature review and the plan and come up with code to prepare data for this experiment.\n"
)
return phase_str
def role_description(self):
return "a machine learning engineer working at a top university."
class SWEngineerAgent(BaseAgent):
def __init__(self, model="gpt4omini", notes=None, max_steps=100, openai_api_key=None):
super().__init__(model, notes, max_steps, openai_api_key)
self.phases = [
"data preparation",
]
def context(self, phase):
sr_str = str()
if self.second_round:
sr_str = (
f"The following are results from the previous experiments\n",
f"Previous Experiment code: {self.prev_results_code}\n"
f"Previous Results: {self.prev_exp_results}\n"
f"Previous Interpretation of results: {self.prev_interpretation}\n"
f"Previous Report: {self.prev_report}\n"
f"{self.reviewer_response}\n\n\n"
)
if phase == "data preparation":
return (
sr_str,
f"Current Literature Review: {self.lit_review_sum}\nPlan: {self.plan}",
f"Current Plan: {self.plan}")
return ""
def example_command(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
return ()
def command_descriptions(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
if phase == "data preparation":
return (
"You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where 'dialogue here' is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n"
"When you and the ML engineer have finalized your dataset preparation code and are ready to submit the final code, please use the following command: ```SUBMIT_CODE\ncode here\n```\n where 'code here' is the finalized code you will send and SUBMIT_CODE is just the word SUBMIT_CODE. Do not use any classes or functions. The submitted code must have a HuggingFace dataset import and must use an external HuggingFace dataset. If your code returns any errors, they will be provided to you, and you are also able to see print statements. Make sure function variables are created inside the function or passed as a function parameter. DO NOT CREATE A MAIN FUNCTION.\n"
"Make sure to submit code in a reasonable amount of time. Do not make the code too complex, try to make it simple. Do not take too long to submit code. Submit the code early. You should submit the code ASAP.\n"
"You can only use a single command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, not both.\n"
"When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. SUBMIT_CODE, DIALOGUE).\n")
return ""
def phase_prompt(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
elif phase == "data preparation":
phase_str = (
"You are a software engineer directing a machine learning engineer, where the machine learning engineer will be writing the code, and you can interact with them through dialogue.\n"
"Your goal is to help the ML engineer produce code that prepares the data for the provided experiment. You should aim for very simple code to prepare the data, not complex code. You should integrate the provided literature review and the plan and come up with code to prepare data for this experiment.\n"
)
return phase_str
def role_description(self):
return "a software engineer working at a top university."
class PhDStudentAgent(BaseAgent):
def __init__(self, model="gpt4omini", notes=None, max_steps=100, openai_api_key=None):
super().__init__(model, notes, max_steps, openai_api_key)
self.phases = [
"literature review",
"plan formulation",
"running experiments",
"results interpretation",
"report writing",
"report refinement",
]
self.lit_review = []
def context(self, phase):
sr_str = str()
if self.second_round:
sr_str = (
f"The following are results from the previous experiments\n",
f"Previous Experiment code: {self.prev_results_code}\n"
f"Previous Results: {self.prev_exp_results}\n"
f"Previous Interpretation of results: {self.prev_interpretation}\n"
f"Previous Report: {self.prev_report}\n"
f"{self.reviewer_response}\n\n\n"
)
if phase == "plan formulation":
return (
sr_str,
f"Current Literature Review: {self.lit_review_sum}",)
elif phase == "data preparation":
return (
sr_str,
f"Current Literature Review: {self.lit_review_sum}\n"
f"Current Plan: {self.plan}"
)
elif phase == "results interpretation":
return (
sr_str,
f"Current Literature Review: {self.lit_review_sum}\n"
f"Current Plan: {self.plan}\n"
f"Current Dataset code: {self.dataset_code}\n"
f"Current Experiment code: {self.results_code}\n"
f"Current Results: {self.exp_results}"
)
elif phase == "report refinement":
return (
sr_str,
f"Current Literature Review: {self.lit_review_sum}\n"
f"Current Plan: {self.plan}\n"
f"Current Dataset code: {self.dataset_code}\n"
f"Current Experiment code: {self.results_code}\n"
f"Current Results: {self.exp_results}\n"
f"Current Interpretation of results: {self.interpretation}"
)
elif phase == "literature review":
return sr_str
else:
return ""
def requirements_txt(self):
sys_prompt = f"""You are {self.role_description()} \nTask instructions: Your goal is to integrate all of the knowledge, code, reports, and notes provided to you and generate a requirements.txt for a github repository for all of the code."""
history_str = "\n".join([_[1] for _ in self.history])
prompt = (
f"""History: {history_str}\n{'~' * 10}\n"""
f"Please produce the requirements.txt below in markdown:\n")
model_resp = query_model(model_str=self.model, system_prompt=sys_prompt, prompt=prompt, openai_api_key=self.openai_api_key)
return model_resp
def example_command(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
return ()
def command_descriptions(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
if phase == "literature review":
return (
"To collect paper summaries, use the following command: ```SUMMARY\nSEARCH QUERY\n```\n where SEARCH QUERY is a string that will be used to find papers with semantically similar content and SUMMARY is just the word SUMMARY. Make sure your search queries are very short.\n"
"To get the full paper text for an arXiv paper, use the following command: ```FULL_TEXT\narXiv paper ID\n```\n where arXiv paper ID is the ID of the arXiv paper (which can be found by using the SUMMARY command), and FULL_TEXT is just the word FULL_TEXT. Make sure to read the full text using the FULL_TEXT command before adding it to your list of relevant papers.\n"
"If you believe a paper is relevant to the research project proposal, you can add it to the official review after reading using the following command: ```ADD_PAPER\narXiv_paper_ID\nPAPER_SUMMARY\n```\nwhere arXiv_paper_ID is the ID of the arXiv paper, PAPER_SUMMARY is a brief summary of the paper, and ADD_PAPER is just the word ADD_PAPER. You can only add one paper at a time. \n"
"Make sure to use ADD_PAPER when you see a relevant paper. DO NOT use SUMMARY too many times."
"You can only use a single command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, not both.\n"
"Make sure to extensively discuss the experimental results in your summary.\n"
"When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. ADD_PAPER, FULL_TEXT, SUMMARY). Do not use the word COMMAND make sure to use the actual command, e.g. your command should look exactly like this: ```ADD_PAPER\ntext\n``` (where the command could be from ADD_PAPER, FULL_TEXT, SUMMARY)\n")
elif phase == "plan formulation":
return (
"You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where 'dialogue here' is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n"
"You can only use a single command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, not both.\n"
"When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. DIALOGUE).\n"
)
elif phase == "data preparation":
return (
"You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where 'dialogue here' is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n"
"When you and the ML engineer have finalized your dataset preparation code and are ready to submit the final code, please use the following command: ```SUBMIT_CODE\ncode here\n```\n where 'code here' is the finalized code you will send and SUBMIT_CODE is just the word SUBMIT_CODE. Do not use any classes or functions. The submitted code must have a HuggingFace dataset import and must use an external HuggingFace dataset. If your code returns any errors, they will be provided to you, and you are also able to see print statements. Make sure function variables are created inside the function or passed as a function parameter. DO NOT CREATE A MAIN FUNCTION.\n"
"Make sure to submit code in a reasonable amount of time. Do not make the code too complex, try to make it simple. Do not take too long to submit code. Submit the code early. You should submit the code ASAP.\n"
"You can only use a single command per inference turn. Do not use more than one command per inference. If you use multiple commands, then only one of them will be executed, not both.\n"
"When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. SUBMIT_CODE, DIALOGUE).\n")
elif phase == "results interpretation":
return (
"You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where 'dialogue here' is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n"
"When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. DIALOGUE).\n"
)
#elif phase == "report writing":
# return (
# "You can produce dialogue using the following command: ```DIALOGUE\ndialogue here\n```\n where 'dialogue here' is the actual dialogue you will send and DIALOGUE is just the word DIALOGUE.\n"
# "When performing a command, make sure to include the three ticks (```) at the top and bottom ```COMMAND\ntext\n``` where COMMAND is the specific command you want to run (e.g. DIALOGUE).\n")
elif phase == "report refinement":
return ""
return ""
def phase_prompt(self, phase):
if phase not in self.phases:
raise Exception(f"Invalid phase: {phase}")
if phase == "literature review":
phase_str = (
"Your goal is to perform a literature review for the presented task and add papers to the literature review.\n"
"You have access to arXiv and can perform two search operations: (1) finding many different paper summaries from a search query and (2) getting a single full paper text for an arXiv paper.\n"
)
rev_papers = "Papers in your review so far: " + " ".join([_paper["arxiv_id"] for _paper in self.lit_review])
phase_str += rev_papers if len(self.lit_review) > 0 else ""
elif phase == "plan formulation":
phase_str = (
"You are a PhD student being directed by a postdoc who will help you come up with a good plan, and you interact with them through dialogue.\n"
"Your goal is to produce plans that would make good experiments for the given topic. You should aim for a very simple experiment that showcases your plan, not a complex one. You should integrate the provided literature review and come up with plans on how to expand and build on these works for the given topic. Your plans should provide a clear outline for how to achieve the task, including what machine learning models to use and implement, what types of datasets should be searched for and used to train the model, and the exact details of the experiment.\n"
)
elif phase == "results interpretation":
phase_str = (
"You are a PhD student being directed by a postdoc who will help you come up with an interpretation for results from an experiment, and you interact with them through dialogue.\n"
"Your goal is to interpret results from experiments that were previously run. You should read through the code and look at the results to understand what occurred. You should then discuss with the postdoc your interpretation and use their feedback to improve your thoughts. You should integrate the provided literature review, code, and plans to come up with an exciting interpretation that could make a compelling paper. Your plans should provide a clear outline that can be used to write an academic paper.\n"
"Your interpretation should include numbers, relevant metrics to the experiment (e.g. accuracy or loss) and measures of significance. You must propagate this information accurately.\n"
"You must submit the interpretation during this phase in a reasonable amount of time. Do not delay the submission."
)
#elif phase == "report writing":
# phase_str = (
# "You are a PhD student being directed by a professor who will help you write a report based on results from an experiment, and you interact with them through dialogue.\n"
# "Your goal is to write a report for an experiment entirely in latex. You should read through the code, read through the interpretation, and look at the results to understand what occurred. You should then discuss with the professor how you can write up the results and receive their feedback to improve your thoughts.\n"
# "Your report should include numbers, relevant metrics to the experiment (e.g. accuracy or loss) and measures of significance in latex. You must propagate this information accurately.\n"
# "You must be incredibly detailed about what you did for the experiment and all of the findings.\n"
# )
elif phase == "report refinement":
phase_str = (
"You are a PhD student who has submitted their paper to an ML conference called ICLR. Your goal was to write a research paper and get high scores from the reviewers so that it get accepted to the conference.\n"
)
else:
phase_str = ""
return phase_str
def role_description(self):
return "a computer science PhD student at a top university."
def add_review(self, review, arx_eng):
try:
arxiv_id, review_text = review.strip().split("\n", 1)
full_text = arx_eng.retrieve_full_paper_text(arxiv_id)
review_entry = {
"arxiv_id": arxiv_id,
"full_text": full_text,
"summary": review_text,
}
self.lit_review.append(review_entry)
return f"Successfully added paper {arxiv_id}", full_text
except Exception as e:
return f"Error trying to add review -- bad formatting, try again: {str(e)}", ""
def format_review(self):
return "Provided here is a literature review on this topic:\n" + "\n".join(
f"arXiv ID: {_l['arxiv_id']}, Summary: {_l['summary']}"
for _l in self.lit_review)