CommitSuite/CMG_eval_binary_metrics.py at master · security-pride/CommitSuite · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import json
import os
from tqdm import tqdm
from openai import OpenAI
import csv
import concurrent.futures
from threading import Lock

# Maximum number of workers (adjust based on API rate limits)
MAX_WORKERS = 15

# Initialize the OpenAI client
client = OpenAI(api_key="sk-xxx", base_url="https://api.deepseek.com")


def build_evaluation_prompt(entry):
    prompt = """You are a code reviewer evaluating the quality of an AI-generated commit message.
You are given the following information:
- Modified files (with diffs)
- Possibly related issues, pull requests, or comments
- Possibly AST changes
- The generated commit message (CMG_result)

You should assess the generated commit message from five perspectives:
  1. Rationality: Whether it contains "why" information (describe the reasons for the changes).
  2. Comprehensiveness: Whether it contains "what" information (summarize the changes in this commit) and covers all affected files.
  3. Non-redundancy: Whether there is no semantic repetition, mergable details, meaningless content (unrelated to "what" and "why"), or content of little use.
  4. Authenticity: Whether it does not include modifications absent in the actual code changes.
  5. Logicality: Whether the content in it is reasonable and logical.

Please answer using the following JSON format (Use 1 for "yes", 0 for "no"; Don't miss any fields):
{
  "Rationality": 0,
  "Comprehensiveness": 0,
  "Non-redundancy": 0,
  "Authenticity": 0,
  "Logicality": 0
}

Here is the information:

Modified Files:
"""

    for file in entry.get("modified_files", []):
        prompt += f"\nFile: {file.get('old_path')} -> {file.get('new_path')}"
        prompt += f"\nChange Type: {file.get('change_type')}"
        prompt += f"\nDiff:\n{file.get('diff')}\n"

    if entry.get("issues"):
        prompt += f"\nRelated Issues:\n{json.dumps(entry['issues'], indent=2)}"
    if entry.get("prs"):
        prompt += f"\nRelated PRs:\n{json.dumps(entry['prs'], indent=2)}"
    if entry.get("comments"):
        prompt += f"\nRelated Comments:\n{json.dumps(entry['comments'], indent=2)}"

    # Add this part as appropriate.
    # if entry.get("ast_changes"):
    #     prompt += f"\nAST Changes (in JSON format):\n{json.dumps(entry['ast_changes'], indent=2)}"

    prompt += f"\n\nGenerated Commit Message:\n{entry.get('CMG_result', '')}\n"
    prompt += "\nNow, answer the 5 questions in JSON format (please be as strict as possible and don't make any explanations):"
    return prompt


def call_llm(prompt):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a professional software engineer."},
            {"role": "user", "content": prompt}
        ],
        stream=False
    )
    content = response.choices[0].message.content.strip()
    content = content.replace('```json', '').replace('```', '').strip()
    print(content)

    try:
        parsed = json.loads(content)
        return {
            "Rationality": int(parsed.get("Rationality", 0)),
            "Comprehensiveness": int(parsed.get("Comprehensiveness", 0)),
            "Non-redundancy": int(parsed.get("Non-redundancy", 0)),
            "Authenticity": int(parsed.get("Authenticity", 0)),
            "Logicality": int(parsed.get("Logicality", 0)),
        }
    except Exception as e:
        print("Failed to parse LLM output:", content)
        return {key: 0 for key in ["Rationality", "Comprehensiveness", "Non-redundancy", "Authenticity", "Logicality"]}


class ThreadSafeData:
    def __init__(self):
        self.summary = {
            "Rationality": 0,
            "Comprehensiveness": 0,
            "Non-redundancy": 0,
            "Authenticity": 0,
            "Logicality": 0
        }
        self.lock = Lock()

    def update(self, scores):
        with self.lock:
            for key in self.summary:
                self.summary[key] += scores[key]


def process_entry(entry, safe_data):
    try:
        prompt = build_evaluation_prompt(entry)
        scores = call_llm(prompt)

        safe_data.update(scores)
        entry.update(scores)
        return entry
    except Exception as e:
        print(f"Error processing {entry.get('hash')}: {str(e)}")
        return None


def evaluate_file(file_in, file_out_json, file_out_csv):
    with open(file_in, "r", encoding="utf-8") as f:
        data = json.load(f)

    safe_data = ThreadSafeData()
    results = []
    total = len(data)

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_entry, entry, safe_data) for entry in data]

        with tqdm(total=total, desc=f"Evaluating {file_in}") as pbar:
            for future in concurrent.futures.as_completed(futures):
                result = future.result()
                if result:
                    results.append(result)
                pbar.update(1)

    results.sort(key=lambda x: x["hash"])

    # Ensure output directory exists
    output_dir = "./CMG_reports"
    os.makedirs(output_dir, exist_ok=True)

    with open(os.open.path.join(output_dir, file_out_json), "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    with open(os.path.join(output_dir, file_out_csv), "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "Rationality", "Comprehensiveness", "Non-redundancy", "Authenticity", "Logicality"
        ])
        writer.writeheader()
        writer.writerow({
            key: round(safe_data.summary[key] / total, 4) for key in safe_data.summary
        })


# Execute evaluation
evaluate_file("CMG_result.json", "CMG_eval_binary_metrics.json", "CMG_eval_binary_metrics.csv")