Skip to content

Commit 4181182

Browse files
Clean names script
1 parent 0ea8a32 commit 4181182

File tree

1 file changed

+133
-0
lines changed

1 file changed

+133
-0
lines changed

clean_and_split.py

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import glob
2+
import os
3+
import sys
4+
import random
5+
import javalang
6+
import numpy as np
7+
from tqdm import tqdm
8+
from math import ceil
9+
from shutil import copyfile as cp
10+
11+
TRAIN_SPLIT = .8
12+
TEST_VAL_SPLIT = .1
13+
14+
#minimum number of times the method name must be seen to include it in the dataset
15+
MIN_NUM = 5
16+
17+
def copy_files(files, folder):
18+
for i in range(0, len(files)):
19+
cp(files[i], os.path.join(out_dir, folder, str(i) + ".java"))
20+
21+
def add_to_method_map(m_name):
22+
if m_name in m_names:
23+
m_names[m_name] += 1
24+
else:
25+
m_names[m_name] = 1
26+
27+
def get_all_methods(f_name):
28+
with open(f_name, "rb") as f:
29+
c = f.read()
30+
31+
try:
32+
tree = javalang.parse.parse(c)
33+
methods = list(tree.filter(javalang.tree.MethodDeclaration))
34+
35+
except (javalang.parser.JavaSyntaxError, AttributeError, javalang.tokenizer.LexerError, TypeError, RecursionError, StopIteration) as e:
36+
#print(e)
37+
return []
38+
39+
return methods
40+
41+
def split_by_token(name):
42+
tokens = []
43+
token = ""
44+
prev = ""
45+
46+
for c in name:
47+
if ((c.isupper() and prev.islower()) or c == "_" ) and len(token) > 0:
48+
tokens.append(token)
49+
token = c
50+
51+
else:
52+
token += c
53+
54+
prev = c
55+
56+
57+
if len(token) > 0:
58+
tokens.append(token)
59+
60+
return tokens
61+
62+
if len(sys.argv) < 3:
63+
print("USAGE: python clean_and_split.py IN_DIR OUT_DIR")
64+
65+
data_dir = sys.argv[1]
66+
out_dir = sys.argv[2]
67+
68+
split_or_clean = sys.argv[3]
69+
split, clean, vec = False, False, False
70+
71+
if split_or_clean == "split":
72+
split = True
73+
elif split_or_clean == "clean":
74+
clean = True
75+
vec_or_seq = sys.argv[4]
76+
if vec == "seq":
77+
vec = False
78+
else:
79+
print("command not accepted")
80+
sys.exit(1)
81+
82+
83+
all_files = []
84+
m_names = {}
85+
86+
for (dirpath, dirnames, filenames) in os.walk(data_dir):
87+
all_files += [os.path.join(dirpath, _file) for _file in filenames]
88+
89+
if clean:
90+
for _file in tqdm(all_files):
91+
methods = get_all_methods(_file)
92+
for path, node in methods:
93+
names = [node.name] if vec else split_by_token(node.name)
94+
95+
for name in names:
96+
add_to_method_map(name)
97+
98+
m_clean = {k: v for k, v in m_names.items() if v >= MIN_NUM}
99+
print("total", len(m_names), "clean", len(m_clean))
100+
101+
s = ""
102+
for k, v in m_clean.items():
103+
s += k + "\n"
104+
105+
with open("clean_names.txt", "w") as f:
106+
f.write(s)
107+
108+
109+
#clean files here by putting each method in a new file?
110+
111+
if split:
112+
random.shuffle(all_files)
113+
114+
l = len(all_files)
115+
end = ceil(TRAIN_SPLIT*l)
116+
train = all_files[0:end]
117+
118+
start = end
119+
end = end + ceil(TEST_VAL_SPLIT*l)
120+
val = all_files[start:end]
121+
122+
test = all_files[end:]
123+
124+
125+
if not os.path.exists(out_dir):
126+
os.mkdir(out_dir)
127+
os.mkdir(os.path.join(out_dir, "training"))
128+
os.mkdir(os.path.join(out_dir, "test"))
129+
os.mkdir(os.path.join(out_dir, "validation"))
130+
131+
copy_files(train, "training")
132+
copy_files(test, "test")
133+
copy_files(val, "validation")

0 commit comments

Comments
 (0)