@@ -20,37 +20,44 @@ def count_file_lines(file_path):
20
20
def prepare ():
21
21
for lang in ['go' , 'java' , 'python' , 'ruby' , 'javascript' , 'php' ]:
22
22
for split in ['train' , 'valid' , 'test' ]:
23
- lang_iso = 'js' if lang == 'javascript' else lang
24
23
src_writer = open (
25
- 'processed/{}.{}-en_XX.{}' .format (split , lang_iso , lang_iso ), 'w' , encoding = 'utf-8'
24
+ 'processed/{}.{}-en_XX.{}' .format (split , lang , lang ), 'w' , encoding = 'utf-8'
26
25
)
27
26
tgt_writer = open (
28
- 'processed/{}.{}-en_XX.en_XX' .format (split , lang_iso ), 'w' , encoding = 'utf-8'
27
+ 'processed/{}.{}-en_XX.en_XX' .format (split , lang ), 'w' , encoding = 'utf-8'
29
28
)
30
29
filename = '{}/{}.jsonl' .format (lang , split )
31
30
with open (filename ) as f :
32
31
for line in tqdm (
33
32
f , total = count_file_lines (filename ), desc = "{}-{}" .format (lang , split )
34
33
):
35
34
ex = json .loads (line .strip ())
35
+ code = ' ' .join (ex ['code_tokens' ])
36
+ code = re .sub ("[\n \r \t ]+" , " " , code ).strip ()
37
+ docstring = ' ' .join (ex ['docstring_tokens' ])
38
+ docstring = re .sub ("[\n \r \t ]+" , " " , docstring ).strip ()
39
+ if len (code ) == 0 or len (docstring ) == 0 :
40
+ continue
41
+
42
+ tokenized_code = None
43
+ if lang == 'python' or lang == 'java' :
44
+ _tokens = tokenize_python (ex ['code' ]) \
45
+ if lang == 'python' else tokenize_java (ex ['code' ])
46
+ tokenized_code = ' ' .join (_tokens )
47
+ tokenized_code = re .sub ("[\n \r \t ]+" , " " , tokenized_code ).strip ()
48
+ if len (tokenized_code ) == 0 :
49
+ continue
50
+
36
51
try :
37
52
if lang == 'python' or lang == 'java' :
38
- code_tokens = tokenize_python (ex ['code' ]) \
39
- if lang == 'python' else tokenize_java (ex ['code' ])
40
- if len (code_tokens ) > 0 :
41
- raise ValueError ('Empty tokenized code' )
53
+ # this line can throw error `UnicodeEncodeError`
54
+ src_writer .write (tokenized_code + '\n ' )
42
55
else :
43
- code_tokens = ex [ 'code_tokens' ]
56
+ src_writer . write ( code + ' \n ' )
44
57
except :
45
- code_tokens = ex [ 'code_tokens' ]
58
+ src_writer . write ( code + ' \n ' )
46
59
47
- code = ' ' .join (code_tokens )
48
- code = re .sub ("[\n \r \t ]+" , " " , code )
49
- docstring = ' ' .join (ex ['docstring_tokens' ])
50
- docstring = re .sub ("[\n \r \t ]+" , " " , docstring )
51
- if len (code ) > 0 and len (docstring ) > 0 :
52
- src_writer .write (code .strip () + '\n ' )
53
- tgt_writer .write (docstring .strip () + '\n ' )
60
+ tgt_writer .write (docstring + '\n ' )
54
61
55
62
src_writer .close ()
56
63
tgt_writer .close ()
0 commit comments