Skip to content

Commit f4278e5

Browse files
committed
adding baseline tokenization script
1 parent ec0ae30 commit f4278e5

21 files changed

+4158
-1
lines changed

.gitignore

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,6 @@
22
*.lst
33
.idea/*
44
*.iml
5-
*.xml
5+
*.xml
6+
*.pyc
7+
+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
requires landscape|boolean (){ return false; }
2+
get parent key|Object (){ return new ContactsUiKey(); }
3+
get parent key|Object (){ return new ContactsUiKey(); }
4+
get layout id|int (){ return R.layout.loose_screen; }
5+
get parent key|Object (){ return new EditContactKey(contactId); }
6+
to contact|Contact (){ return new Contact(id, name, email); }
7+
to string|String (){ return "Welcome!\nClick to continue."; }
8+
get parent key|Object (){ return new EditContactKey(contactId); }
9+
tear down services|void (@NonNull Services services){ }
10+
get layout id|int (){ return R.layout.landscape_screen; }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
from . import parser
3+
from . import parse
4+
from . import tokenizer
5+
from . import javadoc
6+
7+
8+
__version__ = "0.10.1"

baseline_tokenization/javalang/ast.py

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import pickle
2+
3+
import six
4+
5+
6+
class MetaNode(type):
7+
def __new__(mcs, name, bases, dict):
8+
attrs = list(dict['attrs'])
9+
dict['attrs'] = list()
10+
11+
for base in bases:
12+
if hasattr(base, 'attrs'):
13+
dict['attrs'].extend(base.attrs)
14+
15+
dict['attrs'].extend(attrs)
16+
17+
return type.__new__(mcs, name, bases, dict)
18+
19+
20+
@six.add_metaclass(MetaNode)
21+
class Node(object):
22+
attrs = ()
23+
24+
def __init__(self, **kwargs):
25+
values = kwargs.copy()
26+
27+
for attr_name in self.attrs:
28+
value = values.pop(attr_name, None)
29+
setattr(self, attr_name, value)
30+
31+
if values:
32+
raise ValueError('Extraneous arguments')
33+
34+
def __equals__(self, other):
35+
if type(other) is not type(self):
36+
return False
37+
38+
for attr in self.attrs:
39+
if getattr(other, attr) != getattr(self, attr):
40+
return False
41+
42+
return True
43+
44+
def __repr__(self):
45+
return type(self).__name__
46+
47+
def __iter__(self):
48+
return walk_tree(self)
49+
50+
def filter(self, pattern):
51+
for path, node in self:
52+
if ((isinstance(pattern, type) and isinstance(node, pattern)) or
53+
(node == pattern)):
54+
yield path, node
55+
56+
@property
57+
def children(self):
58+
return [getattr(self, attr_name) for attr_name in self.attrs]
59+
60+
def walk_tree(root):
61+
children = None
62+
63+
if isinstance(root, Node):
64+
yield (), root
65+
children = root.children
66+
else:
67+
children = root
68+
69+
for child in children:
70+
if isinstance(child, (Node, list, tuple)):
71+
for path, node in walk_tree(child):
72+
yield (root,) + path, node
73+
74+
def dump(ast, file):
75+
pickle.dump(ast, file)
76+
77+
def load(file):
78+
return pickle.load(file)
+120
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
2+
import re
3+
4+
def join(s):
5+
return ' '.join(l.strip() for l in s.split('\n'))
6+
7+
class DocBlock(object):
8+
def __init__(self):
9+
self.description = ''
10+
self.return_doc = None
11+
self.params = []
12+
13+
self.authors = []
14+
self.deprecated = False
15+
16+
# @exception and @throw are equivalent
17+
self.throws = {}
18+
self.exceptions = self.throws
19+
20+
self.tags = {}
21+
22+
def add_block(self, name, value):
23+
value = value.strip()
24+
25+
if name == 'param':
26+
try:
27+
param, description = value.split(None, 1)
28+
except ValueError:
29+
param, description = value, ''
30+
self.params.append((param, join(description)))
31+
32+
elif name in ('throws', 'exception'):
33+
try:
34+
ex, description = value.split(None, 1)
35+
except ValueError:
36+
ex, description = value, ''
37+
self.throws[ex] = join(description)
38+
39+
elif name == 'return':
40+
self.return_doc = value
41+
42+
elif name == 'author':
43+
self.authors.append(value)
44+
45+
elif name == 'deprecated':
46+
self.deprecated = True
47+
48+
self.tags.setdefault(name, []).append(value)
49+
50+
blocks_re = re.compile('(^@)', re.MULTILINE)
51+
leading_space_re = re.compile(r'^\s*\*', re.MULTILINE)
52+
blocks_justify_re = re.compile(r'^\s*@', re.MULTILINE)
53+
54+
def _sanitize(s):
55+
s = s.strip()
56+
57+
if not (s[:3] == '/**' and s[-2:] == '*/'):
58+
raise ValueError('not a valid Javadoc comment')
59+
60+
s = s.replace('\t', ' ')
61+
62+
return s
63+
64+
def _uncomment(s):
65+
# Remove /** and */
66+
s = s[3:-2].strip()
67+
68+
return leading_space_re.sub('', s)
69+
70+
def _get_indent_level(s):
71+
return len(s) - len(s.lstrip())
72+
73+
def _left_justify(s):
74+
lines = s.rstrip().splitlines()
75+
76+
if not lines:
77+
return ''
78+
79+
indent_levels = []
80+
for line in lines:
81+
if line.strip():
82+
indent_levels.append(_get_indent_level(line))
83+
indent_levels.sort()
84+
85+
common_indent = indent_levels[0]
86+
if common_indent == 0:
87+
return s
88+
else:
89+
lines = [line[common_indent:] for line in lines]
90+
return '\n'.join(lines)
91+
92+
def _force_blocks_left(s):
93+
return blocks_justify_re.sub('@', s)
94+
95+
def parse(raw):
96+
sanitized = _sanitize(raw)
97+
uncommented = _uncomment(sanitized)
98+
justified = _left_justify(uncommented)
99+
justified_fixed = _force_blocks_left(justified)
100+
prepared = justified_fixed
101+
102+
blocks = blocks_re.split(prepared)
103+
104+
doc = DocBlock()
105+
106+
if blocks[0] != '@':
107+
doc.description = blocks[0].strip()
108+
blocks = blocks[2::2]
109+
else:
110+
blocks = blocks[1::2]
111+
112+
for block in blocks:
113+
try:
114+
tag, value = block.split(None, 1)
115+
except ValueError:
116+
tag, value = block, ''
117+
118+
doc.add_block(tag, value)
119+
120+
return doc
+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
2+
from .parser import Parser
3+
from .tokenizer import tokenize
4+
5+
def parse_expression(exp):
6+
if not exp.endswith(';'):
7+
exp = exp + ';'
8+
9+
tokens = tokenize(exp)
10+
parser = Parser(tokens)
11+
12+
return parser.parse_expression()
13+
14+
def parse_member_signature(sig):
15+
if not sig.endswith(';'):
16+
sig = sig + ';'
17+
18+
tokens = tokenize(sig)
19+
parser = Parser(tokens)
20+
21+
return parser.parse_member_declaration()
22+
23+
def parse_constructor_signature(sig):
24+
# Add an empty body to the signature, replacing a ; if necessary
25+
if sig.endswith(';'):
26+
sig = sig[:-1]
27+
sig = sig + '{ }'
28+
29+
tokens = tokenize(sig)
30+
parser = Parser(tokens)
31+
32+
return parser.parse_member_declaration()
33+
34+
def parse_type(s):
35+
tokens = tokenize(s)
36+
parser = Parser(tokens)
37+
38+
return parser.parse_type()
39+
40+
def parse_type_signature(sig):
41+
if sig.endswith(';'):
42+
sig = sig[:-1]
43+
sig = sig + '{ }'
44+
45+
tokens = tokenize(sig)
46+
parser = Parser(tokens)
47+
48+
return parser.parse_class_or_interface_declaration()
49+
50+
def parse(s):
51+
tokens = tokenize(s)
52+
parser = Parser(tokens)
53+
return parser.parse()

0 commit comments

Comments
 (0)