Skip to content

Commit 07cebb0

Browse files
Add extract functions library and tests
This patch adds the extract_functions_lib library that is used for extracting a specific set of functions from a partially compiled corpus. This allows for only recompiling some functions when training a register allocator policy. Reviewers: mtrofin Reviewed By: mtrofin Pull Request: #486
1 parent b65e859 commit 07cebb0

File tree

2 files changed

+269
-0
lines changed

2 files changed

+269
-0
lines changed
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""A library that contains utilities for extracting functions.
15+
16+
This library contains utilities to find what functions exist within a specific
17+
bitcode module as well as to extract functions of interest to separate bitcode
18+
files for use in the training process.
19+
"""
20+
21+
import subprocess
22+
import os
23+
import shutil
24+
import json
25+
import concurrent.futures
26+
from collections.abc import Sequence
27+
28+
29+
def _get_function_names_in_file(bc_file_path: str,
30+
llvm_nm_path: str) -> list[str]:
31+
"""Gets all function names defined in a file.
32+
33+
This function returns all the (mangled) function names present in a bitcode
34+
file that have external or weak linkage.
35+
36+
Args:
37+
bc_file_path: The path to the bitcode file to find functions in.
38+
llvm_nm_path: The path to the llvm-nm binary that is used to extract all of
39+
the symbol names within the function.
40+
41+
Returns:
42+
A list of strings representing the functions in the file.
43+
"""
44+
command_vector = [
45+
llvm_nm_path,
46+
"--defined-only",
47+
"--format=posix",
48+
bc_file_path,
49+
]
50+
result = subprocess.run(command_vector, capture_output=True, check=True)
51+
52+
functions_list = []
53+
for symbol in result.stdout.decode("utf-8").split("\n")[:-1]:
54+
symbol_parts = symbol.split(" ")
55+
if (symbol_parts[1] == "t" or symbol_parts[1] == "T" or
56+
symbol_parts[1] == "w" or symbol_parts[1] == "W"):
57+
functions_list.append(symbol_parts[0])
58+
59+
return functions_list
60+
61+
62+
def _extract_function_from_file(
63+
bc_file_path: str,
64+
output_file_path: str,
65+
function_name: str,
66+
llvm_extract_path: str,
67+
opt_path: str,
68+
) -> None:
69+
"""Extracts a function from a file.
70+
71+
Args:
72+
bc_file_path: The path to the bitcode file to extract the function from.
73+
output_folder: The folder to dump the extracted function into.
74+
function_name: The (mangled) name of the function to extract.
75+
llvm_extract_path: The path to the llvm-extract binary to use to extract
76+
the function.
77+
opt_path: The path to the opt binary to use to strip debug information.
78+
"""
79+
command_vector = [
80+
llvm_extract_path,
81+
"-func",
82+
function_name,
83+
"-o",
84+
output_file_path + ".fat",
85+
bc_file_path,
86+
]
87+
subprocess.run(command_vector, capture_output=True, check=True)
88+
89+
opt_command_vector = [
90+
opt_path,
91+
"-strip-debug",
92+
"-o",
93+
output_file_path,
94+
output_file_path + ".fat",
95+
]
96+
subprocess.run(opt_command_vector, capture_output=True, check=True)
97+
os.remove(output_file_path + ".fat")
98+
99+
orig_cmd_file_path = os.path.splitext(bc_file_path)[0] + ".cmd"
100+
output_cmd_file_path = os.path.splitext(output_file_path)[0] + ".cmd"
101+
shutil.copy(orig_cmd_file_path, output_cmd_file_path)
102+
103+
104+
def get_function_module_map(corpus_path: str,
105+
llvm_nm_path: str) -> dict[str, str]:
106+
"""Gets a mapping from function names to module paths.
107+
108+
Args:
109+
corpus_path: The path to the corpus to obtain the mapping from.
110+
llvm_nm_path: The path to the llvm-nm binary to obtain symbols from
111+
bitcode files.
112+
113+
Returns:
114+
A dictionary mapping (mangled) function names to module paths.
115+
"""
116+
function_to_module_map = {}
117+
118+
with open(
119+
os.path.join(corpus_path, "corpus_description.json"),
120+
encoding="utf-8") as corpus_description_handle:
121+
corpus_description = json.load(corpus_description_handle)
122+
123+
for module in corpus_description["modules"]:
124+
module_path = os.path.join(corpus_path, module) + ".bc"
125+
for function_name in _get_function_names_in_file(module_path, llvm_nm_path):
126+
function_to_module_map[function_name] = module_path
127+
128+
return function_to_module_map
129+
130+
131+
def extract_functions(functions_to_extract: Sequence[str],
132+
function_to_module: dict[str, str],
133+
llvm_extract_path: str, opt_path: str, thread_count: int,
134+
output_dir: str) -> None:
135+
"""Extracts all the functions specified.
136+
137+
Args:
138+
functions_to_extract: A string list containing (mangled) names of all the
139+
functions that should be extracted.
140+
function_to_module: A dictionary mapping (mangled) function names to module
141+
paths.
142+
llvm_extract_path: The path to the llvm-extract binary to use to extract
143+
function bodies from bitcode files.
144+
opt_path: The path to the opt binary to use for stripping debug info.
145+
thread_count: The number of threads to use for extracting functions.
146+
output_dir: The path to the new corpus where all the extracted functions
147+
will be placed.
148+
"""
149+
module_paths = []
150+
151+
with concurrent.futures.ThreadPoolExecutor(thread_count) as thread_pool:
152+
extract_futures = []
153+
154+
for index, function_to_extract in enumerate(functions_to_extract):
155+
bc_file = function_to_module[function_to_extract]
156+
output_path = os.path.join(output_dir, f"{index}.bc")
157+
module_paths.append(str(index))
158+
extract_futures.append(
159+
thread_pool.submit(_extract_function_from_file, bc_file, output_path,
160+
function_to_extract, llvm_extract_path, opt_path))
161+
162+
for future in extract_futures:
163+
if future.exception() is not None:
164+
raise future.exception()
165+
166+
corpus_description = {"modules": module_paths, "has_thinlto": False}
167+
168+
with open(
169+
os.path.join(output_dir, "corpus_description.json"),
170+
"w",
171+
encoding="utf-8") as corpus_description_handle:
172+
json.dump(corpus_description, corpus_description_handle)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Tests for the extract_functions_lib library."""
15+
16+
import os
17+
18+
from absl.testing import absltest
19+
20+
from compiler_opt.testing import corpus_test_utils
21+
from compiler_opt.tools.regalloc_trace import extract_functions_lib
22+
23+
24+
class ExtractFunctionsTest(absltest.TestCase):
25+
26+
def test_get_function_module_map(self):
27+
fake_llvm_nm_binary = self.create_tempfile("fake_llvm_nm")
28+
fake_llvm_nm_invocations = self.create_tempfile("fake_llvm_nm_invocations")
29+
corpus_test_utils.create_test_binary(
30+
fake_llvm_nm_binary.full_path, fake_llvm_nm_invocations.full_path,
31+
["echo \"a T 1140 b\"", "echo \"main T 1150 16\""])
32+
corpus_dir = self.create_tempdir("corpus")
33+
_ = corpus_test_utils.setup_corpus(corpus_dir.full_path)
34+
35+
function_module_map = extract_functions_lib.get_function_module_map(
36+
corpus_dir, fake_llvm_nm_binary.full_path)
37+
self.assertDictEqual(
38+
function_module_map, {
39+
"a": os.path.join(corpus_dir.full_path, "module_b.o.bc"),
40+
"main": os.path.join(corpus_dir.full_path, "module_b.o.bc")
41+
})
42+
43+
def test_extract_functions(self):
44+
output_dir = self.create_tempdir(
45+
"output", cleanup=absltest.TempFileCleanup.OFF)
46+
fake_llvm_extract_binary = self.create_tempfile(
47+
"fake_llvm_extract", cleanup=absltest.TempFileCleanup.OFF)
48+
fake_llvm_extract_invocations = self.create_tempfile(
49+
"fake_llvm_extract_invocations")
50+
corpus_test_utils.create_test_binary(
51+
fake_llvm_extract_binary.full_path,
52+
fake_llvm_extract_invocations.full_path, [
53+
f"touch {os.path.join(output_dir.full_path, '0.bc.fat')}",
54+
f"touch {os.path.join(output_dir.full_path, '1.bc.fat')}"
55+
])
56+
fake_opt_binary = self.create_tempfile("fake_opt")
57+
fake_opt_invocations = self.create_tempfile("fake_opt_invocations")
58+
corpus_test_utils.create_test_binary(fake_opt_binary.full_path,
59+
fake_opt_invocations.full_path)
60+
corpus_dir = self.create_tempdir("corpus")
61+
_ = corpus_test_utils.setup_corpus(corpus_dir.full_path)
62+
functions_to_extract = ["a", "b"]
63+
function_to_module = {
64+
"a": os.path.join(corpus_dir, "module_a.o.bc"),
65+
"b": os.path.join(corpus_dir, "module_b.o.bc")
66+
}
67+
68+
extract_functions_lib.extract_functions(functions_to_extract,
69+
function_to_module,
70+
fake_llvm_extract_binary.full_path,
71+
fake_opt_binary.full_path, 1,
72+
output_dir.full_path)
73+
llvm_extract_invocations = fake_llvm_extract_invocations.read_text().split(
74+
"\n")
75+
llvm_extract_invocations.remove("")
76+
self.assertEqual(
77+
llvm_extract_invocations[0],
78+
f"-func a -o {os.path.join(output_dir.full_path, '0.bc.fat')} "
79+
f"{os.path.join(corpus_dir.full_path, 'module_a.o.bc')}")
80+
self.assertEqual(
81+
llvm_extract_invocations[1],
82+
f"-func b -o {os.path.join(output_dir.full_path, '1.bc.fat')} "
83+
f"{os.path.join(corpus_dir.full_path, 'module_b.o.bc')}")
84+
opt_invocations = fake_opt_invocations.read_text().split("\n")
85+
opt_invocations.remove("")
86+
self.assertEqual(
87+
opt_invocations[0],
88+
f"-strip-debug -o {os.path.join(output_dir.full_path, '0.bc')} "
89+
f"{os.path.join(output_dir.full_path, '0.bc.fat')}")
90+
self.assertEqual(
91+
opt_invocations[1],
92+
f"-strip-debug -o {os.path.join(output_dir.full_path, '1.bc')} "
93+
f"{os.path.join(output_dir.full_path, '1.bc.fat')}")
94+
95+
96+
if __name__ == "__main__":
97+
absltest.main()

0 commit comments

Comments
 (0)