@@ -19,8 +19,9 @@ project(Tokenizers)
1919option (TOKENIZERS_BUILD_TEST "Build tests" OFF )
2020option (TOKENIZERS_BUILD_TOOLS "Build tools" OFF )
2121option (SUPPORT_REGEX_LOOKAHEAD
22- "Support regex lookahead patterns (requires PCRE2)" OFF )
22+ "Support regex lookahead patterns (requires PCRE2)" OFF )
2323
24+ include (Utils.cmake)
2425# Ignore weak attribute warning
2526set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes" )
2627
@@ -34,20 +35,6 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
3435add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/re2)
3536add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece)
3637
37- # Configure PCRE2
38- if (SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
39- set (PCRE2_BUILD_PCRE2_8 ON )
40- set (PCRE2_BUILD_PCRE2_16 OFF )
41- set (PCRE2_BUILD_PCRE2_32 OFF )
42- set (PCRE2_BUILD_TESTS OFF )
43- set (PCRE2_BUILD_PCRE2GREP OFF )
44- set (PCRE2_BUILD_PCRE2TEST OFF )
45- set (PCRE2_BUILD_PCRE2GPERF OFF )
46- set (PCRE2_BUILD_DOCS OFF )
47- set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
48- add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2)
49- endif ()
50-
5138set (CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag} )
5239
5340file (GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR} /src/*.cpp)
@@ -60,14 +47,8 @@ set(tokenizers_source_files
6047 ${CMAKE_CURRENT_SOURCE_DIR} /src/regex .cpp
6148 ${CMAKE_CURRENT_SOURCE_DIR} /src/sentencepiece.cpp
6249 ${CMAKE_CURRENT_SOURCE_DIR} /src/tiktoken.cpp
63- ${CMAKE_CURRENT_SOURCE_DIR} /src/token_decoder.cpp
64- )
65- if (SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
66- list (APPEND
67- tokenizers_source_files
68- ${CMAKE_CURRENT_SOURCE_DIR} /src/pcre2_regex.cpp
69- ${CMAKE_CURRENT_SOURCE_DIR} /src/std_regex.cpp)
70- endif ()
50+ ${CMAKE_CURRENT_SOURCE_DIR} /src/token_decoder.cpp)
51+
7152file (GLOB unicode_source_files
7253 ${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/src/*.cpp)
7354add_library (tokenizers STATIC ${tokenizers_source_files}
@@ -85,11 +66,26 @@ target_include_directories(
8566target_link_libraries (tokenizers PUBLIC sentencepiece-static re2::re2)
8667
8768if (SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
88- target_include_directories (tokenizers
89- PUBLIC
90- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src)
91- target_link_libraries (tokenizers PUBLIC pcre2-8)
92- target_compile_definitions (tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD)
69+ set (PCRE2_BUILD_PCRE2_8 ON )
70+ set (PCRE2_BUILD_PCRE2_16 OFF )
71+ set (PCRE2_BUILD_PCRE2_32 OFF )
72+ set (PCRE2_BUILD_TESTS OFF )
73+ set (PCRE2_BUILD_PCRE2GREP OFF )
74+ set (PCRE2_BUILD_PCRE2TEST OFF )
75+ set (PCRE2_BUILD_PCRE2GPERF OFF )
76+ set (PCRE2_BUILD_DOCS OFF )
77+ set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
78+ add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2)
79+ add_library (
80+ regex_lookahead STATIC
81+ ${CMAKE_CURRENT_SOURCE_DIR} /src/pcre2_regex.cpp
82+ ${CMAKE_CURRENT_SOURCE_DIR} /src/regex_lookahead.cpp
83+ ${CMAKE_CURRENT_SOURCE_DIR} /src/std_regex.cpp)
84+ target_link_libraries (regex_lookahead PUBLIC pcre2-8)
85+ target_include_directories (
86+ regex_lookahead PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} /include
87+ ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src)
88+ target_link_options_shared_lib(regex_lookahead)
9389endif ()
9490
9591# Build test
@@ -120,9 +116,9 @@ if(TOKENIZERS_BUILD_TEST)
120116 ${CMAKE_CURRENT_SOURCE_DIR} /include
121117 ${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece
122118 ${CMAKE_CURRENT_SOURCE_DIR} /third-party/re2
123- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/json/single_include
124- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src)
125- target_link_libraries ( ${test_name} gtest_main GTest::gmock tokenizers )
119+ ${CMAKE_CURRENT_SOURCE_DIR} /third-party/json/single_include)
120+ target_link_libraries ( ${test_name} gtest_main GTest::gmock tokenizers
121+ regex_lookahead )
126122 add_test (${test_name} "${test_name} " )
127123 set_tests_properties (${test_name} PROPERTIES ENVIRONMENT ${test_env} )
128124 endforeach ()
0 commit comments