diff --git a/CMakeLists.txt b/CMakeLists.txt index c79ccd09e097c..16865e80e08b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) +option(LLAMA_PARQUET "Enable Parquet dataset support via Arrow/Parquet C++" OFF) # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) @@ -173,6 +174,12 @@ if (MINGW) add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) endif() +if(LLAMA_PARQUET) + find_package(Arrow REQUIRED) + find_package(Parquet REQUIRED) + add_definitions(-DLLAMA_PARQUET) +endif() + # # build the library # diff --git a/common/arg.cpp b/common/arg.cpp index 40af7e574830f..2dcd413e4559c 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1470,14 +1470,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), [](common_params & params, int value) { params.n_chunks = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"-fa", "--flash-attn"}, string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), @@ -2115,70 +2115,70 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.hellaswag = true; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"--hellaswag-tasks"}, "N", string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), [](common_params & params, int value) { params.hellaswag_tasks = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"--winogrande"}, "compute Winogrande score over random tasks from datafile supplied with -f", [](common_params & params) { params.winogrande = true; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"--winogrande-tasks"}, "N", string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), [](common_params & params, int value) { params.winogrande_tasks = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"--multiple-choice"}, "compute multiple choice score over random tasks from datafile supplied with -f", [](common_params & params) { params.multiple_choice = true; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"--multiple-choice-tasks"}, "N", string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), [](common_params & params, int value) { params.multiple_choice_tasks = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"--kl-divergence"}, "computes KL-divergence to logits provided via --kl-divergence-base", [](common_params & params) { params.kl_divergence = true; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"--save-all-logits", "--kl-divergence-base"}, "FNAME", "set logits file", [](common_params & params, const std::string & value) { params.logits_file = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"--ppl-stride"}, "N", string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), [](common_params & params, int value) { params.ppl_stride = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"--ppl-output-type"}, "<0|1>", string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), [](common_params & params, int value) { params.ppl_output_type = value; } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_FINETUNE})); add_opt(common_arg( {"-dt", "--defrag-thold"}, "N", string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), @@ -3415,6 +3415,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.n_cache_reuse = 256; } ).set_examples({LLAMA_EXAMPLE_SERVER})); +#ifdef LLAMA_PARQUET + add_opt(common_arg( + {"--dataset-format"}, "text", + string_format("Dataset format: text or parquet (requires LLAMA_PARQUET)"), + [](common_params & params, const std::string & format) { + params.dataset_format = format; //or parquet//TODO ENUM CLASS + } + ).set_examples({LLAMA_EXAMPLE_FINETUNE})); + + add_opt(common_arg( + {"--parquet-path"}, "parquet.parquet", + string_format("Parquet path"), + [](common_params & params, const std::string & filepath) {//TODO -read dir + params.parquet_path = filepath; + } + ).set_examples({LLAMA_EXAMPLE_FINETUNE})); + add_opt(common_arg( + {"--tokens-column"}, "tokens", + string_format("Name of tokens column (list) in Parquet file"), + [](common_params & params, const std::string & column) { + params.tokens_column = column; + } + ).set_examples({LLAMA_EXAMPLE_FINETUNE})); +#endif return ctx_arg; } diff --git a/common/common.h b/common/common.h index 8922090e7b10d..fc75a972f0c6f 100644 --- a/common/common.h +++ b/common/common.h @@ -83,6 +83,7 @@ enum llama_example { LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_COUNT, + LLAMA_EXAMPLE_FINETUNE, }; enum common_sampler_type { @@ -282,6 +283,9 @@ struct common_params { std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT std::string logits_file = ""; // file for saving *all* logits // NOLINT + std::string dataset_format = "text"; // "text" | "parquet" + std::string parquet_path; // path to Parquet + std::string tokens_column = "tokens"; // name column list std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) diff --git a/examples/training/README.md b/examples/training/README.md index df425279266e4..dcb05e4643035 100644 --- a/examples/training/README.md +++ b/examples/training/README.md @@ -8,10 +8,21 @@ Finetuning of Stories 260K and LLaMA 3.2 1b seems to work with 24 GB of memory. Proof of concept: +With load data from common file: + ``` sh export model_name=llama_3.2-1b && export quantization=f32 ./build/bin/llama-finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512 ./build/bin/llama-perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf ``` +With load data from parquet (without batching): + +You need install arrow package and build with LLAMA_PARQUET=ON + +``` sh +mkdir build; cmake -DLLAMA_PARQUET=ON .. ; make +export model_name=llama_3.2-1b && export quantization=f32 +./build/bin/llama-finetune -ngl 999 --dataset-format parquet --parquet-path parquet.parquet --tokens-column tokens --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512 +``` The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs. diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index 23bede49b1362..6dc9415b65e44 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -2,6 +2,7 @@ #include "common.h" #include "log.h" #include "llama.h" +#include "../../src/parquet_dataset.h" #include #include @@ -18,7 +19,7 @@ int main(int argc, char ** argv) { params.escape = false; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) { return 1; } @@ -57,7 +58,23 @@ int main(int argc, char ** argv) { constexpr float val_split = 0.05f; - std::vector tokens = common_tokenize(ctx.get(), params.prompt, true); + std::vector tokens; +#ifdef LLAMA_PARQUET + if (params.dataset_format == "text") { +#endif + tokens = common_tokenize(ctx.get(), params.prompt, true); //load from text file +#ifdef LLAMA_PARQUET + } + else if (params.dataset_format == "parquet") { + tokens = load_parquet_dataset(params.parquet_path, params.tokens_column); + if (tokens.empty()) { + LOG_ERR("No tokens in %s, or column %s not found/invalid", params.parquet_path.c_str(), params.tokens_column.c_str()); + return 1; + } + LOG_INF("Loaded %zu tokens from Parquet", tokens.size()); + } +#endif + ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2); struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8f9cd652447ab..d2a49b868dfa7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -32,6 +32,7 @@ add_library(llama llama-quant.cpp llama-sampling.cpp llama-vocab.cpp + parquet_dataset.cpp unicode-data.cpp unicode.cpp unicode.h @@ -41,7 +42,12 @@ target_include_directories(llama PRIVATE .) target_include_directories(llama PUBLIC ../include) target_compile_features (llama PRIVATE cxx_std_17) # don't bump -target_link_libraries(llama PUBLIC ggml) + +if(LLAMA_PARQUET) + target_link_libraries(llama PUBLIC ggml Arrow::arrow_shared Parquet::parquet_shared) +else() + target_link_libraries(llama PUBLIC ggml) +endif() if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/src/parquet_dataset.cpp b/src/parquet_dataset.cpp new file mode 100644 index 0000000000000..d04ee331ebedf --- /dev/null +++ b/src/parquet_dataset.cpp @@ -0,0 +1,47 @@ +#ifdef LLAMA_PARQUET +#include "parquet_dataset.h" +#include +#include +#include +#include "llama-impl.h" + +std::vector load_parquet_dataset(const std::string &path, const std::string &column) { + arrow::MemoryPool *pool = arrow::default_memory_pool(); + std::shared_ptr infile; + PARQUET_ASSIGN_OR_THROW(infile, arrow::io::ReadableFile::Open(path)); + arrow::Result> reader_raw; + PARQUET_ASSIGN_OR_THROW(reader_raw, parquet::arrow::OpenFile(infile, pool)); + + std::unique_ptr reader = std::move(reader_raw.ValueUnsafe()); + std::shared_ptr table; + PARQUET_THROW_NOT_OK(reader->ReadTable(&table)); + + auto field = table->schema()->GetFieldByName(column); + if (!field || !field->type()->Equals(arrow::list(arrow::int32()))) { + LLAMA_LOG_ERROR("Parquet column '%s' missing or not list", column.c_str()); + return {}; + } + + auto col = table->GetColumnByName(column); + std::vector tokens; + for (int chunk = 0; chunk < col->num_chunks(); ++chunk) { + auto list_arr = std::static_pointer_cast(col->chunk(chunk)); + auto values_arr = std::static_pointer_cast(list_arr->values()); + // get raw offsets (int32_t or int64_t based on ListArray template) + const auto *offsets = list_arr->raw_value_offsets(); + // offsets length = list_arr->length() + 1 + int64_t values_length = values_arr->length(); + for (int64_t i = 0; i < list_arr->length(); ++i) { + int64_t start = offsets[i]; + int64_t end = offsets[i + 1]; + // Clamp end + if (start < 0) start = 0; + if (end > values_length) end = values_length; + for (int64_t j = start; j < end; ++j) { + tokens.push_back(static_cast(values_arr->Value(j))); + } + } + } + return tokens; +} +#endif // LLAMA_PARQUET diff --git a/src/parquet_dataset.h b/src/parquet_dataset.h new file mode 100644 index 0000000000000..00fd5e423a7be --- /dev/null +++ b/src/parquet_dataset.h @@ -0,0 +1,10 @@ +#ifndef PARQUET_DATASET_H +#define PARQUET_DATASET_H +#include +#include +#include "llama.h" + +#ifdef LLAMA_PARQUET +std::vector load_parquet_dataset(const std::string &path, const std::string &column); +#endif +#endif //