Upgraded the code to support GPT4All requirements

nomic-ai · Apr 26, 2023 · 0f1f0ad · 0f1f0ad
1 parent a3b895e
commit 0f1f0ad
Show file tree

Hide file tree

Showing 3 changed files with 162 additions and 7 deletions.
diff --git a/examples/backend_test.py b/examples/backend_test.py
@@ -0,0 +1,126 @@
+######
+# Project       : GPT4ALL-UI
+# File          : backend_test.py
+# Author        : ParisNeo with the help of the community
+# Supported by Nomic-AI
+# Licence       : Apache 2.0
+# Description   : 
+# This is an example of a pygpt4all-ui binding for llamacpp and gpt-j models
+# Tests generation
+
+# To call :
+# python backend_test.py -m <model path> --prompt <your prompt> --trigger_stop_after <trigger stop after how many tokens? to test how the backend handles sopping generation>
+######
+from pathlib import Path
+from typing import Callable
+from pyllamacpp.model import Model
+import argparse
+import sys
+
+__author__ = "parisneo"
+__github__ = "https://github.com/nomic-ai/gpt4all-ui"
+__copyright__ = "Copyright 2023, "
+__license__ = "Apache 2.0"
+
+backend_name = "LLAMACPP"
+
+class LLAMACPP():
+    file_extension='*.bin'
+    def __init__(self, model_path, config:dict) -> None:
+        """Builds a LLAMACPP backend
+
+        Args:
+            config (dict): The configuration file
+        """
+        self.config = config
+
+        self.model = Model(
+                ggml_model=model_path, 
+                n_ctx=self.config['ctx_size'], 
+                seed=self.config['seed'],
+                )
+
+    def get_num_tokens(self, prompt):
+        return self.model.num_tokens(prompt)
+
+    def generate(self, 
+                 prompt:str,                  
+                 n_predict: int = 128,
+                 new_text_callback: Callable[[str], None] = bool,
+                 verbose: bool = False,
+                 **gpt_params ):
+        """Generates text out of a prompt
+
+        Args:
+            prompt (str): The prompt to use for generation
+            n_predict (int, optional): Number of tokens to prodict. Defaults to 128.
+            new_text_callback (Callable[[str], None], optional): A callback function that is called everytime a new text element is generated. Defaults to None.
+            verbose (bool, optional): If true, the code will spit many informations about the generation process. Defaults to False.
+        """
+        try:
+            self.model.generate(
+                prompt,
+                new_text_callback=new_text_callback,
+                n_predict=n_predict,
+                temp=self.config['temp'],
+                top_k=self.config['top_k'],
+                top_p=self.config['top_p'],
+                repeat_penalty=self.config['repeat_penalty'],
+                repeat_last_n = self.config['repeat_last_n'],
+                n_threads=self.config['n_threads'],
+                verbose=verbose
+            )
+        except Exception as ex:
+            print(ex)
+
+if __name__=="__main__":
+    # create an ArgumentParser object
+    parser = argparse.ArgumentParser()
+
+    # add the -m or --model_path argument (./models/llama_cpp/ is for gpt4all-ui default structure)
+    parser.add_argument("-m", "--model_path", default="./models/llama_cpp/", help="path to the model file")
+    parser.add_argument('--temp', type=float, default=0.5)
+    parser.add_argument('--top_k', type=int, default=40)
+    parser.add_argument('--top_p', type=float, default=0.95)
+    parser.add_argument('--repeat_penalty', type=float, default=1.3)
+    parser.add_argument('--repeat_last_n', type=int, default=5)
+    parser.add_argument('--n_threads', type=int, default=8)
+    parser.add_argument('--ctx_size', type=int, default=512)
+    parser.add_argument('--seed', type=int, default=-1) 
+    parser.add_argument('--prompt', type=str, default='Once apon a time')   
+    parser.add_argument('--trigger_stop_after', type=int, default=-1)   
+    # parse the arguments
+    args = parser.parse_args()
+    config = {
+    'temp': args.temp,
+    'top_k': args.top_k,
+    'top_p': args.top_p,
+    'repeat_penalty': args.repeat_penalty,
+    'repeat_last_n': args.repeat_last_n,
+    'n_threads': args.n_threads,
+    'ctx_size': args.ctx_size,
+    'seed': args.seed    
+    }
+
+    backend = LLAMACPP(args.model_path, config)
+
+    #Not good to use global, but this is a quick example so nevermind
+    global counter
+    counter = 0
+
+    def callback(text):
+        global counter
+        print(text,end="")
+        sys.stdout.flush()
+        # test the stop generation after a number of words
+        counter +=1        
+        if args.trigger_stop_after>0:
+            if counter>=args.trigger_stop_after:
+                return False
+
+        return True
+
+    num_tokens = backend.get_num_tokens(args.prompt)
+    print(f"Prompt has {num_tokens} tokens")
+    output_text = backend.generate(args.prompt,new_text_callback=callback)
+    print("Text : output_text")
diff --git a/pyllamacpp/pyllamacpp/model.py b/pyllamacpp/pyllamacpp/model.py
@@ -75,13 +75,16 @@ def _set_params(params, kwargs: dict) -> None:
         for param in kwargs:
             setattr(params, param, kwargs[param])
 
-    def _call_new_text_callback(self, text) -> None:
+    def _call_new_text_callback(self, text) -> bool:
         """
         Internal new_segment_callback, it just calls the user's callback with the `Segment` object
-        :return: None
+        :return: bool (continue generation?)
         """
+        # the callback returns either a boolean or a None
         if Model._new_text_callback is not None:
-            Model._new_text_callback(text)
+            continue_gen = Model._new_text_callback(text)
+            if not(continue_gen is None or continue_gen==True):
+                self._ctx.continue_gen = False
         # save res
         self.res += text
 
@@ -90,9 +93,18 @@ def _call_grab_text_callback(self) -> str:
             return Model._grab_text_callback()
         return None
 
+    def num_tokens(self, prompt:str):
+        """
+        Computes the number of tokens from the prompt text
+
+        :param prompt: the prompt
+        :return: the prompt
+        """
+        return pp.llama_get_nb_tokens(self._ctx, prompt)
+
     def generate(self, prompt: str,
                  n_predict: int = 128,
-                 new_text_callback: Callable[[str], None] = None,
+                 new_text_callback: Callable[[str], None] = bool,
                  grab_text_callback: Callable[[], str] = None,
                  verbose: bool = False,
                  **gpt_params) -> str:

diff --git a/pyllamacpp/src/main.cpp b/pyllamacpp/src/main.cpp
@@ -33,6 +33,7 @@ static bool is_interacting = false;
 py::function py_llama_progress_callback;
 
 struct llama_context_wrapper {
+    bool continue_gen = true;     // Continue text generation
     llama_context* ptr;
 };
 
@@ -165,10 +166,21 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
     return "The";
 }
 
+// This is a function to return the tokens number
+// Needed by front end to optimize data size
+int llama_get_nb_tokens(struct llama_context_wrapper * ctx_w, std::string prompt){
+    // tokenize the prompt
+    auto embd_inp = ::llama_tokenize_wrapper(ctx_w, prompt, true);
+    return embd_inp.size();
+}
+
 // quick and dirty implementation! just copied from main.cpp with some minor changes
 // Needs lots of improvements
 int llama_generate(struct llama_context_wrapper * ctx_w, gpt_params params, py::function new_text_callback, py::function grab_text_callback, bool verbose){
 
+    // Set continue_gen to true
+    ctx_w->continue_gen = true;
+
     if (params.perplexity) {
         printf("\n************\n");
         printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
@@ -447,7 +459,12 @@ int llama_generate(struct llama_context_wrapper * ctx_w, gpt_params params, py::
         if (!input_noecho) {
             for (auto id : embd) {
 //                printf("%s", llama_token_to_str(ctx, id));
+                // If the host wants to stop generation, we should stop
                 new_text_callback(llama_token_to_str(ctx, id));
+                if(!ctx_w->continue_gen){
+                    llama_print_timings(ctx);
+                    return 0;
+                }
             }
             fflush(stdout);
         }
@@ -606,7 +623,8 @@ PYBIND11_MODULE(_pyllamacpp, m) {
         .def_readwrite("verbose_prompt", &gpt_params::verbose_prompt)
         .def_readwrite("antiprompt", &gpt_params::antiprompt);
 
-    py::class_<llama_context_wrapper>(m,"llama_context");
+    py::class_<llama_context_wrapper>(m,"llama_context")
+        .def_readwrite("continue_gen", &llama_context_wrapper::continue_gen);
 
     py::class_<llama_token_data>(m,"llama_token_data")
         .def(py::init<>())
@@ -657,10 +675,9 @@ PYBIND11_MODULE(_pyllamacpp, m) {
 
     m.def("llama_print_system_info", &llama_print_system_info);
 
+    m.def("llama_get_nb_tokens", &llama_get_nb_tokens);
     m.def("llama_generate", &llama_generate);
 
-
-
 #ifdef VERSION_INFO
     m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO);
 #else