From 06ef430df629c01738475be6e7b568570df31901 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 16 Apr 2024 06:08:46 +0200
Subject: [PATCH] added the unsupervised model

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 pyproject.toml                                |   2 +-
 src/andromeda/tooling/models.h                |   1 +
 .../fasttext_unsupervised_model.h             | 543 ++++++++++++++++++
 3 files changed, 545 insertions(+), 1 deletion(-)
 create mode 100644 src/andromeda/tooling/models/base_fst_model/fasttext_unsupervised_model.h

diff --git a/pyproject.toml b/pyproject.toml
index b713b2e7..d763e2d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deepsearch-glm"
-version = "0.17.4"
+version = "0.19.0"
 description = "Graph Language Models"
 authors = ["Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
diff --git a/src/andromeda/tooling/models.h b/src/andromeda/tooling/models.h
index 0c208f94..bd5253e4 100644
--- a/src/andromeda/tooling/models.h
+++ b/src/andromeda/tooling/models.h
@@ -8,6 +8,7 @@
 
 #include <andromeda/tooling/models/base_fst_model.h>
 #include <andromeda/tooling/models/base_fst_model/fasttext_supervised_model.h>
+#include <andromeda/tooling/models/base_fst_model/fasttext_unsupervised_model.h>
 
 #include <andromeda/tooling/models/base_rgx_model.h>
 #include <andromeda/tooling/models/base_dct_model.h>
diff --git a/src/andromeda/tooling/models/base_fst_model/fasttext_unsupervised_model.h b/src/andromeda/tooling/models/base_fst_model/fasttext_unsupervised_model.h
new file mode 100644
index 00000000..b2215ca1
--- /dev/null
+++ b/src/andromeda/tooling/models/base_fst_model/fasttext_unsupervised_model.h
@@ -0,0 +1,543 @@
+//-*-C++-*-
+
+#ifndef ANDROMEDA_MODELS_BASE_FST_MODEL_UNSUPERVISED_TRAINER_H_
+#define ANDROMEDA_MODELS_BASE_FST_MODEL_UNSUPERVISED_TRAINER_H_
+
+namespace andromeda
+{
+
+  class fasttext_unsupervised_model:
+    public base_nlp_model
+  {
+    typedef fasttext::Args ft_args_type;
+    typedef fasttext::Autotune ft_autotune_type;
+
+    typedef fasttext::FastText ft_model_type;
+
+  public:
+
+    fasttext_unsupervised_model();
+    ~fasttext_unsupervised_model();
+    
+    //template<typename subject_type>
+    //bool get(subject_type& subj, base_property& prop);
+
+    /*   IO   */
+    virtual bool load(std::filesystem::path ifile);
+    virtual bool save(std::filesystem::path ofile);
+
+    /*  CONFIG   */
+
+    virtual nlohmann::json create_train_config();
+
+    /* TRAIN */
+    virtual bool is_trainable() { return true; }
+
+    virtual bool prepare_data_for_train(nlohmann::json args,
+                                        std::vector<std::shared_ptr<base_nlp_model> >& dep_models);
+
+    virtual bool train(nlohmann::json args);
+
+    //virtual bool evaluate_model(nlohmann::json args,
+    //std::vector<std::shared_ptr<base_nlp_model> >& dep_models);
+
+    /* PREDICT */
+
+    virtual std::string preprocess(const std::string& orig);
+    //virtual bool classify(const std::string& orig, std::string& label, double& conf);
+
+    virtual bool preprocess(const subject<TEXT>& subj, std::string& text);
+    virtual bool preprocess(const subject<TABLE>& subj, std::string& text);
+
+    std::vector<std::pair<float, std::string> > getNN(const std::string& word,
+						      int32_t k)
+    {
+      LOG_S(ERROR) << "needs implementation: " << __FILE__ << ":" << __LINE__;
+
+      std::vector<std::pair<float, std::string> > res = {};
+      return res;
+    }
+    
+    //bool classify(subject<TEXT>& subj);
+    //bool classify(subject<TABLE>& subj);
+
+  protected:
+
+    bool parse_config(nlohmann::json config);
+
+    bool prepare_data(std::vector<std::shared_ptr<base_nlp_model> >& dep_models);
+    bool read_samples(std::vector<std::shared_ptr<base_nlp_model> >& dep_models,
+                      bool read_train, bool read_eval);
+
+    bool launch_training();
+
+    bool evaluate_training();
+
+  protected:
+
+    std::filesystem::path model_path;
+    std::shared_ptr<ft_model_type> model;
+
+  private:
+
+    double learning_rate;
+    int epoch, dim, ws, ngram;
+
+    bool autotune;
+    std::string modelsize;
+    int duration; // in seconds
+
+    std::set<std::string> explicit_hpo_parameters;
+    std::set<std::string> explicit_train_parameters;
+
+    std::string model_file, metrics_file, config_file;
+
+    std::string train_file, validate_file, test_file;
+
+    std::string fasttext_train_file,
+      fasttext_validation_file;
+
+    nlohmann::json train_args;
+
+    std::vector<std::string> train_samples; // <label, text>
+    std::vector<std::string> eval_samples;
+
+    //confusion_matrix_evaluator conf_matrix;
+  };
+
+  fasttext_unsupervised_model::fasttext_unsupervised_model():    
+    base_nlp_model()
+  {}
+
+  fasttext_unsupervised_model::~fasttext_unsupervised_model()
+  {}
+
+  bool fasttext_unsupervised_model::load(std::filesystem::path ifile)
+  {
+    //LOG_S(INFO) << __FILE__ << ":" << __LINE__;
+    
+    std::string model_path = ifile.string();
+    
+    if(not std::filesystem::exists(ifile))
+      {
+	LOG_S(ERROR) << "file does not exists: " << model_path;
+	return false;
+      }
+
+    if(model==NULL)
+      {
+	model = std::make_shared<ft_model_type>();
+      }    
+
+    model->loadModel(model_path);
+    
+    return true;
+  }
+
+  bool fasttext_unsupervised_model::save(std::filesystem::path ofile)
+  {
+    //LOG_S(INFO) << __FUNCTION__;
+    
+    std::string model_name = ofile.string();
+    
+    LOG_S(INFO) << "fasttext model save to " << model_name << ".bin";
+    model->saveModel(model_name + ".bin");
+    
+    LOG_S(INFO) << "fasttext vectors save to " << model_name << ".vec";
+    model->saveVectors(model_name + ".vec");
+
+    //LOG_S(INFO) << "fasttext output save to " << model_name << ".out";
+    //model->saveOutput(model_name + ".out");
+
+    return true;
+  }
+
+  nlohmann::json fasttext_unsupervised_model::create_train_config()
+  {
+    nlohmann::json config = nlohmann::json::object({});
+
+    config["mode"] = "train";
+    config["model"] = get_key();
+
+    nlohmann::json hpo;
+    {
+      hpo["autotune"] = autotune;
+      hpo["modelsize"] = modelsize;
+      hpo["duration"] = duration;
+    }
+    
+    nlohmann::json args;
+    {
+      args["mode"] = "unsupervised";
+
+      args["learning-rate"] = learning_rate;
+      args["epoch"] = epoch;
+
+      args["dim"] = dim;
+      args["ws"] = ws;
+
+      args["n-gram"] = ngram;
+    }
+
+    nlohmann::json files;
+    {
+      files["train-file"] = "<filename>";
+      files["validate-file"] = "<filename>";
+      //files["test-file"] = "<filename>";
+
+      files["model-file"] = "<filename>";
+      //files["metrics-file"] = "<filename>";
+    }
+
+    config["hpo"] = hpo;
+    config["args"] = args;
+    config["files"] = files;
+
+    return config;
+  }
+
+    bool fasttext_unsupervised_model::parse_config(nlohmann::json config)
+  {
+    auto hpo_args = config["hpo"];
+    auto train_args = config["args"];
+
+    auto train_files = config["files"];    
+
+    for(auto itr:hpo_args.items())
+      {
+	explicit_hpo_parameters.insert(itr.key());
+      }
+
+    for(auto itr:train_args.items())
+      {
+	explicit_train_parameters.insert(itr.key());
+      }
+    
+    // HPO
+    {
+      autotune = hpo_args.value("autotune", autotune);
+      
+      modelsize = hpo_args.value("modelsize", modelsize);
+      duration = hpo_args.value("duration", duration);
+    }
+
+    // parameters
+    {
+      learning_rate = train_args.value("learning-rate", learning_rate);
+      epoch = train_args.value("epoch", epoch);
+      dim = train_args.value("dim", dim);
+      ws = train_args.value("ws", ws);
+      ngram = train_args.value("n-gram", ngram);
+    }
+    
+    // files
+    {
+      train_file = train_files.value("train-file", "null");
+
+      validate_file = train_files.value("validate-file", "null");
+      test_file = train_files.value("test-file", "null");
+      
+      model_file = train_files.value("model-file", "null");
+      metrics_file = train_files.value("metrics-file", "null");    
+
+      if(metrics_file=="null")
+	{
+	  metrics_file = model_file+".metrics.txt";
+	}
+      
+      config_file = model_file+".config.json";
+
+      fasttext_train_file = train_file+".fasttext.train.txt";
+      fasttext_validation_file = train_file+".fasttext.validate.txt";
+    }
+
+    return true;
+  }
+
+  bool fasttext_unsupervised_model::prepare_data_for_train(nlohmann::json config,
+							   std::vector<std::shared_ptr<base_nlp_model> >& dep_models)
+  {
+    LOG_S(INFO) << "preparing data to train FastText classifier ...";
+
+    parse_config(config);
+
+    prepare_data(dep_models);
+    
+    return true;
+  }
+  
+  bool fasttext_unsupervised_model::train(nlohmann::json config)
+  {
+    LOG_S(INFO) << "starting to train FastText encoder ...";
+    
+    parse_config(config);
+
+    //if(not prepare_data())
+    //{
+    //LOG_S(WARNING) << "could not prepare the data for unsupervised Fasttext training ...";
+    //}
+
+    launch_training();
+
+    save(model_file.c_str());    
+
+    /*
+    if(eval_samples.size()==0)
+      {
+	read_samples(dep_models);
+      }
+    
+    evaluate_training();
+    */
+    
+    return true;
+  }
+
+  std::string fasttext_unsupervised_model::preprocess(const std::string& orig)
+  {
+    return orig;
+  }
+
+  bool fasttext_unsupervised_model::preprocess(const subject<TEXT>& subj, std::string& text)
+  {
+    //auto& wtokens = subj.word_tokens;
+    //LOG_S(INFO) << "tokens: \n\n" << tabulate(wtokens); 
+    
+    std::stringstream ss;
+
+    std::size_t MAX = 256;
+    std::size_t LEN = subj.get_num_wtokens();
+
+    for(std::size_t l=0; l<std::min(LEN, MAX); l++)
+      {
+	const auto& token = subj.get_wtoken(l);	    
+	auto tags = token.get_tags();
+	
+	if(tags.size()>0)
+	  {
+	    ss << "__" << *(tags.begin()) << "__";		
+	  }
+	else
+	  {
+	    std::string text = token.get_word();
+	    text = utils::to_lower(text);
+	    
+	    ss << text;
+	  }
+	
+	ss << " ";
+      }
+    
+    text = ss.str();
+
+    return true;
+  }
+
+  bool fasttext_unsupervised_model::preprocess(const subject<TABLE>& subj, std::string& text)
+  {
+    text = subj.get_text();
+    return (text.size()>0);
+  }
+
+  bool fasttext_unsupervised_model::prepare_data(std::vector<std::shared_ptr<base_nlp_model> >& dep_models)
+  {
+    LOG_S(INFO) << __FUNCTION__;
+    
+    std::ifstream ifs(train_file.c_str());
+    if(not ifs.good())
+      {
+	LOG_S(ERROR) << "could not read from file: " << train_file;
+	return 0;
+      }
+
+    std::ofstream ofs_train(fasttext_train_file.c_str());
+    if(not ofs_train.good())
+      {
+	LOG_S(ERROR) << "could not create fasttext train-file: " << fasttext_train_file;
+	return 0;
+      }
+
+    std::ofstream ofs_eval(fasttext_validation_file.c_str());
+    if(not ofs_eval.good())
+      {
+	LOG_S(ERROR) << "could not create fasttext eval-file: " << fasttext_validation_file;
+	return 0;
+      }    
+
+    train_samples.clear();
+    eval_samples.clear();    
+
+    std::random_device rd;  // Will be used to obtain a seed for the random number engine
+    std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
+    std::uniform_real_distribution<> dis(0.0, 1.0);
+    
+    LOG_S(INFO) << "start reading from file: " << train_file;
+
+    auto char_normaliser = text_element::create_char_normaliser(false);
+    auto text_normaliser = text_element::create_text_normaliser(false);
+    
+    std::string line, orig="null", text="null", label="null";
+    while(std::getline(ifs, line))
+      {
+	nlohmann::json item = nlohmann::json::parse(line);
+
+	bool training_sample = bool(dis(gen)<0.9);
+	if(item.count("training-sample"))
+	  {
+	    training_sample = item.at("training-sample").get<bool>();
+	  }
+	
+	bool good = false;	
+	if(item.count("text"))
+	  {
+	    //label = item.at("label").get<std::string>();
+	    orig = item.at("text").get<std::string>();
+	    
+	    subject<TEXT> subj;
+	    subj.set(orig, char_normaliser, text_normaliser);
+
+	    for(auto dep_model:dep_models)
+	      {
+		dep_model->apply(subj);
+	      }
+		
+	    good = this->preprocess(subj, text);
+	  }
+	else
+	  {
+	    LOG_S(WARNING) << "no `text` detected: aborting ...";
+	    return false;
+	  }
+
+	if(not good)
+	  {
+	    continue;
+	  }
+	
+	if(training_sample)
+	  {
+	    ofs_train << text << "\n";
+	    train_samples.push_back({text});
+	  }
+	else
+	  {
+	    ofs_eval << text << "\n";
+	    eval_samples.push_back({text});
+	  }
+
+      }
+
+    LOG_S(INFO) << "read successfully: #-train: " << train_samples.size() << ", #-val: " << eval_samples.size();
+
+    return true;    
+  }
+  
+  bool fasttext_unsupervised_model::launch_training()
+  {
+    LOG_S(INFO) << __FUNCTION__;
+
+    // -autotune-validation ./tmp/semantic-model/nlp-train-semantic.annot.jsonl.fasttext.validate.txt -autotune-duration 360 -autotune-modelsize 100M -dim 64 -wordNgrams 1
+    
+    std::vector<std::string> args_vec
+      = {
+	 "", "unsupervised",
+	 
+	 "-input", fasttext_train_file,
+	 "-output", model_file//,
+
+	 //"-autotune-validation", fasttext_validation_file,
+	 //"-autotune-duration", std::to_string(autotune_duration),
+	 //"-autotune-modelsize", autotune_modelsize,
+	 
+	 //"-lr", std::to_string(learning_rate),
+	 //"-dim", std::to_string(dim),
+	 //"-ws", std::to_string(ws),
+	 //"-epoch", std::to_string(epoch),
+	 //"-wordNgrams", std::to_string(ngram)
+      };
+
+    if(autotune)
+      {
+	args_vec.push_back("-autotune-validation");
+	args_vec.push_back(fasttext_validation_file);
+
+	if(explicit_hpo_parameters.count("duration"))
+	  {
+	    args_vec.push_back("-autotune-duration");
+	    args_vec.push_back(std::to_string(duration));
+	  }
+
+	if(explicit_hpo_parameters.count("modelsize"))
+	  {
+	    args_vec.push_back("-autotune-modelsize");
+	    args_vec.push_back(modelsize);
+	  }
+      }
+
+    if(explicit_train_parameters.count("dim"))
+      {
+	args_vec.push_back("-dim");
+	args_vec.push_back(std::to_string(dim));	
+      }
+
+    if(explicit_train_parameters.count("ws"))
+      {
+	args_vec.push_back("-ws");
+	args_vec.push_back(std::to_string(ws));	
+      }
+
+    if(explicit_train_parameters.count("n-gram"))
+      {
+	args_vec.push_back("-wordNgrams");
+	args_vec.push_back(std::to_string(ngram));	
+      }    
+    
+    if(explicit_train_parameters.count("learning-rate"))
+      {
+	args_vec.push_back("-lr");
+	args_vec.push_back(std::to_string(learning_rate));	
+      }
+
+    if(explicit_train_parameters.count("epoch"))
+      {
+	args_vec.push_back("-epoch");
+	args_vec.push_back(std::to_string(dim));	
+      }    
+    
+    if(model==NULL)
+      {
+	model = std::make_shared<ft_model_type>();
+      }
+
+    {
+      std::stringstream ss;
+      ss << "fasttext ";
+      for(auto _ : args_vec)
+	{
+	  ss << _ << " ";
+	}
+
+      //LOG_S(INFO) << "training with command:\n" << ss.str(); 
+    }
+    
+    ft_args_type ft_args;
+    ft_args.parseArgs(args_vec);
+    
+    if(ft_args.hasAutotune())
+      {
+	//LOG_S(INFO) << "start HPO autotuning ... ";
+	
+	ft_autotune_type autotune(model);
+	autotune.train(ft_args);
+      }
+    else
+      {
+	model->train(ft_args);	
+      }
+
+    return true;
+  }  
+  
+}
+
+#endif