From e71519d6981fe8263f3ebfd6fccf3b0adf15e4c0 Mon Sep 17 00:00:00 2001
From: pengkun <pengkun@xinshiyun.com>
Date: Wed, 20 Nov 2024 14:09:15 +0800
Subject: [PATCH 1/4] =?UTF-8?q?=E4=BF=AE=E6=94=B9FunASR=E5=AE=9E=E6=97=B6?=
 =?UTF-8?q?=E8=AF=86=E5=88=AB=E6=A1=86=E6=9E=B6=EF=BC=8C=E5=AE=9E=E6=97=B6?=
 =?UTF-8?q?=E8=AF=86=E5=88=AB=E6=97=B62pass=E6=A8=A1=E5=BC=8F=E4=B8=8B?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=A1=86=E6=9E=B6=E5=B1=82=E9=9D=A2=E8=BF=94?=
 =?UTF-8?q?=E5=9B=9E=E5=8F=A5=E5=AD=90=E7=BA=A7=E5=88=AB=E7=9A=84=E6=97=B6?=
 =?UTF-8?q?=E9=97=B4=E6=88=B3=EF=BC=8C=E5=8D=95=E4=BD=8D=E6=AF=AB=E7=A7=92?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 runtime/onnxruntime/include/audio.h           |  2 +
 runtime/onnxruntime/include/funasrruntime.h   |  2 +
 runtime/onnxruntime/src/audio.cpp             | 40 ++++++++++++++++++
 runtime/onnxruntime/src/commonfunc.h          |  2 +
 runtime/onnxruntime/src/funasrruntime.cpp     | 20 +++++++++
 .../websocket/bin/websocket-server-2pass.cpp  | 42 +++++++++++++++++--
 .../websocket/bin/websocket-server-2pass.h    | 10 ++++-
 7 files changed, 112 insertions(+), 6 deletions(-)
diff --git a/runtime/onnxruntime/include/audio.h b/runtime/onnxruntime/include/audio.h
index 3011050ed..b14d3b85d 100644
--- a/runtime/onnxruntime/include/audio.h
+++ b/runtime/onnxruntime/include/audio.h
@@ -100,6 +100,8 @@ class DLLAPI Audio {
     int offset = 0;
     int speech_start=-1, speech_end=0;
     int speech_offline_start=-1;
+    int64_t start = 0;
+    int64_t end = 0;
 
     int seg_sample = MODEL_SAMPLE_RATE/1000;
     bool LoadPcmwavOnline(const char* buf, int n_file_len, int32_t* sampling_rate);
diff --git a/runtime/onnxruntime/include/funasrruntime.h b/runtime/onnxruntime/include/funasrruntime.h
index 685c0241f..1a3cff607 100644
--- a/runtime/onnxruntime/include/funasrruntime.h
+++ b/runtime/onnxruntime/include/funasrruntime.h
@@ -70,6 +70,8 @@ _FUNASRAPI const char*	FunASRGetResult(FUNASR_RESULT result,int n_index);
 _FUNASRAPI const char*	FunASRGetStamp(FUNASR_RESULT result);
 _FUNASRAPI const char*	FunASRGetStampSents(FUNASR_RESULT result);
 _FUNASRAPI const char*	FunASRGetTpassResult(FUNASR_RESULT result,int n_index);
+_FUNASRAPI const int64_t	FunASRGetTpassStart(FUNASR_RESULT result);
+_FUNASRAPI const int64_t	FunASRGetTpassEnd(FUNASR_RESULT result);
 _FUNASRAPI const int	FunASRGetRetNumber(FUNASR_RESULT result);
 _FUNASRAPI void			FunASRFreeResult(FUNASR_RESULT result);
 _FUNASRAPI void			FunASRUninit(FUNASR_HANDLE handle);
diff --git a/runtime/onnxruntime/src/audio.cpp b/runtime/onnxruntime/src/audio.cpp
index 22a9ecd29..b73369247 100644
--- a/runtime/onnxruntime/src/audio.cpp
+++ b/runtime/onnxruntime/src/audio.cpp
@@ -1289,6 +1289,33 @@ void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYP
             }
         }
     }else{
+
+        int sample_rate = 16000;  // sample_rate 是音频的采样率 这里固定为16000 Hz
+        float segment_duration =  (static_cast<float>(seg_sample) / sample_rate) * 1000;  // 每个分段的持续时间（毫秒）
+
+        // for (auto vad_segment : vad_segments) {
+        //     int speech_start_i = -1, speech_end_i = -1;
+        //     if (vad_segment[0] != -1) {
+        //         speech_start_i = vad_segment[0];
+        //     }
+        //     if (vad_segment[1] != -1) {
+        //         speech_end_i = vad_segment[1];
+        //     }
+
+        //     // 计算并打印语音片段的开始和结束时间
+        //     if (speech_start_i != -1 && speech_end_i != -1) {
+        //         float start_time = speech_start_i * segment_duration;  // 开始时间（秒）
+        //         float end_time = speech_end_i * segment_duration;      // 结束时间（秒）
+        //         std::cout << "Speech segment: Start time = " << start_time << "s, End time = " << end_time << "s" << std::endl;
+        //     } else if (speech_start_i != -1) {
+        //         float start_time = speech_start_i * segment_duration;  // 仅有开始时间
+        //         std::cout << "Speech segment: Start time = " << start_time << "s, End time = Unknown" << std::endl;
+        //     } else if (speech_end_i != -1) {
+        //         float end_time = speech_end_i * segment_duration;      // 仅有结束时间
+        //         std::cout << "Speech segment: Start time = Unknown, End time = " << end_time << "s" << std::endl;
+        //     }
+        // }
+
         for(auto vad_segment: vad_segments){
             int speech_start_i=-1, speech_end_i=-1;
             if(vad_segment[0] != -1){
@@ -1325,6 +1352,13 @@ void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYP
                     frame = nullptr;
                 }
 
+                //设置开始时间和结束时间
+                float start_time = speech_start_i * segment_duration;  // 开始时间（毫秒）
+                float end_time = speech_end_i * segment_duration;      // 结束时间（毫秒）
+                // 转换为 int64_t 类型并赋值给类的成员变量
+                this->start = static_cast<int64_t>(start_time);
+                this->end = static_cast<int64_t>(end_time);
+                //std::cout << "Speech segment: Start time = " << this->start << "ms, End time = " << this->end << "ms" << std::endl;
                 speech_start = -1;
                 speech_offline_start = -1;
             // [70, -1]
@@ -1350,6 +1384,9 @@ void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYP
                     }
                 }
 
+                float start_time = speech_start_i * segment_duration;  // 仅有开始时间
+                this->start = static_cast<int64_t>(start_time);
+                //std::cout << "Speech segment: Start time = " << this->start << "ms, End time = Unknown" << std::endl;
             }else if(speech_end_i != -1){ // [-1,100]
                 if(speech_start == -1 || speech_offline_start == -1){
                     LOG(ERROR) <<"Vad start is null while vad end is available. Set vad start 0" ;
@@ -1399,6 +1436,9 @@ void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYP
                         frame = nullptr;
                     }
                 }
+                float end_time = speech_end_i * segment_duration;      // 仅有结束时间
+                this->end = static_cast<int64_t>(end_time);
+                //std::cout << "Speech segment: Start time = Unknown, End time = " << this->end << "ms" << std::endl;
                 speech_start = -1;
                 speech_offline_start = -1;
             }
diff --git a/runtime/onnxruntime/src/commonfunc.h b/runtime/onnxruntime/src/commonfunc.h
index 6fd553fe0..81fa2422e 100644
--- a/runtime/onnxruntime/src/commonfunc.h
+++ b/runtime/onnxruntime/src/commonfunc.h
@@ -12,6 +12,8 @@ typedef struct
     std::string stamp_sents;
     std::string tpass_msg;
     float snippet_time;
+    int64_t start = 0;
+    int64_t end = 0;
 }FUNASR_RECOG_RESULT;
 
 typedef struct
diff --git a/runtime/onnxruntime/src/funasrruntime.cpp b/runtime/onnxruntime/src/funasrruntime.cpp
index 628641268..1727900e4 100644
--- a/runtime/onnxruntime/src/funasrruntime.cpp
+++ b/runtime/onnxruntime/src/funasrruntime.cpp
@@ -523,6 +523,9 @@
 		p_result->snippet_time = audio->GetTimeLen();
 		
 		audio->Split(vad_online_handle, chunk_len, input_finished, mode);
+		p_result->start = audio->start;
+		p_result->end = audio->end;
+		//std::cout << "p_result: Start time = " << p_result->start << "ms, End time = " << p_result->end << "ms" << std::endl;
 
 		funasr::AudioFrame* frame = nullptr;
 		while(audio->FetchChunck(frame) > 0){
@@ -695,6 +698,23 @@
 		return p_result->tpass_msg.c_str();
 	}
 
+	_FUNASRAPI const int64_t FunASRGetTpassStart(FUNASR_RESULT result)
+	{
+		funasr::FUNASR_RECOG_RESULT * p_result = (funasr::FUNASR_RECOG_RESULT*)result;
+		if(!p_result)
+			return 0;
+
+		return p_result->start;
+	}
+	_FUNASRAPI const int64_t FunASRGetTpassEnd(FUNASR_RESULT result)
+	{
+		funasr::FUNASR_RECOG_RESULT * p_result = (funasr::FUNASR_RECOG_RESULT*)result;
+		if(!p_result)
+			return 0;
+
+		return p_result->end;
+	}
+
 	_FUNASRAPI const char* CTTransformerGetResult(FUNASR_RESULT result,int n_index)
 	{
 		funasr::FUNASR_PUNC_RESULT * p_result = (funasr::FUNASR_PUNC_RESULT*)result;
diff --git a/runtime/websocket/bin/websocket-server-2pass.cpp b/runtime/websocket/bin/websocket-server-2pass.cpp
index ff23e9d41..53f499d06 100644
--- a/runtime/websocket/bin/websocket-server-2pass.cpp
+++ b/runtime/websocket/bin/websocket-server-2pass.cpp
@@ -15,11 +15,19 @@
 #include <thread>
 #include <utility>
 #include <vector>
+#include <iostream>
+#include <chrono>
 
 extern std::unordered_map<std::string, int> hws_map_;
 extern int fst_inc_wts_;
 extern float global_beam_, lattice_beam_, am_scale_;
 
+int64_t getCurrentTimeMillis() {
+    auto now = std::chrono::system_clock::now();
+    auto millis = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
+    return millis;
+}
+
 context_ptr WebSocketServer::on_tls_init(tls_mode mode,
                                          websocketpp::connection_hdl hdl,
                                          std::string& s_certfile,
@@ -57,7 +65,13 @@ context_ptr WebSocketServer::on_tls_init(tls_mode mode,
   return ctx;
 }
 
-nlohmann::json handle_result(FUNASR_RESULT result) {
+nlohmann::json handle_result(FUNASR_RESULT result, websocketpp::connection_hdl& hdl, std::map<websocketpp::connection_hdl, std::shared_ptr<FUNASR_MESSAGE>,std::owner_less<websocketpp::connection_hdl>>& data_map) {
+  std::shared_ptr<FUNASR_MESSAGE> data_msg = nullptr;
+  auto it = data_map.find(hdl);
+  if (it != data_map.end()) {
+    data_msg = it->second;
+  }
+
   websocketpp::lib::error_code ec;
   nlohmann::json jsonresult;
   jsonresult["text"] = "";
@@ -67,12 +81,26 @@ nlohmann::json handle_result(FUNASR_RESULT result) {
     LOG(INFO) << "online_res :" << tmp_online_msg;
     jsonresult["text"] = tmp_online_msg;
     jsonresult["mode"] = "2pass-online";
+
+    // 如果是第一句话的第一个实时结果或新的句子开始
+    if (!data_msg->is_sentence_started) {
+      data_msg->start_time = FunASRGetTpassStart(result);  // 记录句子的开始时间
+      data_msg->is_sentence_started = true;
+    } 
   }
+
+  data_msg->end_time = FunASRGetTpassEnd(result);  // 记录句子的结束时间
+
   std::string tmp_tpass_msg = FunASRGetTpassResult(result, 0);
   if (tmp_tpass_msg != "") {
     LOG(INFO) << "offline results : " << tmp_tpass_msg;
     jsonresult["text"] = tmp_tpass_msg;
     jsonresult["mode"] = "2pass-offline";
+
+    // 句子结束，记录结束时间
+    jsonresult["start_time"] = data_msg->start_time;
+    jsonresult["end_time"] = data_msg->end_time;
+    data_msg->is_sentence_started = false;  // 重置句子状态
   }
 
   std::string tmp_stamp_msg = FunASRGetStamp(result);
@@ -98,6 +126,7 @@ nlohmann::json handle_result(FUNASR_RESULT result) {
 }
 // feed buffer to asr engine for decoder
 void WebSocketServer::do_decoder(
+    std::map<websocketpp::connection_hdl, std::shared_ptr<FUNASR_MESSAGE>,std::owner_less<websocketpp::connection_hdl>>& data_map,
     std::vector<char>& buffer, 
     websocketpp::connection_hdl& hdl,
     nlohmann::json& msg, 
@@ -158,7 +187,7 @@ void WebSocketServer::do_decoder(
       }
       if (Result) {
         websocketpp::lib::error_code ec;
-        nlohmann::json jsonresult = handle_result(Result);
+        nlohmann::json jsonresult = handle_result(Result, hdl, data_map);
         jsonresult["wav_name"] = wav_name;
         jsonresult["is_final"] = false;
         if (jsonresult["text"] != "") {
@@ -200,7 +229,7 @@ void WebSocketServer::do_decoder(
       }
       if (Result) {
         websocketpp::lib::error_code ec;
-        nlohmann::json jsonresult = handle_result(Result);
+        nlohmann::json jsonresult = handle_result(Result, hdl, data_map);
         jsonresult["wav_name"] = wav_name;
         jsonresult["is_final"] = true;
         if (is_ssl) {
@@ -254,7 +283,8 @@ void WebSocketServer::on_open(websocketpp::connection_hdl hdl) {
     data_msg->msg["audio_fs"] = 16000; // default is 16k
     data_msg->msg["access_num"] = 0; // the number of access for this object, when it is 0, we can free it saftly
     data_msg->msg["is_eof"]=false; // if this connection is closed
-    data_msg->msg["svs_lang"]="auto";
+    //data_msg->msg["svs_lang"]="auto";
+    data_msg->msg["svs_lang"]="zh"; // SenseVoice使用时，只需要中文则设为zh
     data_msg->msg["svs_itn"]=true;
     FUNASR_DEC_HANDLE decoder_handle =
       FunASRWfstDecoderInit(tpass_handle, ASR_TWO_PASS, global_beam_, lattice_beam_, am_scale_);
@@ -263,6 +293,8 @@ void WebSocketServer::on_open(websocketpp::connection_hdl hdl) {
         std::make_shared<std::vector<std::vector<std::string>>>(2);
   	data_msg->strand_ =	std::make_shared<asio::io_context::strand>(io_decoder_);
 
+    data_msg->is_sentence_started = false;
+
     data_map.emplace(hdl, data_msg);
   }catch (std::exception const& e) {
     std::cerr << "Error: " << e.what() << std::endl;
@@ -501,6 +533,7 @@ void WebSocketServer::on_message(websocketpp::connection_hdl hdl,
           std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding));
           msg_data->strand_->post(
               std::bind(&WebSocketServer::do_decoder, this,
+                        data_map,
                         std::move(*(sample_data_p.get())), std::move(hdl),
                         std::ref(msg_data->msg), std::ref(*(punc_cache_p.get())),
                         std::move(hotwords_embedding_),
@@ -550,6 +583,7 @@ void WebSocketServer::on_message(websocketpp::connection_hdl hdl,
               std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding));
               msg_data->strand_->post(
                         std::bind(&WebSocketServer::do_decoder, this,
+                                  data_map,
                                   std::move(subvector), std::move(hdl),
                                   std::ref(msg_data->msg),
                                   std::ref(*(punc_cache_p.get())),
diff --git a/runtime/websocket/bin/websocket-server-2pass.h b/runtime/websocket/bin/websocket-server-2pass.h
index e61a93b2d..d0d0ca85e 100644
--- a/runtime/websocket/bin/websocket-server-2pass.h
+++ b/runtime/websocket/bin/websocket-server-2pass.h
@@ -61,7 +61,11 @@ typedef struct {
   std::string online_res = "";
   std::string tpass_res = "";
   std::shared_ptr<asio::io_context::strand>  strand_; // for data execute in order
-  FUNASR_DEC_HANDLE decoder_handle=nullptr; 
+  FUNASR_DEC_HANDLE decoder_handle=nullptr;
+
+  bool is_sentence_started = false;
+  int64_t start_time = 0;
+  int64_t end_time = 0;
 } FUNASR_MESSAGE;
 
 // See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about
@@ -114,7 +118,9 @@ class WebSocketServer {
       server_->clear_access_channels(websocketpp::log::alevel::all);
     }
   }
-  void do_decoder(std::vector<char>& buffer, websocketpp::connection_hdl& hdl,
+  void do_decoder(std::map<websocketpp::connection_hdl, std::shared_ptr<FUNASR_MESSAGE>,std::owner_less<websocketpp::connection_hdl>>& data_map,
+                  std::vector<char>& buffer, 
+                  websocketpp::connection_hdl& hdl,
                   nlohmann::json& msg,
                   std::vector<std::vector<std::string>>& punc_cache,
                   std::vector<std::vector<float>> &hotwords_embedding,

From 2fbfd5f00bf9dc386899d2d2b3df3239bf4641de Mon Sep 17 00:00:00 2001
From: pengkun <pengkun@xinshiyun.com>
Date: Wed, 20 Nov 2024 14:41:32 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=E4=BF=AE=E6=94=B9FunASR=E5=AE=9E=E6=97=B6?=
 =?UTF-8?q?=E8=AF=86=E5=88=AB=E6=A1=86=E6=9E=B6=EF=BC=8C=E5=AE=9E=E6=97=B6?=
 =?UTF-8?q?=E8=AF=86=E5=88=AB=E6=97=B62pass=E6=A8=A1=E5=BC=8F=E4=B8=8B?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81=E6=A1=86=E6=9E=B6=E5=B1=82=E9=9D=A2=E8=BF=94?=
 =?UTF-8?q?=E5=9B=9E=E5=8F=A5=E5=AD=90=E7=BA=A7=E5=88=AB=E7=9A=84=E6=97=B6?=
 =?UTF-8?q?=E9=97=B4=E6=88=B3=EF=BC=8C=E5=8D=95=E4=BD=8D=E6=AF=AB=E7=A7=92?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 runtime/onnxruntime/src/audio.cpp             | 26 -------------------
 runtime/onnxruntime/src/funasrruntime.cpp     |  1 -
 .../websocket/bin/websocket-server-2pass.cpp  |  3 +--
 3 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/runtime/onnxruntime/src/audio.cpp b/runtime/onnxruntime/src/audio.cpp
index b73369247..4ceb56a3a 100644
--- a/runtime/onnxruntime/src/audio.cpp
+++ b/runtime/onnxruntime/src/audio.cpp
@@ -1293,29 +1293,6 @@ void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYP
         int sample_rate = 16000;  // sample_rate 是音频的采样率 这里固定为16000 Hz
         float segment_duration =  (static_cast<float>(seg_sample) / sample_rate) * 1000;  // 每个分段的持续时间（毫秒）
 
-        // for (auto vad_segment : vad_segments) {
-        //     int speech_start_i = -1, speech_end_i = -1;
-        //     if (vad_segment[0] != -1) {
-        //         speech_start_i = vad_segment[0];
-        //     }
-        //     if (vad_segment[1] != -1) {
-        //         speech_end_i = vad_segment[1];
-        //     }
-
-        //     // 计算并打印语音片段的开始和结束时间
-        //     if (speech_start_i != -1 && speech_end_i != -1) {
-        //         float start_time = speech_start_i * segment_duration;  // 开始时间（秒）
-        //         float end_time = speech_end_i * segment_duration;      // 结束时间（秒）
-        //         std::cout << "Speech segment: Start time = " << start_time << "s, End time = " << end_time << "s" << std::endl;
-        //     } else if (speech_start_i != -1) {
-        //         float start_time = speech_start_i * segment_duration;  // 仅有开始时间
-        //         std::cout << "Speech segment: Start time = " << start_time << "s, End time = Unknown" << std::endl;
-        //     } else if (speech_end_i != -1) {
-        //         float end_time = speech_end_i * segment_duration;      // 仅有结束时间
-        //         std::cout << "Speech segment: Start time = Unknown, End time = " << end_time << "s" << std::endl;
-        //     }
-        // }
-
         for(auto vad_segment: vad_segments){
             int speech_start_i=-1, speech_end_i=-1;
             if(vad_segment[0] != -1){
@@ -1358,7 +1335,6 @@ void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYP
                 // 转换为 int64_t 类型并赋值给类的成员变量
                 this->start = static_cast<int64_t>(start_time);
                 this->end = static_cast<int64_t>(end_time);
-                //std::cout << "Speech segment: Start time = " << this->start << "ms, End time = " << this->end << "ms" << std::endl;
                 speech_start = -1;
                 speech_offline_start = -1;
             // [70, -1]
@@ -1386,7 +1362,6 @@ void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYP
 
                 float start_time = speech_start_i * segment_duration;  // 仅有开始时间
                 this->start = static_cast<int64_t>(start_time);
-                //std::cout << "Speech segment: Start time = " << this->start << "ms, End time = Unknown" << std::endl;
             }else if(speech_end_i != -1){ // [-1,100]
                 if(speech_start == -1 || speech_offline_start == -1){
                     LOG(ERROR) <<"Vad start is null while vad end is available. Set vad start 0" ;
@@ -1438,7 +1413,6 @@ void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYP
                 }
                 float end_time = speech_end_i * segment_duration;      // 仅有结束时间
                 this->end = static_cast<int64_t>(end_time);
-                //std::cout << "Speech segment: Start time = Unknown, End time = " << this->end << "ms" << std::endl;
                 speech_start = -1;
                 speech_offline_start = -1;
             }
diff --git a/runtime/onnxruntime/src/funasrruntime.cpp b/runtime/onnxruntime/src/funasrruntime.cpp
index 1727900e4..1eb8230bd 100644
--- a/runtime/onnxruntime/src/funasrruntime.cpp
+++ b/runtime/onnxruntime/src/funasrruntime.cpp
@@ -525,7 +525,6 @@
 		audio->Split(vad_online_handle, chunk_len, input_finished, mode);
 		p_result->start = audio->start;
 		p_result->end = audio->end;
-		//std::cout << "p_result: Start time = " << p_result->start << "ms, End time = " << p_result->end << "ms" << std::endl;
 
 		funasr::AudioFrame* frame = nullptr;
 		while(audio->FetchChunck(frame) > 0){
diff --git a/runtime/websocket/bin/websocket-server-2pass.cpp b/runtime/websocket/bin/websocket-server-2pass.cpp
index 53f499d06..5a04922a9 100644
--- a/runtime/websocket/bin/websocket-server-2pass.cpp
+++ b/runtime/websocket/bin/websocket-server-2pass.cpp
@@ -283,8 +283,7 @@ void WebSocketServer::on_open(websocketpp::connection_hdl hdl) {
     data_msg->msg["audio_fs"] = 16000; // default is 16k
     data_msg->msg["access_num"] = 0; // the number of access for this object, when it is 0, we can free it saftly
     data_msg->msg["is_eof"]=false; // if this connection is closed
-    //data_msg->msg["svs_lang"]="auto";
-    data_msg->msg["svs_lang"]="zh"; // SenseVoice使用时，只需要中文则设为zh
+    data_msg->msg["svs_lang"]="auto";
     data_msg->msg["svs_itn"]=true;
     FUNASR_DEC_HANDLE decoder_handle =
       FunASRWfstDecoderInit(tpass_handle, ASR_TWO_PASS, global_beam_, lattice_beam_, am_scale_);

From 550ff418b6d086190acf5de88cd95d45ff3f0a16 Mon Sep 17 00:00:00 2001
From: pengkun <pengkun@xinshiyun.com>
Date: Fri, 14 Mar 2025 09:25:46 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=AF=86=E5=88=AB?=
 =?UTF-8?q?=E7=9B=B8=E5=85=B3=E6=A0=87=E8=AF=86=E5=AD=97=E6=AE=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 runtime/websocket/bin/websocket-server-2pass.cpp | 9 ++++++++-
 runtime/websocket/bin/websocket-server-2pass.h   | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/runtime/websocket/bin/websocket-server-2pass.cpp b/runtime/websocket/bin/websocket-server-2pass.cpp
index 5a04922a9..23f417841 100644
--- a/runtime/websocket/bin/websocket-server-2pass.cpp
+++ b/runtime/websocket/bin/websocket-server-2pass.cpp
@@ -81,10 +81,13 @@ nlohmann::json handle_result(FUNASR_RESULT result, websocketpp::connection_hdl&
     LOG(INFO) << "online_res :" << tmp_online_msg;
     jsonresult["text"] = tmp_online_msg;
     jsonresult["mode"] = "2pass-online";
+    jsonresult["slice_type"] = 1;
+    jsonresult["index"] = data_msg->index;
 
     // 如果是第一句话的第一个实时结果或新的句子开始
     if (!data_msg->is_sentence_started) {
       data_msg->start_time = FunASRGetTpassStart(result);  // 记录句子的开始时间
+      jsonresult["slice_type"] = 0; //0：一段话开始识别; 1：一段话识别中; 2：一段话识别结束
       data_msg->is_sentence_started = true;
     } 
   }
@@ -100,6 +103,10 @@ nlohmann::json handle_result(FUNASR_RESULT result, websocketpp::connection_hdl&
     // 句子结束，记录结束时间
     jsonresult["start_time"] = data_msg->start_time;
     jsonresult["end_time"] = data_msg->end_time;
+    jsonresult["slice_type"] = 2;
+    jsonresult["index"] = data_msg->index;
+
+    data_msg->index++; //句子序号
     data_msg->is_sentence_started = false;  // 重置句子状态
   }
 
@@ -283,7 +290,7 @@ void WebSocketServer::on_open(websocketpp::connection_hdl hdl) {
     data_msg->msg["audio_fs"] = 16000; // default is 16k
     data_msg->msg["access_num"] = 0; // the number of access for this object, when it is 0, we can free it saftly
     data_msg->msg["is_eof"]=false; // if this connection is closed
-    data_msg->msg["svs_lang"]="auto";
+    data_msg->msg["svs_lang"]="zh";
     data_msg->msg["svs_itn"]=true;
     FUNASR_DEC_HANDLE decoder_handle =
       FunASRWfstDecoderInit(tpass_handle, ASR_TWO_PASS, global_beam_, lattice_beam_, am_scale_);
diff --git a/runtime/websocket/bin/websocket-server-2pass.h b/runtime/websocket/bin/websocket-server-2pass.h
index d0d0ca85e..6d6be52df 100644
--- a/runtime/websocket/bin/websocket-server-2pass.h
+++ b/runtime/websocket/bin/websocket-server-2pass.h
@@ -66,6 +66,7 @@ typedef struct {
   bool is_sentence_started = false;
   int64_t start_time = 0;
   int64_t end_time = 0;
+  int64_t index = 0;
 } FUNASR_MESSAGE;
 
 // See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about

From 6ae8e84574616a6cd36d31ef4c6c5b2f1ed3b059 Mon Sep 17 00:00:00 2001
From: pengkun <pengkun@xinshiyun.com>
Date: Fri, 14 Mar 2025 14:16:53 +0800
Subject: [PATCH 4/4] =?UTF-8?q?2pass=E6=A8=A1=E5=BC=8F=E8=BF=94=E5=9B=9E?=
 =?UTF-8?q?=E5=AD=97=E6=AE=B5=E6=96=B0=E5=A2=9E=E5=AF=B9=E8=AF=9D=E5=BC=80?=
 =?UTF-8?q?=E5=A7=8B=E6=97=B6=E9=97=B4=E6=88=B3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 runtime/websocket/bin/websocket-server-2pass.cpp | 5 +++++
 runtime/websocket/bin/websocket-server-2pass.h   | 1 +
 2 files changed, 6 insertions(+)

diff --git a/runtime/websocket/bin/websocket-server-2pass.cpp b/runtime/websocket/bin/websocket-server-2pass.cpp
index 23f417841..4834f3423 100644
--- a/runtime/websocket/bin/websocket-server-2pass.cpp
+++ b/runtime/websocket/bin/websocket-server-2pass.cpp
@@ -93,6 +93,7 @@ nlohmann::json handle_result(FUNASR_RESULT result, websocketpp::connection_hdl&
   }
 
   data_msg->end_time = FunASRGetTpassEnd(result);  // 记录句子的结束时间
+  jsonresult["timestamp"] = data_msg->timestamp;
 
   std::string tmp_tpass_msg = FunASRGetTpassResult(result, 0);
   if (tmp_tpass_msg != "") {
@@ -198,6 +199,7 @@ void WebSocketServer::do_decoder(
         jsonresult["wav_name"] = wav_name;
         jsonresult["is_final"] = false;
         if (jsonresult["text"] != "") {
+          LOG(INFO) << "jsonresult: " << jsonresult.dump(4);
           if (is_ssl) {
             wss_server_->send(hdl, jsonresult.dump(),
                               websocketpp::frame::opcode::text, ec);
@@ -239,6 +241,7 @@ void WebSocketServer::do_decoder(
         nlohmann::json jsonresult = handle_result(Result, hdl, data_map);
         jsonresult["wav_name"] = wav_name;
         jsonresult["is_final"] = true;
+        LOG(INFO) << "jsonresult: " << jsonresult.dump(4);
         if (is_ssl) {
           wss_server_->send(hdl, jsonresult.dump(),
                             websocketpp::frame::opcode::text, ec);
@@ -301,6 +304,8 @@ void WebSocketServer::on_open(websocketpp::connection_hdl hdl) {
 
     data_msg->is_sentence_started = false;
 
+    data_msg->timestamp = getCurrentTimeMillis();
+
     data_map.emplace(hdl, data_msg);
   }catch (std::exception const& e) {
     std::cerr << "Error: " << e.what() << std::endl;
diff --git a/runtime/websocket/bin/websocket-server-2pass.h b/runtime/websocket/bin/websocket-server-2pass.h
index 6d6be52df..3ba63089c 100644
--- a/runtime/websocket/bin/websocket-server-2pass.h
+++ b/runtime/websocket/bin/websocket-server-2pass.h
@@ -67,6 +67,7 @@ typedef struct {
   int64_t start_time = 0;
   int64_t end_time = 0;
   int64_t index = 0;
+  int64_t timestamp = 0;
 } FUNASR_MESSAGE;
 
 // See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about