InternLM
diff --git a/‎benchmark/profile_pipeline_api.py
+1-1 b/‎benchmark/profile_pipeline_api.py
+1-1
diff --git a/‎docs/en/llm/pipeline.md
+43-14 b/‎docs/en/llm/pipeline.md
+43-14
diff --git a/‎docs/en/multi_modal/vl_pipeline.md
+17-20 b/‎docs/en/multi_modal/vl_pipeline.md
+17-20
diff --git a/‎docs/zh_cn/llm/pipeline.md
+46-13 b/‎docs/zh_cn/llm/pipeline.md
+46-13
@@ -76,7 +76,7 @@ def process_request(self, requests, profiler: Profiler, temperature, top_p,
                              top_p=top_p,
                              top_k=top_k,
                              ignore_eos=True,
-                             do_sample=True,
+                             do_sample=False,
                              max_new_tokens=output_len)
             for _, _, output_len in requests
         ]
 
@@ -6,7 +6,7 @@ You can overview the detailed pipeline API in [this](https://lmdeploy.readthedoc
 
 ## Usage
 
-- **An example using default parameters:**
+### A 'Hello, world' example
 
 ```python
 from lmdeploy import pipeline
@@ -40,7 +40,7 @@ There have been alterations to the strategy for setting the k/v cache ratio thro
 
    The allocation strategy for k/v cache is changed to reserve space from the **GPU free memory** proportionally. The ratio `TurbomindEngineConfig.cache_max_entry_count` has been adjusted to 0.8 by default. If OOM error happens, similar to the method mentioned above, please consider reducing the ratio value to decrease the memory usage of the k/v cache.
 
-- **An example showing how to set tensor parallel num**:
+### Set tensor parallelism
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
@@ -52,7 +52,7 @@ response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
 print(response)
 ```
 
-- **An example for setting sampling parameters:**
+### Set sampling parameters
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
@@ -69,7 +69,7 @@ response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
 print(response)
 ```
 
-- **An example for OpenAI format prompt input:**
+### Apply OpenAI format prompt
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
@@ -93,7 +93,7 @@ response = pipe(prompts,
 print(response)
 ```
 
-- **An example for streaming mode:**
+### Apply streaming output
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
@@ -116,31 +116,60 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config):
     print(item)
 ```
 
-- **An example to cauculate logits & ppl:**
+### Get logits for generated tokens
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('internlm/internlm2_5-7b-chat')
+
+gen_config=GenerationConfig(output_logits='generation'
+                            max_new_tokens=10)
+response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
+                gen_config=gen_config)
+logits = [x.logits for x in response]
+```
+
+### Get last layer's hidden states for generated tokens
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('internlm/internlm2_5-7b-chat')
+
+gen_config=GenerationConfig(output_last_hidden_state='generation',
+                            max_new_tokens=10)
+response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
+                gen_config=gen_config)
+hidden_states = [x.last_hidden_state for x in response]
+```
+
+### Calculate ppl
 
 ```python
 from transformers import AutoTokenizer
 from lmdeploy import pipeline
-model_repoid_or_path='internlm/internlm2_5-7b-chat'
+
+
+model_repoid_or_path = 'internlm/internlm2_5-7b-chat'
 pipe = pipeline(model_repoid_or_path)
 tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)
-
-# logits
 messages = [
    {"role": "user", "content": "Hello, how are you?"},
 ]
 input_ids = tokenizer.apply_chat_template(messages)
-logits = pipe.get_logits(input_ids)
 
-# ppl
+# ppl is a list of float numbers
 ppl = pipe.get_ppl(input_ids)
+print(ppl)
 ```
 
 ```{note}
-get_ppl returns the cross entropy loss without applying the exponential operation afterwards
+- When input_ids is too long, an OOM (Out Of Memory) error may occur. Please apply it with caution
+- get_ppl returns the cross entropy loss without applying the exponential operation afterwards
 ```
 
-- **Below is an example for pytorch backend. Please install triton first.**
+### Use PyTorchEngine
 
 ```shell
 pip install triton>=2.1.0
@@ -167,7 +196,7 @@ response = pipe(prompts, gen_config=gen_config)
 print(response)
 ```
 
-- **An example for lora.**
+### Inference with LoRA
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, PytorchEngineConfig
 
@@ -4,7 +4,7 @@ LMDeploy abstracts the complex inference process of multi-modal Vision-Language
 
 The supported models are listed [here](../supported_models/supported_models.md). We genuinely invite the community to contribute new VLM support to LMDeploy. Your involvement is truly appreciated.
 
-This article showcases the VLM pipeline using the [liuhaotian/llava-v1.6-vicuna-7b](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b) model as a case study.
+This article showcases the VLM pipeline using the [OpenGVLab/InternVL2_5-8B](https://huggingface.co/OpenGVLab/InternVL2_5-8B) model as a case study.
 You'll learn about the simplest ways to leverage the pipeline and how to gradually unlock more advanced features by adjusting engine parameters and generation arguments, such as tensor parallelism, context window sizing, random sampling, and chat template customization.
 Moreover, we will provide practical inference examples tailored to scenarios with multiple images, batch prompts etc.
 
@@ -16,7 +16,7 @@ Using the pipeline interface to infer other VLM models is similar, with the main
 from lmdeploy import pipeline
 from lmdeploy.vl import load_image
 
-pipe = pipeline('liuhaotian/llava-v1.6-vicuna-7b')
+pipe = pipeline('OpenGVLab/InternVL2_5-8B')
 
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
 response = pipe(('describe this image', image))
@@ -30,7 +30,7 @@ In the above example, the inference prompt is a tuple structure consisting of (p
 ```python
 from lmdeploy import pipeline
 
-pipe = pipeline('liuhaotian/llava-v1.6-vicuna-7b')
+pipe = pipeline('OpenGVLab/InternVL2_5-8B')
 
 prompts = [
     {
@@ -53,7 +53,7 @@ Tensor paramllelism can be activated by setting the engine parameter `tp`
 from lmdeploy import pipeline, TurbomindEngineConfig
 from lmdeploy.vl import load_image
 
-pipe = pipeline('liuhaotian/llava-v1.6-vicuna-7b',
+pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(tp=2))
 
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
@@ -69,7 +69,7 @@ When creating the pipeline, you can customize the size of the context window by
 from lmdeploy import pipeline, TurbomindEngineConfig
 from lmdeploy.vl import load_image
 
-pipe = pipeline('liuhaotian/llava-v1.6-vicuna-7b',
+pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
 
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
@@ -85,7 +85,7 @@ You can change the default sampling parameters of pipeline by passing `Generatio
 from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
 from lmdeploy.vl import load_image
 
-pipe = pipeline('liuhaotian/llava-v1.6-vicuna-7b',
+pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(tp=2, session_len=8192))
 gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.6)
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
@@ -139,22 +139,19 @@ response = pipe(('describe this image', image))
 print(response)
 ```
 
-### Calculate logits
-
-We provide support for custom inputs. Users can utilize 'prepare_inputs' to understand how the inputs are organized.
+### Output logits for generated tokens
 
 ```python
-from lmdeploy import pipeline, TurbomindEngineConfig
+from lmdeploy import pipeline, GenerationConfig
 from lmdeploy.vl import load_image
-pipe = pipeline('internlm/internlm-xcomposer2-7b', backend_config=TurbomindEngineConfig(cache_max_entry_count=0.5))
+pipe = pipeline('OpenGVLab/InternVL2_5-8B')
 
-# logits
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
-inputs = pipe.prepare_inputs(('describe this image', image))
-input_ids = inputs['input_ids']
-embeddings = inputs['input_embeddings']
-embedding_ranges = inputs['input_embedding_ranges']
-logits = pipe.get_logits(input_ids, embeddings, embedding_ranges)
+
+response = pipe(('describe this image', image),
+                gen_config=GenerationConfig(output_logits='generation'))
+logits = response.logits
+print(logits)
 ```
 
 ## Multi-images inference
@@ -165,7 +162,7 @@ When dealing with multiple images, you can put them all in one list. Keep in min
 from lmdeploy import pipeline, TurbomindEngineConfig
 from lmdeploy.vl import load_image
 
-pipe = pipeline('liuhaotian/llava-v1.6-vicuna-7b',
+pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
 
 image_urls=[
@@ -186,7 +183,7 @@ Conducting inference with batch prompts is quite straightforward; just place the
 from lmdeploy import pipeline, TurbomindEngineConfig
 from lmdeploy.vl import load_image
 
-pipe = pipeline('liuhaotian/llava-v1.6-vicuna-7b',
+pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
 
 image_urls=[
@@ -206,7 +203,7 @@ There are two ways to do the multi-turn conversations with the pipeline. One is
 from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
 from lmdeploy.vl import load_image
 
-pipe = pipeline('liuhaotian/llava-v1.6-vicuna-7b',
+pipe = pipeline('OpenGVLab/InternVL2_5-8B',
                 backend_config=TurbomindEngineConfig(session_len=8192))
 
 image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
 
@@ -6,7 +6,7 @@ pipeline API 详细的接口说明，请阅读[此处](https://lmdeploy.readthed
 
 ## 使用方法
 
-- **使用默认参数的例子:**
+### "Hello, world" 示例
 
 ```python
 from lmdeploy import pipeline
@@ -40,7 +40,7 @@ LMDeploy 在研发过程中，k/v cache 比例的设定策略有变更，以下
 
    分配策略改为从**空闲显存**中按比例为 k/v cache 开辟空间。默认比例值调整为 0.8。如果遇到 OOM，类似上面的方法，请酌情减少比例值，降低 k/v cache 的内存占用量
 
-- **如何设置 tp:**
+### 设置多卡并行
 
 ```python
 from lmdeploy import pipeline, TurbomindEngineConfig
@@ -52,7 +52,7 @@ response = pipe(['Hi, pls intro yourself', 'Shanghai is'])
 print(response)
 ```
 
-- **如何设置 sampling 参数:**
+### 设置随机采样参数
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
@@ -69,7 +69,7 @@ response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
 print(response)
 ```
 
-- **如何设置 OpenAI 格式输入:**
+### 使用 OpenAI 格式的 prompt
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
@@ -93,7 +93,7 @@ response = pipe(prompts,
 print(response)
 ```
 
-- **流式返回处理结果：**
+### 流式输出
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
@@ -116,31 +116,64 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config):
     print(item)
 ```
 
-- **计算 logits & ppl:**
+### 获取生成 token 的 logits
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('internlm/internlm2_5-7b-chat')
+
+gen_config=GenerationConfig(output_logits='generation'
+                            max_new_tokens=10)
+response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
+                gen_config=gen_config)
+logits = [x.logits for x in response]
+```
+
+### 获取生成 token 最后一层的 hidden_states
+
+```python
+from lmdeploy import pipeline, GenerationConfig
+
+pipe = pipeline('internlm/internlm2_5-7b-chat')
+
+gen_config=GenerationConfig(output_last_hidden_state='generation',
+                            max_new_tokens=10)
+response = pipe(['Hi, pls intro yourself', 'Shanghai is'],
+                gen_config=gen_config)
+hidden_states = [x.last_hidden_state for x in response]
+```
+
+### 计算 ppl
 
 ```python
 from transformers import AutoTokenizer
 from lmdeploy import pipeline
-model_repoid_or_path='internlm/internlm2_5-7b-chat'
+
+
+model_repoid_or_path = 'internlm/internlm2_5-7b-chat'
 pipe = pipeline(model_repoid_or_path)
 tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)
-
-# logits
 messages = [
    {"role": "user", "content": "Hello, how are you?"},
 ]
 input_ids = tokenizer.apply_chat_template(messages)
+
+# logits is a list of tensor
 logits = pipe.get_logits(input_ids)
+print(logits)
 
-# ppl
+# ppl is a list of float numbers
 ppl = pipe.get_ppl(input_ids)
+print(ppl)
 ```
 
 ```{note}
+当 input_ids 过长时，可能会出现 OOM 错误，请小心应用
 get_ppl 返回的是 cross entropy loss，没有在之后加 exp 操作
 ```
 
-- **使用 pytorch 后端**
+### 使用 PyTorchEngine
 
 需要先安装 triton
 
@@ -169,7 +202,7 @@ response = pipe(prompts, gen_config=gen_config)
 print(response)
 ```
 
-- **一个 lora 的例子**
+### LoRA 模型推理
 
 ```python
 from lmdeploy import pipeline, GenerationConfig, PytorchEngineConfig
@@ -190,7 +223,7 @@ response = pipe(prompts, gen_config=gen_config, adapter_name='lora_name_1')
 print(response)
 ```
 
-## FAQs
+## 常见问题
 
 - **RuntimeError: An attempt has been made to start a new process before the current process has finished its bootstrapping phase**.
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def process_request(self, requests, profiler: Profiler, temperature, top_p,`
`76`	`76`	`top_p=top_p,`
`77`	`77`	`top_k=top_k,`
`78`	`78`	`ignore_eos=True,`
`79`		`- do_sample=True,`
	`79`	`+ do_sample=False,`
`80`	`80`	`max_new_tokens=output_len)`
`81`	`81`	`for _, _, output_len in requests`
`82`	`82`	`]`