|
| 1 | +# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +# ======================================================================= |
| 15 | +import json |
| 16 | +import re |
| 17 | + |
| 18 | +import numpy as np |
| 19 | +import requests |
| 20 | +import tritonclient.http as httpclient |
| 21 | +from attrdict import AttrDict |
| 22 | +from tritonclient.utils import InferenceServerException |
| 23 | + |
| 24 | + |
| 25 | +def convert_http_metadata_config(metadata): |
| 26 | + metadata = AttrDict(metadata) |
| 27 | + |
| 28 | + return metadata |
| 29 | + |
| 30 | + |
| 31 | +def prepare_request(inputs_meta, inputs_data, outputs_meta): |
| 32 | + ''' |
| 33 | + inputs_meta: inputs meta information from model. name: info |
| 34 | + inputs_data: users input data. name: data |
| 35 | + ''' |
| 36 | + # Set the input data |
| 37 | + inputs = [] |
| 38 | + for input_dict in inputs_meta: |
| 39 | + input_name = input_dict['name'] |
| 40 | + if input_name not in inputs_data: |
| 41 | + raise RuntimeError( |
| 42 | + 'Error: input name {} required for model not existed.'.format( |
| 43 | + input_name)) |
| 44 | + if input_dict['datatype'] == 'FP32': |
| 45 | + inputs_data[input_name] = inputs_data[input_name].astype( |
| 46 | + np.float32 |
| 47 | + ) / 255 # image data returned by gradio is uint8, convert to fp32 |
| 48 | + if len(input_dict['shape'] |
| 49 | + ) == 3 and input_dict['shape'][0] == 3: # NCHW |
| 50 | + inputs_data[input_name] = inputs_data[input_name][0].transpose( |
| 51 | + 2, 0, 1) |
| 52 | + elif len(input_dict['shape'] |
| 53 | + ) == 4 and input_dict['shape'][1] == 3: # NCHW |
| 54 | + inputs_data[input_name] = inputs_data[input_name].transpose( |
| 55 | + 0, 3, 1, 2) |
| 56 | + infer_input = httpclient.InferInput( |
| 57 | + input_name, inputs_data[input_name].shape, input_dict['datatype']) |
| 58 | + infer_input.set_data_from_numpy(inputs_data[input_name]) |
| 59 | + inputs.append(infer_input) |
| 60 | + outputs = [] |
| 61 | + for output_dict in outputs_meta: |
| 62 | + infer_output = httpclient.InferRequestedOutput(output_dict.name) |
| 63 | + outputs.append(infer_output) |
| 64 | + return inputs, outputs |
| 65 | + |
| 66 | + |
| 67 | +metrics_table_head = """ |
| 68 | +<style> |
| 69 | +table, th {{ |
| 70 | + border:0.1px solid black; |
| 71 | +}} |
| 72 | +</style> |
| 73 | +
|
| 74 | +<div> |
| 75 | +<table style="width:100%"> |
| 76 | + <tr> |
| 77 | + <th rowspan="2">模型名称</th> |
| 78 | + <th colspan="4">执行统计</th> |
| 79 | + <th colspan="5">延迟统计</th> |
| 80 | +
|
| 81 | + </tr> |
| 82 | + <tr> |
| 83 | + <th>请求处理成功数</th> |
| 84 | + <th>请求处理失败数</th> |
| 85 | + <th>推理batch数</th> |
| 86 | + <th>推理样本数</th> |
| 87 | + <th>请求处理时间(ms)</th> |
| 88 | + <th>任务队列等待时间(ms)</th> |
| 89 | + <th>输入处理时间(ms)</th> |
| 90 | + <th>模型推理时间(ms)</th> |
| 91 | + <th>输出处理时间(ms)</th> |
| 92 | + </tr> |
| 93 | + {} |
| 94 | +</table> |
| 95 | +</div> |
| 96 | +<br> |
| 97 | +<br> |
| 98 | +<br> |
| 99 | +<br> |
| 100 | +<br> |
| 101 | +<div> |
| 102 | +<table style="width:100%"> |
| 103 | + <tr> |
| 104 | + <th rowspan="2">GPU</th> |
| 105 | + <th colspan="4">性能指标</th> |
| 106 | + <th colspan="2">显存</th> |
| 107 | + </tr> |
| 108 | + <tr> |
| 109 | + <th>利用率(%)</th> |
| 110 | + <th>功率(W)</th> |
| 111 | + <th>功率限制(W)</th> |
| 112 | + <th>耗电量(W)</th> |
| 113 | + <th>总量(GB)</th> |
| 114 | + <th>已使用(GB)</th> |
| 115 | + </tr> |
| 116 | + {} |
| 117 | +</table> |
| 118 | +</div> |
| 119 | +""" |
| 120 | + |
| 121 | + |
| 122 | +def get_metric_data(server_addr, metric_port): # noqa:C901 |
| 123 | + ''' |
| 124 | + Get metrics data from fastdeploy server, and transform it into html table. |
| 125 | + Args: |
| 126 | + server_addr(str): fastdeployserver ip address |
| 127 | + metric_port(int): fastdeployserver metrics port |
| 128 | + Returns: |
| 129 | + htmltable(str): html table to show metrics data |
| 130 | + ''' |
| 131 | + model_table = {} |
| 132 | + gpu_table = {} |
| 133 | + metric_column_name = { |
| 134 | + "Model": { |
| 135 | + "nv_inference_request_success", "nv_inference_request_failure", |
| 136 | + "nv_inference_count", "nv_inference_exec_count", |
| 137 | + "nv_inference_request_duration_us", |
| 138 | + "nv_inference_queue_duration_us", |
| 139 | + "nv_inference_compute_input_duration_us", |
| 140 | + "nv_inference_compute_infer_duration_us", |
| 141 | + "nv_inference_compute_output_duration_us" |
| 142 | + }, |
| 143 | + "GPU": { |
| 144 | + "nv_gpu_power_usage", "nv_gpu_power_limit", |
| 145 | + "nv_energy_consumption", "nv_gpu_utilization", |
| 146 | + "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" |
| 147 | + }, |
| 148 | + "CPU": { |
| 149 | + "nv_cpu_utilization", "nv_cpu_memory_total_bytes", |
| 150 | + "nv_cpu_memory_used_bytes" |
| 151 | + } |
| 152 | + } |
| 153 | + try: |
| 154 | + res = requests.get("http://{}:{}/metrics".format( |
| 155 | + server_addr, metric_port)) |
| 156 | + except Exception: |
| 157 | + return metrics_table_head.format('', '') |
| 158 | + metric_content = res.text |
| 159 | + for content in metric_content.split('\n'): |
| 160 | + if content.startswith('#'): |
| 161 | + continue |
| 162 | + else: |
| 163 | + res = re.match(r'(\w+){(.*)} (\w+)', |
| 164 | + content) # match output by server metrics interface |
| 165 | + if not res: |
| 166 | + continue |
| 167 | + metric_name = res.group(1) |
| 168 | + model = res.group(2) |
| 169 | + value = res.group(3) |
| 170 | + infos = {} |
| 171 | + for info in model.split(','): |
| 172 | + k, v = info.split('=') |
| 173 | + v = v.strip('"') |
| 174 | + infos[k] = v |
| 175 | + if metric_name in [ |
| 176 | + "nv_inference_request_duration_us", |
| 177 | + "nv_inference_queue_duration_us", |
| 178 | + "nv_inference_compute_input_duration_us", |
| 179 | + "nv_inference_compute_infer_duration_us", |
| 180 | + "nv_inference_compute_output_duration_us" |
| 181 | + ]: |
| 182 | + value = str(float(value) / 1000) |
| 183 | + elif metric_name in [ |
| 184 | + "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" |
| 185 | + ]: |
| 186 | + value = str(float(value) / 1024 / 1024 / 1024) |
| 187 | + for key, metric_names in metric_column_name.items(): |
| 188 | + if metric_name in metric_names: |
| 189 | + if key == 'Model': |
| 190 | + model_name = infos['model'] |
| 191 | + if model_name not in model_table: |
| 192 | + model_table[model_name] = {} |
| 193 | + model_table[model_name][metric_name] = value |
| 194 | + elif key == 'GPU': |
| 195 | + gpu_name = infos['gpu_uuid'] |
| 196 | + if gpu_name not in gpu_table: |
| 197 | + gpu_table[gpu_name] = {} |
| 198 | + gpu_table[gpu_name][metric_name] = value |
| 199 | + elif key == 'CPU': |
| 200 | + pass |
| 201 | + model_data_list = [] |
| 202 | + gpu_data_list = [] |
| 203 | + model_data_metric_names = [ |
| 204 | + "nv_inference_request_success", "nv_inference_request_failure", |
| 205 | + "nv_inference_exec_count", "nv_inference_count", |
| 206 | + "nv_inference_request_duration_us", "nv_inference_queue_duration_us", |
| 207 | + "nv_inference_compute_input_duration_us", |
| 208 | + "nv_inference_compute_infer_duration_us", |
| 209 | + "nv_inference_compute_output_duration_us" |
| 210 | + ] |
| 211 | + gpu_data_metric_names = [ |
| 212 | + "nv_gpu_utilization", "nv_gpu_power_usage", "nv_gpu_power_limit", |
| 213 | + "nv_energy_consumption", "nv_gpu_memory_total_bytes", |
| 214 | + "nv_gpu_memory_used_bytes" |
| 215 | + ] |
| 216 | + for k, v in model_table.items(): |
| 217 | + data = [] |
| 218 | + data.append(k) |
| 219 | + for data_metric in model_data_metric_names: |
| 220 | + data.append(v[data_metric]) |
| 221 | + model_data_list.append(data) |
| 222 | + for k, v in gpu_table.items(): |
| 223 | + data = [] |
| 224 | + data.append(k) |
| 225 | + for data_metric in gpu_data_metric_names: |
| 226 | + data.append(v[data_metric]) |
| 227 | + gpu_data_list.append(data) |
| 228 | + model_data = '\n'.join([ |
| 229 | + "<tr>" + '\n'.join(["<td>" + item + "</td>" |
| 230 | + for item in data]) + "</tr>" |
| 231 | + for data in model_data_list |
| 232 | + ]) |
| 233 | + gpu_data = '\n'.join([ |
| 234 | + "<tr>" + '\n'.join(["<td>" + item + "</td>" |
| 235 | + for item in data]) + "</tr>" |
| 236 | + for data in gpu_data_list |
| 237 | + ]) |
| 238 | + return metrics_table_head.format(model_data, gpu_data) |
| 239 | + |
| 240 | + |
| 241 | +class HttpClientManager: |
| 242 | + def __init__(self): |
| 243 | + self.clients = {} # server url: httpclient |
| 244 | + |
| 245 | + def _create_client(self, server_url): |
| 246 | + if server_url in self.clients: |
| 247 | + return self.clients[server_url] |
| 248 | + try: |
| 249 | + fastdeploy_client = httpclient.InferenceServerClient(server_url) |
| 250 | + self.clients[server_url] = fastdeploy_client |
| 251 | + return fastdeploy_client |
| 252 | + except Exception: |
| 253 | + raise RuntimeError( |
| 254 | + 'Can not connect to server {}, please check your \ |
| 255 | + server address'.format(server_url)) |
| 256 | + |
| 257 | + def infer(self, server_url, model_name, model_version, inputs): |
| 258 | + fastdeploy_client = self._create_client(server_url) |
| 259 | + input_metadata, output_metadata = self.get_model_meta( |
| 260 | + server_url, model_name, model_version) |
| 261 | + inputs, outputs = prepare_request(input_metadata, inputs, |
| 262 | + output_metadata) |
| 263 | + response = fastdeploy_client.infer( |
| 264 | + model_name, inputs, model_version=model_version, outputs=outputs) |
| 265 | + |
| 266 | + results = {} |
| 267 | + for output in output_metadata: |
| 268 | + result = response.as_numpy(output.name) # datatype: numpy |
| 269 | + if output.datatype == 'BYTES': # datatype: bytes |
| 270 | + try: |
| 271 | + value = result |
| 272 | + if len(result.shape) == 1: |
| 273 | + value = result[0] |
| 274 | + elif len(result.shape) == 2: |
| 275 | + value = result[0][0] |
| 276 | + elif len(result.shape) == 3: |
| 277 | + value = result[0][0][0] |
| 278 | + result = json.loads(value) # datatype: json |
| 279 | + except Exception: |
| 280 | + pass |
| 281 | + else: |
| 282 | + result = result[0] |
| 283 | + results[output.name] = result |
| 284 | + return results |
| 285 | + |
| 286 | + def raw_infer(self, server_url, model_name, model_version, raw_input): |
| 287 | + url = 'http://{}/v2/models/{}/versions/{}/infer'.format( |
| 288 | + server_url, model_name, model_version) |
| 289 | + res = requests.post(url, data=json.dumps(json.loads(raw_input))) |
| 290 | + return json.dumps(res.json()) |
| 291 | + |
| 292 | + def get_model_meta(self, server_url, model_name, model_version): |
| 293 | + fastdeploy_client = self._create_client(server_url) |
| 294 | + try: |
| 295 | + model_metadata = fastdeploy_client.get_model_metadata( |
| 296 | + model_name=model_name, model_version=model_version) |
| 297 | + except InferenceServerException as e: |
| 298 | + raise RuntimeError("Failed to retrieve the metadata: " + str(e)) |
| 299 | + |
| 300 | + model_metadata = convert_http_metadata_config(model_metadata) |
| 301 | + |
| 302 | + input_metadata = model_metadata.inputs |
| 303 | + output_metadata = model_metadata.outputs |
| 304 | + return input_metadata, output_metadata |
0 commit comments