mlcommons · arjunsuresh · Jun 10, 2025 · Mar 3, 2025 · Mar 3, 2025 · Mar 16, 2025
@@ -3,7 +3,7 @@ name: Auto-Update Dev Branches from Master
 on:
   push:
     branches:
-      - master  # Trigger workflow on commits to 'dev' branch
+      - master  # Trigger workflow on commits to 'master' branch
 
 jobs:
   update-dev:

@@ -1,5 +1,4 @@
 name: Build loadgen wheels and release them into PYPI
-
 on:
   release:
     types: [published]

@@ -5,7 +5,7 @@ name: Test for MLPerf inference bert submission generation using MLC script auto
 
 on:
   pull_request:
-    branches: [ "master", "dev" ]
+    branches: [ "master_off", "dev_off" ]
     paths:
       - language/bert/**
       - tools/submission/**

@@ -101,3 +101,7 @@ Please click [here](https://github.com/mlcommons/inference/blob/master/automotiv
 ```
 python accuracy_waymo.py --mlperf-accuracy-file <path to accuracy file>/mlperf_log_accuracy.json --waymo-dir /waymo/kitti_format/
 ```
+
+## Automated command for submission generation via MLCFlow
+
+Please see the [new docs site](https://docs.mlcommons.org/inference/submission/) for an automated way to generate submission through MLCFlow. 
@@ -181,6 +181,10 @@ mlcr process,mlperf,accuracy,_igbh --result_dir=<Path to directory where files a
 
 Please click [here](https://github.com/mlcommons/inference/blob/dev/graph/R-GAT/tools/accuracy_igbh.py) to view the Python script for evaluating accuracy for the IGBH dataset.
 
+## Automated command for submission generation via MLCFlow
+
+Please see the [new docs site](https://docs.mlcommons.org/inference/submission/) for an automated way to generate submission through MLCFlow. 
+
 #### Run using docker
 
 Not implemented yet

@@ -24,6 +24,44 @@ Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/l
 ## Disclaimer
 This benchmark app is a reference implementation that is not meant to be the fastest implementation possible.
 
+## Automated command to run the benchmark via MLCFlow
+
+Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/bert/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
+
+You can also do `pip install mlc-scripts` and then use `mlcr` commands for downloading the model and datasets using the commands given in the later sections.
+
+### Download model through MLCFlow Automation
+
+**Pytorch Framework**
+
+```
+mlcr get,ml-model,bert-large,_pytorch --outdirname=<path_to_download> -j
+```
+
+**Onnx Framework**
+
+```
+mlcr get,ml-model,bert-large,_onnx --outdirname=<path_to_download> -j
+```
+
+**TensorFlow Framework**
+
+```
+mlcr get,ml-model,bert-large,_tensorflow --outdirname=<path_to_download> -j
+```
+
+### Download dataset through MLCFlow Automation
+
+**Validation**
+```
+mlcr get,dataset,squad,validation  --outdirname=<path_to_download> -j
+```
+
+**Calibration**
+```
+mlcr get,dataset,squad,_calib1 --outdirname=<path_to_download> -j
+```
+
 ## Commands
 
 Please run the following commands:
@@ -45,6 +83,17 @@ Please run the following commands:
 - The script [tf_freeze_bert.py] freezes the TensorFlow model into pb file.
 - The script [bert_tf_to_pytorch.py] converts the TensorFlow model into the PyTorch `BertForQuestionAnswering` module in [HuggingFace Transformers](https://github.com/huggingface/transformers) and also exports the model to [ONNX](https://github.com/onnx/onnx) format.
 
+### Evaluate the accuracy through MLCFlow Automation
+```bash
+mlcr process,mlperf,accuracy,_squad --result_dir=<Path to directory where files are generated after the benchmark run>
+```
+
+Please click [here](https://github.com/mlcommons/inference/blob/master/language/bert/accuracy-squad.py) to view the Python script for evaluating accuracy for the squad dataset.
+
+## Automated command for submission generation via MLCFlow
+
+Please see the [new docs site](https://docs.mlcommons.org/inference/submission/) for an automated way to generate submission through MLCFlow. 
+
 ## Loadgen over the Network 
 
 ```

@@ -11,4 +11,4 @@
 # to avoid dependency issues when only using certain backends
 __all__ = [
     'BaseBackend',
-]
+]
@@ -1,3 +1,17 @@
+from utils.validation import require_initialized, BackendNotInitializedError
+from utils.backend_registry import get_backend_config
+from .utils import get_cache_directory
+from .base_backend import BaseBackend
+from transformers import AutoTokenizer
+import torch.distributed as dist
+import torch
+from pathlib import Path
+import asyncio
+from typing import Any, Dict, List, Optional
+import logging
+import json
+from ref_dsinfer.inference.model import Transformer, ModelArgs
+from safetensors.torch import load_model
 import os
 import sys
 
@@ -6,23 +20,6 @@
     'REF_DSINFER_PATH', '/opt/ref_dsinfer/inference')
 sys.path.append(ref_dsinfer_path)
 
-from safetensors.torch import load_model
-from ref_dsinfer.inference.model import Transformer, ModelArgs
-import json
-import logging
-from typing import Any, Dict, List, Optional
-import asyncio
-from pathlib import Path
-
-import torch
-import torch.distributed as dist
-from transformers import AutoTokenizer
-
-from .base_backend import BaseBackend
-from .utils import get_cache_directory
-from utils.backend_registry import get_backend_config
-from utils.validation import require_initialized, BackendNotInitializedError
-
 
 logger = logging.getLogger(__name__)
 
@@ -115,8 +112,10 @@ def initialize(self) -> None:
         with torch.device(self.config['device']):
             self.model = Transformer(self.model_args)
 
-        # Load tokenizer (only rank 0 needs it for MLPerf, but all ranks need it for run_eval_mpi)
-        self.tokenizer = AutoTokenizer.from_pretrained(str(self.model_path), revision=self.config['model_revision'])
+        # Load tokenizer (only rank 0 needs it for MLPerf, but all ranks need
+        # it for run_eval_mpi)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            str(self.model_path), revision=self.config['model_revision'])
 
         # Load model weights
         checkpoint_file = self.model_path / \
@@ -133,7 +132,8 @@ def sample(self, logits: torch.Tensor, temperature: float) -> torch.Tensor:
         """Sample from logits with temperature."""
         logits = logits / max(temperature, 1e-5)
         probs = torch.softmax(logits, dim=-1)
-        return probs.div_(torch.empty_like(probs).exponential_(1)).argmax(dim=-1)
+        return probs.div_(torch.empty_like(
+            probs).exponential_(1)).argmax(dim=-1)
 
     @torch.inference_mode()
     def _generate_batch(
@@ -222,7 +222,8 @@ def _generate_batch(
         return completion_tokens
 
     @require_initialized
-    def generate(self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[str, Any]]:
+    def generate(
+            self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[str, Any]]:
         """
         Generate responses for a list of pre-tokenized prompts.
 
@@ -265,7 +266,8 @@ def generate(self, tokenized_prompts: List[List[int]], **kwargs) -> List[Dict[st
         return results
 
     @require_initialized
-    def generate_batch_distributed(self, batch_tokens: List[List[int]]) -> List[List[int]]:
+    def generate_batch_distributed(
+            self, batch_tokens: List[List[int]]) -> List[List[int]]:
         """
         Generate tokens for a batch in distributed mode.
 
@@ -296,7 +298,8 @@ def generate_batch_distributed(self, batch_tokens: List[List[int]]) -> List[List
             return []
 
     @require_initialized
-    def generate_async(self, tokenized_prompts: List[List[int]], **kwargs) -> List[asyncio.Future]:
+    def generate_async(
+            self, tokenized_prompts: List[List[int]], **kwargs) -> List[asyncio.Future]:
         """
         Generate responses asynchronously.
 
@@ -331,7 +334,8 @@ async def extract_result(idx):
         return futures
 
     @require_initialized
-    def generate_batch_distributed_async(self, batch_tokens: List[List[int]]) -> asyncio.Future:
+    def generate_batch_distributed_async(
+            self, batch_tokens: List[List[int]]) -> asyncio.Future:
         """
         Generate tokens for a batch in distributed mode asynchronously.