Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

script to export benchmark information as Line Protocol format #14662

Merged
merged 4 commits into from
Feb 16, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 188 additions & 0 deletions benchmarks/lineprotocol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


"""
Converts a given json to LineProtocol format that can be
visualised by grafana/other systems that support LineProtocol.

Usage example:
$ python3 lineprotocol.py sort.json
benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=0,row_count=10838832,elapsed_ms=85626006 1691105678000000000
benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=1,row_count=10838832,elapsed_ms=68694468 1691105678000000000
benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=2,row_count=10838832,elapsed_ms=63392883 1691105678000000000
benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=3,row_count=10838832,elapsed_ms=66388367 1691105678000000000
"""

# sort.json
"""
{
"queries": [
{
"iterations": [
{
"elapsed": 85626.006132,
"row_count": 10838832
},
{
"elapsed": 68694.467851,
"row_count": 10838832
},
{
"elapsed": 63392.883406,
"row_count": 10838832
},
{
"elapsed": 66388.367387,
"row_count": 10838832
},
],
"query": "sort utf8",
"start_time": 1691105678
},
],
"context": {
"arguments": [
"sort",
"--path",
"benchmarks/data",
"--scale-factor",
"1.0",
"--iterations",
"4",
"-o",
"sort.json"
],
"benchmark_version": "28.0.0",
"datafusion_version": "28.0.0",
"num_cpus": 8,
"start_time": 1691105678
}
}
"""

from __future__ import annotations

import json
from dataclasses import dataclass
from typing import Dict, List, Any
from pathlib import Path
from argparse import ArgumentParser
import sys
print = sys.stdout.write


@dataclass
class QueryResult:
elapsed: float
row_count: int

@classmethod
def load_from(cls, data: Dict[str, Any]) -> QueryResult:
return cls(elapsed=data["elapsed"], row_count=data["row_count"])


@dataclass
class QueryRun:
query: int
iterations: List[QueryResult]
start_time: int

@classmethod
def load_from(cls, data: Dict[str, Any]) -> QueryRun:
return cls(
query=data["query"],
iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
start_time=data["start_time"],
)

@property
def execution_time(self) -> float:
assert len(self.iterations) >= 1

# Use minimum execution time to account for variations / other
# things the system was doing
return min(iteration.elapsed for iteration in self.iterations)


@dataclass
class Context:
benchmark_version: str
datafusion_version: str
num_cpus: int
start_time: int
arguments: List[str]
name: str

@classmethod
def load_from(cls, data: Dict[str, Any]) -> Context:
return cls(
benchmark_version=data["benchmark_version"],
datafusion_version=data["datafusion_version"],
num_cpus=data["num_cpus"],
start_time=data["start_time"],
arguments=data["arguments"],
name=data["arguments"][0]
)


@dataclass
class BenchmarkRun:
context: Context
queries: List[QueryRun]

@classmethod
def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun:
return cls(
context=Context.load_from(data["context"]),
queries=[QueryRun.load_from(result) for result in data["queries"]],
)

@classmethod
def load_from_file(cls, path: Path) -> BenchmarkRun:
with open(path, "r") as f:
return cls.load_from(json.load(f))


def lineformat(
baseline: Path,
) -> None:
baseline = BenchmarkRun.load_from_file(baseline)
context = baseline.context
benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}"
for query in baseline.queries:
query_str = f"query=\"{query.query}\""
timestamp = f"{query.start_time*10**9}"
for iter_num, result in enumerate(query.iterations):
print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}\n")

def main() -> None:
parser = ArgumentParser()
parser.add_argument(
"path",
type=Path,
help="Path to the benchmark file.",
)
options = parser.parse_args()

lineformat(options.baseline_path)



if __name__ == "__main__":
main()
Loading