From cb0baf2b351e0f7e32ce6e507fa8bcae26a5c3f7 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Thu, 21 Aug 2025 17:29:31 -0300 Subject: [PATCH 01/11] add topic property extractor add treesitter working version working version performance --- tools/property-extractor/Makefile | 37 +- .../json-to-asciidoc/generate_docs.py | 27 +- .../topic_property_extractor.py | 567 ++++++++++++++++++ 3 files changed, 617 insertions(+), 14 deletions(-) create mode 100644 tools/property-extractor/topic_property_extractor.py diff --git a/tools/property-extractor/Makefile b/tools/property-extractor/Makefile index 92f70ce..9bd1053 100644 --- a/tools/property-extractor/Makefile +++ b/tools/property-extractor/Makefile @@ -1,4 +1,16 @@ -.PHONY: build venv clean redpanda-git treesitter generate-docs check +.PHONY: build venv clean redpanda-git treesitter topic-properties generate-docs check + +# --- Main build: venv, fetch code, build parser, extract & docgen --- +build: venv redpanda-git treesitter + @echo "🔧 Building with Redpanda tag: $(TAG)" + @mkdir -p $(TOOL_ROOT)/gen + @cd $(TOOL_ROOT) && \ + $(PYTHON) -W ignore::FutureWarning property_extractor.py \ + --recursive \ + --path $(REDPANDA_SRC) \ + --output gen/properties-output.json + @echo "✅ Cluster properties JSON generated at $(TOOL_ROOT)/gen/properties-output.json" + @$(MAKE) generate-docs # Default tag (can be overridden via `make TAG=v25.1.1`) TAG ?= dev @@ -25,18 +37,6 @@ PYTHON := $(VENV)/bin/python OUTPUT_DIR := $(REPO_ROOT)/autogenerated/$(TAG)/properties TREE_SITTER := npx tree-sitter -# --- Main build: venv, fetch code, build parser, extract & docgen --- -build: venv redpanda-git treesitter - @echo "🔧 Building with Redpanda tag: $(TAG)" - @mkdir -p $(TOOL_ROOT)/gen - @cd $(TOOL_ROOT) && \ - $(PYTHON) -W ignore::FutureWarning property_extractor.py \ - --recursive \ - --path $(REDPANDA_SRC) \ - --output gen/properties-output.json - @echo "✅ JSON generated at $(TOOL_ROOT)/gen/properties-output.json" - @$(MAKE) generate-docs - # --- Ensure Python venv & dependencies --- venv: $(TOOL_ROOT)/requirements.txt @if [ ! -d "$(VENV)" ]; then \ @@ -103,3 +103,14 @@ check: @echo "VENV: $(VENV)" @echo "PYTHON: $(PYTHON)" @echo "OUTPUT_DIR: $(OUTPUT_DIR)" + +# --- Extract topic properties --- +topic-properties: + @echo "🔧 Extracting topic properties with Redpanda tag: $(TAG)" + @mkdir -p $(TOOL_ROOT)/gen + @cd $(TOOL_ROOT) && \ + $(PYTHON) topic_property_extractor.py \ + --source-path $(REDPANDA_SRC) \ + --output-json "$(OUTPUT_DIR)/topic-properties-output.json" \ + --output-adoc "$(OUTPUT_DIR)/topic-properties.adoc" + @echo "✅ Topic properties extracted" \ No newline at end of file diff --git a/tools/property-extractor/json-to-asciidoc/generate_docs.py b/tools/property-extractor/json-to-asciidoc/generate_docs.py index d5a1d38..9db6d41 100644 --- a/tools/property-extractor/json-to-asciidoc/generate_docs.py +++ b/tools/property-extractor/json-to-asciidoc/generate_docs.py @@ -14,6 +14,7 @@ OUTPUT_FILE_BROKER = "broker-properties.adoc" OUTPUT_FILE_CLUSTER = "cluster-properties.adoc" OUTPUT_FILE_CLOUD = "object-storage-properties.adoc" +OUTPUT_FILE_TOPIC = "topic-properties.adoc" OUTPUT_FILE_DEPRECATED = os.path.join("deprecated", "partials", "deprecated-properties.adoc") ALL_PROPERTIES_FILE = "all_properties.txt" @@ -66,6 +67,20 @@ ) CLUSTER_CONFIG_TITLE = "== Cluster configuration\n\n" +TOPIC_PAGE_TITLE = ( + "= Topic Configuration Properties\n" + ":page-aliases: reference:topic-properties.adoc\n" + ":description: Reference of topic configuration properties.\n\n" +) + +TOPIC_INTRO = ( + "A topic-level property sets a Redpanda or Kafka configuration for a particular topic.\n\n" + "Many topic-level properties have corresponding xref:manage:cluster-maintenance/cluster-property-configuration.adoc[cluster properties] that set a default value for all topics of a cluster. To customize the value for a topic, you can set a topic-level property that overrides the value of the corresponding cluster property.\n\n" + "NOTE: All topic properties take effect immediately after being set.\n\n" +) + +TOPIC_CONFIG_TITLE = "== Topic configuration\n\n" + CLOUD_PAGE_TITLE = ( "= Object Storage Properties\n" ":description: Reference of object storage properties.\n\n" @@ -92,7 +107,8 @@ "src/v/pandaproxy/schema_registry/configuration.cc": "schema reg", "src/v/pandaproxy/rest/configuration.cc": "http proxy", "src/v/kafka/client/configuration.cc": "http client", - "src/v/config/configuration.cc": "cluster" + "src/v/config/configuration.cc": "cluster", + "src/v/kafka/server/handlers/topics/types.cc": "topic" } SUFFIX_TO_UNIT = { @@ -339,6 +355,7 @@ def main(): kafka_client_content = [] cluster_config_content = [] cloud_config_content = [] + topic_config_content = [] deprecated_broker_content = [] deprecated_cluster_content = [] all_properties = [] @@ -388,6 +405,7 @@ def main(): "http client": kafka_client_content, "cluster": cluster_config_content, "cloud": cloud_config_content, + "topic": topic_config_content, } if group in group_mapping: group_mapping[group].append(property_doc) @@ -423,6 +441,12 @@ def main(): + CLOUD_CONFIG_TITLE + "".join(cloud_config_content) ) + topic_page = ( + TOPIC_PAGE_TITLE + + TOPIC_INTRO + + TOPIC_CONFIG_TITLE + + "".join(topic_config_content) + ) deprecated_page = ( DEPRECATED_PROPERTIES_TITLE + DEPRECATED_PROPERTIES_INTRO @@ -436,6 +460,7 @@ def main(): write_data_to_file(page_folder, OUTPUT_FILE_BROKER, broker_page) write_data_to_file(page_folder, OUTPUT_FILE_CLUSTER, cluster_page) write_data_to_file(page_folder, OUTPUT_FILE_CLOUD, cloud_page) + write_data_to_file(page_folder, OUTPUT_FILE_TOPIC, topic_page) write_data_to_file(page_folder, OUTPUT_FILE_DEPRECATED, deprecated_page) write_data_to_file(output_dir, ALL_PROPERTIES_FILE, "\n".join(all_properties)) diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py new file mode 100644 index 0000000..0480eca --- /dev/null +++ b/tools/property-extractor/topic_property_extractor.py @@ -0,0 +1,567 @@ +#!/usr/bin/env python3 +import os +import re +import json +import argparse +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Set + + +class TopicPropertyExtractor: + def __init__(self, source_path: str): + self.source_path = Path(source_path) + self.topic_properties = {} + self.cluster_mappings = {} + self.enum_values = {} + + def extract_topic_properties(self) -> Dict: + """Extract topic property constants from source files""" + + # Step 1: Discover all topic property constants + self._discover_topic_properties() + + # Step 2: Find enum definitions for acceptable values + self._discover_enum_values() + + # Step 3: Discover cluster property mappings from source code + self._discover_cluster_mappings() + + # Step 4: Match properties with their validators and mappings + self._correlate_properties_with_data() + + return { + "topic_properties": self.topic_properties, + "cluster_mappings": self.cluster_mappings, + "enum_values": self.enum_values + } + + def _discover_topic_properties(self): + """Dynamically discover all topic property constants from source files""" + + # Search for all header files that might contain topic property constants + topic_property_files = [ + "src/v/kafka/server/handlers/topics/types.h", + "src/v/kafka/protocol/topic_properties.h", + "src/v/cluster/topic_properties.h", + ] + + for file_pattern in topic_property_files: + file_path = self.source_path / file_pattern + if file_path.exists(): + self._parse_topic_properties_from_file(file_path) + + # Also search for any other files that might contain topic_property_ constants + for header_file in self.source_path.glob("src/**/*.h"): + if any(pattern in str(header_file) for pattern in ["topic", "kafka"]): + self._scan_file_for_topic_properties(header_file) + + print(f"Discovered {len(self.topic_properties)} topic properties") + + def _parse_topic_properties_from_file(self, file_path: Path): + """Parse topic property constants from a specific file""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Pattern to match: inline constexpr std::string_view topic_property_xxx = "yyy"; + pattern = r'inline\s+constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)";' + matches = re.findall(pattern, content) + + for var_name, property_name in matches: + self.topic_properties[property_name] = { + "variable_name": f"topic_property_{var_name}", + "property_name": property_name, + "source_file": str(file_path.relative_to(self.source_path)), + "description": "", + "type": self._determine_property_type(property_name), + "acceptable_values": None, + "corresponding_cluster_property": None + } + + print(f"Found {len(matches)} topic properties in {file_path}") + except Exception as e: + print(f"Error reading {file_path}: {e}") + + def _scan_file_for_topic_properties(self, file_path: Path): + """Scan any file for topic_property_ constants""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Look for any topic_property_ declarations + pattern = r'topic_property_(\w+)\s*=\s*"([^"]+)"' + matches = re.findall(pattern, content) + + for var_name, property_name in matches: + if property_name not in self.topic_properties: + self.topic_properties[property_name] = { + "variable_name": f"topic_property_{var_name}", + "property_name": property_name, + "source_file": str(file_path.relative_to(self.source_path)), + "description": "", + "type": self._determine_property_type(property_name), + "acceptable_values": None, + "corresponding_cluster_property": None + } + except Exception as e: + # Skip files that can't be read + pass + + def _discover_enum_values(self): + """Discover enum definitions that correspond to topic property acceptable values""" + + # Key enum files for topic property validation + enum_files = [ + "src/v/model/compression.h", + "src/v/model/fundamental.h", + "src/v/model/timestamp.h", + ] + + for file_pattern in enum_files: + file_path = self.source_path / file_pattern + if file_path.exists(): + self._parse_enums_from_file(file_path) + + # Also search other model files for enums + for header_file in self.source_path.glob("src/v/model/**/*.h"): + self._scan_file_for_enums(header_file) + + print(f"Discovered {len(self.enum_values)} enum types") + + def _parse_enums_from_file(self, file_path: Path): + """Parse enum definitions from a file""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Pattern for enum class definitions + enum_pattern = r'enum\s+class\s+(\w+)\s*[^{]*{([^}]+)}' + enum_matches = re.findall(enum_pattern, content, re.DOTALL) + + for enum_name, enum_body in enum_matches: + values = self._extract_enum_values(enum_body) + if values: + self.enum_values[enum_name] = { + "source_file": str(file_path.relative_to(self.source_path)), + "values": values + } + + # Pattern for regular enums too + regular_enum_pattern = r'enum\s+(\w+)\s*{([^}]+)}' + regular_matches = re.findall(regular_enum_pattern, content, re.DOTALL) + + for enum_name, enum_body in regular_matches: + values = self._extract_enum_values(enum_body) + if values: + self.enum_values[enum_name] = { + "source_file": str(file_path.relative_to(self.source_path)), + "values": values + } + + except Exception as e: + print(f"Error parsing enums from {file_path}: {e}") + + def _scan_file_for_enums(self, file_path: Path): + """Scan any file for enum definitions""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Look for enum class definitions + enum_pattern = r'enum\s+class\s+(\w+)\s*[^{]*{([^}]+)}' + matches = re.findall(enum_pattern, content, re.DOTALL) + + for enum_name, enum_body in matches: + if enum_name not in self.enum_values: + values = self._extract_enum_values(enum_body) + if values: + self.enum_values[enum_name] = { + "source_file": str(file_path.relative_to(self.source_path)), + "values": values + } + except Exception as e: + # Skip files that can't be read + pass + + def _determine_property_type(self, property_name: str) -> str: + """Determine the type of a property based on its name and usage patterns""" + + # Type mapping based on property name patterns + if any(keyword in property_name for keyword in ["caching", "recovery", "read", "write", "delete"]): + if property_name in ["write.caching", "redpanda.remote.recovery", "redpanda.remote.write", + "redpanda.remote.read", "redpanda.remote.delete", "redpanda.remote.readreplica"]: + return "boolean" + + elif any(suffix in property_name for suffix in [".bytes", ".ms", ".factor", ".lag.ms"]): + return "integer" + + elif "ratio" in property_name: + return "number" + + elif property_name in ["cleanup.policy", "compression.type", "message.timestamp.type"]: + return "string" # enum-based strings + + # Default to string for unknown properties + return "string" + + def _extract_enum_values(self, enum_body: str) -> List[str]: + """Extract enum value names from enum body""" + values = [] + + # Pattern to match enum value declarations (handle various formats) + value_patterns = [ + r'(\w+)\s*=\s*[^,}]+', # name = value + r'(\w+)\s*,', # name, + r'(\w+)\s*}' # name} + ] + + for pattern in value_patterns: + matches = re.findall(pattern, enum_body) + for match in matches: + if match and match not in values and not match.isdigit(): + values.append(match) + + return values + + def _discover_cluster_mappings(self): + """Discover topic-to-cluster property mappings from source code""" + + # Search in configuration and handler files for mappings + search_patterns = [ + "src/v/config/**/*.cc", + "src/v/config/**/*.h", + "src/v/kafka/server/handlers/**/*.cc", + "src/v/kafka/server/handlers/**/*.h", + "src/v/cluster/**/*.cc", + "src/v/cluster/**/*.h" + ] + + mapping_candidates = {} + + for pattern in search_patterns: + for file_path in self.source_path.glob(pattern): + if file_path.is_file(): + candidates = self._find_mappings_in_file(file_path) + mapping_candidates.update(candidates) + + # Process mapping candidates to find correlations + self._process_mapping_candidates(mapping_candidates) + + print(f"Discovered {len(self.cluster_mappings)} cluster property mappings") + + def _find_mappings_in_file(self, file_path: Path) -> Dict[str, str]: + """Find potential topic-to-cluster property mappings in a file""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + mappings = {} + + # Pattern 1: Look for configuration property definitions with proper cluster prop names + # Example: config.get("log_cleanup_policy") or similar patterns + config_patterns = [ + r'config\.get\("([^"]+)"\)', # config.get("property_name") + r'\.([a-z_]+(?:_[a-z]+)*)\(', # method calls like .retention_bytes( + r'([a-z_]+(?:_[a-z]+)*)\s*=', # assignments like retention_bytes = + ] + + for pattern in config_patterns: + matches = re.findall(pattern, content) + for match in matches: + # Only consider names that look like cluster properties + if self._looks_like_cluster_property(match): + # Try to correlate with topic properties + topic_prop = self._correlate_cluster_to_topic_property(match) + if topic_prop and topic_prop in self.topic_properties: + mappings[topic_prop] = match + + return mappings + + except Exception as e: + return {} + + def _looks_like_cluster_property(self, prop_name: str) -> bool: + """Check if a name looks like a cluster property""" + # Cluster properties typically have specific patterns + cluster_patterns = [ + r'^[a-z]+(_[a-z]+)*$', # snake_case like log_cleanup_policy + r'.*_default$', # ends with _default + r'.*_(ms|bytes|ratio|type|policy)$', # ends with common suffixes + ] + + return any(re.match(pattern, prop_name) for pattern in cluster_patterns) and len(prop_name) > 4 + + def _correlate_cluster_to_topic_property(self, cluster_prop: str) -> Optional[str]: + """Try to correlate a cluster property name to a topic property""" + + # Known correlation patterns + correlations = { + "log_cleanup_policy": "cleanup.policy", + "log_compression_type": "compression.type", + "log_retention_ms": "retention.ms", + "retention_bytes": "retention.bytes", + "log_segment_ms": "segment.ms", + "log_segment_size": "segment.bytes", + "log_message_timestamp_type": "message.timestamp.type", + "kafka_batch_max_bytes": "max.message.bytes", + "default_topic_replication": "replication.factor", + "write_caching_default": "write.caching", + } + + # Direct lookup first + if cluster_prop in correlations: + return correlations[cluster_prop] + + # Pattern-based correlation for properties we haven't hardcoded + # Convert cluster property naming to topic property naming + topic_candidates = [] + + # Remove common prefixes/suffixes + cleaned = cluster_prop + if cleaned.startswith("log_"): + cleaned = cleaned[4:] + if cleaned.endswith("_default"): + cleaned = cleaned[:-8] + if cleaned.endswith("_ms"): + cleaned = cleaned[:-3] + ".ms" + if cleaned.endswith("_bytes"): + cleaned = cleaned[:-6] + ".bytes" + if cleaned.endswith("_policy"): + cleaned = cleaned[:-7] + ".policy" + if cleaned.endswith("_type"): + cleaned = cleaned[:-5] + ".type" + + # Convert snake_case to dot.case + topic_candidate = cleaned.replace("_", ".") + + if topic_candidate in self.topic_properties: + return topic_candidate + + return None + + def _process_mapping_candidates(self, mapping_candidates: Dict[str, str]): + """Process and validate mapping candidates""" + for topic_prop, cluster_prop in mapping_candidates.items(): + if topic_prop in self.topic_properties: + self.cluster_mappings[topic_prop] = cluster_prop + + def _resolve_topic_property_name(self, var_name: str) -> Optional[str]: + """Resolve topic_property_xxx variable to actual property name""" + for prop_name, prop_data in self.topic_properties.items(): + if prop_data["variable_name"] == f"topic_property_{var_name}": + return prop_name + return None + + def _correlate_properties_with_data(self): + """Correlate topic properties with their acceptable values and cluster mappings""" + + for prop_name, prop_data in self.topic_properties.items(): + # Update cluster mapping if found + if prop_name in self.cluster_mappings: + prop_data["corresponding_cluster_property"] = self.cluster_mappings[prop_name] + + # Update acceptable values based on property type + prop_data["acceptable_values"] = self._determine_acceptable_values(prop_name, prop_data) + + def _determine_acceptable_values(self, prop_name: str, prop_data: Dict) -> str: + """Determine acceptable values for a property based on runtime analysis""" + + # Check if it's an enum-based property + if "compression" in prop_name: + if "compression" in self.enum_values: + values = self.enum_values["compression"]["values"] + # Filter out special values like 'count', 'producer' + filtered_values = [v for v in values if v not in ['count', 'producer']] + return f"[`{'`, `'.join(filtered_values)}`]" + + elif "cleanup.policy" in prop_name: + if "cleanup_policy_bitflags" in self.enum_values: + values = self.enum_values["cleanup_policy_bitflags"]["values"] + # Convert enum names to policy names + policy_values = [] + for v in values: + if v == "deletion": + policy_values.append("delete") + elif v == "compaction": + policy_values.append("compact") + if policy_values: + policy_values.append("compact,delete") # Combined policy + return f"[`{'`, `'.join(policy_values)}`]" + + elif "timestamp.type" in prop_name: + return "[`CreateTime`, `LogAppendTime`]" + + elif prop_data.get("type") == "boolean": + return "[`true`, `false`]" + + # For numeric properties, determine format based on type and name + elif prop_data.get("type") == "number" and "ratio" in prop_name: + return "[`0`, `1.0`]" + elif prop_data.get("type") == "integer": + if ".factor" in prop_name: + return "integer (1 or greater)" + elif ".bytes" in prop_name: + return "bytes (integer)" + elif ".ms" in prop_name: + return "milliseconds (integer)" + else: + return "integer" + + return "" # Default to empty if unknown + + def generate_topic_properties_adoc(self, output_path: str): + """Generate topic-properties.adoc file""" + + adoc_content = """= Topic Configuration Properties +:page-aliases: reference:topic-properties.adoc +:description: Reference of topic configuration properties. + +A topic-level property sets a Redpanda or Kafka configuration for a particular topic. + +Many topic-level properties have corresponding xref:manage:cluster-maintenance/cluster-property-configuration.adoc[cluster properties] that set a default value for all topics of a cluster. To customize the value for a topic, you can set a topic-level property that overrides the value of the corresponding cluster property. + +NOTE: All topic properties take effect immediately after being set. + +== Topic property mappings + +|=== +| Topic property | Corresponding cluster property + +""" + + # Add table rows ONLY for properties with cluster mappings + for prop_name, prop_data in sorted(self.topic_properties.items()): + cluster_prop = prop_data.get("corresponding_cluster_property") + if cluster_prop: # Only include if there's a cluster mapping + anchor = prop_name.replace(".", "").replace("-", "").lower() + adoc_content += f"| <<{anchor},`{prop_name}`>>\n" + adoc_content += f"| xref:./cluster-properties.adoc#{cluster_prop}[`{cluster_prop}`]\n\n" + + adoc_content += """|=== + +== Examples + +The following examples show how to configure topic-level properties. Set a topic-level property for a topic to override the value of corresponding cluster property. + +=== Create topic with topic properties + +To set topic properties when creating a topic, use the xref:reference:rpk/rpk-topic/rpk-topic-create.adoc[rpk topic create] command with the `-c` option. + +For example, to create a topic with the `cleanup.policy` property set to `compact`: + +[tabs] +==== +Local:: ++ +-- + +```bash +rpk topic create -c cleanup.policy=compact +``` + +-- +Kubernetes:: ++ +-- + +```bash +kubectl exec -- rpk topic create -c cleanup.policy=compact +``` + +-- +==== + +=== Modify topic properties + +To modify topic properties of an existing topic, use the xref:reference:rpk/rpk-topic/rpk-topic-alter-config.adoc[rpk topic alter-config] command. + +For example, to modify a topic's `retention.ms` property: + +[tabs] +==== +Local:: ++ +-- + +```bash +rpk topic alter-config --set retention.ms= +``` + +-- +Kubernetes:: ++ +-- + +```bash +kubectl exec -- rpk topic alter-config --set retention.ms= +``` + +-- +==== + +== Topic properties + +""" + + # Add individual property documentation - ONLY include properties with cluster mappings + for prop_name, prop_data in sorted(self.topic_properties.items()): + cluster_prop = prop_data.get("corresponding_cluster_property") + + # Skip properties without cluster mappings (as requested by user) + if not cluster_prop: + continue + + anchor = prop_name.replace(".", "").replace("-", "").lower() + acceptable_values = prop_data.get("acceptable_values", "") + prop_type = prop_data.get("type", "string") + + adoc_content += f""" +[[{anchor}]] +=== {prop_name} + +*Type:* {prop_type} + +""" + if acceptable_values: + adoc_content += f"*Accepted values:* {acceptable_values}\n\n" + + adoc_content += "*Default:* null\n\n" + adoc_content += f"*Related cluster property:* xref:./cluster-properties.adoc#{cluster_prop}[`{cluster_prop}`]\n\n" + adoc_content += "---\n\n" + + # Write the file + output_dir = os.path.dirname(output_path) + if output_dir: # Only create directory if there's a path + os.makedirs(output_dir, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(adoc_content) + + print(f"Generated topic properties documentation: {output_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Extract topic properties from Redpanda source code") + parser.add_argument("--source-path", required=True, help="Path to Redpanda source code") + parser.add_argument("--output-json", help="Output JSON file path") + parser.add_argument("--output-adoc", help="Output AsciiDoc file path") + + args = parser.parse_args() + + extractor = TopicPropertyExtractor(args.source_path) + result = extractor.extract_topic_properties() + + print(f"Total topic properties found: {len(result['topic_properties'])}") + print(f"Topic properties with cluster mappings: {len(result['cluster_mappings'])}") + print(f"Enum types discovered: {len(result['enum_values'])}") + + if args.output_json: + with open(args.output_json, 'w', encoding='utf-8') as f: + json.dump(result, f, indent=2) + print(f"Topic properties JSON saved to: {args.output_json}") + + if args.output_adoc: + extractor.generate_topic_properties_adoc(args.output_adoc) + + +if __name__ == "__main__": + main() From f15c743d605fdb925eb2c1406da99788d8f5f783 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 22 Aug 2025 10:46:42 -0300 Subject: [PATCH 02/11] refactor initial page --- .../topic_property_extractor.py | 64 +------------------ 1 file changed, 3 insertions(+), 61 deletions(-) diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py index 0480eca..8eb083e 100644 --- a/tools/property-extractor/topic_property_extractor.py +++ b/tools/property-extractor/topic_property_extractor.py @@ -420,7 +420,9 @@ def generate_topic_properties_adoc(self, output_path: str): Many topic-level properties have corresponding xref:manage:cluster-maintenance/cluster-property-configuration.adoc[cluster properties] that set a default value for all topics of a cluster. To customize the value for a topic, you can set a topic-level property that overrides the value of the corresponding cluster property. -NOTE: All topic properties take effect immediately after being set. +For information on how to configure topic properties, see xref:manage:cluster-maintenance/topic-property-configuration.adoc[]. + +NOTE: All topic properties take effect immediately after being set. == Topic property mappings @@ -439,66 +441,6 @@ def generate_topic_properties_adoc(self, output_path: str): adoc_content += """|=== -== Examples - -The following examples show how to configure topic-level properties. Set a topic-level property for a topic to override the value of corresponding cluster property. - -=== Create topic with topic properties - -To set topic properties when creating a topic, use the xref:reference:rpk/rpk-topic/rpk-topic-create.adoc[rpk topic create] command with the `-c` option. - -For example, to create a topic with the `cleanup.policy` property set to `compact`: - -[tabs] -==== -Local:: -+ --- - -```bash -rpk topic create -c cleanup.policy=compact -``` - --- -Kubernetes:: -+ --- - -```bash -kubectl exec -- rpk topic create -c cleanup.policy=compact -``` - --- -==== - -=== Modify topic properties - -To modify topic properties of an existing topic, use the xref:reference:rpk/rpk-topic/rpk-topic-alter-config.adoc[rpk topic alter-config] command. - -For example, to modify a topic's `retention.ms` property: - -[tabs] -==== -Local:: -+ --- - -```bash -rpk topic alter-config --set retention.ms= -``` - --- -Kubernetes:: -+ --- - -```bash -kubectl exec -- rpk topic alter-config --set retention.ms= -``` - --- -==== - == Topic properties """ From 408b8873f08af6b7b54e6ac12870237fb55cae14 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 22 Aug 2025 11:04:17 -0300 Subject: [PATCH 03/11] add to doc-tools --- bin/doc-tools.js | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/bin/doc-tools.js b/bin/doc-tools.js index fc784d9..19d17d6 100755 --- a/bin/doc-tools.js +++ b/bin/doc-tools.js @@ -777,6 +777,41 @@ automation process.exit(0); }); +automation + .command('topic-property-docs') + .description('Generate JSON and AsciiDoc documentation for Redpanda topic configuration properties') + .option('--tag ', 'Git tag or branch to extract from', 'dev') + .option('--diff ', 'Also diff autogenerated topic properties from ') + .action((options) => { + verifyPropertyDependencies(); + + const newTag = options.tag; + const oldTag = options.diff; + const cwd = path.resolve(__dirname, '../tools/property-extractor'); + const make = (tag) => { + console.log(`⏳ Building topic property docs for ${tag}…`); + const r = spawnSync('make', ['topic-properties', `TAG=${tag}`], { cwd, stdio: 'inherit' }); + if (r.error) { + console.error(`❌ ${r.error.message}`); + process.exit(1); + } + if (r.status !== 0) process.exit(r.status); + }; + + if (oldTag) { + const oldDir = path.join('autogenerated', oldTag, 'properties'); + if (!fs.existsSync(oldDir)) make(oldTag); + } + + make(newTag); + + if (oldTag) { + diffDirs('properties', oldTag, newTag); + } + + process.exit(0); + }); + automation .command('rpk-docs') .description('Generate AsciiDoc documentation for rpk CLI commands') From 21459a232a671c8bc234d708a7df21924a92974d Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 22 Aug 2025 11:34:06 -0300 Subject: [PATCH 04/11] Apply suggestions from code review Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- tools/property-extractor/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/property-extractor/Makefile b/tools/property-extractor/Makefile index 9bd1053..eedaf99 100644 --- a/tools/property-extractor/Makefile +++ b/tools/property-extractor/Makefile @@ -105,7 +105,7 @@ check: @echo "OUTPUT_DIR: $(OUTPUT_DIR)" # --- Extract topic properties --- -topic-properties: +topic-properties: venv redpanda-git treesitter @echo "🔧 Extracting topic properties with Redpanda tag: $(TAG)" @mkdir -p $(TOOL_ROOT)/gen @cd $(TOOL_ROOT) && \ From bb202cc0bc650f9a0a67b9a0bb951b55272488db Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 22 Aug 2025 13:06:04 -0300 Subject: [PATCH 05/11] code review --- .../property-extractor/topic_property_extractor.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py index 8eb083e..3035129 100644 --- a/tools/property-extractor/topic_property_extractor.py +++ b/tools/property-extractor/topic_property_extractor.py @@ -4,7 +4,8 @@ import json import argparse from pathlib import Path -from typing import Dict, List, Optional, Tuple, Set +import sys +from typing import Dict, List, Optional class TopicPropertyExtractor: @@ -104,8 +105,7 @@ def _scan_file_for_topic_properties(self, file_path: Path): "corresponding_cluster_property": None } except Exception as e: - # Skip files that can't be read - pass + print(f"Debug: Skipping {file_path}: {e}", file=sys.stderr) def _discover_enum_values(self): """Discover enum definitions that correspond to topic property acceptable values""" @@ -180,8 +180,7 @@ def _scan_file_for_enums(self, file_path: Path): "values": values } except Exception as e: - # Skip files that can't be read - pass + print(f"Debug: Error scanning enums in {file_path}: {e}", file=sys.stderr) def _determine_property_type(self, property_name: str) -> str: """Determine the type of a property based on its name and usage patterns""" @@ -278,6 +277,7 @@ def _find_mappings_in_file(self, file_path: Path) -> Dict[str, str]: return mappings except Exception as e: + print(f"Debug: Error finding mappings in {file_path}: {e}", file=sys.stderr) return {} def _looks_like_cluster_property(self, prop_name: str) -> bool: @@ -312,10 +312,6 @@ def _correlate_cluster_to_topic_property(self, cluster_prop: str) -> Optional[st if cluster_prop in correlations: return correlations[cluster_prop] - # Pattern-based correlation for properties we haven't hardcoded - # Convert cluster property naming to topic property naming - topic_candidates = [] - # Remove common prefixes/suffixes cleaned = cluster_prop if cleaned.startswith("log_"): From 52aca950aab12ac987bdaa55f1daefa406380f94 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 22 Aug 2025 13:06:10 -0300 Subject: [PATCH 06/11] bump version --- package-lock.json | 4 ++-- package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/package-lock.json b/package-lock.json index 3ec01eb..069b550 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "@redpanda-data/docs-extensions-and-macros", - "version": "4.7.4", + "version": "4.8.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@redpanda-data/docs-extensions-and-macros", - "version": "4.7.4", + "version": "4.7.2", "license": "ISC", "dependencies": { "@asciidoctor/tabs": "^1.0.0-beta.6", diff --git a/package.json b/package.json index fe63866..ee575b4 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@redpanda-data/docs-extensions-and-macros", - "version": "4.7.4", + "version": "4.8.0", "description": "Antora extensions and macros developed for Redpanda documentation.", "keywords": [ "antora", From bfd3ad31dc98fd5394473c87dfb829173941c866 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 22 Aug 2025 13:07:29 -0300 Subject: [PATCH 07/11] adjust identation --- tools/property-extractor/topic_property_extractor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py index 3035129..8d483f8 100644 --- a/tools/property-extractor/topic_property_extractor.py +++ b/tools/property-extractor/topic_property_extractor.py @@ -175,10 +175,10 @@ def _scan_file_for_enums(self, file_path: Path): if enum_name not in self.enum_values: values = self._extract_enum_values(enum_body) if values: - self.enum_values[enum_name] = { - "source_file": str(file_path.relative_to(self.source_path)), - "values": values - } + self.enum_values[enum_name] = { + "source_file": str(file_path.relative_to(self.source_path)), + "values": values + } except Exception as e: print(f"Debug: Error scanning enums in {file_path}: {e}", file=sys.stderr) From 7babe63d38de23123fbc2449407448361314cd71 Mon Sep 17 00:00:00 2001 From: JakeSCahill Date: Wed, 27 Aug 2025 09:32:53 +0100 Subject: [PATCH 08/11] Add test --- .../tools/topic_property_extractor.test.js | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 __tests__/tools/topic_property_extractor.test.js diff --git a/__tests__/tools/topic_property_extractor.test.js b/__tests__/tools/topic_property_extractor.test.js new file mode 100644 index 0000000..45960bf --- /dev/null +++ b/__tests__/tools/topic_property_extractor.test.js @@ -0,0 +1,58 @@ +const path = require('path'); +const fs = require('fs'); +const { execSync } = require('child_process'); + +describe('topic_property_extractor.py', () => { + const scriptPath = path.resolve(__dirname, '../../tools/property-extractor/topic_property_extractor.py'); + const mockSourcePath = path.resolve(__dirname, 'mock-redpanda-src'); + const outputJson = path.resolve(__dirname, 'topic-properties-output.json'); + const outputAdoc = path.resolve(__dirname, 'topic-properties.adoc'); + + beforeAll(() => { + // Create a minimal mock Redpanda source tree + if (!fs.existsSync(mockSourcePath)) { + fs.mkdirSync(mockSourcePath, { recursive: true }); + // Create a mock header file with a topic property + const headerDir = path.join(mockSourcePath, 'src/v/kafka/server/handlers/topics'); + fs.mkdirSync(headerDir, { recursive: true }); + fs.writeFileSync( + path.join(headerDir, 'types.h'), + 'inline constexpr std::string_view topic_property_retention_ms = "retention.ms";\n' + ); + // Add a mock .cc file (should be ignored for property extraction) + fs.writeFileSync( + path.join(headerDir, 'types.cc'), + `// Copyright 2025 Redpanda Data, Inc.\n#include "kafka/server/handlers/topics/types.h"\n// ...rest of the file...\n` + ); + // Add a mock config file to simulate a cluster property mapping + const configDir = path.join(mockSourcePath, 'src/v/config'); + fs.mkdirSync(configDir, { recursive: true }); + fs.writeFileSync( + path.join(configDir, 'mock_config.cc'), + 'config.get("log_retention_ms");\n' + ); + } + }); + + afterAll(() => { + // Cleanup + if (fs.existsSync(outputJson)) fs.unlinkSync(outputJson); + if (fs.existsSync(outputAdoc)) fs.unlinkSync(outputAdoc); + fs.rmdirSync(mockSourcePath, { recursive: true }); + }); + + it('extracts topic properties and generates JSON', () => { + execSync(`python3 ${scriptPath} --source-path ${mockSourcePath} --output-json ${outputJson}`); + const result = JSON.parse(fs.readFileSync(outputJson, 'utf8')); + expect(result.topic_properties).toBeDefined(); + expect(result.topic_properties['retention.ms']).toBeDefined(); + expect(result.topic_properties['retention.ms'].property_name).toBe('retention.ms'); + }); + + it('generates AsciiDoc output', () => { + execSync(`python3 ${scriptPath} --source-path ${mockSourcePath} --output-adoc ${outputAdoc}`); + const adoc = fs.readFileSync(outputAdoc, 'utf8'); + expect(adoc).toContain('= Topic Configuration Properties'); + expect(adoc).toContain('retention.ms'); + }); +}); From e947244cbf174e142c04989803a0e912d5f90228 Mon Sep 17 00:00:00 2001 From: JakeSCahill Date: Wed, 27 Aug 2025 09:53:22 +0100 Subject: [PATCH 09/11] Ensure directory exists --- tools/property-extractor/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/property-extractor/Makefile b/tools/property-extractor/Makefile index eedaf99..6edc675 100644 --- a/tools/property-extractor/Makefile +++ b/tools/property-extractor/Makefile @@ -108,6 +108,7 @@ check: topic-properties: venv redpanda-git treesitter @echo "🔧 Extracting topic properties with Redpanda tag: $(TAG)" @mkdir -p $(TOOL_ROOT)/gen + @mkdir -p "$(OUTPUT_DIR)" @cd $(TOOL_ROOT) && \ $(PYTHON) topic_property_extractor.py \ --source-path $(REDPANDA_SRC) \ From 114c07c6e280f52bb88f3723a65a35243dfbdf4a Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Wed, 10 Sep 2025 19:05:16 -0300 Subject: [PATCH 10/11] adjust to capture more properties, and identify no-op props --- .../topic_property_extractor.py | 235 ++++++++++++++---- 1 file changed, 180 insertions(+), 55 deletions(-) diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py index 8d483f8..a15d3cf 100644 --- a/tools/property-extractor/topic_property_extractor.py +++ b/tools/property-extractor/topic_property_extractor.py @@ -14,6 +14,7 @@ def __init__(self, source_path: str): self.topic_properties = {} self.cluster_mappings = {} self.enum_values = {} + self.noop_properties = set() def extract_topic_properties(self) -> Dict: """Extract topic property constants from source files""" @@ -24,39 +25,58 @@ def extract_topic_properties(self) -> Dict: # Step 2: Find enum definitions for acceptable values self._discover_enum_values() - # Step 3: Discover cluster property mappings from source code + # Step 3: Discover no-op properties + self._discover_noop_properties() + + # Step 4: Discover cluster property mappings from source code self._discover_cluster_mappings() - # Step 4: Match properties with their validators and mappings + # Step 5: Match properties with their validators and mappings self._correlate_properties_with_data() return { "topic_properties": self.topic_properties, "cluster_mappings": self.cluster_mappings, - "enum_values": self.enum_values + "enum_values": self.enum_values, + "noop_properties": list(self.noop_properties) } def _discover_topic_properties(self): """Dynamically discover all topic property constants from source files""" - # Search for all header files that might contain topic property constants - topic_property_files = [ + # Priority files - parse these first with the most comprehensive patterns + priority_files = [ "src/v/kafka/server/handlers/topics/types.h", - "src/v/kafka/protocol/topic_properties.h", + "src/v/kafka/protocol/topic_properties.h", "src/v/cluster/topic_properties.h", ] - for file_pattern in topic_property_files: + for file_pattern in priority_files: file_path = self.source_path / file_pattern if file_path.exists(): self._parse_topic_properties_from_file(file_path) - # Also search for any other files that might contain topic_property_ constants - for header_file in self.source_path.glob("src/**/*.h"): - if any(pattern in str(header_file) for pattern in ["topic", "kafka"]): - self._scan_file_for_topic_properties(header_file) - - print(f"Discovered {len(self.topic_properties)} topic properties") + # Comprehensive search - scan all header files that might contain properties + search_patterns = [ + "src/**/*topic*.h", + "src/**/*kafka*.h", + "src/**/*handler*.h", + "src/**/*config*.h", + "src/**/*property*.h", + ] + + scanned_files = set() + for pattern in search_patterns: + for header_file in self.source_path.glob(pattern): + if header_file not in scanned_files: + scanned_files.add(header_file) + self._scan_file_for_topic_properties(header_file) + + # Also scan the specific types.h file that we know contains many properties + types_files = list(self.source_path.glob("src/**/types.h")) + for types_file in types_files: + if types_file not in scanned_files: + self._scan_file_for_topic_properties(types_file) def _parse_topic_properties_from_file(self, file_path: Path): """Parse topic property constants from a specific file""" @@ -64,46 +84,85 @@ def _parse_topic_properties_from_file(self, file_path: Path): with open(file_path, 'r', encoding='utf-8') as f: content = f.read() - # Pattern to match: inline constexpr std::string_view topic_property_xxx = "yyy"; - pattern = r'inline\s+constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)";' - matches = re.findall(pattern, content) - - for var_name, property_name in matches: - self.topic_properties[property_name] = { - "variable_name": f"topic_property_{var_name}", - "property_name": property_name, - "source_file": str(file_path.relative_to(self.source_path)), - "description": "", - "type": self._determine_property_type(property_name), - "acceptable_values": None, - "corresponding_cluster_property": None - } + # Multiple patterns to catch all possible property definitions + patterns = [ + # Pattern 1: inline constexpr std::string_view topic_property_xxx = "yyy"; + r'inline\s+constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;', + # Pattern 2: constexpr std::string_view topic_property_xxx = "yyy"; + r'constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;', + # Pattern 3: const std::string topic_property_xxx = "yyy"; + r'const\s+std::string\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;', + # Pattern 4: static const char* topic_property_xxx = "yyy"; + r'static\s+const\s+char\*\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;', + ] + + total_matches = 0 + for pattern in patterns: + matches = re.findall(pattern, content) + total_matches += len(matches) - print(f"Found {len(matches)} topic properties in {file_path}") + for var_name, property_name in matches: + # Only add if not already found (prefer inline constexpr definitions) + if property_name not in self.topic_properties: + self.topic_properties[property_name] = { + "variable_name": f"topic_property_{var_name}", + "property_name": property_name, + "source_file": str(file_path.relative_to(self.source_path)), + "description": "", + "type": self._determine_property_type(property_name), + "acceptable_values": None, + "corresponding_cluster_property": None, + "is_noop": False # Will be updated later in _correlate_properties_with_data + } + print(f"Found {total_matches} topic properties in {file_path}") except Exception as e: print(f"Error reading {file_path}: {e}") def _scan_file_for_topic_properties(self, file_path: Path): """Scan any file for topic_property_ constants""" try: - with open(file_path, 'r', encoding='utf-8') as f: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() - # Look for any topic_property_ declarations - pattern = r'topic_property_(\w+)\s*=\s*"([^"]+)"' - matches = re.findall(pattern, content) - - for var_name, property_name in matches: - if property_name not in self.topic_properties: - self.topic_properties[property_name] = { - "variable_name": f"topic_property_{var_name}", - "property_name": property_name, - "source_file": str(file_path.relative_to(self.source_path)), - "description": "", - "type": self._determine_property_type(property_name), - "acceptable_values": None, - "corresponding_cluster_property": None - } + # Enhanced patterns to catch all property definitions + patterns = [ + # Pattern 1: inline constexpr std::string_view topic_property_xxx = "yyy"; + r'inline\s+constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;', + # Pattern 2: constexpr std::string_view topic_property_xxx = "yyy"; + r'constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;', + # Pattern 3: topic_property_xxx = "yyy" (simple assignment) + r'topic_property_(\w+)\s*=\s*"([^"]+)"', + # Pattern 4: const std::string topic_property_xxx = "yyy"; + r'const\s+std::string\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;', + # Pattern 5: Look for string literals that look like topic properties + r'"((?:redpanda\.|cleanup\.|compression\.|segment\.|flush\.|delete\.|replication\.|write\.|min\.|max\.|confluent\.)[^"]+)"' + ] + + for pattern in patterns: + matches = re.findall(pattern, content) + + for match in matches: + if len(match) == 2: + # Regular patterns with var_name and property_name + var_name, property_name = match + else: + # String literal pattern - generate var_name from property_name + property_name = match + var_name = re.sub(r'[^a-zA-Z0-9_]', '_', property_name) + var_name = re.sub(r'_+', '_', var_name).strip('_') + + # Validate this looks like a real topic property + if self._is_valid_topic_property(property_name) and property_name not in self.topic_properties: + self.topic_properties[property_name] = { + "variable_name": f"topic_property_{var_name}", + "property_name": property_name, + "source_file": str(file_path.relative_to(self.source_path)), + "description": "", + "type": self._determine_property_type(property_name), + "acceptable_values": None, + "corresponding_cluster_property": None, + "is_noop": False # Will be updated later in _correlate_properties_with_data + } except Exception as e: print(f"Debug: Skipping {file_path}: {e}", file=sys.stderr) @@ -125,8 +184,38 @@ def _discover_enum_values(self): # Also search other model files for enums for header_file in self.source_path.glob("src/v/model/**/*.h"): self._scan_file_for_enums(header_file) + + def _discover_noop_properties(self): + """Discover no-op properties from the allowlist_topic_noop_confs array""" + + # Look for the allowlist in types.h file + types_file = self.source_path / "src/v/kafka/server/handlers/topics/types.h" + if not types_file.exists(): + print("Warning: types.h file not found for no-op property detection") + return - print(f"Discovered {len(self.enum_values)} enum types") + try: + with open(types_file, 'r', encoding='utf-8') as f: + content = f.read() + + # Pattern to match the allowlist_topic_noop_confs array + # Looks for the array declaration and captures all string literals within it + pattern = r'allowlist_topic_noop_confs\s*=\s*\{([^}]+)\}' + match = re.search(pattern, content, re.DOTALL) + + if match: + array_content = match.group(1) + # Extract all quoted strings from the array + string_pattern = r'"([^"]+)"' + noop_properties = re.findall(string_pattern, array_content) + + self.noop_properties = set(noop_properties) + print(f"Found {len(self.noop_properties)} no-op properties") + else: + print("Warning: allowlist_topic_noop_confs array not found in types.h") + + except Exception as e: + print(f"Error reading no-op properties from {types_file}: {e}") def _parse_enums_from_file(self, file_path: Path): """Parse enum definitions from a file""" @@ -182,6 +271,37 @@ def _scan_file_for_enums(self, file_path: Path): except Exception as e: print(f"Debug: Error scanning enums in {file_path}: {e}", file=sys.stderr) + def _is_valid_topic_property(self, prop_name: str) -> bool: + """Validate that a string looks like a real topic property""" + + # Must be non-empty and reasonable length + if not prop_name or len(prop_name) < 3 or len(prop_name) > 100: + return False + + # Must contain only valid characters for topic properties + if not re.match(r'^[a-zA-Z][a-zA-Z0-9._-]*$', prop_name): + return False + + # Known topic property prefixes/patterns + valid_patterns = [ + r'^redpanda\.', + r'^cleanup\.policy$', + r'^compression\.type$', + r'^segment\.', + r'^flush\.', + r'^delete\.', + r'^replication\.factor$', + r'^write\.caching$', + r'^min\.', + r'^max\.', + r'^confluent\.', + r'.*\.ms$', + r'.*\.bytes$', + r'.*\.ratio$', + ] + + return any(re.match(pattern, prop_name, re.IGNORECASE) for pattern in valid_patterns) + def _determine_property_type(self, property_name: str) -> str: """Determine the type of a property based on its name and usage patterns""" @@ -246,8 +366,6 @@ def _discover_cluster_mappings(self): # Process mapping candidates to find correlations self._process_mapping_candidates(mapping_candidates) - print(f"Discovered {len(self.cluster_mappings)} cluster property mappings") - def _find_mappings_in_file(self, file_path: Path) -> Dict[str, str]: """Find potential topic-to-cluster property mappings in a file""" try: @@ -356,6 +474,9 @@ def _correlate_properties_with_data(self): if prop_name in self.cluster_mappings: prop_data["corresponding_cluster_property"] = self.cluster_mappings[prop_name] + # Mark as no-op if found in the allowlist + prop_data["is_noop"] = prop_name in self.noop_properties + # Update acceptable values based on property type prop_data["acceptable_values"] = self._determine_acceptable_values(prop_name, prop_data) @@ -427,10 +548,11 @@ def generate_topic_properties_adoc(self, output_path: str): """ - # Add table rows ONLY for properties with cluster mappings + # Add table rows ONLY for properties with cluster mappings and exclude no-ops for prop_name, prop_data in sorted(self.topic_properties.items()): cluster_prop = prop_data.get("corresponding_cluster_property") - if cluster_prop: # Only include if there's a cluster mapping + is_noop = prop_data.get("is_noop", False) + if cluster_prop and not is_noop: # Only include if there's a cluster mapping and not a no-op anchor = prop_name.replace(".", "").replace("-", "").lower() adoc_content += f"| <<{anchor},`{prop_name}`>>\n" adoc_content += f"| xref:./cluster-properties.adoc#{cluster_prop}[`{cluster_prop}`]\n\n" @@ -441,12 +563,13 @@ def generate_topic_properties_adoc(self, output_path: str): """ - # Add individual property documentation - ONLY include properties with cluster mappings + # Add individual property documentation - ONLY include properties with cluster mappings and exclude no-ops for prop_name, prop_data in sorted(self.topic_properties.items()): cluster_prop = prop_data.get("corresponding_cluster_property") + is_noop = prop_data.get("is_noop", False) - # Skip properties without cluster mappings (as requested by user) - if not cluster_prop: + # Skip properties without cluster mappings or no-op properties + if not cluster_prop or is_noop: continue anchor = prop_name.replace(".", "").replace("-", "").lower() @@ -488,9 +611,11 @@ def main(): extractor = TopicPropertyExtractor(args.source_path) result = extractor.extract_topic_properties() - print(f"Total topic properties found: {len(result['topic_properties'])}") - print(f"Topic properties with cluster mappings: {len(result['cluster_mappings'])}") - print(f"Enum types discovered: {len(result['enum_values'])}") + # Calculate properties that will be included in documentation (non-no-op with cluster mappings) + documented_props = [prop for prop, data in result['topic_properties'].items() + if data.get('corresponding_cluster_property') and not data.get('is_noop', False)] + + print(f"Found {len(result['topic_properties'])} total properties ({len(documented_props)} documented, {len(result['noop_properties'])} no-op)") if args.output_json: with open(args.output_json, 'w', encoding='utf-8') as f: From 0e8b8e896e33d2b7371cf8442af426bacd186e39 Mon Sep 17 00:00:00 2001 From: Paulo Borges Date: Fri, 12 Sep 2025 10:20:39 -0300 Subject: [PATCH 11/11] add tests --- .../tools/topic_property_extractor.test.js | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/__tests__/tools/topic_property_extractor.test.js b/__tests__/tools/topic_property_extractor.test.js index 45960bf..f47bf96 100644 --- a/__tests__/tools/topic_property_extractor.test.js +++ b/__tests__/tools/topic_property_extractor.test.js @@ -12,12 +12,22 @@ describe('topic_property_extractor.py', () => { // Create a minimal mock Redpanda source tree if (!fs.existsSync(mockSourcePath)) { fs.mkdirSync(mockSourcePath, { recursive: true }); - // Create a mock header file with a topic property + // Create a mock header file with topic properties and no-op allowlist const headerDir = path.join(mockSourcePath, 'src/v/kafka/server/handlers/topics'); fs.mkdirSync(headerDir, { recursive: true }); fs.writeFileSync( path.join(headerDir, 'types.h'), - 'inline constexpr std::string_view topic_property_retention_ms = "retention.ms";\n' + `inline constexpr std::string_view topic_property_retention_ms = "retention.ms"; +inline constexpr std::string_view topic_property_segment_bytes = "segment.bytes"; +inline constexpr std::string_view topic_property_flush_messages = "flush.messages"; + +// Mock allowlist for no-op properties +inline constexpr std::array allowlist_topic_noop_confs = { + "flush.messages", + "segment.index.bytes", + "preallocate", +}; +` ); // Add a mock .cc file (should be ignored for property extraction) fs.writeFileSync( @@ -29,7 +39,7 @@ describe('topic_property_extractor.py', () => { fs.mkdirSync(configDir, { recursive: true }); fs.writeFileSync( path.join(configDir, 'mock_config.cc'), - 'config.get("log_retention_ms");\n' + 'config.get("log_retention_ms");\nconfig.get("log_segment_size");\n' ); } }); @@ -49,10 +59,39 @@ describe('topic_property_extractor.py', () => { expect(result.topic_properties['retention.ms'].property_name).toBe('retention.ms'); }); - it('generates AsciiDoc output', () => { + it('detects no-op properties correctly', () => { + execSync(`python3 ${scriptPath} --source-path ${mockSourcePath} --output-json ${outputJson}`); + const result = JSON.parse(fs.readFileSync(outputJson, 'utf8')); + + // Check that noop_properties array is present + expect(result.noop_properties).toBeDefined(); + expect(Array.isArray(result.noop_properties)).toBe(true); + expect(result.noop_properties).toContain('flush.messages'); + expect(result.noop_properties).toContain('segment.index.bytes'); + expect(result.noop_properties).toContain('preallocate'); + + // Check that flush.messages is marked as no-op + if (result.topic_properties['flush.messages']) { + expect(result.topic_properties['flush.messages'].is_noop).toBe(true); + } + + // Check that regular properties are not marked as no-op + expect(result.topic_properties['retention.ms'].is_noop).toBe(false); + expect(result.topic_properties['segment.bytes'].is_noop).toBe(false); + }); + + it('excludes no-op properties from AsciiDoc generation', () => { execSync(`python3 ${scriptPath} --source-path ${mockSourcePath} --output-adoc ${outputAdoc}`); const adoc = fs.readFileSync(outputAdoc, 'utf8'); + + // Should contain regular properties expect(adoc).toContain('= Topic Configuration Properties'); expect(adoc).toContain('retention.ms'); + expect(adoc).toContain('segment.bytes'); + + // Should NOT contain no-op properties in documentation + expect(adoc).not.toContain('flush.messages'); + expect(adoc).not.toContain('segment.index.bytes'); + expect(adoc).not.toContain('preallocate'); }); });