From cb0baf2b351e0f7e32ce6e507fa8bcae26a5c3f7 Mon Sep 17 00:00:00 2001
From: Paulo Borges <paulohtb6@gmail.com>
Date: Thu, 21 Aug 2025 17:29:31 -0300
Subject: [PATCH 01/11] add topic property extractor

add treesitter

working version

working version

performance
---
 tools/property-extractor/Makefile             |  37 +-
 .../json-to-asciidoc/generate_docs.py         |  27 +-
 .../topic_property_extractor.py               | 567 ++++++++++++++++++
 3 files changed, 617 insertions(+), 14 deletions(-)
 create mode 100644 tools/property-extractor/topic_property_extractor.py

diff --git a/tools/property-extractor/Makefile b/tools/property-extractor/Makefile
index 92f70ce..9bd1053 100644
--- a/tools/property-extractor/Makefile
+++ b/tools/property-extractor/Makefile
@@ -1,4 +1,16 @@
-.PHONY: build venv clean redpanda-git treesitter generate-docs check
+.PHONY: build venv clean redpanda-git treesitter topic-properties generate-docs check
+
+# --- Main build: venv, fetch code, build parser, extract & docgen ---
+build: venv redpanda-git treesitter
+	@echo "🔧 Building with Redpanda tag: $(TAG)"
+	@mkdir -p $(TOOL_ROOT)/gen
+	@cd $(TOOL_ROOT) && \
+	  $(PYTHON) -W ignore::FutureWarning property_extractor.py \
+	    --recursive \
+	    --path $(REDPANDA_SRC) \
+	    --output gen/properties-output.json
+	@echo "✅ Cluster properties JSON generated at $(TOOL_ROOT)/gen/properties-output.json"
+	@$(MAKE) generate-docs
 
 # Default tag (can be overridden via `make TAG=v25.1.1`)
 TAG       ?= dev
@@ -25,18 +37,6 @@ PYTHON        := $(VENV)/bin/python
 OUTPUT_DIR    := $(REPO_ROOT)/autogenerated/$(TAG)/properties
 TREE_SITTER   := npx tree-sitter
 
-# --- Main build: venv, fetch code, build parser, extract & docgen ---
-build: venv redpanda-git treesitter
-	@echo "🔧 Building with Redpanda tag: $(TAG)"
-	@mkdir -p $(TOOL_ROOT)/gen
-	@cd $(TOOL_ROOT) && \
-	  $(PYTHON) -W ignore::FutureWarning property_extractor.py \
-	    --recursive \
-	    --path $(REDPANDA_SRC) \
-	    --output gen/properties-output.json
-	@echo "✅ JSON generated at $(TOOL_ROOT)/gen/properties-output.json"
-	@$(MAKE) generate-docs
-
 # --- Ensure Python venv & dependencies ---
 venv: $(TOOL_ROOT)/requirements.txt
 	@if [ ! -d "$(VENV)" ]; then \
@@ -103,3 +103,14 @@ check:
 	@echo "VENV:          $(VENV)"
 	@echo "PYTHON:        $(PYTHON)"
 	@echo "OUTPUT_DIR:    $(OUTPUT_DIR)"
+
+# --- Extract topic properties ---
+topic-properties:
+	@echo "🔧 Extracting topic properties with Redpanda tag: $(TAG)"
+	@mkdir -p $(TOOL_ROOT)/gen
+	@cd $(TOOL_ROOT) && \
+	  $(PYTHON) topic_property_extractor.py \
+	    --source-path $(REDPANDA_SRC) \
+	    --output-json "$(OUTPUT_DIR)/topic-properties-output.json" \
+	    --output-adoc "$(OUTPUT_DIR)/topic-properties.adoc"
+	@echo "✅ Topic properties extracted"
\ No newline at end of file
diff --git a/tools/property-extractor/json-to-asciidoc/generate_docs.py b/tools/property-extractor/json-to-asciidoc/generate_docs.py
index d5a1d38..9db6d41 100644
--- a/tools/property-extractor/json-to-asciidoc/generate_docs.py
+++ b/tools/property-extractor/json-to-asciidoc/generate_docs.py
@@ -14,6 +14,7 @@
 OUTPUT_FILE_BROKER = "broker-properties.adoc"
 OUTPUT_FILE_CLUSTER = "cluster-properties.adoc"
 OUTPUT_FILE_CLOUD = "object-storage-properties.adoc"
+OUTPUT_FILE_TOPIC = "topic-properties.adoc"
 OUTPUT_FILE_DEPRECATED = os.path.join("deprecated", "partials", "deprecated-properties.adoc")
 ALL_PROPERTIES_FILE = "all_properties.txt"
 
@@ -66,6 +67,20 @@
 )
 CLUSTER_CONFIG_TITLE = "== Cluster configuration\n\n"
 
+TOPIC_PAGE_TITLE = (
+    "= Topic Configuration Properties\n"
+    ":page-aliases: reference:topic-properties.adoc\n"
+    ":description: Reference of topic configuration properties.\n\n"
+)
+
+TOPIC_INTRO = (
+    "A topic-level property sets a Redpanda or Kafka configuration for a particular topic.\n\n"
+    "Many topic-level properties have corresponding xref:manage:cluster-maintenance/cluster-property-configuration.adoc[cluster properties] that set a default value for all topics of a cluster. To customize the value for a topic, you can set a topic-level property that overrides the value of the corresponding cluster property.\n\n"
+    "NOTE: All topic properties take effect immediately after being set.\n\n"
+)
+
+TOPIC_CONFIG_TITLE = "== Topic configuration\n\n"
+
 CLOUD_PAGE_TITLE = (
     "= Object Storage Properties\n"
     ":description: Reference of object storage properties.\n\n"
@@ -92,7 +107,8 @@
     "src/v/pandaproxy/schema_registry/configuration.cc": "schema reg",
     "src/v/pandaproxy/rest/configuration.cc": "http proxy",
     "src/v/kafka/client/configuration.cc": "http client",
-    "src/v/config/configuration.cc": "cluster"
+    "src/v/config/configuration.cc": "cluster",
+    "src/v/kafka/server/handlers/topics/types.cc": "topic"
 }
 
 SUFFIX_TO_UNIT = {
@@ -339,6 +355,7 @@ def main():
     kafka_client_content = []
     cluster_config_content = []
     cloud_config_content = []
+    topic_config_content = []
     deprecated_broker_content = []
     deprecated_cluster_content = []
     all_properties = []
@@ -388,6 +405,7 @@ def main():
             "http client": kafka_client_content,
             "cluster": cluster_config_content,
             "cloud": cloud_config_content,
+            "topic": topic_config_content,
         }
         if group in group_mapping:
             group_mapping[group].append(property_doc)
@@ -423,6 +441,12 @@ def main():
         + CLOUD_CONFIG_TITLE
         + "".join(cloud_config_content)
     )
+    topic_page = (
+        TOPIC_PAGE_TITLE
+        + TOPIC_INTRO
+        + TOPIC_CONFIG_TITLE
+        + "".join(topic_config_content)
+    )
     deprecated_page = (
         DEPRECATED_PROPERTIES_TITLE
         + DEPRECATED_PROPERTIES_INTRO
@@ -436,6 +460,7 @@ def main():
     write_data_to_file(page_folder, OUTPUT_FILE_BROKER, broker_page)
     write_data_to_file(page_folder, OUTPUT_FILE_CLUSTER, cluster_page)
     write_data_to_file(page_folder, OUTPUT_FILE_CLOUD, cloud_page)
+    write_data_to_file(page_folder, OUTPUT_FILE_TOPIC, topic_page)
     write_data_to_file(page_folder, OUTPUT_FILE_DEPRECATED, deprecated_page)
     write_data_to_file(output_dir, ALL_PROPERTIES_FILE, "\n".join(all_properties))
 
diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py
new file mode 100644
index 0000000..0480eca
--- /dev/null
+++ b/tools/property-extractor/topic_property_extractor.py
@@ -0,0 +1,567 @@
+#!/usr/bin/env python3
+import os
+import re
+import json
+import argparse
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Set
+
+
+class TopicPropertyExtractor:
+    def __init__(self, source_path: str):
+        self.source_path = Path(source_path)
+        self.topic_properties = {}
+        self.cluster_mappings = {}
+        self.enum_values = {}
+        
+    def extract_topic_properties(self) -> Dict:
+        """Extract topic property constants from source files"""
+        
+        # Step 1: Discover all topic property constants
+        self._discover_topic_properties()
+        
+        # Step 2: Find enum definitions for acceptable values
+        self._discover_enum_values()
+        
+        # Step 3: Discover cluster property mappings from source code
+        self._discover_cluster_mappings()
+        
+        # Step 4: Match properties with their validators and mappings
+        self._correlate_properties_with_data()
+        
+        return {
+            "topic_properties": self.topic_properties,
+            "cluster_mappings": self.cluster_mappings,
+            "enum_values": self.enum_values
+        }
+        
+    def _discover_topic_properties(self):
+        """Dynamically discover all topic property constants from source files"""
+        
+        # Search for all header files that might contain topic property constants
+        topic_property_files = [
+            "src/v/kafka/server/handlers/topics/types.h",
+            "src/v/kafka/protocol/topic_properties.h",
+            "src/v/cluster/topic_properties.h",
+        ]
+        
+        for file_pattern in topic_property_files:
+            file_path = self.source_path / file_pattern
+            if file_path.exists():
+                self._parse_topic_properties_from_file(file_path)
+                
+        # Also search for any other files that might contain topic_property_ constants
+        for header_file in self.source_path.glob("src/**/*.h"):
+            if any(pattern in str(header_file) for pattern in ["topic", "kafka"]):
+                self._scan_file_for_topic_properties(header_file)
+                
+        print(f"Discovered {len(self.topic_properties)} topic properties")
+        
+    def _parse_topic_properties_from_file(self, file_path: Path):
+        """Parse topic property constants from a specific file"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                
+            # Pattern to match: inline constexpr std::string_view topic_property_xxx = "yyy";
+            pattern = r'inline\s+constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)";'
+            matches = re.findall(pattern, content)
+            
+            for var_name, property_name in matches:
+                self.topic_properties[property_name] = {
+                    "variable_name": f"topic_property_{var_name}",
+                    "property_name": property_name,
+                    "source_file": str(file_path.relative_to(self.source_path)),
+                    "description": "",
+                    "type": self._determine_property_type(property_name),
+                    "acceptable_values": None,
+                    "corresponding_cluster_property": None
+                }
+                
+            print(f"Found {len(matches)} topic properties in {file_path}")
+        except Exception as e:
+            print(f"Error reading {file_path}: {e}")
+            
+    def _scan_file_for_topic_properties(self, file_path: Path):
+        """Scan any file for topic_property_ constants"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                
+            # Look for any topic_property_ declarations
+            pattern = r'topic_property_(\w+)\s*=\s*"([^"]+)"'
+            matches = re.findall(pattern, content)
+            
+            for var_name, property_name in matches:
+                if property_name not in self.topic_properties:
+                    self.topic_properties[property_name] = {
+                        "variable_name": f"topic_property_{var_name}",
+                        "property_name": property_name,
+                        "source_file": str(file_path.relative_to(self.source_path)),
+                        "description": "",
+                        "type": self._determine_property_type(property_name),
+                        "acceptable_values": None,
+                        "corresponding_cluster_property": None
+                    }
+        except Exception as e:
+            # Skip files that can't be read
+            pass
+            
+    def _discover_enum_values(self):
+        """Discover enum definitions that correspond to topic property acceptable values"""
+        
+        # Key enum files for topic property validation
+        enum_files = [
+            "src/v/model/compression.h",
+            "src/v/model/fundamental.h",
+            "src/v/model/timestamp.h",
+        ]
+        
+        for file_pattern in enum_files:
+            file_path = self.source_path / file_pattern
+            if file_path.exists():
+                self._parse_enums_from_file(file_path)
+                
+        # Also search other model files for enums
+        for header_file in self.source_path.glob("src/v/model/**/*.h"):
+            self._scan_file_for_enums(header_file)
+            
+        print(f"Discovered {len(self.enum_values)} enum types")
+        
+    def _parse_enums_from_file(self, file_path: Path):
+        """Parse enum definitions from a file"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                
+            # Pattern for enum class definitions
+            enum_pattern = r'enum\s+class\s+(\w+)\s*[^{]*{([^}]+)}'
+            enum_matches = re.findall(enum_pattern, content, re.DOTALL)
+            
+            for enum_name, enum_body in enum_matches:
+                values = self._extract_enum_values(enum_body)
+                if values:
+                    self.enum_values[enum_name] = {
+                        "source_file": str(file_path.relative_to(self.source_path)),
+                        "values": values
+                    }
+                    
+            # Pattern for regular enums too
+            regular_enum_pattern = r'enum\s+(\w+)\s*{([^}]+)}'
+            regular_matches = re.findall(regular_enum_pattern, content, re.DOTALL)
+            
+            for enum_name, enum_body in regular_matches:
+                values = self._extract_enum_values(enum_body)
+                if values:
+                    self.enum_values[enum_name] = {
+                        "source_file": str(file_path.relative_to(self.source_path)),
+                        "values": values
+                    }
+                    
+        except Exception as e:
+            print(f"Error parsing enums from {file_path}: {e}")
+            
+    def _scan_file_for_enums(self, file_path: Path):
+        """Scan any file for enum definitions"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                
+            # Look for enum class definitions
+            enum_pattern = r'enum\s+class\s+(\w+)\s*[^{]*{([^}]+)}'
+            matches = re.findall(enum_pattern, content, re.DOTALL)
+            
+            for enum_name, enum_body in matches:
+                if enum_name not in self.enum_values:
+                    values = self._extract_enum_values(enum_body)
+                    if values:
+                            self.enum_values[enum_name] = {
+                                "source_file": str(file_path.relative_to(self.source_path)),
+                                "values": values
+                            }
+        except Exception as e:
+            # Skip files that can't be read
+            pass
+            
+    def _determine_property_type(self, property_name: str) -> str:
+        """Determine the type of a property based on its name and usage patterns"""
+        
+        # Type mapping based on property name patterns
+        if any(keyword in property_name for keyword in ["caching", "recovery", "read", "write", "delete"]):
+            if property_name in ["write.caching", "redpanda.remote.recovery", "redpanda.remote.write", 
+                               "redpanda.remote.read", "redpanda.remote.delete", "redpanda.remote.readreplica"]:
+                return "boolean"
+                
+        elif any(suffix in property_name for suffix in [".bytes", ".ms", ".factor", ".lag.ms"]):
+            return "integer"
+            
+        elif "ratio" in property_name:
+            return "number"  
+            
+        elif property_name in ["cleanup.policy", "compression.type", "message.timestamp.type"]:
+            return "string"  # enum-based strings
+            
+        # Default to string for unknown properties
+        return "string"
+        
+    def _extract_enum_values(self, enum_body: str) -> List[str]:
+        """Extract enum value names from enum body"""
+        values = []
+        
+        # Pattern to match enum value declarations (handle various formats)
+        value_patterns = [
+            r'(\w+)\s*=\s*[^,}]+',  # name = value
+            r'(\w+)\s*,',           # name,
+            r'(\w+)\s*}'            # name}
+        ]
+        
+        for pattern in value_patterns:
+            matches = re.findall(pattern, enum_body)
+            for match in matches:
+                if match and match not in values and not match.isdigit():
+                    values.append(match)
+                    
+        return values
+        
+    def _discover_cluster_mappings(self):
+        """Discover topic-to-cluster property mappings from source code"""
+        
+        # Search in configuration and handler files for mappings
+        search_patterns = [
+            "src/v/config/**/*.cc",
+            "src/v/config/**/*.h", 
+            "src/v/kafka/server/handlers/**/*.cc",
+            "src/v/kafka/server/handlers/**/*.h",
+            "src/v/cluster/**/*.cc",
+            "src/v/cluster/**/*.h"
+        ]
+        
+        mapping_candidates = {}
+        
+        for pattern in search_patterns:
+            for file_path in self.source_path.glob(pattern):
+                if file_path.is_file():
+                    candidates = self._find_mappings_in_file(file_path)
+                    mapping_candidates.update(candidates)
+                    
+        # Process mapping candidates to find correlations
+        self._process_mapping_candidates(mapping_candidates)
+        
+        print(f"Discovered {len(self.cluster_mappings)} cluster property mappings")
+        
+    def _find_mappings_in_file(self, file_path: Path) -> Dict[str, str]:
+        """Find potential topic-to-cluster property mappings in a file"""
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                
+            mappings = {}
+            
+            # Pattern 1: Look for configuration property definitions with proper cluster prop names
+            # Example: config.get("log_cleanup_policy") or similar patterns
+            config_patterns = [
+                r'config\.get\("([^"]+)"\)',  # config.get("property_name")
+                r'\.([a-z_]+(?:_[a-z]+)*)\(',  # method calls like .retention_bytes(
+                r'([a-z_]+(?:_[a-z]+)*)\s*=',  # assignments like retention_bytes = 
+            ]
+            
+            for pattern in config_patterns:
+                matches = re.findall(pattern, content)
+                for match in matches:
+                    # Only consider names that look like cluster properties
+                    if self._looks_like_cluster_property(match):
+                        # Try to correlate with topic properties
+                        topic_prop = self._correlate_cluster_to_topic_property(match)
+                        if topic_prop and topic_prop in self.topic_properties:
+                            mappings[topic_prop] = match
+                    
+            return mappings
+            
+        except Exception as e:
+            return {}
+            
+    def _looks_like_cluster_property(self, prop_name: str) -> bool:
+        """Check if a name looks like a cluster property"""
+        # Cluster properties typically have specific patterns
+        cluster_patterns = [
+            r'^[a-z]+(_[a-z]+)*$',  # snake_case like log_cleanup_policy
+            r'.*_default$',         # ends with _default
+            r'.*_(ms|bytes|ratio|type|policy)$',  # ends with common suffixes
+        ]
+        
+        return any(re.match(pattern, prop_name) for pattern in cluster_patterns) and len(prop_name) > 4
+        
+    def _correlate_cluster_to_topic_property(self, cluster_prop: str) -> Optional[str]:
+        """Try to correlate a cluster property name to a topic property"""
+        
+        # Known correlation patterns
+        correlations = {
+            "log_cleanup_policy": "cleanup.policy",
+            "log_compression_type": "compression.type", 
+            "log_retention_ms": "retention.ms",
+            "retention_bytes": "retention.bytes",
+            "log_segment_ms": "segment.ms",
+            "log_segment_size": "segment.bytes",
+            "log_message_timestamp_type": "message.timestamp.type",
+            "kafka_batch_max_bytes": "max.message.bytes",
+            "default_topic_replication": "replication.factor",
+            "write_caching_default": "write.caching",
+        }
+        
+        # Direct lookup first
+        if cluster_prop in correlations:
+            return correlations[cluster_prop]
+            
+        # Pattern-based correlation for properties we haven't hardcoded
+        # Convert cluster property naming to topic property naming
+        topic_candidates = []
+        
+        # Remove common prefixes/suffixes
+        cleaned = cluster_prop
+        if cleaned.startswith("log_"):
+            cleaned = cleaned[4:]
+        if cleaned.endswith("_default"):
+            cleaned = cleaned[:-8]
+        if cleaned.endswith("_ms"):
+            cleaned = cleaned[:-3] + ".ms"
+        if cleaned.endswith("_bytes"):
+            cleaned = cleaned[:-6] + ".bytes"
+        if cleaned.endswith("_policy"):
+            cleaned = cleaned[:-7] + ".policy"
+        if cleaned.endswith("_type"):
+            cleaned = cleaned[:-5] + ".type"
+            
+        # Convert snake_case to dot.case
+        topic_candidate = cleaned.replace("_", ".")
+        
+        if topic_candidate in self.topic_properties:
+            return topic_candidate
+            
+        return None
+            
+    def _process_mapping_candidates(self, mapping_candidates: Dict[str, str]):
+        """Process and validate mapping candidates"""
+        for topic_prop, cluster_prop in mapping_candidates.items():
+            if topic_prop in self.topic_properties:
+                self.cluster_mappings[topic_prop] = cluster_prop
+                
+    def _resolve_topic_property_name(self, var_name: str) -> Optional[str]:
+        """Resolve topic_property_xxx variable to actual property name"""
+        for prop_name, prop_data in self.topic_properties.items():
+            if prop_data["variable_name"] == f"topic_property_{var_name}":
+                return prop_name
+        return None
+        
+    def _correlate_properties_with_data(self):
+        """Correlate topic properties with their acceptable values and cluster mappings"""
+        
+        for prop_name, prop_data in self.topic_properties.items():
+            # Update cluster mapping if found
+            if prop_name in self.cluster_mappings:
+                prop_data["corresponding_cluster_property"] = self.cluster_mappings[prop_name]
+                
+            # Update acceptable values based on property type
+            prop_data["acceptable_values"] = self._determine_acceptable_values(prop_name, prop_data)
+            
+    def _determine_acceptable_values(self, prop_name: str, prop_data: Dict) -> str:
+        """Determine acceptable values for a property based on runtime analysis"""
+        
+        # Check if it's an enum-based property
+        if "compression" in prop_name:
+            if "compression" in self.enum_values:
+                values = self.enum_values["compression"]["values"]
+                # Filter out special values like 'count', 'producer'
+                filtered_values = [v for v in values if v not in ['count', 'producer']]
+                return f"[`{'`, `'.join(filtered_values)}`]"
+                
+        elif "cleanup.policy" in prop_name:
+            if "cleanup_policy_bitflags" in self.enum_values:
+                values = self.enum_values["cleanup_policy_bitflags"]["values"]
+                # Convert enum names to policy names
+                policy_values = []
+                for v in values:
+                    if v == "deletion":
+                        policy_values.append("delete")
+                    elif v == "compaction":
+                        policy_values.append("compact")
+                if policy_values:
+                    policy_values.append("compact,delete")  # Combined policy
+                    return f"[`{'`, `'.join(policy_values)}`]"
+                    
+        elif "timestamp.type" in prop_name:
+            return "[`CreateTime`, `LogAppendTime`]"
+            
+        elif prop_data.get("type") == "boolean":
+            return "[`true`, `false`]"
+            
+        # For numeric properties, determine format based on type and name
+        elif prop_data.get("type") == "number" and "ratio" in prop_name:
+            return "[`0`, `1.0`]"  
+        elif prop_data.get("type") == "integer":
+            if ".factor" in prop_name:
+                return "integer (1 or greater)"
+            elif ".bytes" in prop_name:
+                return "bytes (integer)"
+            elif ".ms" in prop_name:
+                return "milliseconds (integer)"
+            else:
+                return "integer"
+        
+        return ""  # Default to empty if unknown
+
+    def generate_topic_properties_adoc(self, output_path: str):
+        """Generate topic-properties.adoc file"""
+        
+        adoc_content = """= Topic Configuration Properties
+:page-aliases: reference:topic-properties.adoc
+:description: Reference of topic configuration properties.
+
+A topic-level property sets a Redpanda or Kafka configuration for a particular topic.
+
+Many topic-level properties have corresponding xref:manage:cluster-maintenance/cluster-property-configuration.adoc[cluster properties] that set a default value for all topics of a cluster. To customize the value for a topic, you can set a topic-level property that overrides the value of the corresponding cluster property.
+
+NOTE: All topic properties take effect immediately after being set.
+
+== Topic property mappings
+
+|===
+| Topic property | Corresponding cluster property
+
+"""
+        
+        # Add table rows ONLY for properties with cluster mappings
+        for prop_name, prop_data in sorted(self.topic_properties.items()):
+            cluster_prop = prop_data.get("corresponding_cluster_property")
+            if cluster_prop:  # Only include if there's a cluster mapping
+                anchor = prop_name.replace(".", "").replace("-", "").lower()
+                adoc_content += f"| <<{anchor},`{prop_name}`>>\n"
+                adoc_content += f"| xref:./cluster-properties.adoc#{cluster_prop}[`{cluster_prop}`]\n\n"
+                
+        adoc_content += """|===
+
+== Examples
+
+The following examples show how to configure topic-level properties. Set a topic-level property for a topic to override the value of corresponding cluster property.
+
+=== Create topic with topic properties
+
+To set topic properties when creating a topic, use the xref:reference:rpk/rpk-topic/rpk-topic-create.adoc[rpk topic create] command with the `-c` option.
+
+For example, to create a topic with the `cleanup.policy` property set to `compact`:
+
+[tabs]
+====
+Local::
++
+--
+
+```bash
+rpk topic create -c cleanup.policy=compact <topic-name>
+```
+
+--
+Kubernetes::
++
+--
+
+```bash
+kubectl exec <pod-name> -- rpk topic create -c cleanup.policy=compact <topic-name>
+```
+
+--
+====
+
+=== Modify topic properties
+
+To modify topic properties of an existing topic, use the xref:reference:rpk/rpk-topic/rpk-topic-alter-config.adoc[rpk topic alter-config] command.
+
+For example, to modify a topic's `retention.ms` property:
+
+[tabs]
+====
+Local::
++
+--
+
+```bash
+rpk topic alter-config <topic-name> --set retention.ms=<retention-time>
+```
+
+--
+Kubernetes::
++
+--
+
+```bash
+kubectl exec <pod-name> -- rpk topic alter-config <topic-name> --set retention.ms=<retention-time>
+```
+
+--
+====
+
+== Topic properties
+
+"""
+
+        # Add individual property documentation - ONLY include properties with cluster mappings
+        for prop_name, prop_data in sorted(self.topic_properties.items()):
+            cluster_prop = prop_data.get("corresponding_cluster_property")
+            
+            # Skip properties without cluster mappings (as requested by user)
+            if not cluster_prop:
+                continue
+                
+            anchor = prop_name.replace(".", "").replace("-", "").lower()
+            acceptable_values = prop_data.get("acceptable_values", "")
+            prop_type = prop_data.get("type", "string")
+            
+            adoc_content += f"""
+[[{anchor}]]
+=== {prop_name}
+
+*Type:* {prop_type}
+
+"""
+            if acceptable_values:
+                adoc_content += f"*Accepted values:* {acceptable_values}\n\n"
+            
+            adoc_content += "*Default:* null\n\n"
+            adoc_content += f"*Related cluster property:* xref:./cluster-properties.adoc#{cluster_prop}[`{cluster_prop}`]\n\n"
+            adoc_content += "---\n\n"
+
+        # Write the file
+        output_dir = os.path.dirname(output_path)
+        if output_dir:  # Only create directory if there's a path
+            os.makedirs(output_dir, exist_ok=True)
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(adoc_content)
+            
+        print(f"Generated topic properties documentation: {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Extract topic properties from Redpanda source code")
+    parser.add_argument("--source-path", required=True, help="Path to Redpanda source code")
+    parser.add_argument("--output-json", help="Output JSON file path")
+    parser.add_argument("--output-adoc", help="Output AsciiDoc file path")
+    
+    args = parser.parse_args()
+    
+    extractor = TopicPropertyExtractor(args.source_path)
+    result = extractor.extract_topic_properties()
+    
+    print(f"Total topic properties found: {len(result['topic_properties'])}")
+    print(f"Topic properties with cluster mappings: {len(result['cluster_mappings'])}")
+    print(f"Enum types discovered: {len(result['enum_values'])}")
+    
+    if args.output_json:
+        with open(args.output_json, 'w', encoding='utf-8') as f:
+            json.dump(result, f, indent=2)
+        print(f"Topic properties JSON saved to: {args.output_json}")
+        
+    if args.output_adoc:
+        extractor.generate_topic_properties_adoc(args.output_adoc)
+
+
+if __name__ == "__main__":
+    main()

From f15c743d605fdb925eb2c1406da99788d8f5f783 Mon Sep 17 00:00:00 2001
From: Paulo Borges <paulohtb6@gmail.com>
Date: Fri, 22 Aug 2025 10:46:42 -0300
Subject: [PATCH 02/11] refactor initial page

---
 .../topic_property_extractor.py               | 64 +------------------
 1 file changed, 3 insertions(+), 61 deletions(-)

diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py
index 0480eca..8eb083e 100644
--- a/tools/property-extractor/topic_property_extractor.py
+++ b/tools/property-extractor/topic_property_extractor.py
@@ -420,7 +420,9 @@ def generate_topic_properties_adoc(self, output_path: str):
 
 Many topic-level properties have corresponding xref:manage:cluster-maintenance/cluster-property-configuration.adoc[cluster properties] that set a default value for all topics of a cluster. To customize the value for a topic, you can set a topic-level property that overrides the value of the corresponding cluster property.
 
-NOTE: All topic properties take effect immediately after being set.
+For information on how to configure topic properties, see xref:manage:cluster-maintenance/topic-property-configuration.adoc[].
+
+NOTE: All topic properties take effect immediately after being set. 
 
 == Topic property mappings
 
@@ -439,66 +441,6 @@ def generate_topic_properties_adoc(self, output_path: str):
                 
         adoc_content += """|===
 
-== Examples
-
-The following examples show how to configure topic-level properties. Set a topic-level property for a topic to override the value of corresponding cluster property.
-
-=== Create topic with topic properties
-
-To set topic properties when creating a topic, use the xref:reference:rpk/rpk-topic/rpk-topic-create.adoc[rpk topic create] command with the `-c` option.
-
-For example, to create a topic with the `cleanup.policy` property set to `compact`:
-
-[tabs]
-====
-Local::
-+
---
-
-```bash
-rpk topic create -c cleanup.policy=compact <topic-name>
-```
-
---
-Kubernetes::
-+
---
-
-```bash
-kubectl exec <pod-name> -- rpk topic create -c cleanup.policy=compact <topic-name>
-```
-
---
-====
-
-=== Modify topic properties
-
-To modify topic properties of an existing topic, use the xref:reference:rpk/rpk-topic/rpk-topic-alter-config.adoc[rpk topic alter-config] command.
-
-For example, to modify a topic's `retention.ms` property:
-
-[tabs]
-====
-Local::
-+
---
-
-```bash
-rpk topic alter-config <topic-name> --set retention.ms=<retention-time>
-```
-
---
-Kubernetes::
-+
---
-
-```bash
-kubectl exec <pod-name> -- rpk topic alter-config <topic-name> --set retention.ms=<retention-time>
-```
-
---
-====
-
 == Topic properties
 
 """

From 408b8873f08af6b7b54e6ac12870237fb55cae14 Mon Sep 17 00:00:00 2001
From: Paulo Borges <paulohtb6@gmail.com>
Date: Fri, 22 Aug 2025 11:04:17 -0300
Subject: [PATCH 03/11] add to doc-tools

---
 bin/doc-tools.js | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/bin/doc-tools.js b/bin/doc-tools.js
index fc784d9..19d17d6 100755
--- a/bin/doc-tools.js
+++ b/bin/doc-tools.js
@@ -777,6 +777,41 @@ automation
     process.exit(0);
   });
 
+automation
+  .command('topic-property-docs')
+  .description('Generate JSON and AsciiDoc documentation for Redpanda topic configuration properties')
+  .option('--tag <tag>', 'Git tag or branch to extract from', 'dev')
+  .option('--diff <oldTag>', 'Also diff autogenerated topic properties from <oldTag> → <tag>')
+  .action((options) => {
+    verifyPropertyDependencies();
+
+    const newTag = options.tag;
+    const oldTag = options.diff;
+    const cwd = path.resolve(__dirname, '../tools/property-extractor');
+    const make = (tag) => {
+      console.log(`⏳ Building topic property docs for ${tag}…`);
+      const r = spawnSync('make', ['topic-properties', `TAG=${tag}`], { cwd, stdio: 'inherit' });
+      if (r.error) {
+        console.error(`❌ ${r.error.message}`);
+        process.exit(1);
+      }
+      if (r.status !== 0) process.exit(r.status);
+    };
+
+    if (oldTag) {
+      const oldDir = path.join('autogenerated', oldTag, 'properties');
+      if (!fs.existsSync(oldDir)) make(oldTag);
+    }
+
+    make(newTag);
+
+    if (oldTag) {
+      diffDirs('properties', oldTag, newTag);
+    }
+
+    process.exit(0);
+  });
+
 automation
   .command('rpk-docs')
   .description('Generate AsciiDoc documentation for rpk CLI commands')

From 21459a232a671c8bc234d708a7df21924a92974d Mon Sep 17 00:00:00 2001
From: Paulo Borges <paulohtb6@gmail.com>
Date: Fri, 22 Aug 2025 11:34:06 -0300
Subject: [PATCH 04/11] Apply suggestions from code review

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 tools/property-extractor/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/property-extractor/Makefile b/tools/property-extractor/Makefile
index 9bd1053..eedaf99 100644
--- a/tools/property-extractor/Makefile
+++ b/tools/property-extractor/Makefile
@@ -105,7 +105,7 @@ check:
 	@echo "OUTPUT_DIR:    $(OUTPUT_DIR)"
 
 # --- Extract topic properties ---
-topic-properties:
+topic-properties: venv redpanda-git treesitter
 	@echo "🔧 Extracting topic properties with Redpanda tag: $(TAG)"
 	@mkdir -p $(TOOL_ROOT)/gen
 	@cd $(TOOL_ROOT) && \

From bb202cc0bc650f9a0a67b9a0bb951b55272488db Mon Sep 17 00:00:00 2001
From: Paulo Borges <paulohtb6@gmail.com>
Date: Fri, 22 Aug 2025 13:06:04 -0300
Subject: [PATCH 05/11] code review

---
 .../property-extractor/topic_property_extractor.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py
index 8eb083e..3035129 100644
--- a/tools/property-extractor/topic_property_extractor.py
+++ b/tools/property-extractor/topic_property_extractor.py
@@ -4,7 +4,8 @@
 import json
 import argparse
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Set
+import sys
+from typing import Dict, List, Optional
 
 
 class TopicPropertyExtractor:
@@ -104,8 +105,7 @@ def _scan_file_for_topic_properties(self, file_path: Path):
                         "corresponding_cluster_property": None
                     }
         except Exception as e:
-            # Skip files that can't be read
-            pass
+            print(f"Debug: Skipping {file_path}: {e}", file=sys.stderr)
             
     def _discover_enum_values(self):
         """Discover enum definitions that correspond to topic property acceptable values"""
@@ -180,8 +180,7 @@ def _scan_file_for_enums(self, file_path: Path):
                                 "values": values
                             }
         except Exception as e:
-            # Skip files that can't be read
-            pass
+            print(f"Debug: Error scanning enums in {file_path}: {e}", file=sys.stderr)
             
     def _determine_property_type(self, property_name: str) -> str:
         """Determine the type of a property based on its name and usage patterns"""
@@ -278,6 +277,7 @@ def _find_mappings_in_file(self, file_path: Path) -> Dict[str, str]:
             return mappings
             
         except Exception as e:
+            print(f"Debug: Error finding mappings in {file_path}: {e}", file=sys.stderr)
             return {}
             
     def _looks_like_cluster_property(self, prop_name: str) -> bool:
@@ -312,10 +312,6 @@ def _correlate_cluster_to_topic_property(self, cluster_prop: str) -> Optional[st
         if cluster_prop in correlations:
             return correlations[cluster_prop]
             
-        # Pattern-based correlation for properties we haven't hardcoded
-        # Convert cluster property naming to topic property naming
-        topic_candidates = []
-        
         # Remove common prefixes/suffixes
         cleaned = cluster_prop
         if cleaned.startswith("log_"):

From 52aca950aab12ac987bdaa55f1daefa406380f94 Mon Sep 17 00:00:00 2001
From: Paulo Borges <paulohtb6@gmail.com>
Date: Fri, 22 Aug 2025 13:06:10 -0300
Subject: [PATCH 06/11] bump version

---
 package-lock.json | 4 ++--
 package.json      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 3ec01eb..069b550 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@redpanda-data/docs-extensions-and-macros",
-  "version": "4.7.4",
+  "version": "4.8.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "@redpanda-data/docs-extensions-and-macros",
-      "version": "4.7.4",
+      "version": "4.7.2",
       "license": "ISC",
       "dependencies": {
         "@asciidoctor/tabs": "^1.0.0-beta.6",
diff --git a/package.json b/package.json
index fe63866..ee575b4 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@redpanda-data/docs-extensions-and-macros",
-  "version": "4.7.4",
+  "version": "4.8.0",
   "description": "Antora extensions and macros developed for Redpanda documentation.",
   "keywords": [
     "antora",

From bfd3ad31dc98fd5394473c87dfb829173941c866 Mon Sep 17 00:00:00 2001
From: Paulo Borges <paulohtb6@gmail.com>
Date: Fri, 22 Aug 2025 13:07:29 -0300
Subject: [PATCH 07/11] adjust identation

---
 tools/property-extractor/topic_property_extractor.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py
index 3035129..8d483f8 100644
--- a/tools/property-extractor/topic_property_extractor.py
+++ b/tools/property-extractor/topic_property_extractor.py
@@ -175,10 +175,10 @@ def _scan_file_for_enums(self, file_path: Path):
                 if enum_name not in self.enum_values:
                     values = self._extract_enum_values(enum_body)
                     if values:
-                            self.enum_values[enum_name] = {
-                                "source_file": str(file_path.relative_to(self.source_path)),
-                                "values": values
-                            }
+                        self.enum_values[enum_name] = {
+                            "source_file": str(file_path.relative_to(self.source_path)),
+                            "values": values
+                        }
         except Exception as e:
             print(f"Debug: Error scanning enums in {file_path}: {e}", file=sys.stderr)
             

From 7babe63d38de23123fbc2449407448361314cd71 Mon Sep 17 00:00:00 2001
From: JakeSCahill <jake@redpanda.com>
Date: Wed, 27 Aug 2025 09:32:53 +0100
Subject: [PATCH 08/11] Add test

---
 .../tools/topic_property_extractor.test.js    | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 __tests__/tools/topic_property_extractor.test.js

diff --git a/__tests__/tools/topic_property_extractor.test.js b/__tests__/tools/topic_property_extractor.test.js
new file mode 100644
index 0000000..45960bf
--- /dev/null
+++ b/__tests__/tools/topic_property_extractor.test.js
@@ -0,0 +1,58 @@
+const path = require('path');
+const fs = require('fs');
+const { execSync } = require('child_process');
+
+describe('topic_property_extractor.py', () => {
+  const scriptPath = path.resolve(__dirname, '../../tools/property-extractor/topic_property_extractor.py');
+  const mockSourcePath = path.resolve(__dirname, 'mock-redpanda-src');
+  const outputJson = path.resolve(__dirname, 'topic-properties-output.json');
+  const outputAdoc = path.resolve(__dirname, 'topic-properties.adoc');
+
+  beforeAll(() => {
+    // Create a minimal mock Redpanda source tree
+    if (!fs.existsSync(mockSourcePath)) {
+      fs.mkdirSync(mockSourcePath, { recursive: true });
+      // Create a mock header file with a topic property
+      const headerDir = path.join(mockSourcePath, 'src/v/kafka/server/handlers/topics');
+      fs.mkdirSync(headerDir, { recursive: true });
+      fs.writeFileSync(
+        path.join(headerDir, 'types.h'),
+        'inline constexpr std::string_view topic_property_retention_ms = "retention.ms";\n'
+      );
+      // Add a mock .cc file (should be ignored for property extraction)
+      fs.writeFileSync(
+        path.join(headerDir, 'types.cc'),
+        `// Copyright 2025 Redpanda Data, Inc.\n#include "kafka/server/handlers/topics/types.h"\n// ...rest of the file...\n`
+      );
+      // Add a mock config file to simulate a cluster property mapping
+      const configDir = path.join(mockSourcePath, 'src/v/config');
+      fs.mkdirSync(configDir, { recursive: true });
+      fs.writeFileSync(
+        path.join(configDir, 'mock_config.cc'),
+        'config.get("log_retention_ms");\n'
+      );
+    }
+  });
+
+  afterAll(() => {
+    // Cleanup
+    if (fs.existsSync(outputJson)) fs.unlinkSync(outputJson);
+    if (fs.existsSync(outputAdoc)) fs.unlinkSync(outputAdoc);
+    fs.rmdirSync(mockSourcePath, { recursive: true });
+  });
+
+  it('extracts topic properties and generates JSON', () => {
+    execSync(`python3 ${scriptPath} --source-path ${mockSourcePath} --output-json ${outputJson}`);
+    const result = JSON.parse(fs.readFileSync(outputJson, 'utf8'));
+    expect(result.topic_properties).toBeDefined();
+    expect(result.topic_properties['retention.ms']).toBeDefined();
+    expect(result.topic_properties['retention.ms'].property_name).toBe('retention.ms');
+  });
+
+  it('generates AsciiDoc output', () => {
+    execSync(`python3 ${scriptPath} --source-path ${mockSourcePath} --output-adoc ${outputAdoc}`);
+    const adoc = fs.readFileSync(outputAdoc, 'utf8');
+    expect(adoc).toContain('= Topic Configuration Properties');
+    expect(adoc).toContain('retention.ms');
+  });
+});

From e947244cbf174e142c04989803a0e912d5f90228 Mon Sep 17 00:00:00 2001
From: JakeSCahill <jake@redpanda.com>
Date: Wed, 27 Aug 2025 09:53:22 +0100
Subject: [PATCH 09/11] Ensure directory exists

---
 tools/property-extractor/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/property-extractor/Makefile b/tools/property-extractor/Makefile
index eedaf99..6edc675 100644
--- a/tools/property-extractor/Makefile
+++ b/tools/property-extractor/Makefile
@@ -108,6 +108,7 @@ check:
 topic-properties: venv redpanda-git treesitter
 	@echo "🔧 Extracting topic properties with Redpanda tag: $(TAG)"
 	@mkdir -p $(TOOL_ROOT)/gen
+	@mkdir -p "$(OUTPUT_DIR)"
 	@cd $(TOOL_ROOT) && \
 	  $(PYTHON) topic_property_extractor.py \
 	    --source-path $(REDPANDA_SRC) \

From 114c07c6e280f52bb88f3723a65a35243dfbdf4a Mon Sep 17 00:00:00 2001
From: Paulo Borges <paulohtb6@gmail.com>
Date: Wed, 10 Sep 2025 19:05:16 -0300
Subject: [PATCH 10/11] adjust to capture more properties, and identify no-op
 props

---
 .../topic_property_extractor.py               | 235 ++++++++++++++----
 1 file changed, 180 insertions(+), 55 deletions(-)

diff --git a/tools/property-extractor/topic_property_extractor.py b/tools/property-extractor/topic_property_extractor.py
index 8d483f8..a15d3cf 100644
--- a/tools/property-extractor/topic_property_extractor.py
+++ b/tools/property-extractor/topic_property_extractor.py
@@ -14,6 +14,7 @@ def __init__(self, source_path: str):
         self.topic_properties = {}
         self.cluster_mappings = {}
         self.enum_values = {}
+        self.noop_properties = set()
         
     def extract_topic_properties(self) -> Dict:
         """Extract topic property constants from source files"""
@@ -24,39 +25,58 @@ def extract_topic_properties(self) -> Dict:
         # Step 2: Find enum definitions for acceptable values
         self._discover_enum_values()
         
-        # Step 3: Discover cluster property mappings from source code
+        # Step 3: Discover no-op properties
+        self._discover_noop_properties()
+        
+        # Step 4: Discover cluster property mappings from source code
         self._discover_cluster_mappings()
         
-        # Step 4: Match properties with their validators and mappings
+        # Step 5: Match properties with their validators and mappings
         self._correlate_properties_with_data()
         
         return {
             "topic_properties": self.topic_properties,
             "cluster_mappings": self.cluster_mappings,
-            "enum_values": self.enum_values
+            "enum_values": self.enum_values,
+            "noop_properties": list(self.noop_properties)
         }
         
     def _discover_topic_properties(self):
         """Dynamically discover all topic property constants from source files"""
         
-        # Search for all header files that might contain topic property constants
-        topic_property_files = [
+        # Priority files - parse these first with the most comprehensive patterns
+        priority_files = [
             "src/v/kafka/server/handlers/topics/types.h",
-            "src/v/kafka/protocol/topic_properties.h",
+            "src/v/kafka/protocol/topic_properties.h", 
             "src/v/cluster/topic_properties.h",
         ]
         
-        for file_pattern in topic_property_files:
+        for file_pattern in priority_files:
             file_path = self.source_path / file_pattern
             if file_path.exists():
                 self._parse_topic_properties_from_file(file_path)
                 
-        # Also search for any other files that might contain topic_property_ constants
-        for header_file in self.source_path.glob("src/**/*.h"):
-            if any(pattern in str(header_file) for pattern in ["topic", "kafka"]):
-                self._scan_file_for_topic_properties(header_file)
-                
-        print(f"Discovered {len(self.topic_properties)} topic properties")
+        # Comprehensive search - scan all header files that might contain properties
+        search_patterns = [
+            "src/**/*topic*.h",
+            "src/**/*kafka*.h", 
+            "src/**/*handler*.h",
+            "src/**/*config*.h",
+            "src/**/*property*.h",
+        ]
+        
+        scanned_files = set()
+        for pattern in search_patterns:
+            for header_file in self.source_path.glob(pattern):
+                if header_file not in scanned_files:
+                    scanned_files.add(header_file)
+                    self._scan_file_for_topic_properties(header_file)
+                    
+        # Also scan the specific types.h file that we know contains many properties
+        types_files = list(self.source_path.glob("src/**/types.h"))
+        for types_file in types_files:
+            if types_file not in scanned_files:
+                self._scan_file_for_topic_properties(types_file)
         
     def _parse_topic_properties_from_file(self, file_path: Path):
         """Parse topic property constants from a specific file"""
@@ -64,46 +84,85 @@ def _parse_topic_properties_from_file(self, file_path: Path):
             with open(file_path, 'r', encoding='utf-8') as f:
                 content = f.read()
                 
-            # Pattern to match: inline constexpr std::string_view topic_property_xxx = "yyy";
-            pattern = r'inline\s+constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)";'
-            matches = re.findall(pattern, content)
-            
-            for var_name, property_name in matches:
-                self.topic_properties[property_name] = {
-                    "variable_name": f"topic_property_{var_name}",
-                    "property_name": property_name,
-                    "source_file": str(file_path.relative_to(self.source_path)),
-                    "description": "",
-                    "type": self._determine_property_type(property_name),
-                    "acceptable_values": None,
-                    "corresponding_cluster_property": None
-                }
+            # Multiple patterns to catch all possible property definitions
+            patterns = [
+                # Pattern 1: inline constexpr std::string_view topic_property_xxx = "yyy";
+                r'inline\s+constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;',
+                # Pattern 2: constexpr std::string_view topic_property_xxx = "yyy";
+                r'constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;',
+                # Pattern 3: const std::string topic_property_xxx = "yyy";
+                r'const\s+std::string\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;',
+                # Pattern 4: static const char* topic_property_xxx = "yyy";
+                r'static\s+const\s+char\*\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;',
+            ]
+            
+            total_matches = 0
+            for pattern in patterns:
+                matches = re.findall(pattern, content)
+                total_matches += len(matches)
                 
-            print(f"Found {len(matches)} topic properties in {file_path}")
+                for var_name, property_name in matches:
+                    # Only add if not already found (prefer inline constexpr definitions)
+                    if property_name not in self.topic_properties:
+                        self.topic_properties[property_name] = {
+                            "variable_name": f"topic_property_{var_name}",
+                            "property_name": property_name,
+                            "source_file": str(file_path.relative_to(self.source_path)),
+                            "description": "",
+                            "type": self._determine_property_type(property_name),
+                            "acceptable_values": None,
+                            "corresponding_cluster_property": None,
+                            "is_noop": False  # Will be updated later in _correlate_properties_with_data
+                        }
+            print(f"Found {total_matches} topic properties in {file_path}")
         except Exception as e:
             print(f"Error reading {file_path}: {e}")
             
     def _scan_file_for_topic_properties(self, file_path: Path):
         """Scan any file for topic_property_ constants"""
         try:
-            with open(file_path, 'r', encoding='utf-8') as f:
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                 content = f.read()
                 
-            # Look for any topic_property_ declarations
-            pattern = r'topic_property_(\w+)\s*=\s*"([^"]+)"'
-            matches = re.findall(pattern, content)
-            
-            for var_name, property_name in matches:
-                if property_name not in self.topic_properties:
-                    self.topic_properties[property_name] = {
-                        "variable_name": f"topic_property_{var_name}",
-                        "property_name": property_name,
-                        "source_file": str(file_path.relative_to(self.source_path)),
-                        "description": "",
-                        "type": self._determine_property_type(property_name),
-                        "acceptable_values": None,
-                        "corresponding_cluster_property": None
-                    }
+            # Enhanced patterns to catch all property definitions
+            patterns = [
+                # Pattern 1: inline constexpr std::string_view topic_property_xxx = "yyy";
+                r'inline\s+constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;',
+                # Pattern 2: constexpr std::string_view topic_property_xxx = "yyy";
+                r'constexpr\s+std::string_view\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;',
+                # Pattern 3: topic_property_xxx = "yyy" (simple assignment)
+                r'topic_property_(\w+)\s*=\s*"([^"]+)"',
+                # Pattern 4: const std::string topic_property_xxx = "yyy";
+                r'const\s+std::string\s+topic_property_(\w+)\s*=\s*"([^"]+)"\s*;',
+                # Pattern 5: Look for string literals that look like topic properties
+                r'"((?:redpanda\.|cleanup\.|compression\.|segment\.|flush\.|delete\.|replication\.|write\.|min\.|max\.|confluent\.)[^"]+)"'
+            ]
+            
+            for pattern in patterns:
+                matches = re.findall(pattern, content)
+                
+                for match in matches:
+                    if len(match) == 2:
+                        # Regular patterns with var_name and property_name
+                        var_name, property_name = match
+                    else:
+                        # String literal pattern - generate var_name from property_name
+                        property_name = match
+                        var_name = re.sub(r'[^a-zA-Z0-9_]', '_', property_name)
+                        var_name = re.sub(r'_+', '_', var_name).strip('_')
+                    
+                    # Validate this looks like a real topic property
+                    if self._is_valid_topic_property(property_name) and property_name not in self.topic_properties:
+                        self.topic_properties[property_name] = {
+                            "variable_name": f"topic_property_{var_name}",
+                            "property_name": property_name,
+                            "source_file": str(file_path.relative_to(self.source_path)),
+                            "description": "",
+                            "type": self._determine_property_type(property_name),
+                            "acceptable_values": None,
+                            "corresponding_cluster_property": None,
+                            "is_noop": False  # Will be updated later in _correlate_properties_with_data
+                        }
         except Exception as e:
             print(f"Debug: Skipping {file_path}: {e}", file=sys.stderr)
             
@@ -125,8 +184,38 @@ def _discover_enum_values(self):
         # Also search other model files for enums
         for header_file in self.source_path.glob("src/v/model/**/*.h"):
             self._scan_file_for_enums(header_file)
+        
+    def _discover_noop_properties(self):
+        """Discover no-op properties from the allowlist_topic_noop_confs array"""
+        
+        # Look for the allowlist in types.h file
+        types_file = self.source_path / "src/v/kafka/server/handlers/topics/types.h"
+        if not types_file.exists():
+            print("Warning: types.h file not found for no-op property detection")
+            return
             
-        print(f"Discovered {len(self.enum_values)} enum types")
+        try:
+            with open(types_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+                
+            # Pattern to match the allowlist_topic_noop_confs array
+            # Looks for the array declaration and captures all string literals within it
+            pattern = r'allowlist_topic_noop_confs\s*=\s*\{([^}]+)\}'
+            match = re.search(pattern, content, re.DOTALL)
+            
+            if match:
+                array_content = match.group(1)
+                # Extract all quoted strings from the array
+                string_pattern = r'"([^"]+)"'
+                noop_properties = re.findall(string_pattern, array_content)
+                
+                self.noop_properties = set(noop_properties)
+                print(f"Found {len(self.noop_properties)} no-op properties")
+            else:
+                print("Warning: allowlist_topic_noop_confs array not found in types.h")
+                
+        except Exception as e:
+            print(f"Error reading no-op properties from {types_file}: {e}")
         
     def _parse_enums_from_file(self, file_path: Path):
         """Parse enum definitions from a file"""
@@ -182,6 +271,37 @@ def _scan_file_for_enums(self, file_path: Path):
         except Exception as e:
             print(f"Debug: Error scanning enums in {file_path}: {e}", file=sys.stderr)
             
+    def _is_valid_topic_property(self, prop_name: str) -> bool:
+        """Validate that a string looks like a real topic property"""
+        
+        # Must be non-empty and reasonable length
+        if not prop_name or len(prop_name) < 3 or len(prop_name) > 100:
+            return False
+            
+        # Must contain only valid characters for topic properties
+        if not re.match(r'^[a-zA-Z][a-zA-Z0-9._-]*$', prop_name):
+            return False
+            
+        # Known topic property prefixes/patterns
+        valid_patterns = [
+            r'^redpanda\.',
+            r'^cleanup\.policy$',
+            r'^compression\.type$',
+            r'^segment\.',
+            r'^flush\.',
+            r'^delete\.',
+            r'^replication\.factor$',
+            r'^write\.caching$',
+            r'^min\.',
+            r'^max\.',
+            r'^confluent\.',
+            r'.*\.ms$',
+            r'.*\.bytes$',
+            r'.*\.ratio$',
+        ]
+        
+        return any(re.match(pattern, prop_name, re.IGNORECASE) for pattern in valid_patterns)
+            
     def _determine_property_type(self, property_name: str) -> str:
         """Determine the type of a property based on its name and usage patterns"""
         
@@ -246,8 +366,6 @@ def _discover_cluster_mappings(self):
         # Process mapping candidates to find correlations
         self._process_mapping_candidates(mapping_candidates)
         
-        print(f"Discovered {len(self.cluster_mappings)} cluster property mappings")
-        
     def _find_mappings_in_file(self, file_path: Path) -> Dict[str, str]:
         """Find potential topic-to-cluster property mappings in a file"""
         try:
@@ -356,6 +474,9 @@ def _correlate_properties_with_data(self):
             if prop_name in self.cluster_mappings:
                 prop_data["corresponding_cluster_property"] = self.cluster_mappings[prop_name]
                 
+            # Mark as no-op if found in the allowlist
+            prop_data["is_noop"] = prop_name in self.noop_properties
+                
             # Update acceptable values based on property type
             prop_data["acceptable_values"] = self._determine_acceptable_values(prop_name, prop_data)
             
@@ -427,10 +548,11 @@ def generate_topic_properties_adoc(self, output_path: str):
 
 """
         
-        # Add table rows ONLY for properties with cluster mappings
+        # Add table rows ONLY for properties with cluster mappings and exclude no-ops
         for prop_name, prop_data in sorted(self.topic_properties.items()):
             cluster_prop = prop_data.get("corresponding_cluster_property")
-            if cluster_prop:  # Only include if there's a cluster mapping
+            is_noop = prop_data.get("is_noop", False)
+            if cluster_prop and not is_noop:  # Only include if there's a cluster mapping and not a no-op
                 anchor = prop_name.replace(".", "").replace("-", "").lower()
                 adoc_content += f"| <<{anchor},`{prop_name}`>>\n"
                 adoc_content += f"| xref:./cluster-properties.adoc#{cluster_prop}[`{cluster_prop}`]\n\n"
@@ -441,12 +563,13 @@ def generate_topic_properties_adoc(self, output_path: str):
 
 """
 
-        # Add individual property documentation - ONLY include properties with cluster mappings
+        # Add individual property documentation - ONLY include properties with cluster mappings and exclude no-ops
         for prop_name, prop_data in sorted(self.topic_properties.items()):
             cluster_prop = prop_data.get("corresponding_cluster_property")
+            is_noop = prop_data.get("is_noop", False)
             
-            # Skip properties without cluster mappings (as requested by user)
-            if not cluster_prop:
+            # Skip properties without cluster mappings or no-op properties
+            if not cluster_prop or is_noop:
                 continue
                 
             anchor = prop_name.replace(".", "").replace("-", "").lower()
@@ -488,9 +611,11 @@ def main():
     extractor = TopicPropertyExtractor(args.source_path)
     result = extractor.extract_topic_properties()
     
-    print(f"Total topic properties found: {len(result['topic_properties'])}")
-    print(f"Topic properties with cluster mappings: {len(result['cluster_mappings'])}")
-    print(f"Enum types discovered: {len(result['enum_values'])}")
+    # Calculate properties that will be included in documentation (non-no-op with cluster mappings)
+    documented_props = [prop for prop, data in result['topic_properties'].items() 
+                       if data.get('corresponding_cluster_property') and not data.get('is_noop', False)]
+    
+    print(f"Found {len(result['topic_properties'])} total properties ({len(documented_props)} documented, {len(result['noop_properties'])} no-op)")
     
     if args.output_json:
         with open(args.output_json, 'w', encoding='utf-8') as f:

From 0e8b8e896e33d2b7371cf8442af426bacd186e39 Mon Sep 17 00:00:00 2001
From: Paulo Borges <paulohtb6@gmail.com>
Date: Fri, 12 Sep 2025 10:20:39 -0300
Subject: [PATCH 11/11] add tests

---
 .../tools/topic_property_extractor.test.js    | 47 +++++++++++++++++--
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/__tests__/tools/topic_property_extractor.test.js b/__tests__/tools/topic_property_extractor.test.js
index 45960bf..f47bf96 100644
--- a/__tests__/tools/topic_property_extractor.test.js
+++ b/__tests__/tools/topic_property_extractor.test.js
@@ -12,12 +12,22 @@ describe('topic_property_extractor.py', () => {
     // Create a minimal mock Redpanda source tree
     if (!fs.existsSync(mockSourcePath)) {
       fs.mkdirSync(mockSourcePath, { recursive: true });
-      // Create a mock header file with a topic property
+      // Create a mock header file with topic properties and no-op allowlist
       const headerDir = path.join(mockSourcePath, 'src/v/kafka/server/handlers/topics');
       fs.mkdirSync(headerDir, { recursive: true });
       fs.writeFileSync(
         path.join(headerDir, 'types.h'),
-        'inline constexpr std::string_view topic_property_retention_ms = "retention.ms";\n'
+        `inline constexpr std::string_view topic_property_retention_ms = "retention.ms";
+inline constexpr std::string_view topic_property_segment_bytes = "segment.bytes";
+inline constexpr std::string_view topic_property_flush_messages = "flush.messages";
+
+// Mock allowlist for no-op properties
+inline constexpr std::array<std::string_view, 3> allowlist_topic_noop_confs = {
+  "flush.messages",
+  "segment.index.bytes",
+  "preallocate",
+};
+`
       );
       // Add a mock .cc file (should be ignored for property extraction)
       fs.writeFileSync(
@@ -29,7 +39,7 @@ describe('topic_property_extractor.py', () => {
       fs.mkdirSync(configDir, { recursive: true });
       fs.writeFileSync(
         path.join(configDir, 'mock_config.cc'),
-        'config.get("log_retention_ms");\n'
+        'config.get("log_retention_ms");\nconfig.get("log_segment_size");\n'
       );
     }
   });
@@ -49,10 +59,39 @@ describe('topic_property_extractor.py', () => {
     expect(result.topic_properties['retention.ms'].property_name).toBe('retention.ms');
   });
 
-  it('generates AsciiDoc output', () => {
+  it('detects no-op properties correctly', () => {
+    execSync(`python3 ${scriptPath} --source-path ${mockSourcePath} --output-json ${outputJson}`);
+    const result = JSON.parse(fs.readFileSync(outputJson, 'utf8'));
+    
+    // Check that noop_properties array is present
+    expect(result.noop_properties).toBeDefined();
+    expect(Array.isArray(result.noop_properties)).toBe(true);
+    expect(result.noop_properties).toContain('flush.messages');
+    expect(result.noop_properties).toContain('segment.index.bytes');
+    expect(result.noop_properties).toContain('preallocate');
+    
+    // Check that flush.messages is marked as no-op
+    if (result.topic_properties['flush.messages']) {
+      expect(result.topic_properties['flush.messages'].is_noop).toBe(true);
+    }
+    
+    // Check that regular properties are not marked as no-op
+    expect(result.topic_properties['retention.ms'].is_noop).toBe(false);
+    expect(result.topic_properties['segment.bytes'].is_noop).toBe(false);
+  });
+
+  it('excludes no-op properties from AsciiDoc generation', () => {
     execSync(`python3 ${scriptPath} --source-path ${mockSourcePath} --output-adoc ${outputAdoc}`);
     const adoc = fs.readFileSync(outputAdoc, 'utf8');
+    
+    // Should contain regular properties
     expect(adoc).toContain('= Topic Configuration Properties');
     expect(adoc).toContain('retention.ms');
+    expect(adoc).toContain('segment.bytes');
+    
+    // Should NOT contain no-op properties in documentation
+    expect(adoc).not.toContain('flush.messages');
+    expect(adoc).not.toContain('segment.index.bytes');
+    expect(adoc).not.toContain('preallocate');
   });
 });