Skip to content

Commit c3973c6

Browse files
author
Peter Simpson
committed
intial commit for new code snippets project
1 parent 7d71b63 commit c3973c6

File tree

8 files changed

+658
-0
lines changed

8 files changed

+658
-0
lines changed

.gitignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,21 @@
1010
src/components/page/reference/ReleaseNotes/release-notes.json
1111
/.vs
1212

13+
# Code utilities - generated files
14+
code_utils/files_with_code.txt
15+
code_utils/temp/
16+
17+
# Python
18+
__pycache__/
19+
*.py[cod]
20+
*$py.class
21+
*.egg-info/
22+
.venv/
23+
venv/
24+
env/
25+
dist/
26+
build/
27+
1328

1429
# Misc
1530
.DS_Store

code_utils/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.11

code_utils/README.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Codat Documentation Code Utils
2+
3+
Utilities for extracting and managing code snippets from Codat documentation.
4+
Currently consists of a single script `extract_code_from_files.py` which will find every markdown file
5+
in the docs directory containing a code snippet. It will then extract those snippets into files under a `temp/` directory.
6+
7+
8+
## Usage
9+
10+
```
11+
12+
# Run the code extractor
13+
14+
uv run extract_code_from_files.py
15+
16+
## Development
17+
18+
This project uses [uv](https://astral.sh/uv) for dependency management.
19+
20+
```bash
21+
# Install dependencies
22+
uv sync
23+
24+
# Install development dependencies
25+
uv sync --extra dev
26+
```
27+
28+
## Structure
29+
30+
- `code_finder.py` - Main CodeFinder class
31+
- `extract_code_from_files.py` - Entrypoint script.
32+
- `temp/` - Generated code snippets (gitignored)
33+
- `files_with_code.txt` - List of files containing code (gitignored)

code_utils/__init__.py

Whitespace-only changes.

code_utils/code_finder.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
import os
2+
import sys
3+
import re
4+
import shutil
5+
from pathlib import Path
6+
7+
8+
class CodeFinder:
9+
10+
all_languages = {'python', 'javascript', 'csharp', 'go'}
11+
target_languages = {'python', 'javascript', 'csharp' }
12+
deprecated_languages = {'go'}
13+
14+
def __init__(self, output_file_name:str = "files_with_code.txt"):
15+
self.matching_files = []
16+
17+
self.script_dir = Path(__file__).resolve().parent
18+
self.temp_dir = self.script_dir / 'temp'
19+
self.project_root = self.script_dir.parent
20+
self.docs_path = self.project_root / 'docs'
21+
self.output_file = self.script_dir / output_file_name
22+
23+
def has_code_snippets(self, file_path:str, target_languages:set):
24+
"""
25+
Check if a markdown file contains code snippets with specified languages.
26+
27+
Args:
28+
file_path (str): Path to the markdown file
29+
target_languages (set): Set of programming languages to look for
30+
31+
Returns:
32+
bool: True if file contains code snippets with target languages
33+
"""
34+
try:
35+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
36+
content = f.read()
37+
38+
# Look for code blocks that start with ``` followed by language name
39+
# Pattern matches: ```python, ```javascript, ```csharp, ```go
40+
pattern = r'```(' + '|'.join(CodeFinder.all_languages) + r')\b'
41+
42+
matches = re.findall(pattern, content, re.IGNORECASE)
43+
return len(matches) > 0
44+
45+
except Exception as e:
46+
print(f"Error reading file {file_path}: {e}")
47+
return False
48+
49+
def find_files_with_code(self):
50+
"""Find markdown files in docs directory that contain code snippets with specified languages."""
51+
52+
# Check if docs directory exists
53+
if not self.docs_path.exists():
54+
print(f"Error: docs directory not found at {self.docs_path}")
55+
sys.exit(1)
56+
57+
print(f"Searching for markdown files with code snippets in: {self.docs_path}")
58+
print(f"Looking for languages: {', '.join(sorted(CodeFinder.target_languages))}")
59+
60+
# Find all markdown files recursively using pathlib
61+
markdown_files = list(self.docs_path.rglob('*.md')) + list(self.docs_path.rglob('*.mdx'))
62+
63+
for file_path in markdown_files:
64+
# Get relative path from docs directory for output (using forward slashes)
65+
rel_path = file_path.relative_to(self.docs_path).as_posix()
66+
67+
# Check if file contains target code snippets
68+
if self.has_code_snippets(file_path, CodeFinder.target_languages):
69+
self.matching_files.append(rel_path)
70+
print(f"Found: {rel_path}")
71+
72+
# Sort the paths for better readability
73+
self.matching_files.sort()
74+
75+
# Write to output file
76+
try:
77+
with open(self.output_file, 'w', encoding='utf-8') as f:
78+
79+
for path in self.matching_files:
80+
f.write(path + '\n')
81+
82+
print(f"\nSummary:")
83+
print(f"- Total markdown files processed: {len(self.matching_files)}")
84+
print(f"- Files with target code snippets: {len(self.matching_files)}")
85+
print(f"- Results saved to: {self.output_file}")
86+
87+
except Exception as e:
88+
print(f"Error writing to output file: {e}")
89+
sys.exit(1)
90+
91+
def extract_code(self):
92+
"""
93+
Extract code snippets from matching files and save them to temp directory.
94+
Creates subdirectories for each programming language.
95+
"""
96+
if not self.matching_files:
97+
print("No matching files found. Run find_files_with_code() first.")
98+
return
99+
100+
# Language to file extension mapping
101+
lang_extensions = {
102+
'python': '.py',
103+
'javascript': '.ts',
104+
'csharp': '.cs'
105+
}
106+
107+
108+
# Create subdirectories for each target language
109+
for lang in CodeFinder.target_languages:
110+
lang_dir = self.temp_dir / lang
111+
lang_dir.mkdir(parents=True, exist_ok=True)
112+
113+
print(f"Created temp directory structure at: {self.temp_dir}")
114+
115+
# Counters for summary
116+
total_snippets = 0
117+
snippets_by_lang = {lang: 0 for lang in CodeFinder.target_languages}
118+
119+
# Process each matching file
120+
for file_path in self.matching_files:
121+
full_file_path = self.docs_path / file_path
122+
123+
try:
124+
with open(full_file_path, 'r', encoding='utf-8', errors='ignore') as f:
125+
content = f.read()
126+
127+
# Extract code blocks using regex
128+
# Pattern: ```language followed by code until closing ```
129+
pattern = r'```(' + '|'.join(CodeFinder.target_languages) + r')\b\n?(.*?)\n?```'
130+
matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
131+
132+
for i, (language, code_content) in enumerate(matches):
133+
language = language.lower()
134+
135+
if language in CodeFinder.target_languages:
136+
# Clean up the code content
137+
code_content = code_content.strip()
138+
139+
if code_content: # Only save non-empty snippets
140+
# Generate filename based on source file and snippet index
141+
source_name = Path(file_path).stem
142+
source_name = re.sub(r'[^\w\-_]', '_', source_name) # Clean filename
143+
144+
filename = f"{source_name}_snippet_{i+1}{lang_extensions[language]}"
145+
snippet_path = self.temp_dir / language / filename
146+
147+
# Save the code snippet
148+
with open(snippet_path, 'w', encoding='utf-8') as snippet_file:
149+
snippet_file.write(code_content)
150+
151+
total_snippets += 1
152+
snippets_by_lang[language] += 1
153+
154+
print(f"Extracted {language} snippet: {Path(language) / filename}")
155+
156+
except Exception as e:
157+
print(f"Error processing file {file_path}: {e}")
158+
continue
159+
160+
# Print summary
161+
print(f"\nExtraction Summary:")
162+
print(f"- Total code snippets extracted: {total_snippets}")
163+
for lang, count in snippets_by_lang.items():
164+
if count > 0:
165+
print(f"- {lang.capitalize()} snippets: {count}")
166+
print(f"- Snippets saved to: {self.temp_dir}")
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Script to walk through the 'docs' directory and find markdown files with code snippets.
4+
Looks for code blocks with specific programming languages: python, javascript, csharp, go.
5+
Saves matching file paths to files_with_code.txt.
6+
"""
7+
from code_finder import CodeFinder
8+
9+
if __name__ == "__main__":
10+
finder = CodeFinder()
11+
finder.find_files_with_code()
12+
finder.extract_code()

code_utils/pyproject.toml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[project]
2+
name = "code-utils"
3+
version = "0.1.0"
4+
description = "Utilities for extracting and managing code snippets from Codat documentation"
5+
authors = [
6+
{name = "Codat Documentation Team"}
7+
]
8+
readme = "README.md"
9+
requires-python = ">=3.8"
10+
dependencies = []
11+
12+
[project.optional-dependencies]
13+
dev = [
14+
"pytest",
15+
"black",
16+
"ruff",
17+
]
18+
19+
[build-system]
20+
requires = ["hatchling"]
21+
build-backend = "hatchling.build"
22+
23+
[tool.hatch.build.targets.wheel]
24+
packages = ["."]
25+
26+
[tool.black]
27+
line-length = 88
28+
target-version = ['py38']
29+
30+
[tool.ruff]
31+
target-version = "py38"
32+
line-length = 88
33+
select = ["E", "F", "W", "I"]
34+
ignore = []
35+
36+
[tool.ruff.isort]
37+
known-first-party = ["code_finder"]

0 commit comments

Comments
 (0)