1+ import os
2+ import sys
3+ import re
4+ import shutil
5+ from pathlib import Path
6+
7+
8+ class CodeFinder :
9+
10+ all_languages = {'python' , 'javascript' , 'csharp' , 'go' }
11+ target_languages = {'python' , 'javascript' , 'csharp' }
12+ deprecated_languages = {'go' }
13+
14+ def __init__ (self , output_file_name :str = "files_with_code.txt" ):
15+ self .matching_files = []
16+
17+ self .script_dir = Path (__file__ ).resolve ().parent
18+ self .temp_dir = self .script_dir / 'temp'
19+ self .project_root = self .script_dir .parent
20+ self .docs_path = self .project_root / 'docs'
21+ self .output_file = self .script_dir / output_file_name
22+
23+ def has_code_snippets (self , file_path :str , target_languages :set ):
24+ """
25+ Check if a markdown file contains code snippets with specified languages.
26+
27+ Args:
28+ file_path (str): Path to the markdown file
29+ target_languages (set): Set of programming languages to look for
30+
31+ Returns:
32+ bool: True if file contains code snippets with target languages
33+ """
34+ try :
35+ with open (file_path , 'r' , encoding = 'utf-8' , errors = 'ignore' ) as f :
36+ content = f .read ()
37+
38+ # Look for code blocks that start with ``` followed by language name
39+ # Pattern matches: ```python, ```javascript, ```csharp, ```go
40+ pattern = r'```(' + '|' .join (CodeFinder .all_languages ) + r')\b'
41+
42+ matches = re .findall (pattern , content , re .IGNORECASE )
43+ return len (matches ) > 0
44+
45+ except Exception as e :
46+ print (f"Error reading file { file_path } : { e } " )
47+ return False
48+
49+ def find_files_with_code (self ):
50+ """Find markdown files in docs directory that contain code snippets with specified languages."""
51+
52+ # Check if docs directory exists
53+ if not self .docs_path .exists ():
54+ print (f"Error: docs directory not found at { self .docs_path } " )
55+ sys .exit (1 )
56+
57+ print (f"Searching for markdown files with code snippets in: { self .docs_path } " )
58+ print (f"Looking for languages: { ', ' .join (sorted (CodeFinder .target_languages ))} " )
59+
60+ # Find all markdown files recursively using pathlib
61+ markdown_files = list (self .docs_path .rglob ('*.md' )) + list (self .docs_path .rglob ('*.mdx' ))
62+
63+ for file_path in markdown_files :
64+ # Get relative path from docs directory for output (using forward slashes)
65+ rel_path = file_path .relative_to (self .docs_path ).as_posix ()
66+
67+ # Check if file contains target code snippets
68+ if self .has_code_snippets (file_path , CodeFinder .target_languages ):
69+ self .matching_files .append (rel_path )
70+ print (f"Found: { rel_path } " )
71+
72+ # Sort the paths for better readability
73+ self .matching_files .sort ()
74+
75+ # Write to output file
76+ try :
77+ with open (self .output_file , 'w' , encoding = 'utf-8' ) as f :
78+
79+ for path in self .matching_files :
80+ f .write (path + '\n ' )
81+
82+ print (f"\n Summary:" )
83+ print (f"- Total markdown files processed: { len (self .matching_files )} " )
84+ print (f"- Files with target code snippets: { len (self .matching_files )} " )
85+ print (f"- Results saved to: { self .output_file } " )
86+
87+ except Exception as e :
88+ print (f"Error writing to output file: { e } " )
89+ sys .exit (1 )
90+
91+ def extract_code (self ):
92+ """
93+ Extract code snippets from matching files and save them to temp directory.
94+ Creates subdirectories for each programming language.
95+ """
96+ if not self .matching_files :
97+ print ("No matching files found. Run find_files_with_code() first." )
98+ return
99+
100+ # Language to file extension mapping
101+ lang_extensions = {
102+ 'python' : '.py' ,
103+ 'javascript' : '.ts' ,
104+ 'csharp' : '.cs'
105+ }
106+
107+
108+ # Create subdirectories for each target language
109+ for lang in CodeFinder .target_languages :
110+ lang_dir = self .temp_dir / lang
111+ lang_dir .mkdir (parents = True , exist_ok = True )
112+
113+ print (f"Created temp directory structure at: { self .temp_dir } " )
114+
115+ # Counters for summary
116+ total_snippets = 0
117+ snippets_by_lang = {lang : 0 for lang in CodeFinder .target_languages }
118+
119+ # Process each matching file
120+ for file_path in self .matching_files :
121+ full_file_path = self .docs_path / file_path
122+
123+ try :
124+ with open (full_file_path , 'r' , encoding = 'utf-8' , errors = 'ignore' ) as f :
125+ content = f .read ()
126+
127+ # Extract code blocks using regex
128+ # Pattern: ```language followed by code until closing ```
129+ pattern = r'```(' + '|' .join (CodeFinder .target_languages ) + r')\b\n?(.*?)\n?```'
130+ matches = re .findall (pattern , content , re .DOTALL | re .IGNORECASE )
131+
132+ for i , (language , code_content ) in enumerate (matches ):
133+ language = language .lower ()
134+
135+ if language in CodeFinder .target_languages :
136+ # Clean up the code content
137+ code_content = code_content .strip ()
138+
139+ if code_content : # Only save non-empty snippets
140+ # Generate filename based on source file and snippet index
141+ source_name = Path (file_path ).stem
142+ source_name = re .sub (r'[^\w\-_]' , '_' , source_name ) # Clean filename
143+
144+ filename = f"{ source_name } _snippet_{ i + 1 } { lang_extensions [language ]} "
145+ snippet_path = self .temp_dir / language / filename
146+
147+ # Save the code snippet
148+ with open (snippet_path , 'w' , encoding = 'utf-8' ) as snippet_file :
149+ snippet_file .write (code_content )
150+
151+ total_snippets += 1
152+ snippets_by_lang [language ] += 1
153+
154+ print (f"Extracted { language } snippet: { Path (language ) / filename } " )
155+
156+ except Exception as e :
157+ print (f"Error processing file { file_path } : { e } " )
158+ continue
159+
160+ # Print summary
161+ print (f"\n Extraction Summary:" )
162+ print (f"- Total code snippets extracted: { total_snippets } " )
163+ for lang , count in snippets_by_lang .items ():
164+ if count > 0 :
165+ print (f"- { lang .capitalize ()} snippets: { count } " )
166+ print (f"- Snippets saved to: { self .temp_dir } " )
0 commit comments