1+ import os
2+ import sys
3+ import argparse
4+ from docx import Document
5+ from docx .shared import Inches
6+ from docx .oxml .ns import qn
7+ from docx .oxml import OxmlElement
8+ import markdown
9+ from bs4 import BeautifulSoup
10+
11+ def add_hyperlink (paragraph , url , text , color = "0000FF" , underline = True ):
12+ part = paragraph .part
13+ r_id = part .relate_to (url , 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink' , is_external = True )
14+
15+ hyperlink = OxmlElement ('w:hyperlink' )
16+ hyperlink .set (qn ('r:id' ), r_id )
17+
18+ new_run = OxmlElement ('w:r' )
19+ rPr = OxmlElement ('w:rPr' )
20+
21+ if color :
22+ c = OxmlElement ('w:color' )
23+ c .set (qn ('w:val' ), color )
24+ rPr .append (c )
25+
26+ if underline :
27+ u = OxmlElement ('w:u' )
28+ u .set (qn ('w:val' ), 'single' )
29+ rPr .append (u )
30+
31+ new_run .append (rPr )
32+ new_run .text = text
33+ hyperlink .append (new_run )
34+
35+ paragraph ._p .append (hyperlink )
36+ return hyperlink
37+
38+ def add_table_of_contents (soup , doc ):
39+ toc = soup .find ('ul' )
40+ if toc :
41+ for li in toc .find_all ('li' ):
42+ link = li .find ('a' )
43+ if link and link ['href' ].startswith ('#' ):
44+ heading_text = link .text
45+ toc_paragraph = doc .add_paragraph ()
46+ add_hyperlink (toc_paragraph , f'#{ heading_text } ' , heading_text )
47+
48+ def add_markdown_to_docx (md_content , doc , base_path ):
49+ html = markdown .markdown (md_content )
50+ soup = BeautifulSoup (html , 'html.parser' )
51+
52+ heading_map = {}
53+ toc_inserted = False
54+
55+ for element in soup :
56+ if element .name == 'h1' :
57+ paragraph = doc .add_heading (element .text , level = 1 )
58+ heading_map [element .text ] = paragraph
59+ elif element .name == 'h2' :
60+ paragraph = doc .add_heading (element .text , level = 2 )
61+ heading_map [element .text ] = paragraph
62+ if element .text .lower () == 'table of contents' and not toc_inserted :
63+ add_table_of_contents (soup , doc )
64+ toc_inserted = True
65+ elif element .name == 'h3' :
66+ paragraph = doc .add_heading (element .text , level = 3 )
67+ heading_map [element .text ] = paragraph
68+ elif element .name == 'p' :
69+ paragraph = doc .add_paragraph (element .text )
70+ for img in element .find_all ('img' ):
71+ img_src = img ['src' ].lstrip ('./' )
72+ img_path = os .path .join (base_path , img_src )
73+ if os .path .exists (img_path ):
74+ doc .add_picture (img_path , width = Inches (5.0 ))
75+ else :
76+ paragraph .add_run (f"[Image not found: { img_path } ]" )
77+ elif element .name == 'ul' and not toc_inserted :
78+ for li in element .find_all ('li' ):
79+ doc .add_paragraph (li .text , style = 'ListBullet' )
80+ elif element .name == 'ol' :
81+ for li in element .find_all ('li' ):
82+ doc .add_paragraph (li .text , style = 'ListNumber' )
83+ elif element .name == 'a' :
84+ paragraph = doc .add_paragraph ()
85+ add_hyperlink (paragraph , element ['href' ], element .text )
86+
87+ for heading_text , paragraph in heading_map .items ():
88+ bookmark = OxmlElement ('w:bookmarkStart' )
89+ bookmark .set (qn ('w:id' ), str (hash (heading_text )))
90+ bookmark .set (qn ('w:name' ), heading_text )
91+ paragraph ._p .insert (0 , bookmark )
92+ bookmark_end = OxmlElement ('w:bookmarkEnd' )
93+ bookmark_end .set (qn ('w:id' ), str (hash (heading_text )))
94+ paragraph ._p .append (bookmark_end )
95+
96+ def convert_readme_to_docx (readme_dir , output_path ):
97+ readme_path = os .path .join (readme_dir , 'README.md' )
98+ if not os .path .exists (readme_path ):
99+ print (f"README.md not found in { readme_dir } " )
100+ return
101+
102+ with open (readme_path , 'r' ) as file :
103+ md_content = file .read ()
104+
105+ doc = Document ()
106+ add_markdown_to_docx (md_content , doc , readme_dir )
107+ doc .save (output_path )
108+
109+ if __name__ == "__main__" :
110+ parser = argparse .ArgumentParser (description = 'Convert a README.md file to a DOCX file.' )
111+ parser .add_argument ('readme_dir' , type = str , help = 'Directory containing the README.md file' )
112+ args = parser .parse_args ()
113+
114+ readme_dir = args .readme_dir
115+ output_path = os .path .join (readme_dir , 'README.docx' )
116+ convert_readme_to_docx (readme_dir , output_path )
117+ print (f"Converted README.md in { readme_dir } to { output_path } " )
0 commit comments