Skip to content

Commit 96463a2

Browse files
committed
initial commit
0 parents  commit 96463a2

8 files changed

+69
-0
lines changed

DocProcessor.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import os
2+
3+
class ProcessDocuments:
4+
5+
def __init__(self):
6+
self.fileList = []
7+
8+
'''this method reads files to be processed and return the list of those files'''
9+
def readFiles(self):
10+
import os
11+
p = os.popen("ls *.docx","r")
12+
13+
while 1:
14+
fileName = p.readline()
15+
if not fileName: break
16+
fileName = fileName[:-1] # removes null character from end of fileName
17+
self.fileList.append(fileName)
18+
19+
return self.fileList
20+
21+
'''method to convert docx files into pdf'''
22+
def convert2PDF(self):
23+
24+
N = len(self.fileList)
25+
26+
for i in range(0,N):
27+
print "Converting %s to pdf "%self.fileList[i]
28+
os.system("abiword --to=pdf %s"%self.fileList[i]) # Convert docx to pdf
29+
30+
os.system("mkdir PDFs")
31+
os.system("mv *.pdf PDFs")
32+
33+
'''converts generated PDFs to HTML format'''
34+
def convert2HTML(self):
35+
36+
os.system("mkdir ~/pdf") #temporary directory
37+
htmlFiles=[]
38+
p = os.popen("ls PDFs/","r")
39+
while 1:
40+
filename = p.readline()
41+
if not filename: break
42+
filename = filename[0:-1]
43+
htmlFiles.append(filename)
44+
45+
N = len(htmlFiles)
46+
os.system("cp PDFs/*.pdf ~/pdf/")
47+
for i in range(0,N):
48+
os.system("docker run -ti --rm -v ~/pdf:/pdf bwits/pdf2htmlex pdf2htmlEX --zoom 1.3 %s"%htmlFiles[i])
49+
50+
os.system("mkdir HTMLs")
51+
os.system("mv ~/pdf/*.html HTMLs/")
52+
os.system("rm -r ~/pdf/")
53+
54+
55+
56+
57+
58+

README.md

Whitespace-only changes.

Sample/RESEARCHPAPER.docx

20.8 KB
Binary file not shown.
Binary file not shown.

Sample/marketresearchpaper.docx

164 KB
Binary file not shown.

Sample/researchpaper.docx

36.5 KB
Binary file not shown.

__init__.py

Whitespace-only changes.

run.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from DocProcessor import ProcessDocuments
2+
3+
4+
5+
6+
if __name__ == '__main__':
7+
8+
x = ProcessDocuments()
9+
x.readFiles()
10+
x.convert2PDF()
11+
x.convert2HTML()

0 commit comments

Comments
 (0)