-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparsing_pdf.py
110 lines (97 loc) · 5.07 KB
/
parsing_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# %%
import fitz
import pytesseract
import cv2
import pandas as pd
from pathlib import Path
import imutils
import numpy as np
#first install the tesseract executable
#PATH to tesseract executable:
pytesseract.pytesseract.tesseract_cmd = r'D:\soft\tesseract\tesseract.exe'
# file path you want to extract images from
file = Path("D:/2022_12_15_pdf_parsing/2023_04_05_demande_carine_P/liste_des_modules4_concernés.pdf")
parent_dir = file.parent
def crop_part(original_image, x, y, w, h):
result = original_image[y:y+h, x:x+w]
return result
def from_pdf_to_png(input_pdf_file, png_output_name):
pdf_file = fitz.open(input_pdf_file)
#get first page
page = pdf_file[0]
png = page.get_pixmap(dpi=300)
png.save(parent_dir / png_output_name)
def enhance_img(png_output_name):
img =cv2.imread(png_output_name)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
# apply a distance transform which calculates the distance to the
# closest zero pixel for each pixel in the input image
dist = cv2.distanceTransform(thresh, cv2.DIST_L2, 5)
# normalize the distance transform such that the distances lie in
# the range [0, 1] and then convert the distance transform back to
# an unsigned 8-bit integer in the range [0, 255]
dist = cv2.normalize(dist, dist, 0, 1.0, cv2.NORM_MINMAX)
dist = (dist * 255).astype("uint8")
# threshold the distance transform using Otsu's method
dist = cv2.threshold(dist, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
kernel = np.ones((2,2),np.uint8)
opening = cv2.morphologyEx(dist, cv2.MORPH_OPEN, kernel)
# find contours in the opening image, then initialize the list of
# contours which belong to actual characters that we will be OCR'ing
cnts = cv2.findContours(opening.copy(), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
list_mask = []
# loop over the contours
for c in cnts:
# compute the bounding box of the contour
(x, y, w, h) = cv2.boundingRect(c)
# check if contour is at least 5px wide and 5px tall, and if
# so, consider the contour a digit
if w >= 5 and h >= 5:
hull = cv2.convexHull(c)
mask = np.zeros(img.shape[:2], dtype="uint8")
cv2.drawContours(mask, [hull], -1, 255, -1)
mask = cv2.dilate(mask, None, iterations=3)
list_mask.append(mask)
mask = 255 * sum(list_mask)
mask = mask.clip(0, 255).astype("uint8")
# take the bitwise of the opening image and the mask to reveal *just*
# the characters in the image
final = cv2.bitwise_and(opening, opening, mask=mask)
cv2.imwrite(str(parent_dir / png_output_name), final)
def get_pandas_serie_from_part(frame,x,y,w,h,output_name):
croped_part = crop_part(frame,x,y,w,h)
path_part_name = str(parent_dir / f"{output_name}.png")
cv2.imwrite(path_part_name, croped_part)
options = "--psm 12 -c tessedit_char_whitelist=0123456789azertyuiopqsdfghjklmwxcvbnAZERTYUIOPQSDFGHJKLMWXCVBN"
croped_part_txt = pytesseract.image_to_string(path_part_name, config=options)
serie_croped_part = pd.Series(croped_part_txt.split("\n"), name=output_name)
return serie_croped_part
def main(file):
from_pdf_to_png(file, "page.png")
png_output_name = str(parent_dir / "page.png")
enhance_img(png_output_name)
frame = cv2.imread(png_output_name)
header = get_pandas_serie_from_part(frame,x=920,y=318,w=820,h=100,output_name="header")
col1 = get_pandas_serie_from_part(frame,x=325,y=680,w=255,h=2010,output_name="NS_ensemble_Roue_TL_1")
col2 = get_pandas_serie_from_part(frame,x=578,y=680,w=200,h=2010,output_name="NS_module_M04_1")
col3 = get_pandas_serie_from_part(frame,x=795,y=680,w=195,h=2010,output_name="NS_moteur_1")
col4 = get_pandas_serie_from_part(frame,x=1036,y=680,w=235,h=2010,output_name="NS_ensemble_Roue_TL_2")
col5 = get_pandas_serie_from_part(frame,x=1260,y=680,w=200,h=2010,output_name="NS_module_M04_2")
col6 = get_pandas_serie_from_part(frame,x=1477,y=680,w=200,h=2010,output_name="NS_moteur_2")
col7 = get_pandas_serie_from_part(frame,x=1709,y=680,w=235,h=2010,output_name="NS_ensemble_Roue_TL_3")
col8 = get_pandas_serie_from_part(frame,x=1996,y=680,w=180,h=2010,output_name="NS_module_M04_3")
col9 = get_pandas_serie_from_part(frame,x=2170,y=680,w=200,h=2010,output_name="NS_moteur_3")
df_data = pd.concat([col1, col2, col3, col4, col5, col6, col7, col8, col9], ignore_index=True, axis=1)
df_data.columns = [
'NS_ensemble_Roue_TL_1', 'NS_module_M04_1', 'NS_moteur_1',
'NS_ensemble_Roue_TL_2', 'NS_module_M04_2', 'NS_moteur_2',
'NS_ensemble_Roue_TL_3', 'NS_module_M04_3', 'NS_moteur_3',
]
return (header, df_data)
header, df_data = main(file)
header.to_csv(parent_dir / 'header.csv', sep=';', index=False)
df_data.to_csv(parent_dir / 'df_data.csv', sep=';', index=False)
# %%