LJSthu · ben0i0d · Jun 28, 2024 · Jun 28, 2024 · Jun 28, 2024 · Sep 30, 2024
diff --git a/README.md b/README.md
@@ -1,35 +1,29 @@
 # Python-Remove-Watermark
-A simple program to remove the watermark from a PDF file. 
-
+A simple program to remove the watermark from a PDF/JPG/PNG file. 
 
 ### How?
 
-1. convert the PDF file into images using `pdf2image`
-2. convert the images to numpy array
+1. convert the PDF file into images using `pymupdf`
+2. convert the images(include .jpg/.png) to numpy array
 3. find the specific pixel by watermarks' rgb values and change them into (255,255,255)
 4. save the modified images
 
-
-### Environment
+### How to use?
 
 First you need to install the dependencies:
 ```
-$ pip install pdf2image
-```
-```
-$ pip install scikit-image
+$ pip install pillow pymupdf scikit-image numba
 ```
 
-Inside the repository create a directory that will receive the modified images:
+For .pdf execute:
 ```
-$ mkdir jiangyi3
+$ python watermark.py --source source.pdf --target out
 ```
-To execute:
+For .jpg/.png execute:
 ```
-$ python watermark.py
+$ python watermark.py --source source.png --target out
 ```
-Don't forget to indicate the pdf's path you want to convert.
-
+Don't forget to indicate the source path you want to convert, script will automatically creates an output path.
 
 ### Results
 ![image](./result.png)
diff --git a/util.py b/util.py
diff --git a/watermark.py b/watermark.py
@@ -1,68 +1,68 @@
-from skimage import io
-from pdf2image import convert_from_path
+# Import necessary libraries
+import os
+import fitz
+import numba
+import argparse
 import numpy as np
-# imgs = io.imread('./test.png')
-# io.imsave('./hh.png',imgs)
-# imgs = np.array(imgs)
-# print(imgs.shape)
-# r = []
-# g = []
-# b = []
-# alpha = []
+from PIL import Image
+from skimage import io
 
-def judge(x,y):
-    temp = -(600.0/1575.0) * x
-    if y > 1350 + temp and y < 1500 + temp:
-        return True
-    else:
-        return False
+# Create an argument parser for command line arguments
+parser = argparse.ArgumentParser(description='Remove Watermark')
+# source file path
+parser.add_argument('--source', default='source.pdf', type=str, help='source file')
+# target directory
+parser.add_argument('--target', default='out', type=str, help='target directory')
 
-# for  i in range(imgs.shape[0]):
-#     for j in range(imgs.shape[1]):
-#         if not judge(j,i):
-#             continue
-#         if imgs[i][j][1] > 100 and imgs[i][j][1] < 250 and imgs[i][j][2] > 100 and imgs[i][j][2] < 250:
-#             imgs[i][j][0] =  imgs[i][j][1] = imgs[i][j][2] = 255
-#         if imgs[i][j][1] < 10 and imgs[i][j][2] < 100:
-#             imgs[i][j][0] =  imgs[i][j][1] = imgs[i][j][2] = 0 
+# numba's JIT decorator for speed
+@numba.jit(nopython=True) 
+def handle(img):
+    shape = img.shape
+    # Turn the image into a one-dimensional array and avoid loops that are too deep for nesting
+    img = img.reshape(-1, 3)
+    for i in range(len(img)):
+        # If the RGB values of the pixel are between 175 and 250, assume it's part of the watermark and set it to white
+        if 175 < img[i][0] < 250 and 175 < img[i][1] < 250 and 175 < img[i][2] < 250:
+            img[i] = [255, 255, 255]
+    # Reshape the image back
+    img = img.reshape(shape)
+    return img
 
-# io.imsave('./hh.png',imgs)
-# print(r)
-# print(g)
-# print(b)
-# print(alpha)
+if __name__ == '__main__':
+    # Parse command line arguments
+    args = parser.parse_args()
 
-def select_pixel(r,g,b):
-    if (r == 208 and g == 208 and b == 208 ) or (r == 196 and g == 196 and b == 196) \
-        or (r == 206 and g == 206 and b == 206 ):
-        return True
-    else:
-        return False
-def select_pixel2(r,g,b):
-    if r > 175 and r < 250 and g > 175 and g < 250 and b > 175 and b < 250:
-        return True
-    else:
-        return False
-def handle(imgs):
-    for  i in range(imgs.shape[0]):
-        for j in range(imgs.shape[1]):
-            # if not judge(j,i):
-            #     continue
-            # if imgs[i][j][1] > 100 and imgs[i][j][1] < 250 and imgs[i][j][2] > 100 and imgs[i][j][2] < 250:
-            if select_pixel2(imgs[i][j][0],imgs[i][j][1],imgs[i][j][2]):
-                imgs[i][j][0] =  imgs[i][j][1] = imgs[i][j][2] = 255
-            # if not select_pixel(imgs[i][j][0],imgs[i][j][1],imgs[i][j][2]):
-            #     imgs[i][j][0] =  imgs[i][j][1] = imgs[i][j][2] = 0 
-    return imgs
+    # Check if the target directory exists, and create it if it does not
+    if not os.path.exists(args.target):
+        os.mkdir(args.target)
 
-images = convert_from_path('./jiangyi3.pdf')
-# images = np.array(images)
-index = 0
-for img in images:
-    index += 1
-    img = np.array(img)
-    print(img.shape)
-    img = handle(img)
-    io.imsave('./jiangyi3/img'+str(index)+'.jpg', img)
-    # break
-    print(index)
+    # Get the file extension of the source file
+    fileext = os.path.splitext(os.path.basename(args.source))[1]
+    # Handle different image formats based on the file extension
+    if fileext == '.jpg':
+        img = io.imread(args.source)
+        io.imsave(os.path.join(args.target, 'result{}'.format(fileext)), handle(img))
+    elif fileext == '.png':
+        # Read the png and convert it to RGB mode
+        img = np.array(Image.open(args.source).convert('RGB'))
+        io.imsave(os.path.join(args.target, 'result{}'.format(fileext)), handle(img))
+    elif fileext == '.pdf':
+        # Convert the pdf file to a list of images
+        imgs = []
+        pdf = fitz.open(args.source)
+        # zoom_factor = 1.33333333
+        # default size ：792X612, dpi=72
+        # (1.33333333-->1056x816) (2-->1584x1224)
+        zoom_x = zoom_y = 1.33333333
+        mat = fitz.Matrix(zoom_x, zoom_y)
+        for i in range(len(pdf)):
+            page = pdf.load_page(i)
+            pix = page.get_pixmap(matrix=mat, alpha=False)
+            img = np.frombuffer(buffer=pix.samples, dtype=np.uint8).reshape((pix.h, pix.w, -1)).copy()
+            imgs.append(img)
+        # Iterate over the list of images, process each one, and save it
+        for i in range(len(imgs)):
+            io.imsave(os.path.join(args.target, '{}.jpg').format(i + 1), handle(imgs[i]))
+    else:
+        # If the file format is not supported, print an error message
+        print('Unsupported file format')