diff --git a/README.md b/README.md index 31a888e..a1b8bd5 100755 --- a/README.md +++ b/README.md @@ -1,35 +1,29 @@ # Python-Remove-Watermark -A simple program to remove the watermark from a PDF file. - +A simple program to remove the watermark from a PDF/JPG/PNG file. ### How? -1. convert the PDF file into images using `pdf2image` -2. convert the images to numpy array +1. convert the PDF file into images using `pymupdf` +2. convert the images(include .jpg/.png) to numpy array 3. find the specific pixel by watermarks' rgb values and change them into (255,255,255) 4. save the modified images - -### Environment +### How to use? First you need to install the dependencies: ``` -$ pip install pdf2image -``` -``` -$ pip install scikit-image +$ pip install pillow pymupdf scikit-image numba ``` -Inside the repository create a directory that will receive the modified images: +For .pdf execute: ``` -$ mkdir jiangyi3 +$ python watermark.py --source source.pdf --target out ``` -To execute: +For .jpg/.png execute: ``` -$ python watermark.py +$ python watermark.py --source source.png --target out ``` -Don't forget to indicate the pdf's path you want to convert. - +Don't forget to indicate the source path you want to convert, script will automatically creates an output path. ### Results ![image](./result.png) \ No newline at end of file diff --git a/util.py b/util.py deleted file mode 100644 index d4d13b1..0000000 --- a/util.py +++ /dev/null @@ -1,3 +0,0 @@ -for i in range(1,49): - s = '![image](./jiangyi3/img'+str(i)+'.jpg)' - print(s) \ No newline at end of file diff --git a/watermark.py b/watermark.py index 8e35eb3..21583b7 100644 --- a/watermark.py +++ b/watermark.py @@ -1,68 +1,68 @@ -from skimage import io -from pdf2image import convert_from_path +# Import necessary libraries +import os +import fitz +import numba +import argparse import numpy as np -# imgs = io.imread('./test.png') -# io.imsave('./hh.png',imgs) -# imgs = np.array(imgs) -# print(imgs.shape) -# r = [] -# g = [] -# b = [] -# alpha = [] +from PIL import Image +from skimage import io -def judge(x,y): - temp = -(600.0/1575.0) * x - if y > 1350 + temp and y < 1500 + temp: - return True - else: - return False +# Create an argument parser for command line arguments +parser = argparse.ArgumentParser(description='Remove Watermark') +# source file path +parser.add_argument('--source', default='source.pdf', type=str, help='source file') +# target directory +parser.add_argument('--target', default='out', type=str, help='target directory') -# for i in range(imgs.shape[0]): -# for j in range(imgs.shape[1]): -# if not judge(j,i): -# continue -# if imgs[i][j][1] > 100 and imgs[i][j][1] < 250 and imgs[i][j][2] > 100 and imgs[i][j][2] < 250: -# imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 255 -# if imgs[i][j][1] < 10 and imgs[i][j][2] < 100: -# imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 0 +# numba's JIT decorator for speed +@numba.jit(nopython=True) +def handle(img): + shape = img.shape + # Turn the image into a one-dimensional array and avoid loops that are too deep for nesting + img = img.reshape(-1, 3) + for i in range(len(img)): + # If the RGB values of the pixel are between 175 and 250, assume it's part of the watermark and set it to white + if 175 < img[i][0] < 250 and 175 < img[i][1] < 250 and 175 < img[i][2] < 250: + img[i] = [255, 255, 255] + # Reshape the image back + img = img.reshape(shape) + return img -# io.imsave('./hh.png',imgs) -# print(r) -# print(g) -# print(b) -# print(alpha) +if __name__ == '__main__': + # Parse command line arguments + args = parser.parse_args() -def select_pixel(r,g,b): - if (r == 208 and g == 208 and b == 208 ) or (r == 196 and g == 196 and b == 196) \ - or (r == 206 and g == 206 and b == 206 ): - return True - else: - return False -def select_pixel2(r,g,b): - if r > 175 and r < 250 and g > 175 and g < 250 and b > 175 and b < 250: - return True - else: - return False -def handle(imgs): - for i in range(imgs.shape[0]): - for j in range(imgs.shape[1]): - # if not judge(j,i): - # continue - # if imgs[i][j][1] > 100 and imgs[i][j][1] < 250 and imgs[i][j][2] > 100 and imgs[i][j][2] < 250: - if select_pixel2(imgs[i][j][0],imgs[i][j][1],imgs[i][j][2]): - imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 255 - # if not select_pixel(imgs[i][j][0],imgs[i][j][1],imgs[i][j][2]): - # imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 0 - return imgs + # Check if the target directory exists, and create it if it does not + if not os.path.exists(args.target): + os.mkdir(args.target) -images = convert_from_path('./jiangyi3.pdf') -# images = np.array(images) -index = 0 -for img in images: - index += 1 - img = np.array(img) - print(img.shape) - img = handle(img) - io.imsave('./jiangyi3/img'+str(index)+'.jpg', img) - # break - print(index) \ No newline at end of file + # Get the file extension of the source file + fileext = os.path.splitext(os.path.basename(args.source))[1] + # Handle different image formats based on the file extension + if fileext == '.jpg': + img = io.imread(args.source) + io.imsave(os.path.join(args.target, 'result{}'.format(fileext)), handle(img)) + elif fileext == '.png': + # Read the png and convert it to RGB mode + img = np.array(Image.open(args.source).convert('RGB')) + io.imsave(os.path.join(args.target, 'result{}'.format(fileext)), handle(img)) + elif fileext == '.pdf': + # Convert the pdf file to a list of images + imgs = [] + pdf = fitz.open(args.source) + # zoom_factor = 1.33333333 + # default size :792X612, dpi=72 + # (1.33333333-->1056x816) (2-->1584x1224) + zoom_x = zoom_y = 1.33333333 + mat = fitz.Matrix(zoom_x, zoom_y) + for i in range(len(pdf)): + page = pdf.load_page(i) + pix = page.get_pixmap(matrix=mat, alpha=False) + img = np.frombuffer(buffer=pix.samples, dtype=np.uint8).reshape((pix.h, pix.w, -1)).copy() + imgs.append(img) + # Iterate over the list of images, process each one, and save it + for i in range(len(imgs)): + io.imsave(os.path.join(args.target, '{}.jpg').format(i + 1), handle(imgs[i])) + else: + # If the file format is not supported, print an error message + print('Unsupported file format') \ No newline at end of file