Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 10 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,35 +1,29 @@
# Python-Remove-Watermark
A simple program to remove the watermark from a PDF file.

A simple program to remove the watermark from a PDF/JPG/PNG file.

### How?

1. convert the PDF file into images using `pdf2image`
2. convert the images to numpy array
1. convert the PDF file into images using `pymupdf`
2. convert the images(include .jpg/.png) to numpy array
3. find the specific pixel by watermarks' rgb values and change them into (255,255,255)
4. save the modified images


### Environment
### How to use?

First you need to install the dependencies:
```
$ pip install pdf2image
```
```
$ pip install scikit-image
$ pip install pillow pymupdf scikit-image numba
```

Inside the repository create a directory that will receive the modified images:
For .pdf execute:
```
$ mkdir jiangyi3
$ python watermark.py --source source.pdf --target out
```
To execute:
For .jpg/.png execute:
```
$ python watermark.py
$ python watermark.py --source source.png --target out
```
Don't forget to indicate the pdf's path you want to convert.

Don't forget to indicate the source path you want to convert, script will automatically creates an output path.

### Results
![image](./result.png)
3 changes: 0 additions & 3 deletions util.py

This file was deleted.

124 changes: 62 additions & 62 deletions watermark.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,68 @@
from skimage import io
from pdf2image import convert_from_path
# Import necessary libraries
import os
import fitz
import numba
import argparse
import numpy as np
# imgs = io.imread('./test.png')
# io.imsave('./hh.png',imgs)
# imgs = np.array(imgs)
# print(imgs.shape)
# r = []
# g = []
# b = []
# alpha = []
from PIL import Image
from skimage import io

def judge(x,y):
temp = -(600.0/1575.0) * x
if y > 1350 + temp and y < 1500 + temp:
return True
else:
return False
# Create an argument parser for command line arguments
parser = argparse.ArgumentParser(description='Remove Watermark')
# source file path
parser.add_argument('--source', default='source.pdf', type=str, help='source file')
# target directory
parser.add_argument('--target', default='out', type=str, help='target directory')

# for i in range(imgs.shape[0]):
# for j in range(imgs.shape[1]):
# if not judge(j,i):
# continue
# if imgs[i][j][1] > 100 and imgs[i][j][1] < 250 and imgs[i][j][2] > 100 and imgs[i][j][2] < 250:
# imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 255
# if imgs[i][j][1] < 10 and imgs[i][j][2] < 100:
# imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 0
# numba's JIT decorator for speed
@numba.jit(nopython=True)
def handle(img):
shape = img.shape
# Turn the image into a one-dimensional array and avoid loops that are too deep for nesting
img = img.reshape(-1, 3)
for i in range(len(img)):
# If the RGB values of the pixel are between 175 and 250, assume it's part of the watermark and set it to white
if 175 < img[i][0] < 250 and 175 < img[i][1] < 250 and 175 < img[i][2] < 250:
img[i] = [255, 255, 255]
# Reshape the image back
img = img.reshape(shape)
return img

# io.imsave('./hh.png',imgs)
# print(r)
# print(g)
# print(b)
# print(alpha)
if __name__ == '__main__':
# Parse command line arguments
args = parser.parse_args()

def select_pixel(r,g,b):
if (r == 208 and g == 208 and b == 208 ) or (r == 196 and g == 196 and b == 196) \
or (r == 206 and g == 206 and b == 206 ):
return True
else:
return False
def select_pixel2(r,g,b):
if r > 175 and r < 250 and g > 175 and g < 250 and b > 175 and b < 250:
return True
else:
return False
def handle(imgs):
for i in range(imgs.shape[0]):
for j in range(imgs.shape[1]):
# if not judge(j,i):
# continue
# if imgs[i][j][1] > 100 and imgs[i][j][1] < 250 and imgs[i][j][2] > 100 and imgs[i][j][2] < 250:
if select_pixel2(imgs[i][j][0],imgs[i][j][1],imgs[i][j][2]):
imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 255
# if not select_pixel(imgs[i][j][0],imgs[i][j][1],imgs[i][j][2]):
# imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 0
return imgs
# Check if the target directory exists, and create it if it does not
if not os.path.exists(args.target):
os.mkdir(args.target)

images = convert_from_path('./jiangyi3.pdf')
# images = np.array(images)
index = 0
for img in images:
index += 1
img = np.array(img)
print(img.shape)
img = handle(img)
io.imsave('./jiangyi3/img'+str(index)+'.jpg', img)
# break
print(index)
# Get the file extension of the source file
fileext = os.path.splitext(os.path.basename(args.source))[1]
# Handle different image formats based on the file extension
if fileext == '.jpg':
img = io.imread(args.source)
io.imsave(os.path.join(args.target, 'result{}'.format(fileext)), handle(img))
elif fileext == '.png':
# Read the png and convert it to RGB mode
img = np.array(Image.open(args.source).convert('RGB'))
io.imsave(os.path.join(args.target, 'result{}'.format(fileext)), handle(img))
elif fileext == '.pdf':
# Convert the pdf file to a list of images
imgs = []
pdf = fitz.open(args.source)
# zoom_factor = 1.33333333
# default size :792X612, dpi=72
# (1.33333333-->1056x816) (2-->1584x1224)
zoom_x = zoom_y = 1.33333333
mat = fitz.Matrix(zoom_x, zoom_y)
for i in range(len(pdf)):
page = pdf.load_page(i)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = np.frombuffer(buffer=pix.samples, dtype=np.uint8).reshape((pix.h, pix.w, -1)).copy()
imgs.append(img)
# Iterate over the list of images, process each one, and save it
for i in range(len(imgs)):
io.imsave(os.path.join(args.target, '{}.jpg').format(i + 1), handle(imgs[i]))
else:
# If the file format is not supported, print an error message
print('Unsupported file format')