forked from muskaarora4446/LPRnet
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessor.py
More file actions
119 lines (105 loc) · 3.91 KB
/
preprocessor.py
File metadata and controls
119 lines (105 loc) · 3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
'''
created:31-08-2020
Author:https://github.com/bernard0047
'''
import os
import shutil
import pandas as pd
import numpy as np
import cv2 as cv
from PIL import Image
import argparse
import re
def get_parser():
parser = argparse.ArgumentParser(description='parameters for dataset preprocessing')
parser.add_argument('--input_dir', default="./IMAGES3/", help='Input path (contains imgfolder and label csv)')
parser.add_argument('--output_dir', default="./images/", help='a folder containing train and test folders will be created here') #don't pass for easy training
args = parser.parse_args()
return args
def label_check(label):
if len(label)< 8:
return 0
if label[0:2]=='DL':
delhi_pt = "\d{4,4}$"
dlval = re.search(delhi_pt,label)
if dlval is None or len(dlval.group())<8:
return 0
else:
return 1
pattern = "(([A-Za-z]){2,3}(|-)(?:[0-9]){1,2}(|-)(?:[A-Za-z]){2}(|-)([0-9]){1,4})|(([A-Za-z]){2,3}(|-)([0-9]){1,4})"
val = re.search(pattern,label)
if val is None or len(val.group())<8:
return 0
else:
return 1
def size_check(ipath):
img = cv.imread(ipath)
height = img.shape[0]
width = img.shape[1]
#print(height,width)
if height<24 or width<90:
return 0
return 1
def preprocess():
args = get_parser()
idr = os.path.expanduser(args.input_dir)
odr = os.path.expanduser(args.output_dir)
if os.path.exists(odr):
print("Error: Path exists")
return
else:
os.mkdir(odr)
#print(input_dir,output_dir)
imgfolder,dfpath = None, None
for item in os.listdir(idr):
if os.path.isdir(os.path.join(idr,item)):
imgfolder = os.path.join(idr,item)
else:
dfpath = os.path.join(idr,item)
if dfpath[-3:]!="csv":
df = pd.read_excel(dfpath)
else:
df = pd.read_csv(dfpath,encoding = "ISO-8859-1")
df = df.astype(str)
allFileNames = os.listdir(imgfolder)
np.random.shuffle(allFileNames)
train_FileNames, test_FileNames = np.split(np.array(allFileNames),
[int(len(allFileNames)*0.9)])
print('Total images: ', len(allFileNames))
print('Training: ', len(train_FileNames))
print('Testing: ', len(test_FileNames))
train_FileNames = [imgfolder+'/'+ name for name in train_FileNames.tolist()]
test_FileNames = [imgfolder+'/' + name for name in test_FileNames.tolist()]
os.makedirs(odr +'/train')
os.makedirs(odr +'/test')
for name in train_FileNames:
shutil.copy(name, odr+"/train")
for name in test_FileNames:
shutil.copy(name, odr+"/test")
count=0
for dirs in os.listdir(odr):
for img in os.listdir(os.path.join(odr+dirs)):
ipath = os.path.join(odr,dirs,img)
img,_ = os.path.splitext(img) #uncomment this line if your dataset has imgname without label and modify to accomodate imgname of type(int)
_,ext = os.path.splitext(ipath)
label = df[df.iloc[:,0]==img].iloc[0,1]
label = ''.join(e for e in label if e.isalnum())
if img not in df.iloc[:,0].tolist() or label_check(label)==0 or size_check(ipath)==0:
count+=1
print(f"Image not found/ Image too small Label error: Discarding image:{img}")
os.remove(ipath)
continue
tpath = os.path.join(odr,dirs,label+ext)
if os.path.exists(tpath):
if os.path.getsize(tpath)>os.path.getsize(ipath):
os.remove(ipath)
else :
os.remove(tpath)
os.rename(ipath,tpath)
count+=1
print(f"Discarding duplicate image:{img}")
continue
os.rename(ipath,tpath)
print(f"Discarded {count} images in total.")
if __name__ == "__main__":
preprocess()