@@ -254,122 +254,91 @@ def scan_folder(self, scan_dir: str) -> bool:
254254 raise Exception (f"ERROR: Please specify a folder to scan" )
255255 if not os .path .exists (scan_dir ) or not os .path .isdir (scan_dir ):
256256 raise Exception (f"ERROR: Specified folder does not exist or is not a folder: { scan_dir } " )
257- wfps = ''
257+
258258 scan_dir_len = len (scan_dir ) if scan_dir .endswith (os .path .sep ) else len (scan_dir )+ 1
259259 self .print_msg (f'Searching { scan_dir } for files to fingerprint...' )
260260 spinner = None
261261 if not self .quiet and self .isatty :
262262 spinner = Spinner ('Fingerprinting ' )
263+ wfp_list = []
264+ scan_block = ''
265+ scan_size = 0
266+ queue_size = 0
267+ file_count = 0
268+ scan_started = False
263269 for root , dirs , files in os .walk (scan_dir ):
264- self .print_debug (f'U Root: { root } , Dirs: { dirs } , Files { files } ' )
265- dirs [:] = Scanner .__filter_dirs (dirs ) # Strip out unwanted directories
266- filtered_files = Scanner .__filter_files (files ) # Strip out unwanted files
270+ self .print_trace (f'U Root: { root } , Dirs: { dirs } , Files { files } ' )
271+ dirs [:] = Scanner .__filter_dirs (dirs ) # Strip out unwanted directories
272+ filtered_files = Scanner .__filter_files (files ) # Strip out unwanted files
267273 self .print_debug (f'F Root: { root } , Dirs: { dirs } , Files { filtered_files } ' )
268- for file in filtered_files :
274+ for file in filtered_files : # Cycle through each filtered file
269275 path = os .path .join (root , file )
270- file_stat = os .stat (path )
271- if file_stat .st_size > 0 : # Ignore empty files
272- self .print_debug (f'Fingerprinting { path } ...' )
276+ f_size = 0
277+ try :
278+ f_size = os .stat (path ).st_size
279+ except :
280+ self .print_trace (f'Ignoring missing symlink file: { file } ' ) # Can fail if there is a broken symlink
281+ if f_size > 0 : # Ignore broken links and empty files
282+ self .print_trace (f'Fingerprinting { path } ...' )
273283 if spinner :
274284 spinner .next ()
275- wfps += self .winnowing .wfp_for_file (path , Scanner .__strip_dir (scan_dir , scan_dir_len , path ))
285+ wfp = self .winnowing .wfp_for_file (path , Scanner .__strip_dir (scan_dir , scan_dir_len , path ))
286+ wfp_list .append (wfp )
287+ file_count += 1
288+ if self .threaded_scan :
289+ wfp_size = len (wfp .encode ("utf-8" ))
290+ if (wfp_size + scan_size ) >= MAX_POST_SIZE :
291+ self .threaded_scan .queue_add (scan_block )
292+ queue_size += 1
293+ scan_block = ''
294+ scan_block += wfp
295+ scan_size = len (scan_block .encode ("utf-8" ))
296+ if scan_size >= MAX_POST_SIZE :
297+ self .threaded_scan .queue_add (scan_block )
298+ queue_size += 1
299+ scan_block = ''
300+ if queue_size > self .nb_threads and not scan_started : # Start scanning if we have something to do
301+ scan_started = True
302+ if not self .threaded_scan .run (wait = False ):
303+ self .print_stderr (
304+ f'Warning: Some errors encounted while scanning. Results might be incomplete.' )
305+ success = False
306+ # End for loop
307+ if self .threaded_scan and scan_block :
308+ self .threaded_scan .queue_add (scan_block ) # Make sure all files have been submitted
276309 if spinner :
277310 spinner .finish ()
278- if wfps :
311+
312+ if wfp_list :
279313 self .print_debug (f'Writing fingerprints to { self .wfp } ' )
280314 with open (self .wfp , 'w' ) as f :
281- f .write (wfps )
282- self .print_msg (f'Scanning fingerprints...' )
315+ f .write ('' .join (wfp_list ))
283316 if self .scan_output :
284317 self .print_msg (f'Writing results to { self .scan_output } ...' )
285318 if self .threaded_scan :
286- success = self .scan_wfp_file_threaded ( )
319+ success = self .__finish_scan_threaded ( scan_started , file_count )
287320 else :
288321 success = self .scan_wfp_file ()
289322 else :
290323 Scanner .print_stderr (f'Warning: No files found to scan in folder: { scan_dir } ' )
291324 return success
292325
293- def scan_file (self , file : str ) -> bool :
326+ def __finish_scan_threaded (self , scan_started : bool , file_count : int ) -> bool :
294327 """
295- Scan the specified file and produce a result
296- Parameters
297- ----------
298- file: str
299- File to fingerprint and scan/identify
300- :return True if successful, False otherwise
328+ Finish scanning the filtered files and wait for the threads to complete
329+ :param scan_started: If the scan has already started or not
330+ :param file_count: Number of total files to be scanned
331+ :return: True if successful, False otherwise
301332 """
302333 success = True
303- if not file :
304- raise Exception (f"ERROR: Please specify a file to scan" )
305- if not os .path .exists (file ) or not os .path .isfile (file ):
306- raise Exception (f"ERROR: Specified files does not exist or is not a file: { file } " )
307- self .print_debug (f'Fingerprinting { file } ...' )
308- wfps = self .winnowing .wfp_for_file (file , file )
309- if wfps :
310- self .print_debug (f'Scanning { file } ...' )
311- if self .scan_output :
312- self .print_msg (f'Writing results to { self .scan_output } ...' )
313- success = self .scan_wfp (wfps )
334+ self .threaded_scan .update_bar (create = True , file_count = file_count )
335+ if not scan_started :
336+ if not self .threaded_scan .run (): # Run the scan and wait for it to complete
337+ self .print_stderr (f'Warning: Some errors encounted while scanning. Results might be incomplete.' )
338+ success = False
314339 else :
315- success = False
316-
317- return success
318-
319- def scan_wfp_file_threaded (self , file : str = None ) -> bool :
320- """
321- Scan the supplied WFP file in multiple threads
322- :param file: WFP file to scan
323- :return True if scuccessful, False otherwise
324- """
325- success = True
326- wfp_file = file if file else self .wfp # If a WFP file is specified, use it, otherwise us the default
327- if not os .path .exists (wfp_file ) or not os .path .isfile (wfp_file ):
328- raise Exception (f"ERROR: Specified WFP file does not exist or is not a file: { wfp_file } " )
329- file_count = Scanner .__count_files_in_wfp_file (wfp_file )
330- cur_files = 0
331- cur_size = 0
332- batch_files = 0
333- wfp = ''
334- self .print_debug (f'Found { file_count } files to process.' )
335- file_print = ''
336- bar = None
337- if not self .quiet and self .isatty :
338- bar = Bar ('Scanning' , max = file_count )
339- bar .next (0 )
340- self .threaded_scan .set_bar (bar )
341-
342- with open (wfp_file ) as f :
343- for line in f :
344- if line .startswith (WFP_FILE_START ):
345- if file_print :
346- wfp += file_print # Store the WFP for the current file
347- cur_size = len (wfp .encode ("utf-8" ))
348- file_print = line # Start storing the next file
349- cur_files += 1
350- batch_files += 1
351- else :
352- file_print += line # Store the rest of the WFP for this file
353- l_size = cur_size + len (file_print .encode ('utf-8' ))
354- # Hit the max post size, so sending the current batch and continue processing
355- if l_size >= MAX_POST_SIZE and wfp :
356- self .print_debug (f'Added { batch_files } ({ cur_files } ) of'
357- f' { file_count } ({ len (wfp .encode ("utf-8" ))} bytes) files to the pending queue.' )
358- if cur_size > MAX_POST_SIZE :
359- Scanner .print_stderr (f'Warning: Post size { cur_size } greater than limit { MAX_POST_SIZE } ' )
360- self .threaded_scan .queue_add (wfp )
361- batch_files = 0
362- wfp = ''
363- if file_print :
364- wfp += file_print # Store the WFP for the current file
365- if wfp :
366- self .print_debug (f'Adding { batch_files } ({ cur_files } ) of'
367- f' { file_count } ({ len (wfp .encode ("utf-8" ))} bytes) files to the pending queue.' )
368- self .threaded_scan .queue_add (wfp )
369-
370- if not self .threaded_scan .run ():
371- self .print_stderr (f'Warning: Some errors encounted while scanning. Result might not be complete.' )
372- success = False
340+ self .threaded_scan .complete () # Wait for the scans to complete
341+ self .threaded_scan .complete_bar ()
373342 responses = self .threaded_scan .responses
374343 raw_output = "{\n "
375344 if responses :
@@ -382,11 +351,10 @@ def scan_wfp_file_threaded(self, file: str = None) -> bool:
382351 first = False
383352 else :
384353 raw_output += ",\n \" %s\" :%s" % (key , json .dumps (value , indent = 2 ))
354+ # End for loop
385355 else :
386356 success = False
387357 raw_output += "\n }"
388- if bar :
389- bar .finish ()
390358 parsed_json = None
391359 try :
392360 parsed_json = json .loads (raw_output )
@@ -404,6 +372,32 @@ def scan_wfp_file_threaded(self, file: str = None) -> bool:
404372 success = cdx .produce_from_json (parsed_json )
405373 else :
406374 success = cdx .produce_from_str (raw_output )
375+ return success
376+
377+
378+ def scan_file (self , file : str ) -> bool :
379+ """
380+ Scan the specified file and produce a result
381+ Parameters
382+ ----------
383+ file: str
384+ File to fingerprint and scan/identify
385+ :return True if successful, False otherwise
386+ """
387+ success = True
388+ if not file :
389+ raise Exception (f"ERROR: Please specify a file to scan" )
390+ if not os .path .exists (file ) or not os .path .isfile (file ):
391+ raise Exception (f"ERROR: Specified files does not exist or is not a file: { file } " )
392+ self .print_debug (f'Fingerprinting { file } ...' )
393+ wfp = self .winnowing .wfp_for_file (file , file )
394+ if wfp :
395+ self .print_debug (f'Scanning { file } ...' )
396+ if self .scan_output :
397+ self .print_msg (f'Writing results to { self .scan_output } ...' )
398+ success = self .scan_wfp (wfp )
399+ else :
400+ success = False
407401
408402 return success
409403
@@ -538,14 +532,14 @@ def wfp_file(self, scan_file: str, wfp_file: str = None):
538532 raise Exception (f"ERROR: Specified file does not exist or is not a file: { scan_file } " )
539533
540534 self .print_debug (f'Fingerprinting { scan_file } ...' )
541- wfps = self .winnowing .wfp_for_file (scan_file , scan_file )
542- if wfps :
535+ wfp = self .winnowing .wfp_for_file (scan_file , scan_file )
536+ if wfp :
543537 if wfp_file :
544538 self .print_stderr (f'Writing fingerprints to { wfp_file } ' )
545539 with open (wfp_file , 'w' ) as f :
546- f .write (wfps )
540+ f .write (wfp )
547541 else :
548- print (wfps )
542+ print (wfp )
549543 else :
550544 Scanner .print_stderr (f'Warning: No fingerprints generated for: { scan_file } ' )
551545
@@ -561,7 +555,7 @@ def wfp_folder(self, scan_dir: str, wfp_file: str = None):
561555 scan_dir_len = len (scan_dir ) if scan_dir .endswith (os .path .sep ) else len (scan_dir )+ 1
562556 self .print_msg (f'Searching { scan_dir } for files to fingerprint...' )
563557 for root , dirs , files in os .walk (scan_dir ):
564- dirs = Scanner .__filter_dirs (dirs ) # Strip out unwanted directories
558+ dirs [:] = Scanner .__filter_dirs (dirs ) # Strip out unwanted directories
565559 filtered_files = Scanner .__filter_files (files ) # Strip out unwanted files
566560 self .print_trace (f'Root: { root } , Dirs: { dirs } , Files { filtered_files } ' )
567561 for file in filtered_files :
0 commit comments