Skip to content

Commit 9895286

Browse files
committed
added scan and wfp filter options
1 parent b6fa3f7 commit 9895286

File tree

5 files changed

+74
-35
lines changed

5 files changed

+74
-35
lines changed

.github/workflows/python-local-test.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,17 @@ jobs:
6464
echo "Error: Scan test did not produce any results. Failing"
6565
exit 1
6666
fi
67+
68+
- name: Run Tests HPSM (fast winnowing)
69+
run: |
70+
pip install scanoss_winnowing
71+
which scanoss-py
72+
scanoss-py version
73+
scanoss-py utils fast
74+
scanoss-py scan -H tests > results.json
75+
id_count=$(cat results.json | grep '"id":' | wc -l)
76+
echo "ID Count: $id_count"
77+
if [[ $id_count -lt 1 ]]; then
78+
echo "Error: Scan test did not produce any results. Failing"
79+
exit 1
80+
fi

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99
### Added
1010
- Upcoming changes...
1111

12+
## [1.10.0] - 2024-02-06
13+
### Added
14+
- Added scan/wfp file filtering options
15+
- Exclude file extensions `--skip-extension` (repeat as needed)
16+
- Exclude folder `--skip-folder` (repeat as needed)
17+
- Exclude files smaller than specified `--skip-size`
18+
- Added `scan_files_with_options` SDK capability
19+
- Enables a programmer to supply a specific list of files to scan
20+
1221
## [1.9.0] - 2023-12-29
1322
### Added
1423
- Added dependency file decoration option to scanning (`scan`) using `--dep`
@@ -281,3 +290,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
281290
[1.7.0]: https://github.com/scanoss/scanoss.py/compare/v1.6.3...v1.7.0
282291
[1.8.0]: https://github.com/scanoss/scanoss.py/compare/v1.7.0...v1.8.0
283292
[1.9.0]: https://github.com/scanoss/scanoss.py/compare/v1.8.0...v1.9.0
293+
[1.10.0]: https://github.com/scanoss/scanoss.py/compare/v1.9.0...v1.10.0

src/scanoss/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@
2222
THE SOFTWARE.
2323
"""
2424

25-
__version__ = '1.9.0'
25+
__version__ = '1.10.0'

src/scanoss/cli.py

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -86,25 +86,19 @@ def setup_args() -> None:
8686
'256: disable best match only, 512: hide identified files, '
8787
'1024: enable download_url, 2048: enable GitHub full path, '
8888
'4096: disable extended server stats)')
89-
p_scan.add_argument('--skip-snippets', '-S', action='store_true', help='Skip the generation of snippets')
9089
p_scan.add_argument('--post-size', '-P', type=int, default=32,
9190
help='Number of kilobytes to limit the post to while scanning (optional - default 32)')
9291
p_scan.add_argument('--timeout', '-M', type=int, default=180,
9392
help='Timeout (in seconds) for API communication (optional - default 180)')
9493
p_scan.add_argument('--retry', '-R', type=int, default=5,
9594
help='Retry limit for API communication (optional - default 5)')
9695
p_scan.add_argument('--no-wfp-output', action='store_true', help='Skip WFP file generation')
97-
p_scan.add_argument('--all-extensions', action='store_true', help='Scan all file extensions')
98-
p_scan.add_argument('--all-folders', action='store_true', help='Scan all folders')
99-
p_scan.add_argument('--all-hidden', action='store_true', help='Scan all hidden files/folders')
100-
p_scan.add_argument('--obfuscate', action='store_true', help='Obfuscate file paths and names')
10196
p_scan.add_argument('--dependencies', '-D', action='store_true', help='Add Dependency scanning')
10297
p_scan.add_argument('--dependencies-only', action='store_true', help='Run Dependency scanning only')
10398
p_scan.add_argument('--sc-command', type=str,
10499
help='Scancode command and path if required (optional - default scancode).')
105100
p_scan.add_argument('--sc-timeout', type=int, default=600,
106101
help='Timeout (in seconds) for scancode to complete (optional - default 600)')
107-
p_scan.add_argument('--hpsm', '-H', action='store_true', help='Scan using High Precision Snippet Matching')
108102

109103
# Sub-command: fingerprint
110104
p_wfp = subparsers.add_parser('fingerprint', aliases=['fp', 'wfp'],
@@ -116,12 +110,6 @@ def setup_args() -> None:
116110
p_wfp.add_argument('--stdin', '-s', metavar='STDIN-FILENAME', type=str,
117111
help='Fingerprint the file contents supplied via STDIN (optional)')
118112
p_wfp.add_argument('--output', '-o', type=str, help='Output result file name (optional - default stdout).')
119-
p_wfp.add_argument('--obfuscate', action='store_true', help='Obfuscate fingerprints')
120-
p_wfp.add_argument('--skip-snippets', '-S', action='store_true', help='Skip the generation of snippets')
121-
p_wfp.add_argument('--all-extensions', action='store_true', help='Fingerprint all file extensions')
122-
p_wfp.add_argument('--all-folders', action='store_true', help='Fingerprint all folders')
123-
p_wfp.add_argument('--all-hidden', action='store_true', help='Fingerprint all hidden files/folders')
124-
p_wfp.add_argument('--hpsm', '-H', action='store_true', help='Use High Precision Snippet Matching algorithm.')
125113

126114
# Sub-command: dependency
127115
p_dep = subparsers.add_parser('dependencies', aliases=['dp', 'dep'],
@@ -260,6 +248,19 @@ def setup_args() -> None:
260248
help='SCANOSS API URL (optional - default: https://osskb.org/api/scan/direct)')
261249
p.add_argument('--ignore-cert-errors', action='store_true', help='Ignore certificate errors')
262250

251+
# Global Scan/Fingerprint filter options
252+
for p in [p_scan, p_wfp]:
253+
p.add_argument('--obfuscate', action='store_true', help='Obfuscate fingerprints')
254+
p.add_argument('--all-extensions', action='store_true', help='Fingerprint all file extensions')
255+
p.add_argument('--all-folders', action='store_true', help='Fingerprint all folders')
256+
p.add_argument('--all-hidden', action='store_true', help='Fingerprint all hidden files/folders')
257+
p.add_argument('--hpsm', '-H', action='store_true', help='Use High Precision Snippet Matching algorithm.')
258+
p.add_argument('--skip-snippets', '-S', action='store_true', help='Skip the generation of snippets')
259+
p.add_argument('--skip-extension', '-E', type=str, action='append', help='File Extension to skip.')
260+
p.add_argument('--skip-folder', '-O', type=str, action='append', help='Folder to skip.')
261+
p.add_argument('--skip-size', '-Z', type=int, default=0,
262+
help='Minimum file size to consider for fingerprinting (optional - default 0 bytes [unlimited])')
263+
263264
# Global Scan/GRPC options
264265
for p in [p_scan, c_crypto, c_vulns, c_search, c_versions, c_semgrep]:
265266
p.add_argument('--key', '-k', type=str,
@@ -374,8 +375,9 @@ def wfp(parser, args):
374375
scan_options = 0 if args.skip_snippets else ScanType.SCAN_SNIPPETS.value # Skip snippet generation or not
375376
scanner = Scanner(debug=args.debug, trace=args.trace, quiet=args.quiet, obfuscate=args.obfuscate,
376377
scan_options=scan_options, all_extensions=args.all_extensions,
377-
all_folders=args.all_folders, hidden_files_folders=args.all_hidden, hpsm=args.hpsm)
378-
378+
all_folders=args.all_folders, hidden_files_folders=args.all_hidden, hpsm=args.hpsm,
379+
skip_size=args.skip_size, skip_extensions=args.skip_extension, skip_folders=args.skip_folder
380+
)
379381
if args.stdin:
380382
contents = sys.stdin.buffer.read()
381383
scanner.wfp_contents(args.stdin, contents, scan_output)
@@ -530,14 +532,15 @@ def scan(parser, args):
530532
scan_options=scan_options, sc_timeout=args.sc_timeout, sc_command=args.sc_command,
531533
grpc_url=args.api2url, obfuscate=args.obfuscate,
532534
ignore_cert_errors=args.ignore_cert_errors, proxy=args.proxy, grpc_proxy=args.grpc_proxy,
533-
pac=pac_file, ca_cert=args.ca_cert, retry=args.retry, hpsm=args.hpsm
535+
pac=pac_file, ca_cert=args.ca_cert, retry=args.retry, hpsm=args.hpsm,
536+
skip_size=args.skip_size, skip_extensions=args.skip_extension, skip_folders=args.skip_folder
534537
)
535538
if args.wfp:
536539
if not scanner.is_file_or_snippet_scan():
537540
print_stderr(f'Error: Cannot specify WFP scanning if file/snippet options are disabled ({scan_options})')
538541
exit(1)
539542
if scanner.is_dependency_scan() and not args.dep:
540-
print_stderr(f'Error: Cannot specify WFP & Dependency scanning without a dependency file ({--dep})')
543+
print_stderr(f'Error: Cannot specify WFP & Dependency scanning without a dependency file (--dep)')
541544
exit(1)
542545
scanner.scan_wfp_with_options(args.wfp, args.dep)
543546
elif args.stdin:

src/scanoss/scanner.py

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
FILTERED_DIR_EXT = { # Folder endings to skip
5959
".egg-info"
6060
}
61-
FILTERED_EXT = { # File extensions to skip
61+
FILTERED_EXT = [ # File extensions to skip
6262
".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".ac", ".adoc", ".am",
6363
".asciidoc", ".bmp", ".build", ".cfg", ".chm", ".class", ".cmake", ".cnf",
6464
".conf", ".config", ".contributors", ".copying", ".crt", ".csproj", ".css",
@@ -78,7 +78,7 @@
7878
# File endings
7979
"-doc", "changelog", "config", "copying", "license", "authors", "news", "licenses", "notice",
8080
"readme", "swiftdoc", "texidoc", "todo", "version", "ignore", "manifest", "sqlite", "sqlite3"
81-
}
81+
]
8282
FILTERED_FILES = { # Files to skip
8383
"gradlew", "gradlew.bat", "mvnw", "mvnw.cmd", "gradle-wrapper.jar", "maven-wrapper.jar",
8484
"thumbs.db", "babel.config.js", "license.txt", "license.md", "copying.lib", "makefile"
@@ -100,12 +100,17 @@ def __init__(self, wfp: str = None, scan_output: str = None, output_format: str
100100
all_extensions: bool = False, all_folders: bool = False, hidden_files_folders: bool = False,
101101
scan_options: int = 7, sc_timeout: int = 600, sc_command: str = None, grpc_url: str = None,
102102
obfuscate: bool = False, ignore_cert_errors: bool = False, proxy: str = None, grpc_proxy: str = None,
103-
ca_cert: str = None, pac: PACFile = None, retry: int = 5, hpsm: bool = False
103+
ca_cert: str = None, pac: PACFile = None, retry: int = 5, hpsm: bool = False,
104+
skip_size: int = 0, skip_extensions=None, skip_folders=None
104105
):
105106
"""
106107
Initialise scanning class, including Winnowing, ScanossApi and ThreadedScanning
107108
"""
108109
super().__init__(debug, trace, quiet)
110+
if skip_folders is None:
111+
skip_folders = []
112+
if skip_extensions is None:
113+
skip_extensions = []
109114
self.wfp = wfp if wfp else "scanner_output.wfp"
110115
self.scan_output = scan_output
111116
self.output_format = output_format
@@ -117,6 +122,8 @@ def __init__(self, wfp: str = None, scan_output: str = None, output_format: str
117122
self.scan_options = scan_options
118123
self._skip_snippets = True if not scan_options & ScanType.SCAN_SNIPPETS.value else False
119124
self.hpsm = hpsm
125+
self.skip_folders = skip_folders
126+
self.skip_size = skip_size
120127
ver_details = Scanner.version_details()
121128

122129
self.winnowing = Winnowing(debug=debug, quiet=quiet, skip_snippets=self._skip_snippets,
@@ -143,6 +150,9 @@ def __init__(self, wfp: str = None, scan_output: str = None, output_format: str
143150
self.post_file_count = post_size if post_size > 0 else 32 # Max number of files for any given POST (default 32)
144151
if self._skip_snippets:
145152
self.max_post_size = 8 * 1024 # 8k Max post size if we're skipping snippets
153+
self.skip_extensions = FILTERED_EXT
154+
if skip_extensions: # Append extra file extensions to skip
155+
self.skip_extensions.extend(skip_extensions)
146156

147157
def __filter_files(self, files: list) -> list:
148158
"""
@@ -160,8 +170,8 @@ def __filter_files(self, files: list) -> list:
160170
if f_lower in FILTERED_FILES: # Check for exact files to ignore
161171
ignore = True
162172
if not ignore:
163-
for ending in FILTERED_EXT: # Check for file endings to ignore
164-
if f_lower.endswith(ending):
173+
for ending in self.skip_extensions: # Check for file endings to ignore (static and user supplied)
174+
if ending and f_lower.endswith(ending):
165175
ignore = True
166176
break
167177
if not ignore:
@@ -181,10 +191,12 @@ def __filter_dirs(self, dirs: list) -> list:
181191
ignore = True
182192
if not ignore and not self.all_folders: # Skip this check if we're allowing all folders
183193
d_lower = d.lower()
184-
if d_lower in FILTERED_DIRS: # Ignore specific folders
194+
if d_lower in FILTERED_DIRS: # Ignore specific folders (case insensitive)
195+
ignore = True
196+
elif self.skip_folders and d in self.skip_folders: # Ignore user-supplied folders (case sensitive)
185197
ignore = True
186198
if not ignore:
187-
for de in FILTERED_DIR_EXT: # Ignore specific folder endings
199+
for de in FILTERED_DIR_EXT: # Ignore specific folder endings (case insensitive)
188200
if d_lower.endswith(de):
189201
ignore = True
190202
break
@@ -385,7 +397,8 @@ def scan_folder(self, scan_dir: str) -> bool:
385397
except Exception as e:
386398
self.print_trace(
387399
f'Ignoring missing symlink file: {file} ({e})') # Can fail if there is a broken symlink
388-
if f_size > 0: # Ignore broken links and empty files
400+
# Ignore broken links and empty files or if a user-specified size limit is supplied
401+
if f_size > 0 and (self.skip_size <= 0 or f_size > self.skip_size):
389402
self.print_trace(f'Fingerprinting {path}...')
390403
if spinner:
391404
spinner.next()
@@ -598,7 +611,7 @@ def scan_file(self, file: str) -> bool:
598611
success = False
599612
return success
600613

601-
def scan_files(self, files: list[str]) -> bool:
614+
def scan_files(self, files: []) -> bool:
602615
"""
603616
Scan the specified list of files, producing fingerprints, send to the SCANOSS API and return results
604617
Please note that by providing an explicit list you bypass any exclusions that may be defined on the scanner
@@ -637,7 +650,7 @@ def scan_files(self, files: list[str]) -> bool:
637650
spinner.next()
638651
wfp = self.winnowing.wfp_for_file(file, file)
639652
if wfp is None or wfp == '':
640-
self.print_stderr(f'Warning: No WFP returned for {path}')
653+
self.print_stderr(f'Warning: No WFP returned for {file}')
641654
continue
642655
if save_wfps_for_print:
643656
wfp_list.append(wfp)
@@ -681,12 +694,12 @@ def scan_files(self, files: list[str]) -> bool:
681694
if self.threaded_scan:
682695
success = self.__run_scan_threaded(scan_started, file_count)
683696
else:
684-
Scanner.print_stderr(f'Warning: No files found to scan in folder: {scan_dir}')
697+
Scanner.print_stderr(f'Warning: No files found to scan from: {files}')
685698
return success
686699

687-
def scan_files_with_options(self, files: list[str], deps_file: str = None, file_map: dict = None) -> bool:
700+
def scan_files_with_options(self, files: [], deps_file: str = None, file_map: dict = None) -> bool:
688701
"""
689-
Scan the given folder for whatever scaning options that have been configured
702+
Scan the given list of files for whatever scaning options that have been configured
690703
:param files: list of files to scan
691704
:param deps_file: pre-parsed dependency file to decorate
692705
:param file_map: mapping of obfuscated files back into originals
@@ -697,7 +710,7 @@ def scan_files_with_options(self, files: list[str], deps_file: str = None, file_
697710
raise Exception(f"ERROR: Please specify a list of files to scan")
698711
if not self.is_file_or_snippet_scan():
699712
raise Exception(f"ERROR: file or snippet scan options have to be set to scan files: {files}")
700-
if self.is_dependency_scan():
713+
if self.is_dependency_scan() or deps_file:
701714
raise Exception(f"ERROR: The dependency scan option is currently not supported when scanning a list of files")
702715
if self.scan_output:
703716
self.print_msg(f'Writing results to {self.scan_output}...')
@@ -852,26 +865,25 @@ def scan_wfp_with_options(self, wfp: str, deps_file: str, file_map: dict = None)
852865
raise Exception(f"ERROR: Specified WFP file does not exist or is not a file: {wfp_file}")
853866

854867
if not self.is_file_or_snippet_scan() and not self.is_dependency_scan():
855-
raise Exception(f"ERROR: No scan options defined to scan folder: {scan_dir}")
868+
raise Exception(f"ERROR: No scan options defined to scan WFP: {wfp}")
856869

857870
if self.scan_output:
858871
self.print_msg(f'Writing results to {self.scan_output}...')
859872
if self.is_dependency_scan():
860873
if not self.threaded_deps.run(deps_file=deps_file, wait=False): # Kick off a background dependency scan
861874
success = False
862875
if self.is_file_or_snippet_scan():
863-
if not self.scan_wfp_file_threaded(wfp_file, file_map):
876+
if not self.scan_wfp_file_threaded(wfp_file):
864877
success = False
865878
if self.threaded_scan:
866879
if not self.__finish_scan_threaded(file_map):
867880
success = False
868881
return success
869882

870-
def scan_wfp_file_threaded(self, file: str = None, file_map: dict = None) -> bool:
883+
def scan_wfp_file_threaded(self, file: str = None) -> bool:
871884
"""
872885
Scan the contents of the specified WFP file (threaded)
873886
:param file: WFP file to scan (optional)
874-
:param file_map: mapping of obfuscated files back into originals (optional)
875887
return: True if successful, False otherwise
876888
"""
877889
success = True

0 commit comments

Comments
 (0)