Skip to content

Commit f892b38

Browse files
committed
Allow processing directories, reports may contain warnings
1 parent 2c9ccc8 commit f892b38

File tree

5 files changed

+70
-18
lines changed

5 files changed

+70
-18
lines changed

lib/src/dolos.ts

+44-14
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,11 @@ import { Result } from "./lib/util/result";
66
import { csvParse, DSVRowString } from "d3-dsv";
77
import * as path from "path";
88
import { Tokenizer } from "./lib/tokenizer/tokenizer";
9-
import { default as fsWithCallbacks, constants } from "fs";
9+
import { constants, default as fsWithCallbacks } from "fs";
1010
import { spawnSync as spawn } from "child_process";
1111
import { tmpdir } from "os";
12-
import {
13-
Language,
14-
LanguagePicker
15-
} from "./lib/util/language";
12+
import { Language, LanguagePicker } from "./lib/util/language";
13+
1614
const fs = fsWithCallbacks.promises;
1715

1816

@@ -31,8 +29,22 @@ export class Dolos {
3129
}
3230

3331
private async fromDirectory(dirPath: string): Promise<Result<File[]>> {
34-
const entries = await fs.readdir(dirPath, { withFileTypes: true });
35-
const files = entries.filter(ent => ent.isFile()).map(ent => File.fromPath(ent.name));
32+
const dirs = [dirPath];
33+
const files = [];
34+
35+
let i = 0;
36+
37+
while(i < dirs.length) {
38+
for (const entry of await fs.readdir(dirs[i], { withFileTypes: true })) {
39+
if (entry.isDirectory()) {
40+
dirs.push(path.join(dirs[i], entry.name));
41+
} else if (entry.isFile()) {
42+
files.push(File.fromPath(path.join(dirs[i], entry.name)));
43+
}
44+
}
45+
i += 1;
46+
}
47+
3648
return await Result.all(files);
3749
}
3850

@@ -46,10 +58,10 @@ export class Dolos {
4658
throw new Error(`Unzipping failed with exit status ${ status }, stderr: \n${stderr}`);
4759
}
4860
const infoPath = path.join(tmpDir, "info.csv");
49-
if (await fs.access(infoPath, constants.R_OK).catch(() => false)) {
50-
return await this.fromDirectory(tmpDir);
51-
} else {
61+
if (await fs.access(infoPath, constants.R_OK).then(() => true).catch(() => false)) {
5262
return await this.fromCSV(infoPath);
63+
} else {
64+
return await this.fromDirectory(tmpDir);
5365
}
5466
} finally {
5567
await fs.rm(tmpDir, { recursive: true });
@@ -121,13 +133,19 @@ export class Dolos {
121133
this.index = new Index(this.tokenizer, this.options);
122134
}
123135

136+
const warnings = [];
137+
let filteredFiles;
124138
if (this.languageDetected) {
125-
const filteredFiles = files.filter(file => this.language?.extensionMatches(file.path));
139+
filteredFiles = files.filter(file => this.language?.extensionMatches(file.path));
126140
const diff = files.length - filteredFiles.length;
127141
if (diff > 0) {
128-
console.warn(`Ignoring ${diff} files because the file extension did not match the detected language ${this.language!.name}. Specify the language explicitly to avoid this.`);
142+
warnings.push(
143+
`The language of the files was detected as ${this.language?.name} ` +
144+
`but ${diff} files were ignored because they did not have a matching extension.`
145+
);
129146
}
130-
files = filteredFiles;
147+
} else {
148+
filteredFiles = files;
131149
}
132150

133151
if (files.length < 2) {
@@ -138,6 +156,18 @@ export class Dolos {
138156
"be present in 100% of the files. This option does only" +
139157
"make sense when comparing more than two files.");
140158
}
141-
return this.index.compareFiles(files, nameCandidate);
159+
160+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
161+
const tokenizedFiles = filteredFiles.map(f => this.tokenizer!.tokenizeFile(f));
162+
const fingerprints = await this.index.createMatches(tokenizedFiles);
163+
164+
return new Report(
165+
this.options,
166+
this.language,
167+
tokenizedFiles,
168+
fingerprints,
169+
nameCandidate,
170+
warnings
171+
);
142172
}
143173
}

lib/src/lib/analyze/report.ts

+3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ type Hash = number;
1010
export interface Metadata extends DolosOptions {
1111
languageDetected: boolean;
1212
createdAt: string;
13+
warnings: string[];
1314
}
1415

1516
export class Report {
@@ -28,6 +29,7 @@ export class Report {
2829
public readonly files: TokenizedFile[],
2930
fingerprints: Map<Hash, SharedFingerprint>,
3031
name?: string,
32+
public readonly warnings: string[] = [],
3133
) {
3234
if (this.options.maxFingerprintCount != null) {
3335
this.kgramMaxFileOccurrences = this.options.maxFingerprintCount;
@@ -87,6 +89,7 @@ export class Report {
8789
createdAt: this.createdAt,
8890
language: this.language?.name ?? null,
8991
languageDetected: this.options.language == undefined,
92+
warnings: this.warnings,
9093
};
9194
}
9295
}

lib/src/lib/util/language.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ export class LanguagePicker {
153153

154154
if (language == undefined) {
155155
throw new LanguageError(
156-
`Could not detect language based on extension.`
156+
"Could not detect language based on extension."
157157
);
158158
}
159159

lib/src/test/dolos.test.ts

+18-1
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,9 @@ test("should read ZIP-files without info.csv", async t => {
255255
const report = await dolos.analyzePaths(["../samples/javascript/simple-dataset-no-csv.zip"]);
256256

257257
t.is(4, report.files.length);
258-
t.is(report.name, "simple-dataset");
258+
t.is(report.name, "simple-dataset-no-csv");
259259
t.is(report.metadata()["reportName"], "simple-dataset-no-csv");
260+
t.is(report.metadata()["warnings"].length, 1);
260261

261262
const pairs = report.allPairs();
262263
t.is(6, pairs.length);
@@ -271,3 +272,19 @@ test("empty files should match 0%", async t => {
271272
t.is(0, pairs[0].overlap);
272273
t.is(0, pairs[0].longest);
273274
});
275+
276+
test("should generate warning when not all files match detected language", async t => {
277+
const dolos = new Dolos();
278+
279+
const report = await dolos.analyzePaths([
280+
"../samples/javascript/sample.js",
281+
"../samples/javascript/copied_function.js",
282+
"../samples/java/Caesar.java"
283+
]);
284+
285+
t.is(report.metadata()["warnings"].length, 1);
286+
t.is(2, report.files.length);
287+
288+
const pairs = report.allPairs();
289+
t.is(1, pairs.length);
290+
});

lib/src/test/tokenizer.test.ts

+4-2
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ test("language picker should throw an error for unknown extension", t => {
5353
t.throws(() => new LanguagePicker().detectLanguage([new File("unknown.extension", "")]));
5454
});
5555

56-
test("language picker should throw an error for different languages", t => {
57-
t.throws(() => new LanguagePicker().detectLanguage([new File("file.py", ""), new File("file.js", "")]));
56+
test("language picker should detect most common language", t => {
57+
const files = [new File("file.py", ""), new File("otherfile.py", ""), new File("file.js", "")];
58+
const detected = new LanguagePicker().detectLanguage(files);
59+
t.deepEqual(detected.name, "python");
5860
});

0 commit comments

Comments
 (0)