Skip to content

Commit ce7ab2f

Browse files
authored
Merge pull request #1612 from dodona-edu/csv-template-parse
Add support for template files into CSV parsing
2 parents 44f9b33 + 1277c1d commit ce7ab2f

File tree

5 files changed

+180
-99
lines changed

5 files changed

+180
-99
lines changed

core/src/file/file.ts

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export interface ExtraInfo {
1111
exerciseID: string;
1212
createdAt: Date;
1313
labels: string;
14+
ignored: string;
1415
}
1516

1617
/**

lib/src/lib/dataset.ts

+141
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import { readFiles, readPath } from "./reader.js";
2+
3+
import { ExtraInfo, File, Result } from "@dodona/dolos-core";
4+
import { csvParse, DSVRowString } from "d3-dsv";
5+
6+
import { constants } from "node:fs";
7+
import fs from "node:fs/promises";
8+
import path from "node:path";
9+
import { spawnSync as spawn } from "node:child_process";
10+
import { tmpdir } from "node:os";
11+
12+
export class Dataset {
13+
constructor(
14+
public name: string,
15+
public files: File[],
16+
public ignore?: File) {
17+
}
18+
19+
private static async fromDirectory(dirPath: string): Promise<Result<File[]>> {
20+
const dirs = [dirPath];
21+
const files = [];
22+
23+
let i = 0;
24+
25+
while(i < dirs.length) {
26+
for (const entry of await fs.readdir(dirs[i], { withFileTypes: true })) {
27+
if (entry.isDirectory()) {
28+
dirs.push(path.join(dirs[i], entry.name));
29+
} else if (entry.isFile()) {
30+
files.push(readPath(path.join(dirs[i], entry.name)));
31+
}
32+
}
33+
i += 1;
34+
}
35+
36+
return await Result.all(files);
37+
}
38+
39+
private static async setIgnoredFile(resolvedFiles: File[], ignore?: string): Promise<File | undefined> {
40+
const ignoredFiles = resolvedFiles.filter(file => file.extra?.ignored === "true");
41+
if (ignoredFiles.length > 1) {
42+
throw new Error(
43+
"More than one file has the ignored field set to true. " +
44+
"Only one template/boilerplate code file is allowed at this moment."
45+
);
46+
}
47+
else if (ignore) {
48+
return (await readPath(ignore)).ok();
49+
}
50+
return ignoredFiles.length === 1 ? ignoredFiles[0] : undefined;
51+
}
52+
53+
54+
private static async fromZIP(
55+
zipPath: string,
56+
ignore?: string
57+
): Promise<Dataset> {
58+
const tmpDir = await fs.mkdtemp(path.join(tmpdir(), "dolos-unzip-"));
59+
try {
60+
const { status, error, stderr } = spawn("unzip", [zipPath, "-d", tmpDir]);
61+
if (error) {
62+
throw error;
63+
} else if (status != 0) {
64+
throw new Error(`Unzipping failed with exit status ${ status }, stderr: \n${stderr}`);
65+
}
66+
const infoPath = path.join(tmpDir, "info.csv");
67+
if (await fs.access(infoPath, constants.R_OK).then(() => true).catch(() => false)) {
68+
const dataset = await Dataset.fromCSV(infoPath, ignore);
69+
if (dataset) {
70+
dataset.name = path.basename(zipPath, ".zip");
71+
return dataset;
72+
}
73+
else {
74+
throw new Error("Failed to process files");
75+
}
76+
} else {
77+
const files = (await this.fromDirectory(tmpDir)).ok();
78+
const ignoredFile = undefined;
79+
const nameCandidate = path.basename(zipPath, ".zip");
80+
return new Dataset(nameCandidate, files, ignoredFile);
81+
}
82+
} finally {
83+
await fs.rm(tmpDir, { recursive: true });
84+
}
85+
}
86+
87+
88+
private static async fromCSV(
89+
infoPath: string,
90+
ignore?: string
91+
): Promise<Dataset> {
92+
const dirname = path.dirname(infoPath);
93+
try {
94+
const csv_files = csvParse((await fs.readFile(infoPath)).toString())
95+
.map((row: DSVRowString) => ({
96+
filename: row.filename as string,
97+
fullName: row.full_name as string,
98+
id: row.id as string,
99+
status: row.status as string,
100+
submissionID: row.submission_id as string,
101+
nameEN: row.name_en as string,
102+
nameNL: row.name_nl as string,
103+
exerciseID: row.exercise_id as string,
104+
createdAt: new Date(row.created_at as string),
105+
labels: row.label as string || row.labels as string,
106+
ignored: row.ignored as string
107+
}))
108+
.map((row: ExtraInfo) => readPath(path.join(dirname, row.filename), row));
109+
const resolvedFiles = await Result.all(csv_files);
110+
const ignoredFile = await this.setIgnoredFile(resolvedFiles.ok(), ignore);
111+
const files = resolvedFiles.ok().filter(file => file.extra?.ignored !== "true");
112+
const nameCandidate = path.dirname(infoPath).split(path.sep).pop() || "undefined";
113+
return new Dataset(nameCandidate, files, ignoredFile);
114+
} catch(e) {
115+
throw new Error("The given '.csv'-file could not be opened");
116+
}
117+
}
118+
119+
120+
public static async create(paths: string[], ignore?: string): Promise<Dataset> {
121+
let resolvedIgnoredFile = null;
122+
let resolvedFiles = null;
123+
let nameCandidate = "undefined";
124+
125+
if (paths.length == 1) {
126+
const inputFile = paths[0];
127+
if (inputFile.toLowerCase().endsWith(".zip")) {
128+
return Dataset.fromZIP(inputFile, ignore);
129+
} else if (inputFile.toLowerCase().endsWith(".csv")) {
130+
return Dataset.fromCSV(inputFile, ignore);
131+
} else {
132+
throw new Error("You gave one input file, but it is not a CSV file or a ZIP archive.");
133+
}
134+
} else {
135+
resolvedFiles = (await readFiles(paths)).ok();
136+
resolvedIgnoredFile = await this.setIgnoredFile(resolvedFiles, ignore);
137+
nameCandidate = path.basename(paths[0]) + " & " + path.basename(paths[1]);
138+
return new Dataset(nameCandidate, resolvedFiles, resolvedIgnoredFile);
139+
}
140+
}
141+
}

lib/src/lib/dolos.ts

+4-99
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,9 @@ import { Report } from "./report.js";
22
import { CustomOptions, Options } from "./options.js";
33
import { Tokenizer } from "./tokenizer/tokenizer.js";
44
import { Language, LanguagePicker } from "./language.js";
5-
import { readFiles, readPath } from "./reader.js";
5+
import { Dataset } from "./dataset.js";
66

7-
import { FingerprintIndex, ExtraInfo, File, Result } from "@dodona/dolos-core";
8-
import { csvParse, DSVRowString } from "d3-dsv";
9-
10-
import { constants } from "node:fs";
11-
import fs from "node:fs/promises";
12-
import path from "node:path";
13-
import { spawnSync as spawn } from "node:child_process";
14-
import { tmpdir } from "node:os";
7+
import { FingerprintIndex, File } from "@dodona/dolos-core";
158

169
export class Dolos {
1710
readonly options: Options;
@@ -27,97 +20,9 @@ export class Dolos {
2720
this.options = new Options(customOptions);
2821
}
2922

30-
private async fromDirectory(dirPath: string): Promise<Result<File[]>> {
31-
const dirs = [dirPath];
32-
const files = [];
33-
34-
let i = 0;
35-
36-
while(i < dirs.length) {
37-
for (const entry of await fs.readdir(dirs[i], { withFileTypes: true })) {
38-
if (entry.isDirectory()) {
39-
dirs.push(path.join(dirs[i], entry.name));
40-
} else if (entry.isFile()) {
41-
files.push(readPath(path.join(dirs[i], entry.name)));
42-
}
43-
}
44-
i += 1;
45-
}
46-
47-
return await Result.all(files);
48-
}
49-
50-
private async fromZIP(zipPath: string): Promise<Result<File[]>> {
51-
const tmpDir = await fs.mkdtemp(path.join(tmpdir(), "dolos-unzip-"));
52-
try {
53-
const { status, error, stderr } = spawn("unzip", [zipPath, "-d", tmpDir]);
54-
if (error) {
55-
throw error;
56-
} else if (status != 0) {
57-
throw new Error(`Unzipping failed with exit status ${ status }, stderr: \n${stderr}`);
58-
}
59-
const infoPath = path.join(tmpDir, "info.csv");
60-
if (await fs.access(infoPath, constants.R_OK).then(() => true).catch(() => false)) {
61-
return await this.fromCSV(infoPath);
62-
} else {
63-
return await this.fromDirectory(tmpDir);
64-
}
65-
} finally {
66-
await fs.rm(tmpDir, { recursive: true });
67-
}
68-
}
69-
70-
private async fromCSV(infoPath: string): Promise<Result<File[]>> {
71-
const dirname = path.dirname(infoPath);
72-
try {
73-
const files = csvParse((await fs.readFile(infoPath)).toString())
74-
.map((row: DSVRowString) => ({
75-
filename: row.filename as string,
76-
fullName: row.full_name as string,
77-
id: row.id as string,
78-
status: row.status as string,
79-
submissionID: row.submission_id as string,
80-
nameEN: row.name_en as string,
81-
nameNL: row.name_nl as string,
82-
exerciseID: row.exercise_id as string,
83-
createdAt: new Date(row.created_at as string),
84-
labels: row.label as string || row.labels as string
85-
}))
86-
.map((row: ExtraInfo) => readPath(path.join(dirname, row.filename), row));
87-
return await Result.all(files);
88-
} catch(e) {
89-
throw new Error("The given '.csv'-file could not be opened");
90-
}
91-
}
92-
93-
9423
public async analyzePaths(paths: string[], ignore?: string): Promise<Report> {
95-
let files = null;
96-
let nameCandidate = undefined;
97-
if(paths.length == 1) {
98-
const inputFile = paths[0];
99-
if(inputFile.toLowerCase().endsWith(".zip")) {
100-
files = this.fromZIP(inputFile);
101-
nameCandidate = path.basename(inputFile, ".zip");
102-
} else if(inputFile.toLowerCase().endsWith(".csv")) {
103-
files = this.fromCSV(inputFile);
104-
if (inputFile.endsWith("info.csv")) {
105-
nameCandidate = path.dirname(inputFile).split(path.sep).pop();
106-
}
107-
} else {
108-
throw new Error("You gave one input file, but is not a CSV file or a ZIP archive.");
109-
}
110-
} else {
111-
files = readFiles(paths);
112-
if (paths.length === 2) {
113-
nameCandidate = path.basename(paths[0]) + " & " + path.basename(paths[1]);
114-
}
115-
}
116-
let ignoredFile;
117-
if (ignore) {
118-
ignoredFile = (await readPath(ignore)).ok();
119-
}
120-
return this.analyze((await files).ok(), nameCandidate, ignoredFile);
24+
const dataset = await Dataset.create(paths, ignore);
25+
return this.analyze(dataset.files, dataset.name, dataset.ignore);
12126
}
12227

12328
public async analyze(

lib/src/test/dolos.test.ts

+27
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,33 @@ test("should read CSV-files", async t => {
234234
t.true(pairs[0].similarity > 0.75);
235235
});
236236

237+
test("should read CSV-files with template code", async t => {
238+
const dolos = new Dolos();
239+
240+
const report = await dolos.analyzePaths(["./src/test/fixtures/javascript/info_with_template.csv"]);
241+
242+
const files = report.files;
243+
t.is(5, files.length);
244+
245+
const boilerplate = files[0];
246+
const unique = files[1];
247+
const alternative = files[2];
248+
const similar = files[3];
249+
250+
// Boilerplate copy should not have a match
251+
t.is(0, report.getPair(boilerplate, unique).similarity);
252+
t.is(0, report.getPair(boilerplate, alternative).similarity);
253+
t.is(0, report.getPair(boilerplate, similar).similarity);
254+
255+
256+
const unique_alternative = report.getPair(unique, alternative);
257+
const unique_similar = report.getPair(unique, similar);
258+
const alternative_similar = report.getPair(alternative, similar);
259+
t.true(unique_alternative.similarity < alternative_similar.similarity, "Pairs with unique should be less similar");
260+
t.true(unique_similar.similarity < alternative_similar.similarity, "Pairs with unique should be less similar");
261+
t.true(alternative_similar.similarity > 0.5, "Pairs with similar code should have a similarity above 50%");
262+
});
263+
237264
test("should read ZIP-files with info.csv", async t => {
238265
const dolos = new Dolos();
239266

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
filename,id,labels,created_at,ignored
2+
boilerplate.js,1,original,2019-07-23 17:12:33 +0200,true
3+
boilerplate_copy.js,1,original,2019-07-23 17:12:33 +0200
4+
implementation-unique.js,2,copy,2019-07-25 11:02:57 +0200
5+
implementation-alternative.js,3,copy,2019-07-25 14:43:20 +0200
6+
implementation-alternative-similar.js,4,copy,2019-07-27 19:22:39 +0200
7+
boilerplate_copy.js,5,template,2019-07-27 19:22:39 +0200

0 commit comments

Comments
 (0)