Skip to content

Commit 9a598de

Browse files
committed
Create dolos-core with pure JS code
1 parent 341121f commit 9a598de

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+6730
-685
lines changed

.gitignore

+12-7
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
1-
node_modules/
2-
*.tsbuildinfo
3-
vetur.config.js
1+
!docs/yarn.lock
42
**/package-lock.json
53
**/yarn.lock
6-
*.log
7-
*.txt
84
*.csv
5+
*.log
96
*.tgz
7+
*.tsbuildinfo
8+
*.txt
109
*.zip
11-
dolos-report-*/
12-
!docs/yarn.lock
1310
.nvmrc
11+
.nyc_output
12+
.vscode
13+
dist/
14+
dolos-analysis-*
15+
dolos-report-*/
16+
node_modules/
17+
vetur.config.js
18+
yarn-error.log

core/.eslintrc

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"parser": "@typescript-eslint/parser",
3+
"plugins": ["@typescript-eslint"],
4+
"parserOptions": {
5+
"ecmaVersion": 2019,
6+
"sourceType": "module"
7+
},
8+
"extends": [
9+
"eslint:recommended",
10+
"plugin:@typescript-eslint/eslint-recommended",
11+
"plugin:@typescript-eslint/recommended",
12+
"../.eslintrc"
13+
],
14+
"env": {
15+
"node": true,
16+
"es6": true
17+
},
18+
"globals": {
19+
"Promise": "true"
20+
}
21+
}

core/.gitignore

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
node_modules
2+
dist
3+
coverage
4+
yarn-error.log
5+
.vscode
6+
dolos-analysis-*
7+
dolos-report-*
8+
.nyc_output
9+
*.tgz
10+
*.tsbuildinfo
11+
../cli/data/testfiles

core/.npmignore

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
/*
2+
!dist/
3+
dist/tsconfig.tsbuildinfo
4+
dist/test/
5+
dist/bin/
6+
*.map

core/LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2019 Team Dodona <[email protected]>
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

core/ava.config.js

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
export default {
2+
typescript: {
3+
rewritePaths: {
4+
"src/": "dist/"
5+
},
6+
compile: false,
7+
},
8+
files: [
9+
"src/test/**.ts"
10+
],
11+
workerThreads: false
12+
};

core/package.json

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"name": "@dodona/dolos-core",
3+
"version": "1.0.0",
4+
"exports": "./dist/index.js",
5+
"type": "module",
6+
"description": "Core classes and algorithms for Dolos source code similarity checker",
7+
"types": "dist/index.d.ts",
8+
"engines": {
9+
"node": ">=16"
10+
},
11+
"scripts": {
12+
"test": "tsc --build && ava",
13+
"build": "tsc --build --verbose",
14+
"force-build": "tsc --build --verbose --force",
15+
"lint": "eslint --ext .ts src/"
16+
},
17+
"repository": {
18+
"type": "git",
19+
"url": "git+ssh://[email protected]/dodona-edu/dolos.git"
20+
},
21+
"license": "MIT",
22+
"private": false,
23+
"publishConfig": {
24+
"registry": "https://registry.npmjs.org",
25+
"access": "public"
26+
},
27+
"devDependencies": {
28+
"@ava/typescript": "4.1.0",
29+
"@typescript-eslint/eslint-plugin": "5.60.1",
30+
"@typescript-eslint/parser": "5.60.1",
31+
"ava": "5.3.1",
32+
"eslint": "8.44.0",
33+
"np": "7.7.0",
34+
"typescript": "5.1.6"
35+
},
36+
"bugs": {
37+
"url": "https://github.com/dodona-edu/dolos/issues"
38+
},
39+
"homepage": "https://dolos.ugent.be",
40+
"keywords": [
41+
"plagiarism",
42+
"plagiarism-checker",
43+
"plagiarism detection",
44+
"similarity",
45+
"code similarity"
46+
]
47+
}
+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
import { HashFilter } from "../hashing/hashFilter.js";
2+
import { Range } from "../util/range.js";
3+
import { Region } from "../util/region.js";
4+
import { WinnowFilter } from "../hashing/winnowFilter.js";
5+
import { TokenizedFile } from "..//file/tokenizedFile.js";
6+
import { SharedFingerprint } from "./sharedFingerprint.js";
7+
import { ASTRegion } from "./pairedOccurrence.js";
8+
import { Pair } from "./pair.js";
9+
import { assert, assertDefined, closestMatch } from "../util/utils.js";
10+
11+
export type Hash = number;
12+
13+
export interface FileEntry {
14+
file: TokenizedFile;
15+
kgrams: Array<Range>,
16+
shared: Set<SharedFingerprint>;
17+
}
18+
19+
export interface Occurrence {
20+
file: TokenizedFile;
21+
side: ASTRegion;
22+
}
23+
24+
export class FingerprintIndex {
25+
private readonly hashFilter: HashFilter;
26+
private readonly files: Map<number, FileEntry>;
27+
private readonly index: Map<Hash, SharedFingerprint>;
28+
29+
/**
30+
* Creates a Fingerprint Index which is able to compare files with each other
31+
* based on their winnowed fingerprints (kgrams of tokens).
32+
*
33+
*/
34+
constructor(
35+
private readonly kgramLength: number,
36+
private readonly kgramsInWindow: number,
37+
kgramData?: boolean
38+
) {
39+
this.hashFilter = new WinnowFilter(this.kgramLength, this.kgramsInWindow, kgramData);
40+
this.files = new Map<number, FileEntry>();
41+
this.index = new Map<Hash, SharedFingerprint>();
42+
}
43+
44+
public async addFiles(tokenizedFiles: TokenizedFile[]): Promise<Map<Hash, SharedFingerprint>> {
45+
46+
for (const f of tokenizedFiles) {
47+
assert(!this.files.has(f.id), `This file has already been analyzed: ${f.file.path}`);
48+
}
49+
50+
for (const file of tokenizedFiles) {
51+
let kgram = 0;
52+
53+
const entry: FileEntry = {
54+
file,
55+
kgrams: [],
56+
shared: new Set<SharedFingerprint>()
57+
};
58+
59+
this.files.set(file.id, entry);
60+
61+
for await (
62+
const { data, hash, start, stop }
63+
of this.hashFilter.fingerprints(file.tokens)
64+
) {
65+
66+
// add kgram to file
67+
entry.kgrams.push(new Range(start, stop));
68+
69+
// sanity check
70+
assert(
71+
Region.isInOrder(
72+
file.mapping[start],
73+
file.mapping[stop]
74+
)
75+
// If we end our kgram on a ')', the location of the opening token is used.
76+
// However, the location of this token in the file might be before
77+
// the location of the starting token of the kmer
78+
// For example: the last token of every ast is ')', closing the program.
79+
// The location of this token is always (0, 0), since the program root is the first token.
80+
// In this way, the 'end' token is before any other token in the AST.
81+
|| file.tokens[stop] === ")" ,
82+
`Invalid ordering:
83+
expected ${file.mapping[start]}
84+
to start be before the end of ${file.mapping[stop]}`
85+
);
86+
87+
const location = Region.merge(
88+
file.mapping[start],
89+
file.mapping[stop]
90+
);
91+
92+
const part: Occurrence = {
93+
file,
94+
side: { index: kgram, start, stop, data, location }
95+
};
96+
97+
// look if the index already contains the given hashing
98+
let shared: SharedFingerprint | undefined = this.index.get(hash);
99+
100+
if (!shared) {
101+
// if the hashing does not yet exist in the index, add it
102+
shared = new SharedFingerprint(hash, data);
103+
this.index.set(hash, shared);
104+
}
105+
106+
shared.add(part);
107+
entry.shared.add(shared);
108+
109+
kgram += 1;
110+
}
111+
}
112+
113+
return this.index;
114+
}
115+
116+
public sharedFingerprints(): Array<SharedFingerprint> {
117+
return Array.from(this.index.values());
118+
}
119+
120+
public getPair(file1: TokenizedFile, file2: TokenizedFile): Pair {
121+
const entry1 = this.files.get(file1.id);
122+
const entry2 = this.files.get(file2.id);
123+
assertDefined(entry1, `File ${file1.path} not found in index`);
124+
assertDefined(entry2, `File ${file2.path} not found in index`);
125+
return new Pair(entry1, entry2);
126+
}
127+
128+
public allPairs(sortBy?: string): Array<Pair> {
129+
const pairs = [];
130+
const entries = Array.from(this.files.values());
131+
for (let i = 0; i < entries.length; i++) {
132+
for (let j = i + 1; j < entries.length; j++) {
133+
pairs.push(new Pair(entries[i], entries[j]));
134+
}
135+
}
136+
137+
if (sortBy) {
138+
type SortFn = (a: Pair, b: Pair) => number;
139+
const sortfn = closestMatch<SortFn>(sortBy || "similarity", {
140+
"total overlap": (a, b) => b.overlap - a.overlap,
141+
"longest fragment": (a, b) => b.longest - a.longest,
142+
similarity: (a, b) => b.similarity - a.similarity,
143+
});
144+
145+
assertDefined(sortfn, `${sortBy} is not a valid field to sort on`);
146+
147+
pairs.sort(sortfn);
148+
}
149+
return pairs;
150+
}
151+
}
File renamed without changes.

lib/src/lib/analyze/pair.ts core/src/algorithm/pair.ts

+15-17
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
import { Range } from "../util/range.js";
22
import { PairedOccurrence } from "./pairedOccurrence.js";
33
import { Fragment } from "./fragment.js";
4-
import { TokenizedFile } from "../file/tokenizedFile.js";
5-
import Identifiable from "../util/identifiable.js";
4+
import { Identifiable } from "../util/identifiable.js";
65
import { SharedFingerprint } from "./sharedFingerprint.js";
7-
import { Occurrence } from "./index.js";
6+
import { FileEntry, Occurrence } from "./fingerprintIndex.js";
87

98
type LeftRight = string;
109

@@ -13,7 +12,6 @@ interface Kgram {
1312
index: number,
1413
}
1514

16-
1715
/**
1816
* This class represents all the fragments between two files (i.e. the
1917
* pair of their hashes).
@@ -30,18 +28,18 @@ export class Pair extends Identifiable {
3028
public readonly similarity;
3129

3230
constructor(
33-
public readonly leftFile: TokenizedFile,
34-
public readonly rightFile: TokenizedFile
31+
public readonly leftEntry: FileEntry,
32+
public readonly rightEntry: FileEntry,
3533
) {
3634
super();
3735
let small, large;
3836
this.shared = [];
39-
if (leftFile.shared.size < rightFile.shared.size) {
40-
small = leftFile;
41-
large = rightFile;
37+
if (leftEntry.shared.size < rightEntry.shared.size) {
38+
small = leftEntry;
39+
large = rightEntry;
4240
} else {
43-
small = rightFile;
44-
large = leftFile;
41+
small = rightEntry;
42+
large = leftEntry;
4543
}
4644

4745
for (const fingeprint of small.shared) {
@@ -53,10 +51,10 @@ export class Pair extends Identifiable {
5351
const left: Kgram[] = [];
5452
const right: Kgram[] = [];
5553
for (const fingerprint of this.shared) {
56-
for (const occurrence of fingerprint.occurrencesOf(this.leftFile)) {
54+
for (const occurrence of fingerprint.occurrencesOf(this.leftEntry.file)) {
5755
left.push({ hash: fingerprint.hash, index: occurrence.side.index });
5856
}
59-
for (const occurrence of fingerprint.occurrencesOf(this.rightFile)) {
57+
for (const occurrence of fingerprint.occurrencesOf(this.rightEntry.file)) {
6058
right.push({ hash: fingerprint.hash, index: occurrence.side.index });
6159
}
6260
}
@@ -67,8 +65,8 @@ export class Pair extends Identifiable {
6765

6866
this.leftCovered = left.length;
6967
this.rightCovered = right.length;
70-
this.leftTotal = leftFile.kgrams.length;
71-
this.rightTotal = rightFile.kgrams.length;
68+
this.leftTotal = leftEntry.kgrams.length;
69+
this.rightTotal = rightEntry.kgrams.length;
7270
if (this.leftTotal + this.rightTotal > 0) {
7371
this.similarity = (this.leftCovered + this.rightCovered) / (this.leftTotal + this.rightTotal);
7472
} else {
@@ -122,8 +120,8 @@ export class Pair extends Identifiable {
122120
const fragmentEnd: Map<LeftRight, Fragment> = new Map();
123121

124122
for (const fingerprint of this.shared) {
125-
const left = Array.from(fingerprint.occurrencesOf(this.leftFile).values());
126-
const right = Array.from(fingerprint.occurrencesOf(this.rightFile).values());
123+
const left = Array.from(fingerprint.occurrencesOf(this.leftEntry.file).values());
124+
const right = Array.from(fingerprint.occurrencesOf(this.rightEntry.file).values());
127125
for (let i = 0; i < left.length; i++) {
128126
const leftOcc: Occurrence = left[i];
129127
for (let j = 0; j < right.length; j++) {
File renamed without changes.

lib/src/lib/analyze/sharedFingerprint.ts core/src/algorithm/sharedFingerprint.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
import { Occurrence } from "./index.js";
1+
import { Occurrence } from "./fingerprintIndex.js";
22
import { TokenizedFile } from "../file/tokenizedFile.js";
3-
import Identifiable from "../util/identifiable.js";
3+
import { Identifiable } from "../util/identifiable.js";
44

55
export class SharedFingerprint extends Identifiable {
66

0 commit comments

Comments
 (0)