dodona-edu
diff --git a/‎.gitignore
+12-7 b/‎.gitignore
+12-7
diff --git a/‎core/.eslintrc
+21 b/‎core/.eslintrc
+21
diff --git a/‎core/.gitignore
+11 b/‎core/.gitignore
+11
diff --git a/‎core/.npmignore
+6 b/‎core/.npmignore
+6
diff --git a/‎core/LICENSE
+21 b/‎core/LICENSE
+21
diff --git a/‎core/ava.config.js
+12 b/‎core/ava.config.js
+12
diff --git a/‎core/package.json
+47 b/‎core/package.json
+47
diff --git a/‎core/src/algorithm/fingerprintIndex.ts
+151 b/‎core/src/algorithm/fingerprintIndex.ts
+151
diff --git a/‎lib/src/lib/analyze/fragment.ts ‎core/src/algorithm/fragment.ts b/‎lib/src/lib/analyze/fragment.ts ‎core/src/algorithm/fragment.ts
diff --git a/‎lib/src/lib/analyze/pair.ts ‎core/src/algorithm/pair.ts
+15-17 b/‎lib/src/lib/analyze/pair.ts ‎core/src/algorithm/pair.ts
+15-17
diff --git a/‎lib/src/lib/analyze/pairedOccurrence.ts ‎core/src/algorithm/pairedOccurrence.ts b/‎lib/src/lib/analyze/pairedOccurrence.ts ‎core/src/algorithm/pairedOccurrence.ts
diff --git a/‎lib/src/lib/analyze/sharedFingerprint.ts ‎core/src/algorithm/sharedFingerprint.ts
+2-2 b/‎lib/src/lib/analyze/sharedFingerprint.ts ‎core/src/algorithm/sharedFingerprint.ts
+2-2
@@ -1,13 +1,18 @@
-node_modules/
-*.tsbuildinfo
-vetur.config.js
+!docs/yarn.lock
 **/package-lock.json
 **/yarn.lock
-*.log
-*.txt
 *.csv
+*.log
 *.tgz
+*.tsbuildinfo
+*.txt
 *.zip
-dolos-report-*/
-!docs/yarn.lock
 .nvmrc
+.nyc_output
+.vscode
+dist/
+dolos-analysis-*
+dolos-report-*/
+node_modules/
+vetur.config.js
+yarn-error.log
@@ -0,0 +1,21 @@
+{
+    "parser": "@typescript-eslint/parser",
+        "plugins": ["@typescript-eslint"],
+        "parserOptions": {
+            "ecmaVersion": 2019,
+            "sourceType": "module"
+        },
+        "extends": [
+            "eslint:recommended",
+            "plugin:@typescript-eslint/eslint-recommended",
+            "plugin:@typescript-eslint/recommended",
+            "../.eslintrc"
+        ],
+        "env": {
+            "node": true,
+            "es6": true
+        },
+        "globals": {
+            "Promise": "true"
+        }
+}
@@ -0,0 +1,11 @@
+node_modules
+dist
+coverage
+yarn-error.log
+.vscode
+dolos-analysis-*
+dolos-report-*
+.nyc_output
+*.tgz
+*.tsbuildinfo
+../cli/data/testfiles
@@ -0,0 +1,6 @@
+/*
+!dist/
+dist/tsconfig.tsbuildinfo
+dist/test/
+dist/bin/
+*.map
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Team Dodona <[email protected]>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,12 @@
+export default {
+  typescript: {
+    rewritePaths: {
+        "src/": "dist/"
+    },
+    compile: false,
+  },
+  files: [
+    "src/test/**.ts"
+  ],
+  workerThreads: false
+};
@@ -0,0 +1,47 @@
+{
+  "name": "@dodona/dolos-core",
+  "version": "1.0.0",
+  "exports": "./dist/index.js",
+  "type": "module",
+  "description": "Core classes and algorithms for Dolos source code similarity checker",
+  "types": "dist/index.d.ts",
+  "engines": {
+    "node": ">=16"
+  },
+  "scripts": {
+    "test": "tsc --build && ava",
+    "build": "tsc --build --verbose",
+    "force-build": "tsc --build --verbose --force",
+    "lint": "eslint --ext .ts src/"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+ssh://[email protected]/dodona-edu/dolos.git"
+  },
+  "license": "MIT",
+  "private": false,
+  "publishConfig": {
+    "registry": "https://registry.npmjs.org",
+    "access": "public"
+  },
+  "devDependencies": {
+    "@ava/typescript": "4.1.0",
+    "@typescript-eslint/eslint-plugin": "5.60.1",
+    "@typescript-eslint/parser": "5.60.1",
+    "ava": "5.3.1",
+    "eslint": "8.44.0",
+    "np": "7.7.0",
+    "typescript": "5.1.6"
+  },
+  "bugs": {
+    "url": "https://github.com/dodona-edu/dolos/issues"
+  },
+  "homepage": "https://dolos.ugent.be",
+  "keywords": [
+    "plagiarism",
+    "plagiarism-checker",
+    "plagiarism detection",
+    "similarity",
+    "code similarity"
+  ]
+}
@@ -0,0 +1,151 @@
+import { HashFilter } from "../hashing/hashFilter.js";
+import { Range } from "../util/range.js";
+import { Region } from "../util/region.js";
+import { WinnowFilter } from "../hashing/winnowFilter.js";
+import { TokenizedFile } from "..//file/tokenizedFile.js";
+import { SharedFingerprint } from "./sharedFingerprint.js";
+import { ASTRegion } from "./pairedOccurrence.js";
+import { Pair } from "./pair.js";
+import { assert, assertDefined, closestMatch } from "../util/utils.js";
+
+export type Hash = number;
+
+export interface FileEntry {
+  file: TokenizedFile;
+  kgrams: Array<Range>,
+  shared: Set<SharedFingerprint>;
+}
+
+export interface Occurrence {
+  file: TokenizedFile;
+  side: ASTRegion;
+}
+
+export class FingerprintIndex {
+  private readonly hashFilter: HashFilter;
+  private readonly files: Map<number, FileEntry>;
+  private readonly index: Map<Hash, SharedFingerprint>;
+
+  /**
+   * Creates a Fingerprint Index which is able to compare files with each other
+   * based on their winnowed fingerprints (kgrams of tokens).
+   *
+   */
+  constructor(
+    private readonly kgramLength: number,
+    private readonly kgramsInWindow: number,
+    kgramData?: boolean
+  ) {
+    this.hashFilter = new WinnowFilter(this.kgramLength, this.kgramsInWindow, kgramData);
+    this.files = new Map<number, FileEntry>();
+    this.index = new Map<Hash, SharedFingerprint>();
+  }
+
+  public async addFiles(tokenizedFiles: TokenizedFile[]): Promise<Map<Hash, SharedFingerprint>> {
+
+    for (const f of tokenizedFiles) {
+      assert(!this.files.has(f.id), `This file has already been analyzed: ${f.file.path}`);
+    }
+
+    for (const file of tokenizedFiles) {
+      let kgram = 0;
+
+      const entry: FileEntry = {
+        file,
+        kgrams: [],
+        shared: new Set<SharedFingerprint>()
+      };
+
+      this.files.set(file.id, entry);
+
+      for await (
+        const { data, hash, start, stop  }
+        of this.hashFilter.fingerprints(file.tokens)
+      ) {
+
+        // add kgram to file
+        entry.kgrams.push(new Range(start, stop));
+
+        // sanity check
+        assert(
+          Region.isInOrder(
+            file.mapping[start],
+            file.mapping[stop]
+          )
+            // If we end our kgram on a ')', the location of the opening token is used.
+            // However, the location of this token in the file might be before
+            // the location of the starting token of the kmer
+            // For example: the last token of every ast is ')', closing the program.
+            // The location of this token is always (0, 0), since the program root is the first token.
+            // In this way, the 'end' token is before any other token in the AST.
+            || file.tokens[stop] === ")" ,
+          `Invalid ordering:
+             expected ${file.mapping[start]}
+             to start be before the end of ${file.mapping[stop]}`
+        );
+
+        const location = Region.merge(
+          file.mapping[start],
+          file.mapping[stop]
+        );
+
+        const part: Occurrence = {
+          file,
+          side: { index: kgram, start, stop, data, location }
+        };
+
+        // look if the index already contains the given hashing
+        let shared: SharedFingerprint | undefined = this.index.get(hash);
+
+        if (!shared) {
+          // if the hashing does not yet exist in the index, add it
+          shared = new SharedFingerprint(hash, data);
+          this.index.set(hash, shared);
+        }
+
+        shared.add(part);
+        entry.shared.add(shared);
+
+        kgram += 1;
+      }
+    }
+
+    return this.index;
+  }
+
+  public sharedFingerprints(): Array<SharedFingerprint> {
+    return Array.from(this.index.values());
+  }
+
+  public getPair(file1: TokenizedFile, file2: TokenizedFile): Pair {
+    const entry1 = this.files.get(file1.id);
+    const entry2 = this.files.get(file2.id);
+    assertDefined(entry1, `File ${file1.path} not found in index`);
+    assertDefined(entry2, `File ${file2.path} not found in index`);
+    return new Pair(entry1, entry2);
+  }
+
+  public allPairs(sortBy?: string): Array<Pair> {
+    const pairs = [];
+    const entries = Array.from(this.files.values());
+    for (let i = 0; i < entries.length; i++) {
+      for (let j = i + 1; j < entries.length; j++) {
+        pairs.push(new Pair(entries[i], entries[j]));
+      }
+    }
+
+    if (sortBy) {
+      type SortFn = (a: Pair, b: Pair) => number;
+      const sortfn = closestMatch<SortFn>(sortBy || "similarity", {
+        "total overlap": (a, b) => b.overlap - a.overlap,
+        "longest fragment": (a, b) => b.longest - a.longest,
+        similarity: (a, b) => b.similarity - a.similarity,
+      });
+
+      assertDefined(sortfn, `${sortBy} is not a valid field to sort on`);
+
+      pairs.sort(sortfn);
+    }
+    return pairs;
+  }
+}
@@ -1,10 +1,9 @@
 import { Range } from "../util/range.js";
 import { PairedOccurrence } from "./pairedOccurrence.js";
 import { Fragment } from "./fragment.js";
-import { TokenizedFile } from "../file/tokenizedFile.js";
-import Identifiable from "../util/identifiable.js";
+import { Identifiable } from "../util/identifiable.js";
 import { SharedFingerprint } from "./sharedFingerprint.js";
-import { Occurrence } from "./index.js";
+import { FileEntry, Occurrence } from "./fingerprintIndex.js";
 
 type LeftRight = string;
 
@@ -13,7 +12,6 @@ interface Kgram {
   index: number,
 }
 
-
 /**
  * This class represents all the fragments between two files (i.e. the
  * pair of their hashes).
@@ -30,18 +28,18 @@ export class Pair extends Identifiable {
   public readonly similarity;
 
   constructor(
-    public readonly leftFile: TokenizedFile,
-    public readonly rightFile: TokenizedFile
+    public readonly leftEntry: FileEntry,
+    public readonly rightEntry: FileEntry,
   ) {
     super();
     let small, large;
     this.shared = [];
-    if (leftFile.shared.size < rightFile.shared.size) {
-      small = leftFile;
-      large = rightFile;
+    if (leftEntry.shared.size < rightEntry.shared.size) {
+      small = leftEntry;
+      large = rightEntry;
     } else {
-      small = rightFile;
-      large = leftFile;
+      small = rightEntry;
+      large = leftEntry;
     }
 
     for (const fingeprint of small.shared) {
@@ -53,10 +51,10 @@ export class Pair extends Identifiable {
     const left: Kgram[] = [];
     const right: Kgram[] = [];
     for (const fingerprint of this.shared) {
-      for (const occurrence of fingerprint.occurrencesOf(this.leftFile)) {
+      for (const occurrence of fingerprint.occurrencesOf(this.leftEntry.file)) {
         left.push({ hash: fingerprint.hash, index: occurrence.side.index });
       }
-      for (const occurrence of fingerprint.occurrencesOf(this.rightFile)) {
+      for (const occurrence of fingerprint.occurrencesOf(this.rightEntry.file)) {
         right.push({ hash: fingerprint.hash, index: occurrence.side.index });
       }
     }
@@ -67,8 +65,8 @@ export class Pair extends Identifiable {
 
     this.leftCovered = left.length;
     this.rightCovered = right.length;
-    this.leftTotal = leftFile.kgrams.length;
-    this.rightTotal = rightFile.kgrams.length;
+    this.leftTotal = leftEntry.kgrams.length;
+    this.rightTotal = rightEntry.kgrams.length;
     if (this.leftTotal + this.rightTotal > 0) {
       this.similarity = (this.leftCovered + this.rightCovered) / (this.leftTotal + this.rightTotal);
     } else {
@@ -122,8 +120,8 @@ export class Pair extends Identifiable {
     const fragmentEnd: Map<LeftRight, Fragment> = new Map();
 
     for (const fingerprint of this.shared) {
-      const left = Array.from(fingerprint.occurrencesOf(this.leftFile).values());
-      const right = Array.from(fingerprint.occurrencesOf(this.rightFile).values());
+      const left = Array.from(fingerprint.occurrencesOf(this.leftEntry.file).values());
+      const right = Array.from(fingerprint.occurrencesOf(this.rightEntry.file).values());
       for (let i = 0; i < left.length; i++) {
         const leftOcc: Occurrence = left[i];
         for (let j = 0; j < right.length; j++) {
 
@@ -1,6 +1,6 @@
-import { Occurrence } from "./index.js";
+import { Occurrence } from "./fingerprintIndex.js";
 import { TokenizedFile } from "../file/tokenizedFile.js";
-import Identifiable from "../util/identifiable.js";
+import { Identifiable } from "../util/identifiable.js";
 
 export class SharedFingerprint extends Identifiable {